예제 #1
0
  def test_html_scan(self):
    self.test_dumpData()

    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    #sub_scan.addFile('https://www.coursera.org/')
    sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32)
    pass
예제 #2
0
  def test_txt_scan(self):
    self.test_dumpData()
    
    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518)
    '''
    assert len(sub_scan.lexicon) == 3929
    assert len(sub_scan.stem_lexicon) == 2968
    assert len(sub_scan.wordSet) <= 1807
    assert len(sub_scan.stem_newWords) <= 922
    assert len(sub_scan.newWords) <= 531
    '''

    names_mv=['Yuri', 'Simeon']
    for n in names_mv:
      assert n in sub_scan.nameSet
      '''
      if n in sub_scan.nameSet:
        print n;
        pass
      '''
      pass
    #print sub_scan.nameSet
    for n in names_mv:
      '''
      if n.lower() in sub_scan.newWords:
        print n;
        pass
      '''
      assert n.lower() not in sub_scan.newWords
      pass
    pass
예제 #3
0
def main(argv=None, logger=None):
  if(logger is None):
    logger=createLog(logname="subtitle",level=logging.INFO)

  fname=None

  startDtime=datetime.now()
  print "Start time: "+str(startDtime)#.strftime("%Y-%m-%d %H:%M:%S"))
  print
  #sub=Subtitle(logging.getLogger())
  sub=Subtitle(logger)

  try:
    opts, args=getopt.getopt(
      argv, 
      "hvf:w:t:d:p:?lm:WDc", 
      ["help", "version", "checkup" "file=","word=","type=","dir=","pickle=","limit="])
    #print opts, args
    logger.info("opts:{0};args:{1}".format(opts, args))
  except getopt.GetoptError as msg:
    print "error happened when get options!!! error:{0}".format(msg)
    usage()
    logger.error("getopt.GetoptError:{0}, exit!".format(msg))
    sys.exit(2)
  except Exception as msg:
    logger.error("error:{0}, exit!".format(msg))
    sys.exit(2)

  _is_lines_show=False
  _is_words_show=False
  sub_type = ""
  words_limit=None
  for opt, arg in opts:
    if(opt in ("-?","-h", "--help")):
      usage()
      sys.exit()
      pass
    elif(opt in ("-v", "--version")):
      version()
      sys.exit()
      pass
    elif(opt in ("-c", "--checkup")):
      sub.checkup=True
      pass
    elif(opt in ("-d", "--dir")):
      print "Sorry, -d --dir option still not offer"
      sys.exit()
      pass
    elif(opt in ("-p", "--pickle")):
      pkl=arg
      sub.setLexiconFile(pkl)
      pass
    elif(opt in ('-f',"--file")):
      fname= arg
      sub.addFile(fname)
      pass
    elif(opt == '-D'):
      logger.setLevel(logging.DEBUG)
      sub.setLogger(logger)
      pass
    elif(opt in ("-w", "--word")):
      word = arg
      sub.addWord(word)
      pass
    elif(opt in ("-t","--type")):
      sub_type = arg
      if(sub_type not in ('word', 'scan')):
        usage()
        sys.exit()
        pass
      pass
    elif(opt in ("-m","--limit")):
      words_limit= int(arg)
      #print words_limit
      _is_words_show=True
      pass
    elif(opt == '-l'):
      #show lines
      _is_lines_show=True
      pass
    elif(opt == '-W'):
      #show words
      _is_words_show=True
      pass

  """
  if(len(sys.argv)<2):
    print "need args!!"
    logger.error("need args!!sys.argv:{0}".format(sys.argv))
    return None
    pass
  """
  #print sys.argv

  #sub.addPunctuation([',','!',';','.',':','>','<'])
  #sub.addLexicon(["hello", "world"])

  if sub.lexicon_path is None:
    sub.setLexiconFile("lexicon.pickle")
  sub.loadOldData()

  sub.addFiles(args)
  #sub.addStrings("hello world, I'm wang. Please call me wang.")

  sub.parse()

  if(_is_lines_show):
    sub.lines_show()
    pass

  if(_is_words_show):
    #print words_limit
    sub.words_show(words_limit)
    pass
  sub.show()

  if(sub_type =='word'):
    sub.dumpData()

  print 
  endDtime = datetime.now()
  print "End time: "+str(endDtime)
  timedelta  = endDtime-startDtime
  print "Cost time: "+str(timedelta) 

  #getChecksum(sys.argv[1])
  pass
예제 #4
0
class Sub_testCase(unittest.TestCase):
  '''unit test for Subtitle Class'''

  def __init__(self, *args, **kwargs):
    unittest.TestCase.__init__(self, *args, **kwargs)
    self.pkl = "../data/test.pickle"
    self.fname = '../data/vocabulary/Vocabulary -juniorHighschool(chinese) .txt'
    pass

  def setUp(self):
    self.sub = Subtitle(loglevel=logging.DEBUG)
    pass

  def tearDown(self):
    if os.path.exists(self.pkl):
      os.remove(self.pkl);
      pass
    pass

  def sub_assert(self, sub, lex=None, stem_lex=None, words=None, stem_words=None, new_words=None):
    """
    """
    if lex:
      self.assertEqual(len(sub.lexicon), lex)
      pass
    if stem_lex:
      self.assertEqual(len(sub.stem_lexicon), stem_lex)
      pass
    if words:
      self.assertLessEqual(len(sub.wordSet), words)
      pass
    if stem_words:
      self.assertLessEqual(len(sub.stem_newWords), stem_words)
      pass
    if new_words:
      self.assertLessEqual(len(sub.newWords), new_words)
      pass
    pass

  def test_addWord(self):
    self.sub.setLexiconFile(self.pkl)
    self.sub.addWord('eyes')
    #print type(self.sub.newWords)
    #assert type(self.sub.newWords) is Counter
    self.assertIs(self.sub.newWords, None)
    #self.sub.show()
    self.sub_assert(self.sub,lex=0, stem_lex=0)

    self.sub.parse()
    #self.sub.words_show()
    #print type(self.sub.newWords)
    self.assertIs(type(self.sub.newWords), Counter)
    #assert type(self.sub.newWords) is Counter
    #self.sub.show()
    self.sub_assert(self.sub,lex=0, stem_lex=0, words=1, stem_words=1, new_words=1)

    self.sub.addWords(['anymore','sold'])
    #self.sub.show()
    self.sub_assert(self.sub,lex=0, stem_lex=0, words=1, stem_words=1, new_words=1)
    self.sub.parse()
    #self.sub.show()
    self.sub_assert(self.sub,lex=0, stem_lex=0, words=2, stem_words=2, new_words=2)

    #print self.sub.raw
    pass

  def test_dumpData(self):
    self.sub.setLexiconFile(self.pkl)
    #self.sub.loadOldData()
    self.sub.addFile(self.fname)
    self.sub.addFile('../data/vocabulary/Vocabulary -highschool(chinese).txt')
    self.sub.addFile('../data/vocabulary/Vocabulary-cet-4 (chinese).txt')
    self.sub.parse()
    #self.sub.show()
    self.sub.dumpData()
    self.sub_assert(self.sub, lex=0, stem_lex=0, words=4156, stem_words=4156, new_words=4156)
    '''
    assert len(self.sub.lexicon) == 0
    assert len(self.sub.stem_lexicon) == 0
    assert len(self.sub.wordSet) <= 3929
    assert len(self.sub.stem_newWords) <= 3929
    assert len(self.sub.newWords) <= 3929
    '''
    pass

  def test_html_scan(self):
    self.test_dumpData()

    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    #sub_scan.addFile('https://www.coursera.org/')
    sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32)
    pass

  def test_txt_scan(self):
    self.test_dumpData()
    
    sub_scan = Subtitle(loglevel=logging.DEBUG)
    sub_scan.setLexiconFile(self.pkl)
    sub_scan.loadOldData()
    sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt')
    sub_scan.parse()
    #sub_scan.words_show(50)
    #sub_scan.show()
    self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518)
    '''
    assert len(sub_scan.lexicon) == 3929
    assert len(sub_scan.stem_lexicon) == 2968
    assert len(sub_scan.wordSet) <= 1807
    assert len(sub_scan.stem_newWords) <= 922
    assert len(sub_scan.newWords) <= 531
    '''

    names_mv=['Yuri', 'Simeon']
    for n in names_mv:
      assert n in sub_scan.nameSet
      '''
      if n in sub_scan.nameSet:
        print n;
        pass
      '''
      pass
    #print sub_scan.nameSet
    for n in names_mv:
      '''
      if n.lower() in sub_scan.newWords:
        print n;
        pass
      '''
      assert n.lower() not in sub_scan.newWords
      pass
    pass

  def test_word(self):
    self.sub.setLexiconFile(self.pkl)
    #self.sub.loadOldData()
    self.sub.addFile(self.fname)
    self.sub.parse()
    #self.sub.show()

    self.sub_assert(self.sub, lex=0, stem_lex=0, words=1599, stem_words=1599, new_words=1599)
    '''
    assert len(self.sub.lexicon) == 0
    assert len(self.sub.stem_lexicon) == 0
    assert len(self.sub.wordSet) <= 1449
    assert len(self.sub.stem_newWords) <= 1449
    assert len(self.sub.newWords) <= 1449
    '''

    pass
  pass