def test_html_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() #sub_scan.addFile('https://www.coursera.org/') sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32) pass
def test_txt_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518) ''' assert len(sub_scan.lexicon) == 3929 assert len(sub_scan.stem_lexicon) == 2968 assert len(sub_scan.wordSet) <= 1807 assert len(sub_scan.stem_newWords) <= 922 assert len(sub_scan.newWords) <= 531 ''' names_mv=['Yuri', 'Simeon'] for n in names_mv: assert n in sub_scan.nameSet ''' if n in sub_scan.nameSet: print n; pass ''' pass #print sub_scan.nameSet for n in names_mv: ''' if n.lower() in sub_scan.newWords: print n; pass ''' assert n.lower() not in sub_scan.newWords pass pass
def main(argv=None, logger=None): if(logger is None): logger=createLog(logname="subtitle",level=logging.INFO) fname=None startDtime=datetime.now() print "Start time: "+str(startDtime)#.strftime("%Y-%m-%d %H:%M:%S")) print #sub=Subtitle(logging.getLogger()) sub=Subtitle(logger) try: opts, args=getopt.getopt( argv, "hvf:w:t:d:p:?lm:WDc", ["help", "version", "checkup" "file=","word=","type=","dir=","pickle=","limit="]) #print opts, args logger.info("opts:{0};args:{1}".format(opts, args)) except getopt.GetoptError as msg: print "error happened when get options!!! error:{0}".format(msg) usage() logger.error("getopt.GetoptError:{0}, exit!".format(msg)) sys.exit(2) except Exception as msg: logger.error("error:{0}, exit!".format(msg)) sys.exit(2) _is_lines_show=False _is_words_show=False sub_type = "" words_limit=None for opt, arg in opts: if(opt in ("-?","-h", "--help")): usage() sys.exit() pass elif(opt in ("-v", "--version")): version() sys.exit() pass elif(opt in ("-c", "--checkup")): sub.checkup=True pass elif(opt in ("-d", "--dir")): print "Sorry, -d --dir option still not offer" sys.exit() pass elif(opt in ("-p", "--pickle")): pkl=arg sub.setLexiconFile(pkl) pass elif(opt in ('-f',"--file")): fname= arg sub.addFile(fname) pass elif(opt == '-D'): logger.setLevel(logging.DEBUG) sub.setLogger(logger) pass elif(opt in ("-w", "--word")): word = arg sub.addWord(word) pass elif(opt in ("-t","--type")): sub_type = arg if(sub_type not in ('word', 'scan')): usage() sys.exit() pass pass elif(opt in ("-m","--limit")): words_limit= int(arg) #print words_limit _is_words_show=True pass elif(opt == '-l'): #show lines _is_lines_show=True pass elif(opt == '-W'): #show words _is_words_show=True pass """ if(len(sys.argv)<2): print "need args!!" logger.error("need args!!sys.argv:{0}".format(sys.argv)) return None pass """ #print sys.argv #sub.addPunctuation([',','!',';','.',':','>','<']) #sub.addLexicon(["hello", "world"]) if sub.lexicon_path is None: sub.setLexiconFile("lexicon.pickle") sub.loadOldData() sub.addFiles(args) #sub.addStrings("hello world, I'm wang. Please call me wang.") sub.parse() if(_is_lines_show): sub.lines_show() pass if(_is_words_show): #print words_limit sub.words_show(words_limit) pass sub.show() if(sub_type =='word'): sub.dumpData() print endDtime = datetime.now() print "End time: "+str(endDtime) timedelta = endDtime-startDtime print "Cost time: "+str(timedelta) #getChecksum(sys.argv[1]) pass
class Sub_testCase(unittest.TestCase): '''unit test for Subtitle Class''' def __init__(self, *args, **kwargs): unittest.TestCase.__init__(self, *args, **kwargs) self.pkl = "../data/test.pickle" self.fname = '../data/vocabulary/Vocabulary -juniorHighschool(chinese) .txt' pass def setUp(self): self.sub = Subtitle(loglevel=logging.DEBUG) pass def tearDown(self): if os.path.exists(self.pkl): os.remove(self.pkl); pass pass def sub_assert(self, sub, lex=None, stem_lex=None, words=None, stem_words=None, new_words=None): """ """ if lex: self.assertEqual(len(sub.lexicon), lex) pass if stem_lex: self.assertEqual(len(sub.stem_lexicon), stem_lex) pass if words: self.assertLessEqual(len(sub.wordSet), words) pass if stem_words: self.assertLessEqual(len(sub.stem_newWords), stem_words) pass if new_words: self.assertLessEqual(len(sub.newWords), new_words) pass pass def test_addWord(self): self.sub.setLexiconFile(self.pkl) self.sub.addWord('eyes') #print type(self.sub.newWords) #assert type(self.sub.newWords) is Counter self.assertIs(self.sub.newWords, None) #self.sub.show() self.sub_assert(self.sub,lex=0, stem_lex=0) self.sub.parse() #self.sub.words_show() #print type(self.sub.newWords) self.assertIs(type(self.sub.newWords), Counter) #assert type(self.sub.newWords) is Counter #self.sub.show() self.sub_assert(self.sub,lex=0, stem_lex=0, words=1, stem_words=1, new_words=1) self.sub.addWords(['anymore','sold']) #self.sub.show() self.sub_assert(self.sub,lex=0, stem_lex=0, words=1, stem_words=1, new_words=1) self.sub.parse() #self.sub.show() self.sub_assert(self.sub,lex=0, stem_lex=0, words=2, stem_words=2, new_words=2) #print self.sub.raw pass def test_dumpData(self): self.sub.setLexiconFile(self.pkl) #self.sub.loadOldData() self.sub.addFile(self.fname) self.sub.addFile('../data/vocabulary/Vocabulary -highschool(chinese).txt') self.sub.addFile('../data/vocabulary/Vocabulary-cet-4 (chinese).txt') self.sub.parse() #self.sub.show() self.sub.dumpData() self.sub_assert(self.sub, lex=0, stem_lex=0, words=4156, stem_words=4156, new_words=4156) ''' assert len(self.sub.lexicon) == 0 assert len(self.sub.stem_lexicon) == 0 assert len(self.sub.wordSet) <= 3929 assert len(self.sub.stem_newWords) <= 3929 assert len(self.sub.newWords) <= 3929 ''' pass def test_html_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() #sub_scan.addFile('https://www.coursera.org/') sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32) pass def test_txt_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518) ''' assert len(sub_scan.lexicon) == 3929 assert len(sub_scan.stem_lexicon) == 2968 assert len(sub_scan.wordSet) <= 1807 assert len(sub_scan.stem_newWords) <= 922 assert len(sub_scan.newWords) <= 531 ''' names_mv=['Yuri', 'Simeon'] for n in names_mv: assert n in sub_scan.nameSet ''' if n in sub_scan.nameSet: print n; pass ''' pass #print sub_scan.nameSet for n in names_mv: ''' if n.lower() in sub_scan.newWords: print n; pass ''' assert n.lower() not in sub_scan.newWords pass pass def test_word(self): self.sub.setLexiconFile(self.pkl) #self.sub.loadOldData() self.sub.addFile(self.fname) self.sub.parse() #self.sub.show() self.sub_assert(self.sub, lex=0, stem_lex=0, words=1599, stem_words=1599, new_words=1599) ''' assert len(self.sub.lexicon) == 0 assert len(self.sub.stem_lexicon) == 0 assert len(self.sub.wordSet) <= 1449 assert len(self.sub.stem_newWords) <= 1449 assert len(self.sub.newWords) <= 1449 ''' pass pass