def test_CreateNewFile(self): vocab = vocab_utils.VocabStore(self.filename1) vocab.add_vocab_words(['abc', 'abc', 'def']) vocab.save_to_file() with open(self.filename2, 'w', encoding='utf8') as the_file: self._write_special_tokens(the_file) the_file.write('abc\n') the_file.write('def\n') self.assertTrue(filecmp.cmp(self.filename1, self.filename2))
def test_print_report(self): vocab = vocab_utils.VocabStore(self.filename1) vocab.add_vocab_words(['abc', 'abc', 'def']) vocab.save_to_file() def myprint(*arg): a = [str(elem) for elem in arg] print(' '.join(a)) vocab.print_report(myprint)
def test_parse_h3(self): """Test H3""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = phb.Parser(corpus_store, 'h3') parser.parse(self.htmlStruct) expected = [ ['H3-A', 'body H3-A-1'], ['H3-A', 'body H3-A-1'] ] self.assertListEqual(expected, corpus_store.data)
def test_parse_h1(self): """Test H1""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = phb.Parser(corpus_store, 'h1') parser.parse(self.htmlStruct) expected = [ ['H1-A', 'body H1-A-1\nbody H2-A-1\nbody H2-B-1\nbody H2-B-2\nbody H3-A-1\nbody H2-C-1'], ['H1-B', 'body H3-A-1\nbody H2-A-1'] ] self.assertListEqual(expected, corpus_store.data)
def generate(output_dir): """ Generate the standard vocaburary file """ if not os.path.exists(output_dir): os.makedirs(output_dir) file_path = os.path.join(output_dir, 'vocab.src') # Store the loaded characters into VocabStore, then export the vocab file. vocab = vocab_utils.VocabStore(file_path) _add_standard_words(vocab) vocab.sort_by_unicode() vocab.save_to_file() vocab.print_report() # For analysis purposes, export in unicode code point. vocab.save_unicode_list(file_path + '.txt')
def test_AtomicParserJpn(self): """Test the targetted atomic parser with Japanese sentence.""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = pat.Parser(corpus_store) parser.parse(self.html) expected = [ ['test', 'ヘッダー1'], ['ヘッダー1', 'こんにちは。'], ['こんにちは。', 'さようなら。'], ] self.assertListEqual(expected, corpus_store.data) expected = ['test', 'ヘッダー', '1', 'こんにちは', '。', 'さようなら'] self.assertListEqual(expected, vocab.words_new)
def test_AtomicHeaderBodyParser(self): """Test the targetted atomic parser which is going to be in the actual use.""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = pah.Parser(corpus_store) parser.parse(self.htmlStruct) expected = [ ['H4-A', 'body H4-A-1'], ['H1-A', 'body H1-A-1'], ['H1-A', 'H3-A'], ['H1-A', 'H2-A'], ['H3-A', 'body H3-A-1'], ['H2-A', 'body H2-A-1'], ['H1-B', 'body H1-B-1'] ] self.assertListEqual(expected, corpus_store.data)
def test_AtomicHeaderBodyParserJpn(self): """Test the targetted atomic parser with Japanese sentence.""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = pah.Parser(corpus_store) html = ('<html><head><title>test</title></head>' '<body>' '<h1>ヘッダー1</h1>' '<p>こんにちは。さようなら。</p>' '</body></html>') parser.parse(html) expected = [ ['ヘッダー1','こんにちは。'], ['ヘッダー1','さようなら。'] ] self.assertListEqual(expected, corpus_store.data)
def test_LoadExistingFile(self): # Create original file before VocabStore with open(self.filename1, 'w', encoding='utf8') as the_file: self._write_special_tokens(the_file) the_file.write('abc\n') the_file.write('def\n') vocab = vocab_utils.VocabStore(self.filename1) vocab.add_vocab_words(['abc', '123']) vocab.save_to_file() with open(self.filename2, 'w', encoding='utf8') as the_file: self._write_special_tokens(the_file) the_file.write('abc\n') the_file.write('def\n') the_file.write('123\n') self.assertTrue(filecmp.cmp(self.filename1, self.filename2))
def test_parse_script(self): """Test script element within a font element""" vocab = vocab_utils.VocabStore() corpus_store = corpus_utils.CorpusStore(vocab) parser = phb.Parser(corpus_store) html = ('<html><head><title>test</title></head>' '<body>' '<h1>H1-A</h1>' '<font>font element body' '<script>script shouldn\'t be parsed</script>' '</font>' '</body></html>') parser.parse(html) expected = [ ['H1-A', 'font element body'], ] self.assertListEqual(expected, corpus_store.data)
def compile(input_path, vocab_path, output_dir): """ Compile the corpus files and generate a set of NMT data files (train/dev/test). input_path is the corpus data, either a folder path or file path, both in absolute path. vocab_path is the vocaburary file, either a folder path or file path, both in absolute path. If folder path is given, the file name defaults 'vocab.src'. output_dir is the path to the folder where the data set is generated. """ # Create output directory if not exist if not os.path.exists(output_dir): os.makedirs(output_dir) # Create vocab file directory if not exist. And get the file path. if not os.path.isfile(vocab_path): if not os.path.exists(vocab_path): os.makedirs(vocab_path) vocab_path = os.path.join(vocab_path, 'vocab.src') # Store the compilation details into log file. with open(os.path.join(output_dir, 'compile.log'), 'w') as lf: def log_print(*arg): """ Log print function. """ texts = [str(elem) for elem in arg] log = ' '.join(texts) print(log) timeString = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) lf.write("{0} {1}\n".format(timeString, log)) if os.path.isfile(input_path): log_print("The input file is", input_path) log_print("The vocab file is", vocab_path) log_print("The output directory is", output_dir) files = [[input_path, os.path.basename(input_path)]] else: input_dir = input_path log_print("The input directory is", input_dir) log_print("The vocab file is", vocab_path) log_print("The output directory is", output_dir) log_print("Searching corpus files in the input directory...") files = file_utils.get_filelist_in_path("cor", input_dir, True) vocab = vocab_utils.VocabStore(vocab_path) corpus_store = corpus_utils.CorpusStore(vocab) log_print("Total", len(files), "files to process. Loading...") for idx, file in enumerate(files): f_abst = file[0] # absolute path f_rel = file[1] # relative path log_print("(", idx, "of", len(files), ") file", f_rel) # Import and restore corpus store. # Don't restore vocaburary here. It's time consuming. It'll be restored during export later on. corpus_store.import_corpus(f_abst, False) # Split the corpus data randomly into 3 blocks - train, dev and test. # The distribution ratio is train 98%, dev 1% and test 1%. # Be careful not to make dev and test files too big otherwise Tensorflow training # fails with out-of-memory (even with GPU machine). train, dev, test = corpus_store.split_rnd((0.98, 0.01, 0.01)) def process(corpus_store, subject, size_limit_KB=None): """ size_limit_KB is the limit of file size to be written. The size is in Kilo bite (1024 bytes) """ # Export the corpus data into file. Also vocaburary is restored here. log_print("Exporting the", subject, "data into file...") corpus_store.export_to_file(output_dir, subject, size_limit_KB, True) corpus_store.print_report(log_print) # Generate each file set process(train, "train") process(dev, "dev", 100) process(test, "test", 100) # Generate vocaburary file that contains words detected in all 3 file lists. vocab.sort_by_unicode() vocab.save_to_file() vocab.print_report(log_print) vocab.save_unicode_list(vocab_path + '.txt')