예제 #1
0
    def test_CreateNewFile(self):
        vocab = vocab_utils.VocabStore(self.filename1)
        vocab.add_vocab_words(['abc', 'abc', 'def'])
        vocab.save_to_file()

        with open(self.filename2, 'w', encoding='utf8') as the_file:
            self._write_special_tokens(the_file)
            the_file.write('abc\n')
            the_file.write('def\n')
        self.assertTrue(filecmp.cmp(self.filename1, self.filename2))
예제 #2
0
    def test_print_report(self):
        vocab = vocab_utils.VocabStore(self.filename1)
        vocab.add_vocab_words(['abc', 'abc', 'def'])
        vocab.save_to_file()

        def myprint(*arg):
            a = [str(elem) for elem in arg]
            print(' '.join(a))

        vocab.print_report(myprint)
예제 #3
0
    def test_parse_h3(self):
        """Test H3"""
        vocab = vocab_utils.VocabStore()
        corpus_store = corpus_utils.CorpusStore(vocab)
        parser = phb.Parser(corpus_store, 'h3')
        parser.parse(self.htmlStruct)

        expected = [
            ['H3-A', 'body H3-A-1'],
            ['H3-A', 'body H3-A-1']
        ]
        self.assertListEqual(expected, corpus_store.data)
예제 #4
0
    def test_parse_h1(self):
        """Test H1"""
        vocab = vocab_utils.VocabStore()
        corpus_store = corpus_utils.CorpusStore(vocab)
        parser = phb.Parser(corpus_store, 'h1')
        parser.parse(self.htmlStruct)

        expected = [
            ['H1-A', 'body H1-A-1\nbody H2-A-1\nbody H2-B-1\nbody H2-B-2\nbody H3-A-1\nbody H2-C-1'],
            ['H1-B', 'body H3-A-1\nbody H2-A-1']
        ]
        self.assertListEqual(expected, corpus_store.data)
예제 #5
0
def generate(output_dir):
    """ Generate the standard vocaburary file
    """
    if not os.path.exists(output_dir): os.makedirs(output_dir)

    file_path = os.path.join(output_dir, 'vocab.src')
    # Store the loaded characters into VocabStore, then export the vocab file.
    vocab = vocab_utils.VocabStore(file_path)
    _add_standard_words(vocab)
    vocab.sort_by_unicode()
    vocab.save_to_file()
    vocab.print_report()
    # For analysis purposes, export in unicode code point.
    vocab.save_unicode_list(file_path + '.txt')
예제 #6
0
    def test_AtomicParserJpn(self):
        """Test the targetted atomic parser with Japanese sentence."""
        vocab = vocab_utils.VocabStore()
        corpus_store = corpus_utils.CorpusStore(vocab)
        parser = pat.Parser(corpus_store)
        parser.parse(self.html)

        expected = [
            ['test', 'ヘッダー1'],
            ['ヘッダー1', 'こんにちは。'],
            ['こんにちは。', 'さようなら。'],
        ]
        self.assertListEqual(expected, corpus_store.data)

        expected = ['test', 'ヘッダー', '1', 'こんにちは', '。', 'さようなら']
        self.assertListEqual(expected, vocab.words_new)
예제 #7
0
    def test_AtomicHeaderBodyParser(self):
        """Test the targetted atomic parser which is going to be in the actual use."""
        vocab = vocab_utils.VocabStore()
        corpus_store = corpus_utils.CorpusStore(vocab)
        parser = pah.Parser(corpus_store)
        parser.parse(self.htmlStruct)

        expected = [
            ['H4-A', 'body H4-A-1'],
            ['H1-A', 'body H1-A-1'],
            ['H1-A', 'H3-A'],
            ['H1-A', 'H2-A'],
            ['H3-A', 'body H3-A-1'],
            ['H2-A', 'body H2-A-1'],
            ['H1-B', 'body H1-B-1']
        ]
        self.assertListEqual(expected, corpus_store.data)
예제 #8
0
    def test_AtomicHeaderBodyParserJpn(self):
        """Test the targetted atomic parser with Japanese sentence."""
        vocab = vocab_utils.VocabStore()
        corpus_store = corpus_utils.CorpusStore(vocab)
        parser = pah.Parser(corpus_store)
        html = ('<html><head><title>test</title></head>'
            '<body>'
            '<h1>ヘッダー1</h1>'
            '<p>こんにちは。さようなら。</p>'
            '</body></html>')
        parser.parse(html)

        expected = [
            ['ヘッダー1','こんにちは。'],
            ['ヘッダー1','さようなら。']
        ]
        self.assertListEqual(expected, corpus_store.data)
예제 #9
0
    def test_LoadExistingFile(self):
        # Create original file before VocabStore
        with open(self.filename1, 'w', encoding='utf8') as the_file:
            self._write_special_tokens(the_file)
            the_file.write('abc\n')
            the_file.write('def\n')

        vocab = vocab_utils.VocabStore(self.filename1)
        vocab.add_vocab_words(['abc', '123'])
        vocab.save_to_file()

        with open(self.filename2, 'w', encoding='utf8') as the_file:
            self._write_special_tokens(the_file)
            the_file.write('abc\n')
            the_file.write('def\n')
            the_file.write('123\n')

        self.assertTrue(filecmp.cmp(self.filename1, self.filename2))
예제 #10
0
    def test_parse_script(self):
        """Test script element within a font element"""
        vocab = vocab_utils.VocabStore()
        corpus_store = corpus_utils.CorpusStore(vocab)
        parser = phb.Parser(corpus_store)

        html = ('<html><head><title>test</title></head>'
            '<body>'
            '<h1>H1-A</h1>'
            '<font>font element body'
            '<script>script shouldn\'t be parsed</script>'
            '</font>'
            '</body></html>')

        parser.parse(html)

        expected = [
            ['H1-A', 'font element body'],
        ]
        self.assertListEqual(expected, corpus_store.data)
예제 #11
0
def compile(input_path, vocab_path, output_dir):
    """ Compile the corpus files and generate a set of NMT data files (train/dev/test).
        input_path is the corpus data, either a folder path or file path, both in absolute path.
        vocab_path is the vocaburary file, either a folder path or file path, both in absolute path. 
        If folder path is given, the file name defaults 'vocab.src'.
        output_dir is the path to the folder where the data set is generated.
    """
    # Create output directory if not exist
    if not os.path.exists(output_dir): os.makedirs(output_dir)

    # Create vocab file directory if not exist. And get the file path.
    if not os.path.isfile(vocab_path):
        if not os.path.exists(vocab_path): os.makedirs(vocab_path)
        vocab_path = os.path.join(vocab_path, 'vocab.src')

    # Store the compilation details into log file.
    with open(os.path.join(output_dir, 'compile.log'), 'w') as lf:

        def log_print(*arg):
            """ Log print function. """
            texts = [str(elem) for elem in arg]
            log = ' '.join(texts)
            print(log)
            timeString = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            lf.write("{0} {1}\n".format(timeString, log))

        if os.path.isfile(input_path):
            log_print("The input file is", input_path)
            log_print("The vocab file is", vocab_path)
            log_print("The output directory is", output_dir)
            files = [[input_path, os.path.basename(input_path)]]
        else:
            input_dir = input_path
            log_print("The input directory is", input_dir)
            log_print("The vocab file is", vocab_path)
            log_print("The output directory is", output_dir)
            log_print("Searching corpus files in the input directory...")
            files = file_utils.get_filelist_in_path("cor", input_dir, True)

        vocab = vocab_utils.VocabStore(vocab_path)
        corpus_store = corpus_utils.CorpusStore(vocab)
        log_print("Total", len(files), "files to process. Loading...")
        for idx, file in enumerate(files):
            f_abst = file[0]  # absolute path
            f_rel = file[1]  # relative path
            log_print("(", idx, "of", len(files), ") file", f_rel)
            # Import and restore corpus store.
            # Don't restore vocaburary here. It's time consuming. It'll be restored during export later on.
            corpus_store.import_corpus(f_abst, False)

        # Split the corpus data randomly into 3 blocks - train, dev and test.
        # The distribution ratio is train 98%, dev 1% and test 1%.
        # Be careful not to make dev and test files too big otherwise Tensorflow training
        # fails with out-of-memory (even with GPU machine).
        train, dev, test = corpus_store.split_rnd((0.98, 0.01, 0.01))

        def process(corpus_store, subject, size_limit_KB=None):
            """ size_limit_KB is the limit of file size to be written. The size is in Kilo bite (1024 bytes)
            """
            # Export the corpus data into file. Also vocaburary is restored here.
            log_print("Exporting the", subject, "data into file...")
            corpus_store.export_to_file(output_dir, subject, size_limit_KB,
                                        True)
            corpus_store.print_report(log_print)

        # Generate each file set
        process(train, "train")
        process(dev, "dev", 100)
        process(test, "test", 100)

        # Generate vocaburary file that contains words detected in all 3 file lists.
        vocab.sort_by_unicode()
        vocab.save_to_file()
        vocab.print_report(log_print)
        vocab.save_unicode_list(vocab_path + '.txt')