Exemplo n.º 1
0
    def _make_training_folder(self, bilingual_corpora, monolingual_corpora, domains, folder):
        for corpus in bilingual_corpora:
            dest_corpus = BilingualCorpus.make_parallel(domains[corpus.name], folder, corpus.langs)

            for lang in corpus.langs:
                os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang))

        for corpus in monolingual_corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, folder, corpus.langs)

            for lang in corpus.langs:
                os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang))

        return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=folder)
Exemplo n.º 2
0
    def _make_training_folder(self, bilingual_corpora, monolingual_corpora, domains, folder):
        for corpus in bilingual_corpora:
            dest_corpus = BilingualCorpus.make_parallel(domains[corpus.name], folder, corpus.langs)

            for lang in corpus.langs:
                os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang))

        for corpus in monolingual_corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, folder, corpus.langs)

            for lang in corpus.langs:
                os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang))

        return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=folder)
Exemplo n.º 3
0
    def build(self, corpora, working_dir='.', log_file=None):
        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)
        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        merged_corpus = BilingualCorpus.make_parallel('merge', working_dir, (self._source_lang, self._target_lang))

        fileutils.merge([corpus.get_file(self._source_lang) for corpus in corpora],
                        merged_corpus.get_file(self._source_lang))
        fileutils.merge([corpus.get_file(self._target_lang) for corpus in corpora],
                        merged_corpus.get_file(self._target_lang))

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Train model
            command = [self._build_bin,
                       '-s', merged_corpus.get_file(self._source_lang), '-t', merged_corpus.get_file(self._target_lang),
                       '-m', self._model, '-I', '4']
            shell.execute(command, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Exemplo n.º 4
0
    def train(self, corpora, aligner, working_dir='.', log_file=None):
        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Prepare training folder
            for corpus in corpora:
                dest_corpus = BilingualCorpus.make_parallel(corpus.name, working_dir,
                                                            (self._source_lang, self._target_lang))
                source_file = corpus.get_file(self._source_lang)
                target_file = corpus.get_file(self._target_lang)

                os.symlink(source_file, dest_corpus.get_file(self._source_lang))
                os.symlink(target_file, dest_corpus.get_file(self._target_lang))

                aligner.align(corpus, os.path.join(working_dir, corpus.name + '.align'))

            # Build models
            command = [self._build_bin, '--input', working_dir, '--model', self._model,
                       '-s', self._source_lang, '-t', self._target_lang]
            shell.execute(command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Exemplo n.º 5
0
    def build(self, corpora, working_dir='.', log_file=None):
        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)
        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        merged_corpus = BilingualCorpus.make_parallel(
            'merge', working_dir, (self._source_lang, self._target_lang))

        fileutils.merge(
            [corpus.get_file(self._source_lang) for corpus in corpora],
            merged_corpus.get_file(self._source_lang))
        fileutils.merge(
            [corpus.get_file(self._target_lang) for corpus in corpora],
            merged_corpus.get_file(self._target_lang))

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Train model
            command = [
                self._build_bin, '-s',
                merged_corpus.get_file(self._source_lang), '-t',
                merged_corpus.get_file(self._target_lang), '-m', self._model,
                '-I', '4'
            ]
            shell.execute(command, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Exemplo n.º 6
0
    def train(self, corpora, aligner, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        train_corpora = []  # Prepare training folder
        for corpus in corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, working_dir,
                                                        (self._source_lang, self._target_lang))
            source_file = corpus.get_file(self._source_lang)
            target_file = corpus.get_file(self._target_lang)

            os.symlink(source_file, dest_corpus.get_file(self._source_lang))
            os.symlink(target_file, dest_corpus.get_file(self._target_lang))

            train_corpora.append(dest_corpus)

        # Align corpora
        aligner.align(train_corpora, working_dir, log=log)

        # Build models
        command = [self._build_bin, '--input', working_dir, '--model', self._model,
                   '-s', self._source_lang, '-t', self._target_lang]
        shell.execute(command, stdout=log, stderr=log)
Exemplo n.º 7
0
    def build(self, corpora, working_dir='.', log=None):
        if log is None:
            log = shell.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        merged_corpus = BilingualCorpus.make_parallel(
            'merge', working_dir, (self._source_lang, self._target_lang))

        fileutils.merge(
            [corpus.get_file(self._source_lang) for corpus in corpora],
            merged_corpus.get_file(self._source_lang))
        fileutils.merge(
            [corpus.get_file(self._target_lang) for corpus in corpora],
            merged_corpus.get_file(self._target_lang))

        command = [
            self._build_bin, '-s',
            merged_corpus.get_file(self._source_lang), '-t',
            merged_corpus.get_file(self._target_lang), '-m', self._model, '-I',
            '4'
        ]
        shell.execute(command, stdout=log, stderr=log)
Exemplo n.º 8
0
    def process(self, corpora, dest_folder, print_tags=True, print_placeholders=False, original_spacing=False):
        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang])

                self.__process_file(source, dest, lang, print_tags, print_placeholders, original_spacing)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 9
0
    def process_corpora(self, corpora, dest_folder):
        fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang])

                self.process_file(source, dest, lang)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 10
0
    def process_corpora(self, corpora, dest_folder):
        fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang])

                self.process_file(source, dest, lang)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 11
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]).get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 12
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]).get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 13
0
    def train(self, corpora, aligner, working_dir='.', log_file=None):
        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Prepare training folder
            for corpus in corpora:
                dest_corpus = BilingualCorpus.make_parallel(
                    corpus.name, working_dir,
                    (self._source_lang, self._target_lang))
                source_file = corpus.get_file(self._source_lang)
                target_file = corpus.get_file(self._target_lang)

                os.symlink(source_file,
                           dest_corpus.get_file(self._source_lang))
                os.symlink(target_file,
                           dest_corpus.get_file(self._target_lang))

                aligner.align(
                    corpus, os.path.join(working_dir, corpus.name + '.align'))

            # Build models
            command = [
                self._build_bin, '--input', working_dir, '--model',
                self._model, '-s', self._source_lang, '-t', self._target_lang
            ]
            shell.execute(command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Exemplo n.º 14
0
    def train(self, corpora, aligner, working_dir='.', log_file=None):
        if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0:
            raise Exception('Model already exists at ' + self._model)

        if not os.path.isdir(self._model):
            fileutils.makedirs(self._model, exist_ok=True)

        if not os.path.isdir(working_dir):
            fileutils.makedirs(working_dir, exist_ok=True)

        l1 = self._source_lang
        l2 = self._target_lang
        langs = (l1, l2)
        langs_suffix = l1 + '-' + l2

        mct_base = self._get_model_basename()
        dmp_file = mct_base + '.dmp'
        mam_file = mct_base + '.' + langs_suffix + '.mam'
        lex_file = mct_base + '.' + langs_suffix + '.lex'

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'a')

            # Clean corpus for training
            clean_output = os.path.join(working_dir, 'clean_corpora')
            fileutils.makedirs(clean_output, exist_ok=True)
            corpora = self._cleaner.clean(corpora, clean_output, (self._source_lang, self._target_lang))

            # Create merged corpus and domains list file (dmp)
            merged_corpus = BilingualCorpus.make_parallel(os.path.basename(mct_base), working_dir, langs)

            fileutils.merge([corpus.get_file(l1) for corpus in corpora], merged_corpus.get_file(l1))
            fileutils.merge([corpus.get_file(l2) for corpus in corpora], merged_corpus.get_file(l2))
            with open(dmp_file, 'w') as dmp:
                for corpus in corpora:
                    dmp.write(str(corpus.name) + ' ' + str(corpus.count_lines()) + '\n')

            # Create alignments in 'bal' file and symmetrize
            bal_file = aligner.align(merged_corpus, langs, self._model, working_dir, log_file)

            symal_file = os.path.join(working_dir, 'alignments.' + langs_suffix + '.symal')
            symal_command = [self._symal_bin, '-a=g', '-d=yes', '-f=yes', '-b=yes']
            with open(bal_file) as stdin:
                with open(symal_file, 'w') as stdout:
                    shell.execute(symal_command, stdin=stdin, stdout=stdout, stderr=log)

            # Execute mtt-build
            mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l1)
            with open(merged_corpus.get_file(l1)) as stdin:
                shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log)

            mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l2)
            with open(merged_corpus.get_file(l2)) as stdin:
                shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log)

            # Create 'mam' file
            mam_command = [self._symal2mam_bin, mam_file]
            with open(symal_file) as stdin:
                shell.execute(mam_command, stdin=stdin, stdout=log, stderr=log)

            # Create 'lex' file
            lex_command = [self._mmlexbuild_bin, mct_base + '.', l1, l2, '-o', lex_file]
            shell.execute(lex_command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Exemplo n.º 15
0
    def tune(self,
             corpora=None,
             debug=False,
             context_enabled=True,
             random_seeds=False,
             max_iterations=25,
             early_stopping_value=None):
        if corpora is None:
            corpora = BilingualCorpus.list(
                os.path.join(self.engine.data_path,
                             TrainingPreprocessor.DEV_FOLDER_NAME))

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        corpora = [
            corpus for corpus in corpora
            if source_lang in corpus.langs and target_lang in corpus.langs
        ]
        if len(corpora) == 0:
            raise IllegalArgumentException(
                'No %s > %s corpora found into specified path' %
                (source_lang, target_lang))

        source_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [source_lang]) for corpus in corpora
        ]
        reference_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [target_lang]) for corpus in corpora
        ]

        cmdlogger = _tuning_logger(4)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenizer = Tokenizer(target_lang)
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with cmdlogger.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(
                    reference_corpora, tokenized_output)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + target_lang)
                fileutils.merge([
                    corpus.get_file(target_lang)
                    for corpus in reference_corpora
                ], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if self.api.root is not None:
                    decoder_flags += ['--root', self.api.root]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [
                        self._mert_script, source_merged_corpus,
                        target_merged_corpus, self._mert_i_script,
                        runtime_moses_ini.name, '--threads',
                        str(multiprocessing.cpu_count()), '--mertdir',
                        cli.BIN_DIR, '--mertargs',
                        '\'--binary --sctype BLEU\'', '--working-dir', mert_wd,
                        '--nbest', '100', '--decoder-flags',
                        '"' + ' '.join(decoder_flags) + '"', '--nonorm',
                        '--closest', '--no-filter-phrase-table'
                    ]

                    if early_stopping_value is not None:
                        command += [
                            '--bleuscorer', self._scorer_script,
                            '--bleuscorer-flags "-nt" --early-stopping-value %d'
                            % early_stopping_value
                        ]

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(
                            num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command),
                                      stdout=log,
                                      stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [
                                float(val) for val in tokens[1:]
                            ]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir("tuning")
Exemplo n.º 16
0
    def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25):
        if corpora is None:
            corpora = BilingualCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME))

        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if not self.is_running():
            raise IllegalStateException('No MMT Server running, start the engine first')

        tokenizer = Tokenizer()

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang])
                          for corpus in corpora]
        reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang])
                             for corpus in corpora]

        cmdlogger = _tuning_logger(4)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with cmdlogger.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(reference_corpora, tokenized_output)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [self._mert_script, source_merged_corpus, target_merged_corpus,
                               self._mert_i_script, runtime_moses_ini.name, '--threads',
                               str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR,
                               '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100',
                               '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest',
                               '--no-filter-phrase-table']

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command), stdout=log, stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir()