Exemplo n.º 1
0
    def _translate_with(self, translator, corpora, working_dir,
                        expected_segments):
        result = self._Entry(translator)

        translations_path = os.path.join(working_dir, 'translations',
                                         result.id)
        osutils.makedirs(translations_path, exist_ok=True)

        try:
            begin_time = time.time()
            segments_count = translator.translate_corpora(
                corpora, translations_path)
            result.translation_time = (time.time() -
                                       begin_time) / float(segments_count)

            if expected_segments != segments_count:
                raise TranslateError(
                    'Invalid line count for translator %s: expected %d, found %d.'
                    % (translator.name, expected_segments, segments_count))

            result.translation_file = os.path.join(
                working_dir, result.id + '.' + self._target_lang)
            osutils.concat([
                os.path.join(translations_path,
                             corpus.name + '.' + self._target_lang)
                for corpus in corpora
            ], result.translation_file)
        except TranslateError as e:
            result.error = e
        except Exception as e:
            result.error = TranslateError('Unexpected ERROR: ' +
                                          str(e.message))

        return result
Exemplo n.º 2
0
 def _get_tempdir(self, name, delete_if_exists=False):
     path = os.path.join(self._temp_dir, name)
     if delete_if_exists:
         shutil.rmtree(path, ignore_errors=True)
     if not os.path.isdir(path):
         osutils.makedirs(path, exist_ok=True)
     return path
Exemplo n.º 3
0
    def get_logfile(self, name, ensure=True, append=False):
        if ensure and not os.path.isdir(self.logs_path):
            osutils.makedirs(self.logs_path, exist_ok=True)

        logfile = os.path.join(self.logs_path, name + '.log')

        if not append and ensure and os.path.isfile(logfile):
            os.remove(logfile)

        return logfile
Exemplo n.º 4
0
    def get_tempdir(self, name, ensure=True):
        if ensure and not os.path.isdir(self.temp_path):
            osutils.makedirs(self.temp_path, exist_ok=True)

        folder = os.path.join(self.temp_path, name)

        if ensure:
            shutil.rmtree(folder, ignore_errors=True)
            os.makedirs(folder)

        return folder
Exemplo n.º 5
0
    def train_model(self,
                    train_dir,
                    output_dir,
                    batch_size=1024,
                    n_train_steps=None,
                    n_eval_steps=1000,
                    hparams='transformer_base',
                    log=None,
                    fromModel=None):
        if log is None:
            log = osutils.DEVNULL

        if not os.path.isdir(output_dir):
            osutils.makedirs(output_dir)

        # if an existing checkpoint is loaded for starting the training (i.e fromModel != None)
        # copy the checkpoint files into the right location
        if fromModel is not None:
            self._copy_and_fix_model(fromModel, output_dir, gpus=self._gpus)

        data_dir = os.path.join(train_dir, 'data')

        src_model_vocab = os.path.join(data_dir, 'model.vcb')
        tgt_model_vocab = os.path.join(output_dir, 'model.vcb')

        if not os.path.isfile(tgt_model_vocab):
            os.symlink(src_model_vocab, tgt_model_vocab)

        env = self._get_env()
        hparams_p = 'batch_size=%d' % batch_size
        command = [
            't2t-trainer', '--t2t_usr_dir', self._t2t_dir,
            '--data_dir=%s' % data_dir, '--problem=translate_mmt',
            '--model=transformer',
            '--hparams_set=%s' % hparams,
            '--output_dir=%s' % output_dir,
            '--local_eval_frequency=%d' % n_eval_steps,
            '--train_steps=%d' %
            (n_train_steps if n_train_steps is not None else 100000000),
            '--worker_gpu=%d' % len(self._gpus), '--hparams', hparams_p
        ]

        process = osutils.shell_exec(command,
                                     stdout=log,
                                     stderr=log,
                                     env=env,
                                     background=True)

        try:
            return_code = process.wait()
            if return_code != 0:
                raise ShellError(' '.join(command), return_code, None)
        except KeyboardInterrupt:
            process.kill()
Exemplo n.º 6
0
    def process_corpora(self, corpora, output_folder):
        osutils.makedirs(output_folder, exist_ok=True)

        for corpus in corpora:
            output_corpus = BilingualCorpus.make_parallel(
                corpus.name, output_folder, corpus.langs)

            for lang in corpus.langs:
                input_path = corpus.get_file(lang)
                output_path = output_corpus.get_file(lang)

                self.process_file(input_path, output_path, lang)

        return BilingualCorpus.list(self._source_lang, self._target_lang,
                                    output_folder)
Exemplo n.º 7
0
    def build(self, corpora, log=None):
        if log is None:
            log = osutils.DEVNULL

        shutil.rmtree(self._model, ignore_errors=True)
        osutils.makedirs(self._model, exist_ok=True)

        source_path = set([corpus.get_folder() for corpus in corpora])
        assert len(source_path) == 1
        source_path = source_path.pop()

        command = [
            self._build_bin, '-s', self._source_lang, '-t', self._target_lang,
            '-i', source_path, '-m', self._model, '-I', '4'
        ]
        osutils.shell_exec(command, stdout=log, stderr=log)
Exemplo n.º 8
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            osutils.makedirs(dest_folder, exist_ok=True)

        out_corpora = []
        for corpus in corpora:
            out_corpus = BilingualCorpus.make_parallel(corpus.name,
                                                       dest_folder,
                                                       corpus.langs)

            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = out_corpus.get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

            out_corpora.append(out_corpus)

        return out_corpora
Exemplo n.º 9
0
    def prepare_data(self,
                     train_corpora,
                     eval_corpora,
                     output_path,
                     log=None,
                     bpe_symbols=2**15,
                     fromModel=None):
        if log is None:
            log = osutils.DEVNULL

        data_dir = os.path.join(output_path, 'data')
        tmp_dir = os.path.join(output_path, 'tmp')

        train_path = self._get_common_root(train_corpora)
        eval_path = self._get_common_root(eval_corpora)

        shutil.rmtree(data_dir, ignore_errors=True)
        osutils.makedirs(data_dir)

        # if an existing checkpoint is loaded for starting the training (i.e fromModel!=None)
        # copy the subtoken vocabulary associated to the existing checkpoint into the right location,
        # so that the subtoken vocabulary is not re-created from the new training data,
        # and so that it is only exploited to bpe-fy the new data
        # it assumes that the vocabulary is called "model.vcb" and is located in the same directory of the checkpoint
        if fromModel is not None:
            shutil.copyfile(os.path.join(fromModel, 'model.vcb'),
                            os.path.join(data_dir, 'model.vcb'))

        if not os.path.isdir(tmp_dir):
            osutils.makedirs(tmp_dir)

        env = self._get_env(train_path, eval_path, bpe=bpe_symbols)
        command = [
            't2t-datagen', '--t2t_usr_dir', self._t2t_dir,
            '--data_dir=%s' % data_dir,
            '--tmp_dir=%s' % tmp_dir, '--problem=translate_mmt'
        ]

        osutils.shell_exec(command, stdout=log, stderr=log, env=env)
Exemplo n.º 10
0
 def get_tempfile(self, name, ensure=True):
     if ensure and not os.path.isdir(self.temp_path):
         osutils.makedirs(self.temp_path, exist_ok=True)
     return os.path.join(self.temp_path, name)
Exemplo n.º 11
0
    def evaluate(self, corpora, heval_output=None, debug=False):
        corpora = [
            corpus for corpus in corpora if self._source_lang in corpus.langs
            and self._target_lang in corpus.langs
        ]
        if len(corpora) == 0:
            raise IllegalArgumentException(
                'No %s > %s corpora found into specified path' %
                (self._source_lang, self._target_lang))

        print '\n============== EVALUATION ==============\n'
        print 'Testing on %d lines:\n' % sum(
            [corpus.count_lines() for corpus in corpora])

        if heval_output is not None:
            osutils.makedirs(heval_output, exist_ok=True)

        step_logger = _StepLogger()
        human_eval_outputter = HumanEvaluationFileOutputter(
        ) if heval_output is not None else None

        working_dir = self._engine.get_tempdir('evaluation')

        try:
            # Process references
            with step_logger.step('Preparing corpora') as _:
                source = os.path.join(working_dir,
                                      'source.' + self._source_lang)
                osutils.concat(
                    [corpus.get_file(self._source_lang) for corpus in corpora],
                    source)

                reference = os.path.join(working_dir,
                                         'reference.' + self._target_lang)
                osutils.concat(
                    [corpus.get_file(self._target_lang) for corpus in corpora],
                    reference + '.tmp')
                XMLEncoder().encode_file(reference + '.tmp', reference)
                os.remove(reference + '.tmp')

                if human_eval_outputter is not None:
                    human_eval_outputter.write(lang=self._target_lang,
                                               input_file=reference,
                                               output_file=os.path.join(
                                                   heval_output, 'reference.' +
                                                   self._target_lang))
                    human_eval_outputter.write(lang=self._source_lang,
                                               input_file=source,
                                               output_file=os.path.join(
                                                   heval_output, 'source.' +
                                                   self._source_lang))

                total_line_count = osutils.lc(reference)

            # Translate
            entries = []
            for translator in self._translators:
                with step_logger.step('Translating with %s' %
                                      translator.name) as _:
                    entry = self._translate_with(translator, corpora,
                                                 working_dir, total_line_count)
                    entries.append(entry)

                    if entry.error is None and human_eval_outputter is not None:
                        human_eval_file = os.path.join(
                            heval_output,
                            os.path.basename(entry.translation_file))
                        human_eval_outputter.write(
                            lang=self._target_lang,
                            input_file=entry.translation_file,
                            output_file=human_eval_file)

            # Scoring
            for scorer in self._scorers:
                with step_logger.step('Calculating %s' % scorer.name()) as _:
                    for entry in entries:
                        if entry.error is not None:
                            continue
                        try:
                            entry.scores[scorer] = scorer.calculate(
                                entry.translation_file, reference)
                        except Exception as e:
                            entry.scores[scorer] = str(e)

            # Print results
            print '\n=============== RESULTS ================\n'

            for scorer in self._scorers:
                print scorer.name() + ':'

                for i, entry in enumerate(
                        sorted(entries,
                               key=lambda x: x.scores[scorer]
                               if x.error is None else 0,
                               reverse=True)):
                    if entry.error is None:
                        value = entry.scores[scorer]
                        if isinstance(value, basestring):
                            text = value
                        else:
                            text = '%.2f' % (value * 100)
                            if i == 0:
                                text += ' (Winner)'
                    else:
                        text = str(entry.error)

                    print '  %s: %s' % (entry.translator.name.ljust(20), text)
                print

            print 'Translation Speed:'
            for entry in sorted(entries,
                                key=lambda x: x.translation_time
                                if x.error is None else float('inf')):
                if entry.error is None:
                    text = '%.2fs per sentence' % entry.translation_time
                else:
                    text = str(entry.error)

                print '  %s: %s' % (entry.translator.name.ljust(20), text)
            print
        finally:
            if not debug:
                self._engine.clear_tempdir('evaluation')