def _translate_with(self, translator, corpora, working_dir, expected_segments): result = self._Entry(translator) translations_path = os.path.join(working_dir, 'translations', result.id) osutils.makedirs(translations_path, exist_ok=True) try: begin_time = time.time() segments_count = translator.translate_corpora( corpora, translations_path) result.translation_time = (time.time() - begin_time) / float(segments_count) if expected_segments != segments_count: raise TranslateError( 'Invalid line count for translator %s: expected %d, found %d.' % (translator.name, expected_segments, segments_count)) result.translation_file = os.path.join( working_dir, result.id + '.' + self._target_lang) osutils.concat([ os.path.join(translations_path, corpus.name + '.' + self._target_lang) for corpus in corpora ], result.translation_file) except TranslateError as e: result.error = e except Exception as e: result.error = TranslateError('Unexpected ERROR: ' + str(e.message)) return result
def _get_tempdir(self, name, delete_if_exists=False): path = os.path.join(self._temp_dir, name) if delete_if_exists: shutil.rmtree(path, ignore_errors=True) if not os.path.isdir(path): osutils.makedirs(path, exist_ok=True) return path
def get_logfile(self, name, ensure=True, append=False): if ensure and not os.path.isdir(self.logs_path): osutils.makedirs(self.logs_path, exist_ok=True) logfile = os.path.join(self.logs_path, name + '.log') if not append and ensure and os.path.isfile(logfile): os.remove(logfile) return logfile
def get_tempdir(self, name, ensure=True): if ensure and not os.path.isdir(self.temp_path): osutils.makedirs(self.temp_path, exist_ok=True) folder = os.path.join(self.temp_path, name) if ensure: shutil.rmtree(folder, ignore_errors=True) os.makedirs(folder) return folder
def train_model(self, train_dir, output_dir, batch_size=1024, n_train_steps=None, n_eval_steps=1000, hparams='transformer_base', log=None, fromModel=None): if log is None: log = osutils.DEVNULL if not os.path.isdir(output_dir): osutils.makedirs(output_dir) # if an existing checkpoint is loaded for starting the training (i.e fromModel != None) # copy the checkpoint files into the right location if fromModel is not None: self._copy_and_fix_model(fromModel, output_dir, gpus=self._gpus) data_dir = os.path.join(train_dir, 'data') src_model_vocab = os.path.join(data_dir, 'model.vcb') tgt_model_vocab = os.path.join(output_dir, 'model.vcb') if not os.path.isfile(tgt_model_vocab): os.symlink(src_model_vocab, tgt_model_vocab) env = self._get_env() hparams_p = 'batch_size=%d' % batch_size command = [ 't2t-trainer', '--t2t_usr_dir', self._t2t_dir, '--data_dir=%s' % data_dir, '--problem=translate_mmt', '--model=transformer', '--hparams_set=%s' % hparams, '--output_dir=%s' % output_dir, '--local_eval_frequency=%d' % n_eval_steps, '--train_steps=%d' % (n_train_steps if n_train_steps is not None else 100000000), '--worker_gpu=%d' % len(self._gpus), '--hparams', hparams_p ] process = osutils.shell_exec(command, stdout=log, stderr=log, env=env, background=True) try: return_code = process.wait() if return_code != 0: raise ShellError(' '.join(command), return_code, None) except KeyboardInterrupt: process.kill()
def process_corpora(self, corpora, output_folder): osutils.makedirs(output_folder, exist_ok=True) for corpus in corpora: output_corpus = BilingualCorpus.make_parallel( corpus.name, output_folder, corpus.langs) for lang in corpus.langs: input_path = corpus.get_file(lang) output_path = output_corpus.get_file(lang) self.process_file(input_path, output_path, lang) return BilingualCorpus.list(self._source_lang, self._target_lang, output_folder)
def build(self, corpora, log=None): if log is None: log = osutils.DEVNULL shutil.rmtree(self._model, ignore_errors=True) osutils.makedirs(self._model, exist_ok=True) source_path = set([corpus.get_folder() for corpus in corpora]) assert len(source_path) == 1 source_path = source_path.pop() command = [ self._build_bin, '-s', self._source_lang, '-t', self._target_lang, '-i', source_path, '-m', self._model, '-I', '4' ] osutils.shell_exec(command, stdout=log, stderr=log)
def encode(self, corpora, dest_folder): if not os.path.isdir(dest_folder): osutils.makedirs(dest_folder, exist_ok=True) out_corpora = [] for corpus in corpora: out_corpus = BilingualCorpus.make_parallel(corpus.name, dest_folder, corpus.langs) for lang in corpus.langs: source = corpus.get_file(lang) dest_file = out_corpus.get_file(lang) self.encode_file(source, dest_file, delete_nl=True) out_corpora.append(out_corpus) return out_corpora
def prepare_data(self, train_corpora, eval_corpora, output_path, log=None, bpe_symbols=2**15, fromModel=None): if log is None: log = osutils.DEVNULL data_dir = os.path.join(output_path, 'data') tmp_dir = os.path.join(output_path, 'tmp') train_path = self._get_common_root(train_corpora) eval_path = self._get_common_root(eval_corpora) shutil.rmtree(data_dir, ignore_errors=True) osutils.makedirs(data_dir) # if an existing checkpoint is loaded for starting the training (i.e fromModel!=None) # copy the subtoken vocabulary associated to the existing checkpoint into the right location, # so that the subtoken vocabulary is not re-created from the new training data, # and so that it is only exploited to bpe-fy the new data # it assumes that the vocabulary is called "model.vcb" and is located in the same directory of the checkpoint if fromModel is not None: shutil.copyfile(os.path.join(fromModel, 'model.vcb'), os.path.join(data_dir, 'model.vcb')) if not os.path.isdir(tmp_dir): osutils.makedirs(tmp_dir) env = self._get_env(train_path, eval_path, bpe=bpe_symbols) command = [ 't2t-datagen', '--t2t_usr_dir', self._t2t_dir, '--data_dir=%s' % data_dir, '--tmp_dir=%s' % tmp_dir, '--problem=translate_mmt' ] osutils.shell_exec(command, stdout=log, stderr=log, env=env)
def get_tempfile(self, name, ensure=True): if ensure and not os.path.isdir(self.temp_path): osutils.makedirs(self.temp_path, exist_ok=True) return os.path.join(self.temp_path, name)
def evaluate(self, corpora, heval_output=None, debug=False): corpora = [ corpus for corpus in corpora if self._source_lang in corpus.langs and self._target_lang in corpus.langs ] if len(corpora) == 0: raise IllegalArgumentException( 'No %s > %s corpora found into specified path' % (self._source_lang, self._target_lang)) print '\n============== EVALUATION ==============\n' print 'Testing on %d lines:\n' % sum( [corpus.count_lines() for corpus in corpora]) if heval_output is not None: osutils.makedirs(heval_output, exist_ok=True) step_logger = _StepLogger() human_eval_outputter = HumanEvaluationFileOutputter( ) if heval_output is not None else None working_dir = self._engine.get_tempdir('evaluation') try: # Process references with step_logger.step('Preparing corpora') as _: source = os.path.join(working_dir, 'source.' + self._source_lang) osutils.concat( [corpus.get_file(self._source_lang) for corpus in corpora], source) reference = os.path.join(working_dir, 'reference.' + self._target_lang) osutils.concat( [corpus.get_file(self._target_lang) for corpus in corpora], reference + '.tmp') XMLEncoder().encode_file(reference + '.tmp', reference) os.remove(reference + '.tmp') if human_eval_outputter is not None: human_eval_outputter.write(lang=self._target_lang, input_file=reference, output_file=os.path.join( heval_output, 'reference.' + self._target_lang)) human_eval_outputter.write(lang=self._source_lang, input_file=source, output_file=os.path.join( heval_output, 'source.' + self._source_lang)) total_line_count = osutils.lc(reference) # Translate entries = [] for translator in self._translators: with step_logger.step('Translating with %s' % translator.name) as _: entry = self._translate_with(translator, corpora, working_dir, total_line_count) entries.append(entry) if entry.error is None and human_eval_outputter is not None: human_eval_file = os.path.join( heval_output, os.path.basename(entry.translation_file)) human_eval_outputter.write( lang=self._target_lang, input_file=entry.translation_file, output_file=human_eval_file) # Scoring for scorer in self._scorers: with step_logger.step('Calculating %s' % scorer.name()) as _: for entry in entries: if entry.error is not None: continue try: entry.scores[scorer] = scorer.calculate( entry.translation_file, reference) except Exception as e: entry.scores[scorer] = str(e) # Print results print '\n=============== RESULTS ================\n' for scorer in self._scorers: print scorer.name() + ':' for i, entry in enumerate( sorted(entries, key=lambda x: x.scores[scorer] if x.error is None else 0, reverse=True)): if entry.error is None: value = entry.scores[scorer] if isinstance(value, basestring): text = value else: text = '%.2f' % (value * 100) if i == 0: text += ' (Winner)' else: text = str(entry.error) print ' %s: %s' % (entry.translator.name.ljust(20), text) print print 'Translation Speed:' for entry in sorted(entries, key=lambda x: x.translation_time if x.error is None else float('inf')): if entry.error is None: text = '%.2fs per sentence' % entry.translation_time else: text = str(entry.error) print ' %s: %s' % (entry.translator.name.ljust(20), text) print finally: if not debug: self._engine.clear_tempdir('evaluation')