def clean(self, corpora, dest_folder, langs=None): if langs is None and len(corpora) > 0: langs = (corpora[0].langs[0], corpora[0].langs[1]) self._pool_exec(self._clean_file, [(corpus, ParallelCorpus(corpus.name, dest_folder, corpus.langs), langs) for corpus in corpora]) return ParallelCorpus.list(dest_folder)
def encode(self, corpora, dest_folder): if not os.path.isdir(dest_folder): fileutils.makedirs(dest_folder, exist_ok=True) for corpus in corpora: for lang in corpus.langs: source = corpus.get_file(lang) dest = ParallelCorpus(corpus.name, dest_folder, [lang]).get_file(lang) self.encode_file(source, dest) return ParallelCorpus.list(dest_folder)
def process(self, corpora, dest_folder, print_tags=True, print_placeholders=False, original_spacing=False): for corpus in corpora: for lang in corpus.langs: source = corpus.get_file(lang) dest = ParallelCorpus(corpus.name, dest_folder, [lang]).get_file(lang) self.__process_file(source, dest, lang, print_tags, print_placeholders, original_spacing) return ParallelCorpus.list(dest_folder)
def train(self, corpora, aligner, working_dir='.', log_file=None): if os.path.isdir(self._model) and len(os.listdir(self._model)) > 0: raise Exception('Model already exists at ' + self._model) if not os.path.isdir(self._model): fileutils.makedirs(self._model, exist_ok=True) if not os.path.isdir(working_dir): fileutils.makedirs(working_dir, exist_ok=True) l1 = self._source_lang l2 = self._target_lang langs = (l1, l2) langs_suffix = l1 + '-' + l2 mct_base = self._get_model_basename() dmp_file = mct_base + '.dmp' mam_file = mct_base + '.' + langs_suffix + '.mam' lex_file = mct_base + '.' + langs_suffix + '.lex' log = shell.DEVNULL try: if log_file is not None: log = open(log_file, 'a') # Clean corpus for training clean_output = os.path.join(working_dir, 'clean_corpora') fileutils.makedirs(clean_output, exist_ok=True) corpora = self._cleaner.clean(corpora, clean_output, (self._source_lang, self._target_lang)) # Create merged corpus and domains list file (dmp) merged_corpus = ParallelCorpus(os.path.basename(mct_base), working_dir, langs) fileutils.merge([corpus.get_file(l1) for corpus in corpora], merged_corpus.get_file(l1)) fileutils.merge([corpus.get_file(l2) for corpus in corpora], merged_corpus.get_file(l2)) with open(dmp_file, 'w') as dmp: for corpus in corpora: dmp.write(str(corpus.name) + ' ' + str(corpus.count_lines()) + '\n') # Create alignments in 'bal' file and symmetrize bal_file = aligner.align(merged_corpus, langs, self._model, working_dir, log_file) symal_file = os.path.join(working_dir, 'alignments.' + langs_suffix + '.symal') symal_command = [self._symal_bin, '-a=g', '-d=yes', '-f=yes', '-b=yes'] with open(bal_file) as stdin: with open(symal_file, 'w') as stdout: shell.execute(symal_command, stdin=stdin, stdout=stdout, stderr=log) # Execute mtt-build mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l1) with open(merged_corpus.get_file(l1)) as stdin: shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log) mttbuild_command = self._get_mttbuild_command(mct_base, dmp_file, l2) with open(merged_corpus.get_file(l2)) as stdin: shell.execute(mttbuild_command, stdin=stdin, stdout=log, stderr=log) # Create 'mam' file mam_command = [self._symal2mam_bin, mam_file] with open(symal_file) as stdin: shell.execute(mam_command, stdin=stdin, stdout=log, stderr=log) # Create 'lex' file lex_command = [self._mmlexbuild_bin, mct_base + '.', l1, l2, '-o', lex_file] shell.execute(lex_command, stdout=log, stderr=log) finally: if log_file is not None: log.close()