Exemplo n.º 1
0
def mmt_dedup(src_lang, tgt_lang, in_path, out_path, length_threshold=None):
    args = ['-s', src_lang, '-t', tgt_lang, '--input', in_path, '--output', out_path]
    if length_threshold is not None and length_threshold > 0:
        args += ['-l', length_threshold]

    command = mmt_java('eu.modernmt.cli.DeduplicationMain', args)
    osutils.shell_exec(command, env=__mmt_env())
Exemplo n.º 2
0
    def __test(self, src, tgt, filename):
        src_file = os.path.join(TEST_RESOURCES, 'tag_projection_dataset',
                                filename + '.' + src)
        tgt_file = os.path.join(TEST_RESOURCES, 'tag_projection_dataset',
                                filename + '.' + tgt)
        alg_file = os.path.join(TEST_RESOURCES, 'tag_projection_dataset',
                                filename + '.alg')

        if not os.path.isfile(src_file) or not os.path.isfile(
                tgt_file) or not os.path.isfile(alg_file):
            self.skipTest("external resource not available")

        java_cmd = mmt_java(
            'eu.modernmt.processing.tags.cli.XMLProjectorTestMain',
            [src_file, tgt_file, alg_file])

        with tempfile.NamedTemporaryFile() as out_stream:
            osutils.shell_exec(java_cmd, stdout=out_stream)
            out_stream.flush()

            with _Reader(src_file, out_stream.name, alg_file) as reader:
                for src_line, tgt_line, alg_line in reader:
                    src_line, tgt_line = src_line.rstrip(), tgt_line.rstrip()
                    src_tags, tgt_tags = self._extract_tags(
                        src_line), self._extract_tags(tgt_line)

                    if set(src_tags) != set(tgt_tags):
                        self.fail(
                            'Not all tags were projected:\n\t%s\n\t%s\n\t%s' %
                            (src_line, tgt_line, alg_line))

                    if not self.__validate_tags(tgt_tags):
                        self.fail('Invalid tag projection:\n\t%s\n\t%s\n\t%s' %
                                  (src_line, tgt_line, alg_line))
Exemplo n.º 3
0
    def datagen(self):
        os.makedirs(self.args.output_path, exist_ok=True)

        train_pref = os.path.join(self.state.encoded_corpora, 'train')
        valid_pref = os.path.join(self.state.encoded_corpora, 'dev')
        cmd = ['fairseq-preprocess', '--source-lang', 'sl', '--target-lang', 'tl', '--user-dir', MMT_FAIRSEQ_USER_DIR,
               '--task', 'mmt_translation', '--trainpref', train_pref, '--validpref', valid_pref,
               '--destdir', self.args.output_path, '--workers', str(multiprocessing.cpu_count()),
               '--srcdict', self.state.vocab, '--joined-dictionary', '--dataset-impl', 'mmap']

        osutils.shell_exec(cmd, stdout=self.log_fobj, stderr=self.log_fobj)
Exemplo n.º 4
0
def mmt_tmsclean(src_lang, tgt_lang, in_path, out_path, out_format=None, filters=None):
    args = ['-s', src_lang, '-t', tgt_lang, '--input', in_path, '--output', out_path]
    if out_format is not None:
        args += ['--output-format', out_format]
    if filters is not None and len(filters) > 0:
        args += ['--filters'] + filters

    extended_heap_mb = int(osutils.mem_size() * 90 / 100)

    java_ops = ['-DentityExpansionLimit=0', '-DtotalEntitySizeLimit=0', '-Djdk.xml.totalEntitySizeLimit=0']
    command = mmt_java('eu.modernmt.cli.CleaningPipelineMain', args, max_heap_mb=extended_heap_mb, java_ops=java_ops)
    osutils.shell_exec(command, env=__mmt_env())
Exemplo n.º 5
0
def mmt_preprocess(src_lang, tgt_lang, in_paths, out_path, dev_path=None, test_path=None, partition_size=None):
    args = ['-s', src_lang, '-t', tgt_lang, '--output', out_path, '--input']
    if isinstance(in_paths, str):
        in_paths = [in_paths]

    args += in_paths

    if partition_size is not None:
        args += ['--size', str(partition_size)]
    if dev_path is not None:
        args += ['--dev', dev_path]
    if test_path is not None:
        args += ['--test', test_path]

    command = mmt_java('eu.modernmt.cli.TrainingPipelineMain', args)
    osutils.shell_exec(command, env=__mmt_env())
Exemplo n.º 6
0
 def cli(self,
         *args,
         stdin=None,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE):
     return osutils.shell_exec([self._mmt_script] + list(args),
                               stdin=stdin,
                               stdout=stdout,
                               stderr=stderr)
Exemplo n.º 7
0
def fastalign_build(src_lang, tgt_lang, in_path, out_model, iterations=None,
                    case_sensitive=True, favor_diagonal=True, log=None):
    os.makedirs(out_model, exist_ok=True)
    out_model = os.path.join(out_model, '%s__%s.fam' % (src_lang, tgt_lang))

    if log is None:
        log = osutils.DEVNULL

    command = [os.path.join(MMT_BIN_DIR, 'fa_build'), '-s', src_lang, '-t', tgt_lang, '-i', in_path, '-m', out_model]

    if iterations is not None:
        command.extend(['-I', str(iterations)])
    if not case_sensitive:
        command.append('--case-insensitive')
    if not favor_diagonal:
        command.append('--no-favor-diagonal')

    osutils.shell_exec(command, stdout=log, stderr=log, env=__mmt_env())
Exemplo n.º 8
0
    def __init__(self, model_path) -> None:
        std_out, _ = osutils.shell_exec(['java', '-cp', mmt.MMT_JAR, 'eu.modernmt.context.lucene.storage.utils.Dump',
                                         os.path.join(model_path, 'storage')])

        self._content_by_memory = defaultdict(set)

        for line in std_out.splitlines(keepends=False):
            memory, src_lang, tgt_lang, line = line.strip().split('\t', maxsplit=3)
            self._content_by_memory[int(memory)].add('%s\t%s\t%s' % (src_lang, tgt_lang, line))
Exemplo n.º 9
0
    def __init__(self, model_path, main_class=None) -> None:
        if main_class is None:
            main_class = 'eu.modernmt.decoder.neural.memory.lucene.utils.Dump'
        cmd = ['java', '-cp', mmt.MMT_JAR, main_class, model_path]
        std_out, _ = osutils.shell_exec(cmd)

        self._content_by_memory = defaultdict(set)

        for line in std_out.splitlines(keepends=False):
            memory, src_lang, tgt_lang, src_line, tgt_line = line.strip().split('\t')
            self._content_by_memory[int(memory)].add(self.Entry(src_lang, tgt_lang, src_line, tgt_line))
Exemplo n.º 10
0
def fastalign_score(src_lang, tgt_lang, model_path, in_path, out_path=None):
    model_path = os.path.join(model_path, '%s__%s.fam' % (src_lang, tgt_lang))

    command = [os.path.join(MMT_BIN_DIR, 'fa_score'), '-s', src_lang, '-t', tgt_lang,
               '-m', model_path, '-i', in_path, '-o', out_path or in_path]
    stdout, _ = osutils.shell_exec(command, env=__mmt_env())

    result = dict()
    for line in stdout.splitlines(keepends=False):
        key, value = line.split('=', maxsplit=1)
        result[key] = float(value)

    return result['good_avg'], result['good_std_dev'], result['bad_avg'], result['bad_std_dev']
Exemplo n.º 11
0
def __get_java_version():
    try:
        stdout, stderr = osutils.shell_exec(['java', '-version'])
        java_output = stdout + '\n' + stderr

        for line in java_output.split('\n'):
            tokens = line.split()
            if 'version' in tokens:
                version = tokens[tokens.index('version') + 1]
                version = version.strip('"')

                if version.startswith('1.'):
                    version = version[2:]

                version = re.match('^[0-9]+', version)
                return int(version.group())

        return None
    except OSError:
        return None
Exemplo n.º 12
0
 def start(self):
     command = mmt_java(
         'eu.modernmt.cli.BackupDaemonMain',
         args=['-e', self.engine.name, '-i', '3600', '-l', '1'])
     env = dict(os.environ, MMT_Q_HOST=network.get_ip())
     self._process = osutils.shell_exec(command, background=True, env=env)
Exemplo n.º 13
0
    def train_nn(self):
        self.state.nn_path = self.wdir('nn_model')

        last_ckpt_path = os.path.join(self.state.nn_path, 'checkpoint_last.pt')
        if not os.path.isfile(last_ckpt_path) and self.args.init_model is not None:
            shutil.copy(self.args.init_model, last_ckpt_path)

        # Create command
        tensorboard_logdir = self.state.tensorboard_logdir = self.wdir('tensorboard_logdir')

        cmd = ['fairseq-train', self.args.data_path, '--save-dir', self.state.nn_path, '--task', 'mmt_translation',
               '--user-dir', MMT_FAIRSEQ_USER_DIR, '--share-all-embeddings', '--no-progress-bar',
               '--tensorboard-logdir', tensorboard_logdir, '--dataset-impl', 'mmap']

        if self.args.train_steps is not None:
            cmd.extend(['--max-update', str(self.args.train_steps)])

        cmd += self.extra_argv

        # Create environment
        env = None
        if self.args.gpus is not None:
            env = os.environ.copy()
            env['CUDA_VISIBLE_DEVICES'] = ','.join([str(gpu) for gpu in self.args.gpus])

        # Start process
        tensorboard = None

        if self.args.tensorboard_port is not None:
            tensorboard_env = os.environ.copy()
            tensorboard_env['CUDA_VISIBLE_DEVICES'] = ''

            tensorboard_log = open(os.path.join(self.state.tensorboard_logdir, 'server.log'), 'wb')
            tensorboard_cmd = ['tensorboard', '--logdir', tensorboard_logdir, '--port', str(self.args.tensorboard_port)]
            tensorboard = osutils.shell_exec(tensorboard_cmd, stderr=tensorboard_log, stdout=tensorboard_log,
                                             env=tensorboard_env, background=True)

        process_timeout = None
        if self.args.train_steps is None:
            process_timeout = 5 * 60  # 5 minutes

        process = osutils.shell_exec(cmd, stderr=self.log_fobj, stdout=self.log_fobj, background=True, env=env)
        last_checkpoint = None

        try:
            while True:
                try:
                    return_code = process.wait(process_timeout)

                    if return_code != 0:
                        raise ShellError(' '.join(cmd), return_code)

                    break
                except KeyboardInterrupt:
                    process.terminate()
                    self._logger.info('Training manually interrupted by user')
                    break
                except TimeoutExpired:
                    checkpoints = _last_n_checkpoints(self.state.nn_path, 1)
                    checkpoint = checkpoints[0] if len(checkpoints) > 0 else None

                    if last_checkpoint != checkpoint and self._training_should_stop():
                        process.terminate()
                        self._logger.info('Training interrupted by termination policy: '
                                          'validation loss has reached its plateau')
                        break

                    last_checkpoint = checkpoint
        finally:
            if tensorboard is not None:
                tensorboard.terminate()
Exemplo n.º 14
0
def pip_install():
    requirements_txt = os.path.join(mmt.MMT_HOME_DIR, 'requirements.txt')
    osutils.shell_exec(['pip3', 'install', '-r', requirements_txt],
                       stderr=sys.stderr,
                       stdout=sys.stdout)