Exemplo n.º 1
0
def main_sweep(argv):
    parser = argparse.ArgumentParser(description='Sweep SA sample size and measure BLEU scores at various settings.')
    parser.add_argument('-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent',
                        default=None)
    parser.add_argument('--path', dest='corpora_path', metavar='CORPORA', default=None,
                        help='the path to the test corpora (default is the automatically splitted sample)')
    args = parser.parse_args(argv)

    samples = [int(e) for e in '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'.split()]

    injector = dependency.Injector()
    #injector.read_args(args)
    engine = MMTEngine(args.engine)
    injector.inject(engine)

    node = ClusterNode(engine, api_port=DEFAULT_MMT_API_PORT)

    # more or less copy-pasted from mmt evaluate:

    evaluator = Evaluator(node, google_key='1234', use_sessions=True)

    corpora = BilingualCorpus.list(args.corpora_path) if args.corpora_path is not None \
        else BilingualCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

    lines = 0
    for corpus in corpora:
        lines += corpus.count_lines()

    # end copy-paste

    print('sample bleu')

    for sample in samples:
        node.engine.set_config_option('suffixarrays', 'sample', sample)
        injector.read_config(node.engine.config)  # to get engine.set() to affect MosesFeatures -> moses.ini
        injector.inject(node.engine)
        node.engine.write_configs()
        node.restart()

        scores = evaluator.evaluate(corpora=corpora, heval_output=None,
                                    debug=False)

        engine_scores = [r for r in scores if r.id == 'MMT'][0]

        if engine_scores.error:
            raise RuntimeError(engine_scores.error)

        bleu = engine_scores.bleu
        print(sample, '%.2f' % (bleu * 100))
Exemplo n.º 2
0
    def translate(self, corpora, output):
        """
        Translate the given corpora in parallel processing fashion.
        :param corpora: list of ParallelCorpus
        :param output:  path to output directory
        :return: ([ParallelCorpus, ...], time_per_sentence, parallelism)
        """
        pool = multithread.Pool(self._threads)

        try:
            translations = []
            start_time = datetime.now()

            for corpus in corpora:
                self._before_translate(corpus)

                with open(corpus.get_file(self.source_lang)) as source:
                    output_path = os.path.join(
                        output, corpus.name + '.' + self.target_lang)

                    for line in source:
                        translation = pool.apply_async(self._get_translation,
                                                       (line, corpus))
                        translations.append((translation, output_path))

                self._after_translate(corpus)

            elapsed_time = 0
            translation_count = 0

            path = None
            stream = None

            for translation_job, output_path in translations:
                translation, elapsed = translation_job.get()

                if output_path != path:
                    if stream is not None:
                        stream.close()

                    stream = open(output_path, 'wb')
                    path = output_path

                stream.write(translation.encode('utf-8'))
                stream.write('\n')

                elapsed_time += elapsed
                translation_count += 1

            if stream is not None:
                stream.close()

            end_time = datetime.now()
            total_time = end_time - start_time

            return BilingualCorpus.list(output), (
                elapsed_time / translation_count), (elapsed_time /
                                                    total_time.total_seconds())
        finally:
            pool.terminate()
Exemplo n.º 3
0
    def _step_preprocess(self,
                         bilingual_corpora,
                         monolingual_corpora,
                         _,
                         skip=False,
                         logger=None,
                         delete_on_exit=False):
        preprocessed_folder = self._get_tempdir('preprocessed')
        cleaned_folder = self._get_tempdir('clean_corpora')

        # if skip is true, then we are in resume mode, so return the already existing results
        if skip:
            processed_bicorpora, processed_monocorpora = BilingualCorpus.splitlist(
                self._engine.source_lang,
                self._engine.target_lang,
                roots=preprocessed_folder)
            cleaned_bicorpora = BilingualCorpus.list(cleaned_folder)
        else:
            processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                bilingual_corpora + monolingual_corpora,
                preprocessed_folder,
                (self._engine.data_path if self._split_trainingset else None),
                log=logger.stream)
            cleaned_bicorpora = self._engine.training_preprocessor.clean(
                processed_bicorpora, cleaned_folder)

        return processed_bicorpora, processed_monocorpora, cleaned_bicorpora
Exemplo n.º 4
0
    def translate(self, corpora, output):
        """
        Translate the given corpora in parallel processing fashion.
        :param corpora: list of ParallelCorpus
        :param output:  path to output directory
        :return: ([ParallelCorpus, ...], time_per_sentence, parallelism)
        """
        pool = multithread.Pool(self._threads)

        try:
            translations = []
            start_time = datetime.now()

            for corpus in corpora:
                self._before_translate(corpus)

                with open(corpus.get_file(self.source_lang)) as source:
                    output_path = os.path.join(output, corpus.name + '.' + self.target_lang)

                    for line in source:
                        translation = pool.apply_async(self._get_translation, (line, corpus))
                        translations.append((translation, output_path))

                self._after_translate(corpus)

            elapsed_time = 0
            translation_count = 0

            path = None
            stream = None

            for translation_job, output_path in translations:
                translation, elapsed = translation_job.get()

                if output_path != path:
                    if stream is not None:
                        stream.close()

                    stream = open(output_path, 'wb')
                    path = output_path

                stream.write(translation.encode('utf-8'))
                stream.write('\n')

                elapsed_time += elapsed
                translation_count += 1

            if stream is not None:
                stream.close()

            end_time = datetime.now()
            total_time = end_time - start_time

            return BilingualCorpus.list(output), (elapsed_time / translation_count), (
                elapsed_time / total_time.total_seconds())
        finally:
            pool.terminate()
Exemplo n.º 5
0
    def process(self, corpora, dest_folder, print_tags=True, print_placeholders=False, original_spacing=False):
        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang])

                self.__process_file(source, dest, lang, print_tags, print_placeholders, original_spacing)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 6
0
    def process_corpora(self, corpora, dest_folder):
        fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang])

                self.process_file(source, dest, lang)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 7
0
    def process_corpora(self, corpora, dest_folder):
        fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang])

                self.process_file(source, dest, lang)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 8
0
def main_sweep(argv):
    parser = argparse.ArgumentParser(description='Sweep SA sample size and measure BLEU scores at various settings.')
    parser.add_argument('-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent',
                        default=None)
    parser.add_argument('--path', dest='corpora_path', metavar='CORPORA', default=None,
                        help='the path to the test corpora (default is the automatically splitted sample)')
    args = parser.parse_args(argv)

    samples = [int(e) for e in '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'.split()]

    node = ConfiguredClusterNode(args.engine)

    # more or less copy-pasted from mmt evaluate:

    evaluator = Evaluator(node.engine, node)

    corpora = BilingualCorpus.list(args.corpora_path) if args.corpora_path is not None \
        else BilingualCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

    lines = 0
    for corpus in corpora:
        lines += corpus.count_lines()

    # end copy-paste

    print('sample bleu')

    for sample in samples:
        node.set('suffixarrays', 'sample', sample)
        node.apply_configs()

        scores = evaluator.evaluate(corpora=corpora, google_key='1234', heval_output=None,
                                    use_sessions=True, debug=False)

        engine_scores = scores['MMT']

        if isinstance(engine_scores, str):
            raise RuntimeError(engine_scores)

        bleu = engine_scores['bleu']
        print(sample, '%.2f' % (bleu * 100))
Exemplo n.º 9
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]).get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 10
0
    def encode(self, corpora, dest_folder):
        if not os.path.isdir(dest_folder):
            fileutils.makedirs(dest_folder, exist_ok=True)

        for corpus in corpora:
            for lang in corpus.langs:
                source = corpus.get_file(lang)
                dest_file = BilingualCorpus.make_parallel(corpus.name, dest_folder, [lang]).get_file(lang)

                self.encode_file(source, dest_file, delete_nl=True)

        return BilingualCorpus.list(dest_folder)
Exemplo n.º 11
0
    def clean(self, corpora, output_path):
        args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input']

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL)

        return BilingualCorpus.list(output_path)
Exemplo n.º 12
0
    def clean(self, corpora, output_path, log=None):
        if log is None:
            log = shell.DEVNULL

        args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input']

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdout=log, stderr=log)

        return BilingualCorpus.list(output_path)
Exemplo n.º 13
0
    def _step_tm_cleanup(self,
                         corpora,
                         skip=False,
                         logger=None,
                         delete_on_exit=True):
        # the folder where tm_cleanup results are to be stored
        folder = self._get_tempdir('clean_tms')

        # if skip is true, then we are in resume mode, so use the already existing results
        if skip:
            clean_tms = BilingualCorpus.list(folder)
        # else perform the cleaning on the corpora and use the clean corpora
        else:
            clean_tms = self._engine.cleaner.clean(corpora,
                                                   folder,
                                                   log=logger.stream)

        return clean_tms
Exemplo n.º 14
0
    def clean(self, corpora, output_path, log=None):
        if log is None:
            log = shell.DEVNULL

        # read memory size
        mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')  # e.g. 4015976448
        mem_mb = mem_bytes / (1024. ** 2)  # e.g. 3.74

        extended_heap_mb = int(mem_mb*90/100)

        args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input']

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args=args, max_heap_mb=extended_heap_mb)
        shell.execute(command, stdout=log, stderr=log)

        return BilingualCorpus.list(output_path)
Exemplo n.º 15
0
    def tune(self, corpora=None, debug=False, context_enabled=True, random_seeds=False, max_iterations=25):
        if corpora is None:
            corpora = BilingualCorpus.list(os.path.join(self.engine.data_path, TrainingPreprocessor.DEV_FOLDER_NAME))

        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if not self.is_running():
            raise IllegalStateException('No MMT Server running, start the engine first')

        tokenizer = Tokenizer()

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        source_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [source_lang])
                          for corpus in corpora]
        reference_corpora = [BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(), [target_lang])
                             for corpus in corpora]

        cmdlogger = _tuning_logger(4)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with cmdlogger.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(reference_corpora, tokenized_output)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir, 'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir, 'corpus.' + target_lang)
                fileutils.merge([corpus.get_file(target_lang) for corpus in reference_corpora], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [self._mert_script, source_merged_corpus, target_merged_corpus,
                               self._mert_i_script, runtime_moses_ini.name, '--threads',
                               str(multiprocessing.cpu_count()), '--mertdir', cli.BIN_DIR,
                               '--mertargs', '\'--binary --sctype BLEU\'', '--working-dir', mert_wd, '--nbest', '100',
                               '--decoder-flags', '"' + ' '.join(decoder_flags) + '"', '--nonorm', '--closest',
                               '--no-filter-phrase-table']

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command), stdout=log, stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [float(val) for val in tokens[1:]]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir()
Exemplo n.º 16
0
    def clean(self, corpora, dest_folder, langs=None):
        if langs is None and len(corpora) > 0:
            langs = (corpora[0].langs[0], corpora[0].langs[1])

        self._pool_exec(self._clean_file, [(corpus, dest_folder, langs) for corpus in corpora])
        return BilingualCorpus.list(dest_folder)
Exemplo n.º 17
0
    def tune(self,
             corpora=None,
             debug=False,
             context_enabled=True,
             random_seeds=False,
             max_iterations=25,
             early_stopping_value=None):
        if corpora is None:
            corpora = BilingualCorpus.list(
                os.path.join(self.engine.data_path,
                             TrainingPreprocessor.DEV_FOLDER_NAME))

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        corpora = [
            corpus for corpus in corpora
            if source_lang in corpus.langs and target_lang in corpus.langs
        ]
        if len(corpora) == 0:
            raise IllegalArgumentException(
                'No %s > %s corpora found into specified path' %
                (source_lang, target_lang))

        source_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [source_lang]) for corpus in corpora
        ]
        reference_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [target_lang]) for corpus in corpora
        ]

        cmdlogger = _tuning_logger(4)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenizer = Tokenizer(target_lang)
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with cmdlogger.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(
                    reference_corpora, tokenized_output)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + target_lang)
                fileutils.merge([
                    corpus.get_file(target_lang)
                    for corpus in reference_corpora
                ], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if self.api.root is not None:
                    decoder_flags += ['--root', self.api.root]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [
                        self._mert_script, source_merged_corpus,
                        target_merged_corpus, self._mert_i_script,
                        runtime_moses_ini.name, '--threads',
                        str(multiprocessing.cpu_count()), '--mertdir',
                        cli.BIN_DIR, '--mertargs',
                        '\'--binary --sctype BLEU\'', '--working-dir', mert_wd,
                        '--nbest', '100', '--decoder-flags',
                        '"' + ' '.join(decoder_flags) + '"', '--nonorm',
                        '--closest', '--no-filter-phrase-table'
                    ]

                    if early_stopping_value is not None:
                        command += [
                            '--bleuscorer', self._scorer_script,
                            '--bleuscorer-flags "-nt" --early-stopping-value %d'
                            % early_stopping_value
                        ]

                    if not random_seeds:
                        command.append('--predictable-seeds')
                    if max_iterations > 0:
                        command.append('--maximum-iterations={num}'.format(
                            num=max_iterations))

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command),
                                      stdout=log,
                                      stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [
                                float(val) for val in tokens[1:]
                            ]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir("tuning")
Exemplo n.º 18
0
def main_sweep(argv):
    parser = argparse.ArgumentParser(
        description=
        'Sweep SA sample size and measure BLEU scores at various settings.')
    parser.add_argument(
        '-e',
        '--engine',
        dest='engine',
        help='the engine name, \'default\' will be used if absent',
        default=None)
    parser.add_argument(
        '--path',
        dest='corpora_path',
        metavar='CORPORA',
        default=None,
        help=
        'the path to the test corpora (default is the automatically splitted sample)'
    )
    args = parser.parse_args(argv)

    samples = [
        int(e) for e in
        '10 20 50 70 80 90 100 110 120 150 200 350 500 800 1000 2000 5000'.
        split()
    ]

    injector = dependency.DependencyInjector()
    #injector.read_args(args)
    engine = MMTEngine(args.engine)
    injector.inject(engine)

    node = ClusterNode(engine, api_port=DEFAULT_MMT_API_PORT)

    # more or less copy-pasted from mmt evaluate:

    evaluator = Evaluator(node, google_key='1234', use_sessions=True)

    corpora = BilingualCorpus.list(args.corpora_path) if args.corpora_path is not None \
        else BilingualCorpus.list(os.path.join(node.engine.data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

    lines = 0
    for corpus in corpora:
        lines += corpus.count_lines()

    # end copy-paste

    print('sample bleu')

    for sample in samples:
        node.engine.set_config_option('suffixarrays', 'sample', sample)
        injector.read_config(
            node.engine.config
        )  # to get engine.set() to affect MosesFeatures -> moses.ini
        injector.inject(node.engine)
        node.engine.write_configs()
        node.restart()

        scores = evaluator.evaluate(corpora=corpora,
                                    heval_output=None,
                                    debug=False)

        engine_scores = [r for r in scores if r.id == 'MMT'][0]

        if engine_scores.error:
            raise RuntimeError(engine_scores.error)

        bleu = engine_scores.bleu
        print(sample, '%.2f' % (bleu * 100))
Exemplo n.º 19
0
    def clean(self, corpora, dest_folder):
        langs = (self._source_lang, self._target_lang)

        _pool_exec(self._clean_file, [(corpus, dest_folder, langs) for corpus in corpora])
        return BilingualCorpus.list(dest_folder)
Exemplo n.º 20
0
    def clean(self, corpora, dest_folder):
        langs = (self._source_lang, self._target_lang)

        _pool_exec(self._clean_file, [(corpus, dest_folder, langs) for corpus in corpora])
        return BilingualCorpus.list(dest_folder)