Exemplo n.º 1
0
    def _step_init(self,
                   bilingual_corpora,
                   monolingual_corpora,
                   skip=False,
                   logger=None,
                   delete_on_exit=False):
        training_folder = self._get_tempdir('training_corpora')

        # if skip is true, then we are in resume mode, so return the already existing results
        if skip:
            bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(
                self._engine.source_lang,
                self._engine.target_lang,
                roots=training_folder)
        # else perform the baseline domains extraction and domain mapping, and return its result
        else:
            domains = self._engine.db.insert(bilingual_corpora)

            bilingual_corpora = [
                domain.corpus.symlink(training_folder, name=str(domain.id))
                for domain in domains
            ]
            monolingual_corpora = [
                corpus.symlink(training_folder)
                for corpus in monolingual_corpora
            ]

        return bilingual_corpora, monolingual_corpora
Exemplo n.º 2
0
    def _step_preprocess(self,
                         bilingual_corpora,
                         monolingual_corpora,
                         _,
                         skip=False,
                         logger=None,
                         delete_on_exit=False):
        preprocessed_folder = self._get_tempdir('preprocessed')
        cleaned_folder = self._get_tempdir('clean_corpora')

        # if skip is true, then we are in resume mode, so return the already existing results
        if skip:
            processed_bicorpora, processed_monocorpora = BilingualCorpus.splitlist(
                self._engine.source_lang,
                self._engine.target_lang,
                roots=preprocessed_folder)
            cleaned_bicorpora = BilingualCorpus.list(cleaned_folder)
        else:
            processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                bilingual_corpora + monolingual_corpora,
                preprocessed_folder,
                (self._engine.data_path if self._split_trainingset else None),
                log=logger.stream)
            cleaned_bicorpora = self._engine.training_preprocessor.clean(
                processed_bicorpora, cleaned_folder)

        return processed_bicorpora, processed_monocorpora, cleaned_bicorpora
Exemplo n.º 3
0
    def process(self, corpora, output_path, data_path=None):
        args = [
            '-s', self._source_lang, '-t', self._target_lang, '-v',
            self._vocabulary_path, '--output', output_path, '--input'
        ]

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command,
                      stdin=shell.DEVNULL,
                      stdout=shell.DEVNULL,
                      stderr=shell.DEVNULL)

        return BilingualCorpus.splitlist(self._source_lang,
                                         self._target_lang,
                                         roots=output_path)
Exemplo n.º 4
0
    def clean(self, source, target, input_paths, output_path):
        args = ['-s', source, '-t', target, '--output', output_path, '--input']

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL)

        return BilingualCorpus.splitlist(source, target, roots=output_path)[0]
Exemplo n.º 5
0
    def _make_training_folder(self, bilingual_corpora, monolingual_corpora, domains, folder):
        for corpus in bilingual_corpora:
            dest_corpus = BilingualCorpus.make_parallel(domains[corpus.name], folder, corpus.langs)

            for lang in corpus.langs:
                os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang))

        for corpus in monolingual_corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, folder, corpus.langs)

            for lang in corpus.langs:
                os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang))

        return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=folder)
Exemplo n.º 6
0
    def _make_training_folder(self, bilingual_corpora, monolingual_corpora, domains, folder):
        for corpus in bilingual_corpora:
            dest_corpus = BilingualCorpus.make_parallel(domains[corpus.name], folder, corpus.langs)

            for lang in corpus.langs:
                os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang))

        for corpus in monolingual_corpora:
            dest_corpus = BilingualCorpus.make_parallel(corpus.name, folder, corpus.langs)

            for lang in corpus.langs:
                os.symlink(corpus.get_file(lang), dest_corpus.get_file(lang))

        return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=folder)
Exemplo n.º 7
0
    def process(self, source, target, input_paths, output_path, data_path=None):
        args = ['-s', source, '-t', target, '--output', output_path, '--input']

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL)

        return BilingualCorpus.splitlist(source, target, roots=output_path)
Exemplo n.º 8
0
    def process(self, corpora, output_path, data_path=None):
        args = ['-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path,
                '--input']

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL)

        return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
Exemplo n.º 9
0
    def _build(self, resume=False):

        self._temp_dir = self._engine.get_tempdir('training',
                                                  ensure=(not resume))
        self._checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json')
        self._passed_steps = []

        if resume:
            self.load_checkpoint()
        else:
            self.save_checkpoint()

        # initialize thee checkpoint manager
        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        # separate bilingual and monolingual corpora in separate lists, reading them from roots
        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(
            source_lang, target_lang, roots=self._roots)
        # if no bilingual corpora are found, it is not possible to train the translation system
        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' %
                (source_lang.upper(), target_lang.upper()))

        # if no old engines (i.e. engine folders) can be found, create a new one from scratch
        # if we are not trying to resume an old one, create from scratch anyway
        if not os.path.isdir(self._engine.path) or not resume:
            shutil.rmtree(self._engine.path, ignore_errors=True)
            os.makedirs(self._engine.path)

        # Check if all requirements are fulfilled before launching engine training
        self._check_constraints()

        # Create a new logger for the building activities,
        # passing it the amount of steps to perform (plus a non user-decidable step)
        # and the name of the log file to create
        logger = _builder_logger(
            len(self._scheduled_steps) + 1,
            self._engine.get_logfile('training'))
        delete_on_exit = not self._debug
        # Start the engine building (training) phases
        try:
            # tell the logger that the engine training has started
            logger.start(self._engine, bilingual_corpora, monolingual_corpora)

            # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~
            # Note: if resume is true, a step is only run if it was not in the previous attempt

            # run tm_cleanup step on the bilingual_corpora if required.
            # Obtain cleaned bicorpora
            cleaned_bicorpora = self._run_step('tm_cleanup',
                                               self._step_tm_cleanup,
                                               logger=logger,
                                               values=[bilingual_corpora],
                                               delete_on_exit=delete_on_exit)

            # run __db_map step (always: user can't skip it)
            # on the cleaned bicorpora and the original monocorpora;
            # obtain base bicorpora and base monocorpora
            base_bicorpora, base_monocorpora = self._run_step(
                '__db_map',
                self._step_init,
                forced=True,
                values=[cleaned_bicorpora, monolingual_corpora],
                delete_on_exit=delete_on_exit)

            # run preprocess step if required.
            # Return processed bi and mono corpora and cleaned bicorpora
            processed_bicorpora, processed_monocorpora, cleaned_bicorpora = \
                self._run_step('preprocess',
                               self._step_preprocess,
                               logger=logger,
                               values=[base_bicorpora, base_monocorpora, base_bicorpora],
                               delete_on_exit=delete_on_exit)

            # run context_analyzer step base_bicorpora if required.
            _ = self._run_step('context_analyzer',
                               self._step_context_analyzer,
                               logger=logger,
                               values=[base_bicorpora],
                               delete_on_exit=delete_on_exit)

            # run aligner step cleaned_bicorpora if required.
            _ = self._run_step('aligner',
                               self._step_aligner,
                               logger=logger,
                               values=[cleaned_bicorpora],
                               delete_on_exit=delete_on_exit)

            # run tm step cleaned_bicorpora if required.
            _ = self._run_step('tm',
                               self._step_tm,
                               logger=logger,
                               values=[cleaned_bicorpora],
                               delete_on_exit=delete_on_exit)

            # run lm step on the joint list of processed_bicorpora and processed_monocorpora
            _ = self._run_step(
                'lm',
                self._step_lm,
                logger=logger,
                values=[processed_bicorpora + processed_monocorpora],
                delete_on_exit=delete_on_exit)

            # Writing config file
            with logger.step('Writing config files') as _:
                self._engine.write_configs()

            # tell the logger that the engine training has completed
            logger.completed()

            # if this is not debug mode, then the training temporary folder must be deleted
            if not self._debug:
                self._engine.clear_tempdir('training')
        except:
            logger.error()
            raise
        finally:
            logger.close()
Exemplo n.º 10
0
    def build(self, roots, debug=False, steps=None, split_trainingset=True):
        self._temp_dir = self._engine.get_tempdir('training', ensure=True)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(source_lang, target_lang, roots=roots)

        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper()))

        if steps is None:
            steps = self._engine.training_steps
        else:
            unknown_steps = [step for step in steps if step not in self._engine.training_steps]
            if len(unknown_steps) > 0:
                raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps))

        cmdlogger = _builder_logger(len(steps) + 1)
        cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora)

        shutil.rmtree(self._engine.path, ignore_errors=True)
        os.makedirs(self._engine.path)

        # Check disk space constraints
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self.__GB * corpus_size_on_disk / (350 * self.__MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \
                      (recommended_mem / self.__GB, free_memory / self.__GB)
            if free_space_on_disk < recommended_disk:
                print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \
                      (recommended_disk / self.__GB, free_space_on_disk / self.__GB)
            print

        try:
            unprocessed_bicorpora = bilingual_corpora
            unprocessed_monocorpora = monolingual_corpora

            # TM draft-translations cleanup
            if 'tm_cleanup' in steps:
                with cmdlogger.step('TMs clean-up') as _:
                    unprocessed_bicorpora = self._engine.cleaner.clean(
                        unprocessed_bicorpora, self._get_tempdir('clean_tms')
                    )

            cleaned_bicorpora = unprocessed_bicorpora
            processed_bicorpora = unprocessed_bicorpora
            processed_monocorpora = unprocessed_monocorpora

            # Preprocessing
            if 'preprocess' in steps:
                with cmdlogger.step('Corpora preprocessing') as _:
                    unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate(
                        unprocessed_bicorpora, unprocessed_monocorpora, self._get_tempdir('training_corpora')
                    )

                    processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                        unprocessed_bicorpora + unprocessed_monocorpora, self._get_tempdir('preprocessed'),
                        (self._engine.data_path if split_trainingset else None)
                    )

                    cleaned_bicorpora = self._engine.training_preprocessor.clean(
                        processed_bicorpora, self._get_tempdir('clean_corpora')
                    )

            # Training Context Analyzer
            if 'context_analyzer' in steps:
                with cmdlogger.step('Context Analyzer training') as _:
                    log_file = self._engine.get_logfile('training.context')
                    self._engine.analyzer.create_index(unprocessed_bicorpora, source_lang, log_file=log_file)

            # Aligner
            if 'aligner' in steps:
                with cmdlogger.step('Aligner training') as _:
                    log_file = self._engine.get_logfile('training.aligner')
                    working_dir = self._get_tempdir('aligner')

                    self._engine.aligner.build(cleaned_bicorpora, working_dir, log_file)

            # Training Translation Model
            if 'tm' in steps:
                with cmdlogger.step('Translation Model training') as _:
                    working_dir = self._get_tempdir('tm')
                    log_file = self._engine.get_logfile('training.tm')
                    self._engine.pt.train(cleaned_bicorpora, self._engine.aligner, working_dir, log_file)

            # Training Adaptive Language Model
            if 'lm' in steps:
                with cmdlogger.step('Language Model training') as _:
                    working_dir = self._get_tempdir('lm')
                    log_file = self._engine.get_logfile('training.lm')
                    self._engine.lm.train(processed_bicorpora + processed_monocorpora, target_lang,
                                          working_dir, log_file)

            # Writing config file
            with cmdlogger.step('Writing config files') as _:
                self._engine.write_configs()

            cmdlogger.completed()
        finally:
            if not debug:
                self._engine.clear_tempdir('training')
Exemplo n.º 11
0
    def build(self, roots, debug=False, steps=None, split_trainingset=True):
        self._temp_dir = self._engine.get_tempdir('training', ensure=True)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(
            source_lang, target_lang, roots=roots)

        if len(bilingual_corpora) == 0:
            raise IllegalArgumentException(
                'you project does not include %s-%s data.' %
                (source_lang.upper(), target_lang.upper()))

        if steps is None:
            steps = self._engine.training_steps
        else:
            unknown_steps = [
                step for step in steps
                if step not in self._engine.training_steps
            ]
            if len(unknown_steps) > 0:
                raise IllegalArgumentException('Unknown training steps: ' +
                                               str(unknown_steps))

        shutil.rmtree(self._engine.path, ignore_errors=True)
        os.makedirs(self._engine.path)

        # Check disk space constraints
        free_space_on_disk = fileutils.df(self._engine.path)[2]
        corpus_size_on_disk = 0
        for root in roots:
            corpus_size_on_disk += fileutils.du(root)
        free_memory = fileutils.free()

        recommended_mem = self.__GB * corpus_size_on_disk / (
            350 * self.__MB)  # 1G RAM every 350M on disk
        recommended_disk = 10 * corpus_size_on_disk

        if free_memory < recommended_mem or free_space_on_disk < recommended_disk:
            if free_memory < recommended_mem:
                print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \
                      (recommended_mem / self.__GB, free_memory / self.__GB)
            if free_space_on_disk < recommended_disk:
                print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \
                      (recommended_disk / self.__GB, free_space_on_disk / self.__GB)
            print

        logger = _builder_logger(
            len(steps) + 1, self._engine.get_logfile('training'))

        try:
            logger.start(self._engine, bilingual_corpora, monolingual_corpora)

            unprocessed_bicorpora = bilingual_corpora
            unprocessed_monocorpora = monolingual_corpora

            # TM draft-translations cleanup
            if 'tm_cleanup' in steps:
                with logger.step('TMs clean-up') as _:
                    unprocessed_bicorpora = self._engine.cleaner.clean(
                        unprocessed_bicorpora,
                        self._get_tempdir('clean_tms'),
                        log=logger.stream)

            cleaned_bicorpora = unprocessed_bicorpora
            processed_bicorpora = unprocessed_bicorpora
            processed_monocorpora = unprocessed_monocorpora

            # Preprocessing
            if 'preprocess' in steps:
                with logger.step('Corpora preprocessing') as _:
                    unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate(
                        unprocessed_bicorpora,
                        unprocessed_monocorpora,
                        self._get_tempdir('training_corpora'),
                        log=logger.stream)

                    processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                        unprocessed_bicorpora + unprocessed_monocorpora,
                        self._get_tempdir('preprocessed'),
                        (self._engine.data_path
                         if split_trainingset else None),
                        log=logger.stream)

                    cleaned_bicorpora = self._engine.training_preprocessor.clean(
                        processed_bicorpora,
                        self._get_tempdir('clean_corpora'))

            # Training Context Analyzer
            if 'context_analyzer' in steps:
                with logger.step('Context Analyzer training') as _:
                    self._engine.analyzer.create_index(unprocessed_bicorpora,
                                                       log=logger.stream)

            # Aligner
            if 'aligner' in steps:
                with logger.step('Aligner training') as _:
                    working_dir = self._get_tempdir('aligner')
                    self._engine.aligner.build(cleaned_bicorpora,
                                               working_dir,
                                               log=logger.stream)

            # Training Translation Model
            if 'tm' in steps:
                with logger.step('Translation Model training') as _:
                    working_dir = self._get_tempdir('tm')
                    self._engine.pt.train(cleaned_bicorpora,
                                          self._engine.aligner,
                                          working_dir,
                                          log=logger.stream)

            # Training Adaptive Language Model
            if 'lm' in steps:
                with logger.step('Language Model training') as _:
                    working_dir = self._get_tempdir('lm')
                    self._engine.lm.train(processed_bicorpora +
                                          processed_monocorpora,
                                          target_lang,
                                          working_dir,
                                          log=logger.stream)

            # Writing config file
            with logger.step('Writing config files') as _:
                self._engine.write_configs()

            logger.completed()
        except:
            logger.error()
            raise
        finally:
            logger.close()
            if not debug:
                self._engine.clear_tempdir('training')