Exemplo n.º 1
0
    def _preprocess(self, args, skip=False, log=None):
        preprocessed_folder = self._get_tempdir('preprocessed_corpora')

        if skip:
            processed_bicorpora, processed_monocorpora = BilingualCorpus.splitlist(
                self._engine.source_lang,
                self._engine.target_lang,
                roots=preprocessed_folder)
        else:
            corpora = args.bilingual_corpora + args.monolingual_corpora
            if not corpora:
                raise CorpusNotFoundInFolderException(
                    "Could not find any valid %s -> %s segments in your input."
                    % (self._engine.source_lang, self._engine.target_lang))

            processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process(
                corpora,
                preprocessed_folder,
                data_path=(self._engine.data_path
                           if self._split_trainingset else None),
                vb_path=self._engine.vocabulary_path,
                log=log)

        args.processed_bilingual_corpora = processed_bicorpora
        args.processed_monolingual_corpora = processed_monocorpora
Exemplo n.º 2
0
    def _preprocess(self, args, skip=False, log=None):
        preprocessed_folder = self._get_tempdir('preprocessed_corpora')
        train_folder = os.path.join(preprocessed_folder, 'train')
        valid_folder = os.path.join(preprocessed_folder, 'validation')
        raw_valid_folder = os.path.join(preprocessed_folder,
                                        'extracted_validation')

        if skip:
            args.processed_train_corpora = BilingualCorpus.list(
                self.source_lang, self.target_lang, train_folder)
            args.processed_valid_corpora = BilingualCorpus.list(
                self.source_lang, self.target_lang, valid_folder)
        else:
            if not args.corpora:
                raise CorpusNotFoundInFolderException(
                    'Could not find any valid %s > %s segments in your input.'
                    % (self.source_lang, self.target_lang))

            test_data_path = self._engine.test_data_path if self._split_train else None
            dev_data_path = raw_valid_folder if self._split_train else None
            args.processed_train_corpora = self._training_preprocessor.process(
                args.corpora,
                train_folder,
                log=log,
                test_data_path=test_data_path,
                dev_data_path=dev_data_path)
            valid_corpora = BilingualCorpus.list(
                self.source_lang, self.target_lang, dev_data_path
                or self._validation_path)

            if not valid_corpora:
                raise CorpusNotFoundInFolderException(
                    'Could not find any valid %s > %s segments for validation.'
                    % (self.source_lang, self.target_lang))

            args.processed_valid_corpora = self._training_preprocessor.process(
                valid_corpora, valid_folder, log=log)
Exemplo n.º 3
0
    def _build(self, resume, listener):
        self._temp_dir = self._engine.get_tempdir('training',
                                                  ensure=(not resume))

        checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json')
        if resume:
            self._schedule.load(checkpoint_path)
        else:
            self._schedule.store(checkpoint_path)

        source_lang = self._engine.source_lang
        target_lang = self._engine.target_lang

        # separate bilingual and monolingual corpora in separate lists, reading them from roots
        bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(
            source_lang, target_lang, roots=self._roots)

        # if no bilingual corpora are found, it is not possible to train the translation system
        if len(bilingual_corpora) == 0:
            raise CorpusNotFoundInFolderException(
                'Could not find %s-%s corpora in path %s' %
                (source_lang.upper(), target_lang.upper(), ', '.join(
                    self._roots)))

        # if no old engines (i.e. engine folders) can be found, create a new one from scratch
        # if we are not trying to resume an old one, create from scratch anyway
        if not os.path.isdir(self._engine.path) or not resume:
            shutil.rmtree(self._engine.path, ignore_errors=True)
            os.makedirs(self._engine.path)

        # Create a new logger for the building activities,
        log_file = self._engine.get_logfile('training', append=resume)
        log_stream = open(log_file, 'ab' if resume else 'wb')
        logging.basicConfig(
            format='%(asctime)-15s [%(levelname)s] - %(message)s',
            level=logging.DEBUG,
            stream=log_stream)
        logger = logging.getLogger('EngineBuilder')

        # Start the engine building (training) phases
        try:
            logger.log(
                logging.INFO,
                'Training started: engine=%s, bilingual=%d, monolingual=%d, langpair=%s-%s'
                % (self._engine.name, len(bilingual_corpora),
                   len(monolingual_corpora), self._engine.source_lang,
                   self._engine.target_lang))

            if listener:
                listener.on_training_begin(self._schedule.visible_steps(),
                                           self._engine, bilingual_corpora,
                                           monolingual_corpora)

            # Check if all requirements are fulfilled before actual engine training
            try:
                self._check_constraints()
            except EngineBuilder.HWConstraintViolated as e:
                if listener:
                    listener.on_hw_constraint_violated(e.cause)

            args = EngineBuilder.__Args()
            args.bilingual_corpora = bilingual_corpora
            args.monolingual_corpora = monolingual_corpora

            # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~
            # Note: if resume is true, a step is only run if it was not in the previous attempt

            step_index = 1

            for method in self._schedule:
                skip = self._schedule.is_completed(method.id)

                if listener and not method.is_hidden():
                    listener.on_step_begin(method.id, method.name)

                logger.log(
                    logging.INFO, 'Training step "%s" (%d/%d) started' %
                    (method.id, step_index, len(self._schedule)))

                start_time = time.time()
                method(self,
                       args,
                       skip=skip,
                       log=log_stream,
                       delete_on_exit=self._delete_on_exit)
                elapsed_time = time.time() - start_time

                if listener and not method.is_hidden():
                    listener.on_step_end(method.id, method.name)

                logger.log(
                    logging.INFO, 'Training step "%s" completed in %d s' %
                    (method.id, int(elapsed_time)))

                self._schedule.step_completed(method.id)
                self._schedule.store(checkpoint_path)

                step_index += 1

            if listener:
                listener.on_training_end(self._engine)

            if self._delete_on_exit:
                self._engine.clear_tempdir('training')
        except:
            logger.exception('Unexpected exception')
            raise
        finally:
            log_stream.close()
Exemplo n.º 4
0
    def _build(self, resume):
        self._temp_dir = self._engine.get_tempdir('training',
                                                  ensure=(not resume))

        checkpoint_path = os.path.join(self._temp_dir, 'checkpoint.json')
        if resume:
            self._schedule.load(checkpoint_path)
        else:
            self._schedule.store(checkpoint_path)

        corpora = BilingualCorpus.list(self.source_lang, self.target_lang,
                                       self.roots)

        if len(corpora) == 0:
            raise CorpusNotFoundInFolderException(
                'Could not find %s > %s corpora in path %s' %
                (self.source_lang, self.target_lang, ', '.join(self.roots)))

        # if no old engines (i.e. engine folders) can be found, create a new one from scratch
        # if we are not trying to resume an old one, create from scratch anyway
        if not os.path.isdir(self._engine.path) or not resume:
            shutil.rmtree(self._engine.path, ignore_errors=True)
            os.makedirs(self._engine.path)

        # Create a new logger for the building activities,
        log_file = self._engine.get_logfile('training', append=resume)
        log_stream = open(log_file, 'ab' if resume else 'wb')
        logging.basicConfig(
            format='%(asctime)-15s [%(levelname)s] - %(message)s',
            level=logging.DEBUG,
            stream=log_stream)
        logger = logging.getLogger('EngineBuilder')

        # Start the engine building (training) phases
        steps_count = len(self._schedule.visible_steps())
        log_line_len = 70

        try:
            logger.log(
                logging.INFO,
                'Training started: engine=%s, corpora=%d, lang_pair=%s-%s' %
                (self._engine.name, len(corpora), self.source_lang,
                 self.target_lang))

            print '\n=========== TRAINING STARTED ===========\n'
            print 'ENGINE:  %s' % self._engine.name
            print 'CORPORA: %d corpora' % len(corpora)
            print 'LANGS:   %s > %s' % (self.source_lang, self.target_lang)
            print

            # Check if all requirements are fulfilled before actual engine training
            try:
                self._check_constraints()
            except EngineBuilder.HWConstraintViolated as e:
                print '\033[91mWARNING\033[0m: %s\n' % e.cause

            args = EngineBuilder.__Args()
            args.corpora = corpora

            # ~~~~~~~~~~~~~~~~~~~~~ RUN ALL STEPS ~~~~~~~~~~~~~~~~~~~~~
            # Note: if resume is true, a step is only run if it was not in the previous attempt

            step_index = 1

            for method in self._schedule:
                if not method.is_hidden():
                    print('INFO: (%d of %d) %s... ' %
                          (step_index, steps_count,
                           method.name)).ljust(log_line_len),

                skip = self._schedule.is_completed(method.id)
                self._step_start_time = time.time()

                logger.log(
                    logging.INFO, 'Training step "%s" (%d/%d) started' %
                    (method.id, step_index, len(self._schedule)))

                start_time = time.time()
                method(self,
                       args,
                       skip=skip,
                       log=log_stream,
                       delete_on_exit=self._delete_on_exit)
                elapsed_time_str = self._pretty_print_time(time.time() -
                                                           start_time)

                if not method.is_hidden():
                    step_index += 1
                    print 'DONE (in %s)' % elapsed_time_str

                logger.log(
                    logging.INFO, 'Training step "%s" completed in %s' %
                    (method.id, elapsed_time_str))

                self._schedule.step_completed(method.id)
                self._schedule.store(checkpoint_path)

            print '\n=========== TRAINING SUCCESS ===========\n'
            print 'You can now start, stop or check the status of the server with command:'
            print '\t./mmt start|stop|status ' + ('' if self._engine.name
                                                  == 'default' else '-e %s' %
                                                  self._engine.name)
            print

            if self._delete_on_exit:
                self._engine.clear_tempdir('training')
        except Exception:
            logger.exception('Unexpected exception')
            raise
        finally:
            log_stream.close()