def _check_constraints(self): free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in self._roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / ( 350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print
def _check_constraints(self): free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in self._roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self._GB * corpus_size_on_disk / ( 350 * self._MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: raise EngineBuilder.HWConstraintViolated( 'more than %.fG of RAM recommended, only %.fG available' % (recommended_mem / self._GB, free_memory / self._GB)) if free_space_on_disk < recommended_disk: raise EngineBuilder.HWConstraintViolated( 'more than %.fG of storage recommended, only %.fG available' % (recommended_disk / self._GB, free_space_on_disk / self._GB))
def build(self, roots, debug=False, steps=None, split_trainingset=True): self._temp_dir = self._engine.get_tempdir('training', ensure=True) source_lang = self._engine.source_lang target_lang = self._engine.target_lang bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist(source_lang, target_lang, roots=roots) if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) if steps is None: steps = self._engine.training_steps else: unknown_steps = [step for step in steps if step not in self._engine.training_steps] if len(unknown_steps) > 0: raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps)) cmdlogger = _builder_logger(len(steps) + 1) cmdlogger.start(self._engine, bilingual_corpora, monolingual_corpora) shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check disk space constraints free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / (350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print try: unprocessed_bicorpora = bilingual_corpora unprocessed_monocorpora = monolingual_corpora # TM draft-translations cleanup if 'tm_cleanup' in steps: with cmdlogger.step('TMs clean-up') as _: unprocessed_bicorpora = self._engine.cleaner.clean( unprocessed_bicorpora, self._get_tempdir('clean_tms') ) cleaned_bicorpora = unprocessed_bicorpora processed_bicorpora = unprocessed_bicorpora processed_monocorpora = unprocessed_monocorpora # Preprocessing if 'preprocess' in steps: with cmdlogger.step('Corpora preprocessing') as _: unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate( unprocessed_bicorpora, unprocessed_monocorpora, self._get_tempdir('training_corpora') ) processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( unprocessed_bicorpora + unprocessed_monocorpora, self._get_tempdir('preprocessed'), (self._engine.data_path if split_trainingset else None) ) cleaned_bicorpora = self._engine.training_preprocessor.clean( processed_bicorpora, self._get_tempdir('clean_corpora') ) # Training Context Analyzer if 'context_analyzer' in steps: with cmdlogger.step('Context Analyzer training') as _: log_file = self._engine.get_logfile('training.context') self._engine.analyzer.create_index(unprocessed_bicorpora, source_lang, log_file=log_file) # Aligner if 'aligner' in steps: with cmdlogger.step('Aligner training') as _: log_file = self._engine.get_logfile('training.aligner') working_dir = self._get_tempdir('aligner') self._engine.aligner.build(cleaned_bicorpora, working_dir, log_file) # Training Translation Model if 'tm' in steps: with cmdlogger.step('Translation Model training') as _: working_dir = self._get_tempdir('tm') log_file = self._engine.get_logfile('training.tm') self._engine.pt.train(cleaned_bicorpora, self._engine.aligner, working_dir, log_file) # Training Adaptive Language Model if 'lm' in steps: with cmdlogger.step('Language Model training') as _: working_dir = self._get_tempdir('lm') log_file = self._engine.get_logfile('training.lm') self._engine.lm.train(processed_bicorpora + processed_monocorpora, target_lang, working_dir, log_file) # Writing config file with cmdlogger.step('Writing config files') as _: self._engine.write_configs() cmdlogger.completed() finally: if not debug: self._engine.clear_tempdir('training')
def build(self, roots, debug=False, steps=None, split_trainingset=True): self._temp_dir = self._engine.get_tempdir('training', ensure=True) source_lang = self._engine.source_lang target_lang = self._engine.target_lang bilingual_corpora, monolingual_corpora = BilingualCorpus.splitlist( source_lang, target_lang, roots=roots) if len(bilingual_corpora) == 0: raise IllegalArgumentException( 'you project does not include %s-%s data.' % (source_lang.upper(), target_lang.upper())) if steps is None: steps = self._engine.training_steps else: unknown_steps = [ step for step in steps if step not in self._engine.training_steps ] if len(unknown_steps) > 0: raise IllegalArgumentException('Unknown training steps: ' + str(unknown_steps)) shutil.rmtree(self._engine.path, ignore_errors=True) os.makedirs(self._engine.path) # Check disk space constraints free_space_on_disk = fileutils.df(self._engine.path)[2] corpus_size_on_disk = 0 for root in roots: corpus_size_on_disk += fileutils.du(root) free_memory = fileutils.free() recommended_mem = self.__GB * corpus_size_on_disk / ( 350 * self.__MB) # 1G RAM every 350M on disk recommended_disk = 10 * corpus_size_on_disk if free_memory < recommended_mem or free_space_on_disk < recommended_disk: if free_memory < recommended_mem: print '> WARNING: more than %.fG of RAM recommended, only %.fG available' % \ (recommended_mem / self.__GB, free_memory / self.__GB) if free_space_on_disk < recommended_disk: print '> WARNING: more than %.fG of storage recommended, only %.fG available' % \ (recommended_disk / self.__GB, free_space_on_disk / self.__GB) print logger = _builder_logger( len(steps) + 1, self._engine.get_logfile('training')) try: logger.start(self._engine, bilingual_corpora, monolingual_corpora) unprocessed_bicorpora = bilingual_corpora unprocessed_monocorpora = monolingual_corpora # TM draft-translations cleanup if 'tm_cleanup' in steps: with logger.step('TMs clean-up') as _: unprocessed_bicorpora = self._engine.cleaner.clean( unprocessed_bicorpora, self._get_tempdir('clean_tms'), log=logger.stream) cleaned_bicorpora = unprocessed_bicorpora processed_bicorpora = unprocessed_bicorpora processed_monocorpora = unprocessed_monocorpora # Preprocessing if 'preprocess' in steps: with logger.step('Corpora preprocessing') as _: unprocessed_bicorpora, unprocessed_monocorpora = self._engine.db.generate( unprocessed_bicorpora, unprocessed_monocorpora, self._get_tempdir('training_corpora'), log=logger.stream) processed_bicorpora, processed_monocorpora = self._engine.training_preprocessor.process( unprocessed_bicorpora + unprocessed_monocorpora, self._get_tempdir('preprocessed'), (self._engine.data_path if split_trainingset else None), log=logger.stream) cleaned_bicorpora = self._engine.training_preprocessor.clean( processed_bicorpora, self._get_tempdir('clean_corpora')) # Training Context Analyzer if 'context_analyzer' in steps: with logger.step('Context Analyzer training') as _: self._engine.analyzer.create_index(unprocessed_bicorpora, log=logger.stream) # Aligner if 'aligner' in steps: with logger.step('Aligner training') as _: working_dir = self._get_tempdir('aligner') self._engine.aligner.build(cleaned_bicorpora, working_dir, log=logger.stream) # Training Translation Model if 'tm' in steps: with logger.step('Translation Model training') as _: working_dir = self._get_tempdir('tm') self._engine.pt.train(cleaned_bicorpora, self._engine.aligner, working_dir, log=logger.stream) # Training Adaptive Language Model if 'lm' in steps: with logger.step('Language Model training') as _: working_dir = self._get_tempdir('lm') self._engine.lm.train(processed_bicorpora + processed_monocorpora, target_lang, working_dir, log=logger.stream) # Writing config file with logger.step('Writing config files') as _: self._engine.write_configs() logger.completed() except: logger.error() raise finally: logger.close() if not debug: self._engine.clear_tempdir('training')