def run(dataset): fs = FS.for_parse_projects(dataset) logger.info(f"Getting files from {fs.path_to_raw_dataset}") logger.info(f"Writing preprocessed files to {fs.path_to_parsed_dataset}") preprocessing_types_dict = {k: None for k in PrepParam} fs.save_pp_params(pp_params) fs.save_preprocessing_types(preprocessing_types_dict) params = [] for train_test_valid, project in fs.get_raw_projects(): params.append((fs.path_to_raw_dataset, fs.path_to_parsed_dataset, train_test_valid, project, preprocessing_types_dict)) files_total = len(params) current_file = 0 start_time = time.time() with Pool() as pool: it = pool.imap_unordered(preprocess_and_write, params) for _ in it: current_file += 1 logger.info( f"Processed {current_file} out of {files_total} chunks") time_elapsed = time.time() - start_time logger.info( f"Time elapsed: {time_elapsed:.2f} s, estimated time until completion: " f"{time_elapsed / current_file * files_total - time_elapsed:.2f} s" )
def train_and_save_model(rnn_learner: RNN_Learner, fs: FS, training: LMTraining, metric_list: List[str], cache: Cache, use_subword_aware_metrics: bool): only_validation = False n = training.cycle.n if training.cycle.n == 0: logger.info("Number of epochs specified is 0. Not training...") fs.save_best(rnn_learner) only_validation = True n = 1 training_start_time = time() training_log_file = os.path.join(fs.path_to_model, 'training.log') logger.info( f"Starting training, check {training_log_file} for training progress") callbacks = [] if training.early_stop: callbacks.append( EarlyStopping(rnn_learner, save_path=BEST_MODEL_NAME, best_loss_path=BEST_LOSS_FILENAME, best_acc_path=BEST_ACC_FILENAME, best_epoch_path=BEST_EPOCH_FILENAME, enc_path=ENCODER_NAME)) validation_function = get_validation_function(cache, use_subword_aware_metrics, rnn_learner.text_field) vals, ep_vals = rnn_learner.fit(lrs=training.lr, n_cycle=n, wds=training.wds, cycle_len=training.cycle.len, cycle_mult=training.cycle.mult, metrics=list( map(lambda x: getattr(metrics, x), metric_list)), get_ep_vals=True, file=open(training_log_file, 'w'), callbacks=callbacks, valid_func=validation_function, only_validation=only_validation) training_time_mins = int(time() - training_start_time) // 60 with open(os.path.join(fs.path_to_model, 'results.out'), 'w') as f: f.write(str(training_time_mins) + "\n") for _, vals in ep_vals.items(): f.write(" ".join(map(lambda x: str(x), vals)) + "\n")
def get_best_available_model(fs: FS, data: Data, arch: Arch): preloaded_text_filed = fs.load_text_field() rnn_learner = create_nn_architecture( fs, data, arch, path=None, preloaded_text_field=preloaded_text_filed) logger.info(rnn_learner) logger.info("Checking if there exists a model with the same architecture") model_loaded = fs.load_best(rnn_learner) if not model_loaded and fs.base_model_specified: logger.info(f'Trying to load base model: {fs.base_model_id}') try: fs.load_base_model(rnn_learner) except FileNotFoundError: raise FileNotFoundError( "Base model was not found. Training model from scratch") return rnn_learner, model_loaded
def __init__(self, dataset: str, repr: str, model: str, backwards: bool): fs = FS.for_lang_model(dataset, repr, model) text_field = fs.load_text_field() super().__init__(repr=repr, fs=fs, text_field=text_field, config_class=LMTrainingConfig, output_field=text_field, n_predictions=10, backwards=backwards)
def __init__(self, dataset: str, repr: str, model: str, backwards: bool, classifier_type: str): fs = FS.for_classifier(dataset, repr, model, PretrainingType.FULL, classifier_type) text_field = fs.load_text_field() super().__init__(repr=repr, fs=fs, text_field=text_field, config_class=ClassifierTrainingConfig, output_field=LEVEL_LABEL, n_predictions=6 if classifier_type == 'level' else 2, backwards=backwards)
def run(dataset): fs = FS.for_parse_projects(dataset) logger.info(f"Getting files from {fs.path_to_raw_dataset}") logger.info(f"Writing preprocessed files to {fs.path_to_parsed_dataset}") preprocessing_types_dict = {k: None for k in PrepParam} fs.save_pp_params(pp_params) fs.save_preprocessing_types(preprocessing_types_dict) params = [] for train_test_valid, project in fs.get_raw_projects(): params.append((fs.path_to_raw_dataset, fs.path_to_parsed_dataset, train_test_valid, project, preprocessing_types_dict)) files_total = len(params) with Pool() as pool: it = pool.imap_unordered(preprocess_and_write, params) for _ in tqdm(it, total=files_total): pass
def run_on_device(config: Union[LMLRConfig, LMConfig], find_lr: bool, force_rerun: bool) -> None: fs = FS.for_lang_model(config.data.dataset, config.data.repr, config.base_model) fs.create_path_to_model(config.data, config.training_config) attach_dataset_aware_handlers_to_loggers(fs.path_to_model, 'main.log') print_gpu_info() learner, model_trained = get_best_available_model(fs, config.data, config.arch) fs.save_vocab_data(learner.text_field, config.data.percent, config.data.start_from) if model_trained and not force_rerun: logger.info( f'Model {fs.path_to_model} already trained. Not rerunning training.' ) return elif model_trained: logger.info(f"Forcing rerun") else: logger.info(f'Model with the same training config was not found.') config_manager.save_config(config.training_config, fs.path_to_model) if find_lr: find_and_plot_lr(learner, fs) else: train_and_save_model(learner, fs, config.training, config.metrics, config.cache, config.use_subword_aware_metrics) model_loaded = fs.load_best(learner) if not model_loaded: raise AssertionError( "The best model should have been trained and saved!") gen_text_path = os.path.join(fs.path_to_model, 'gen_text.out') run_and_display_tests(learner, config.arch, config.testing, config.data.backwards, gen_text_path)
def run_on_device(config: ClassifierConfig, force_rerun: bool) -> None: base_model = config.base_model pretraining = config.pretraining_type PrepConfig.assert_classification_config(config.data.repr) if bool(base_model) != bool(pretraining): raise ValueError( 'Base model and pretraining_type params must be both set or both unset!' ) fs = FS.for_classifier(config.data.dataset, config.data.repr, base_model=base_model, pretraining=pretraining, classification_type=config.classification_type) fs.create_path_to_model(config.data, config.training_config) attach_dataset_aware_handlers_to_loggers(fs.path_to_model, 'main.log') print_gpu_info() text_field = fs.load_text_field() rnn_learner = create_nn_architecture(fs, text_field, LEVEL_LABEL, config.data, config.arch, config.min_log_coverage_percent) logger.info(rnn_learner) same_model_exists = fs.best_model_exists(rnn_learner) if same_model_exists and not force_rerun: logger.info( f'Model {fs.path_to_classification_model} already trained. Not rerunning training.' f'To retrain the model with this parameters, specify --force-rerun flag' ) return elif same_model_exists: logger.info( f"Model {fs.path_to_classification_model} already trained. Forcing rerun." ) if pretraining == PretrainingType.FULL: try: logger.info(f'Trying to load base classifier: {base_model}') fs.load_base_model(rnn_learner) logger.info('Base classifier model is loaded.') except Exception as e: logger.warning(e) logger.warning( 'Base classifier model not loaded. Training from scratch') elif pretraining == PretrainingType.ONLY_ENCODER: try: logger.info(f'Trying to load pretarined LM: {base_model}') # TODO its a dirty hack. fix it fs.lm_cl_pretraining = True fs.load_pretrained_langmodel(rnn_learner) logger.info("Using pretrained LM") except Exception as e: logger.warning(e) logger.warning('Pretrained LM not loaded. Training from scratch') else: logger.info("No pretraining. Training classifier from scratch.") config_manager.save_config(config.training_config, fs.path_to_model) train(fs, rnn_learner, config.training, config.metrics) model = rnn_learner.model to_test_mode(model) sample_test_runs_file = os.path.join(fs.path_to_model, 'test_runs.out') n_predicitions = 6 if config.classification_type == 'level' else 2 show_tests(fs.test_path, model, text_field, sample_test_runs_file, config.data.backwards, n_predicitions, config.testing.n_samples) logger.info("Classifier training finished successfully.")