def save(self, conf, params, problem, emb_matrix): if not os.path.exists(conf.cache_dir): os.makedirs(conf.cache_dir) shutil.copy(params.conf_path, os.path.join(conf.cache_dir, 'conf_cache.json')) if self.dictionary_invalid: if conf.mode == 'philly' and conf.emb_pkl_path.startswith( '/hdfs/'): with HDFSDirectTransferer( conf.problem_path, with_hdfs_command=True) as transferer: transferer.pkl_dump( problem.export_problem(conf.problem_path, ret_without_save=True)) else: problem.export_problem(conf.problem_path) logging.info("[Cache] problem is saved to %s" % conf.problem_path) if emb_matrix is not None and conf.emb_pkl_path is not None: if conf.mode == 'philly' and conf.emb_pkl_path.startswith( '/hdfs/'): with HDFSDirectTransferer( conf.emb_pkl_path, with_hdfs_command=True) as transferer: transferer.pkl_dump(emb_matrix) else: dump_to_pkl(emb_matrix, conf.emb_pkl_path) logging.info("Embedding matrix saved to %s" % conf.emb_pkl_path) if self.encoding_invalid: pass
def main(params): conf = ModelConf("train", params.conf_path, version, params, mode=params.mode) shutil.copy(params.conf_path, conf.save_base_dir) logging.info('Configuration file is backed up to %s' % (conf.save_base_dir)) if ProblemTypes[conf.problem_type] == ProblemTypes.sequence_tagging: problem = Problem(conf.problem_type, conf.input_types, conf.answer_column_name, source_with_start=True, source_with_end=True, source_with_unk=True, source_with_pad=True, target_with_start=True, target_with_end=True, target_with_unk=True, target_with_pad=True, same_length=True, with_bos_eos=conf.add_start_end_for_seq, tagging_scheme=conf.tagging_scheme, remove_stopwords=conf.remove_stopwords, DBC2SBC=conf.DBC2SBC, unicode_fix=conf.unicode_fix) elif ProblemTypes[conf.problem_type] == ProblemTypes.classification \ or ProblemTypes[conf.problem_type] == ProblemTypes.regression: problem = Problem(conf.problem_type, conf.input_types, conf.answer_column_name, source_with_start=True, source_with_end=True, source_with_unk=True, source_with_pad=True, target_with_start=False, target_with_end=False, target_with_unk=False, target_with_pad=False, same_length=False, with_bos_eos=conf.add_start_end_for_seq, remove_stopwords=conf.remove_stopwords, DBC2SBC=conf.DBC2SBC, unicode_fix=conf.unicode_fix) elif ProblemTypes[conf.problem_type] == ProblemTypes.mrc: problem = Problem(conf.problem_type, conf.input_types, conf.answer_column_name, source_with_start=True, source_with_end=True, source_with_unk=True, source_with_pad=True, target_with_start=False, target_with_end=False, target_with_unk=False, target_with_pad=False, same_length=False, with_bos_eos=False, remove_stopwords=conf.remove_stopwords, DBC2SBC=conf.DBC2SBC, unicode_fix=conf.unicode_fix) cache_load_flag = False if not conf.pretrained_model_path: # first time training, load cache if appliable if conf.use_cache: cache_conf_path = os.path.join(conf.cache_dir, 'conf_cache.json') if os.path.isfile(cache_conf_path): params_cache = copy.deepcopy(params) ''' for key in vars(params_cache): setattr(params_cache, key, None) params_cache.mode = params.mode ''' try: cache_conf = ModelConf('cache', cache_conf_path, version, params_cache) except Exception as e: cache_conf = None if cache_conf is None or verify_cache(cache_conf, conf) is not True: logging.info('Found cache that is ineffective') if params.mode == 'philly' or params.force is True: renew_option = 'yes' else: renew_option = input('There exists ineffective cache %s for old models. Input "yes" to renew cache and "no" to exit. (default:no): ' % os.path.abspath(conf.cache_dir)) if renew_option.lower() != 'yes': exit(0) else: shutil.rmtree(conf.cache_dir) time.sleep(2) # sleep 2 seconds since the deleting is asynchronous logging.info('Old cache is deleted') else: logging.info('Found cache that is appliable to current configuration...') elif os.path.isdir(conf.cache_dir): renew_option = input('There exists ineffective cache %s for old models. Input "yes" to renew cache and "no" to exit. (default:no): ' % os.path.abspath(conf.cache_dir)) if renew_option.lower() != 'yes': exit(0) else: shutil.rmtree(conf.cache_dir) time.sleep(2) # Sleep 2 seconds since the deleting is asynchronous logging.info('Old cache is deleted') if not os.path.exists(conf.cache_dir): os.makedirs(conf.cache_dir) shutil.copy(params.conf_path, os.path.join(conf.cache_dir, 'conf_cache.json')) # first time training, load problem from cache, and then backup the cache to model_save_dir/.necessary_cache/ if conf.use_cache and os.path.isfile(conf.problem_path): problem.load_problem(conf.problem_path) if conf.emb_pkl_path is not None: if os.path.isfile(conf.emb_pkl_path): emb_matrix = np.array(load_from_pkl(conf.emb_pkl_path)) cache_load_flag = True else: if params.mode == 'normal': renew_option = input('The cache is invalid because the embedding matrix does not exist in the cache directory. Input "yes" to renew cache and "no" to exit. (default:no): ') if renew_option.lower() != 'yes': exit(0) else: # by default, renew cache renew_option = 'yes' else: emb_matrix = None cache_load_flag = True if cache_load_flag: logging.info("Cache loaded!") if cache_load_flag is False: logging.info("Preprocessing... Depending on your corpus size, this step may take a while.") if conf.pretrained_emb_path: emb_matrix = problem.build(conf.train_data_path, conf.file_columns, conf.input_types, conf.file_with_col_header, conf.answer_column_name, word2vec_path=conf.pretrained_emb_path, word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type, file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb, show_progress=True if params.mode == 'normal' else False, max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency) else: emb_matrix = problem.build(conf.train_data_path, conf.file_columns, conf.input_types, conf.file_with_col_header, conf.answer_column_name, word2vec_path=None, word_emb_dim=None, format=None, file_type=None, involve_all_words=conf.involve_all_words_in_pretrained_emb, show_progress=True if params.mode == 'normal' else False, max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency) if conf.mode == 'philly' and conf.emb_pkl_path.startswith('/hdfs/'): with HDFSDirectTransferer(conf.problem_path, with_hdfs_command=True) as transferer: transferer.pkl_dump(problem.export_problem(conf.problem_path, ret_without_save=True)) else: problem.export_problem(conf.problem_path) if conf.use_cache: logging.info("Cache saved to %s" % conf.problem_path) if emb_matrix is not None and conf.emb_pkl_path is not None: if conf.mode == 'philly' and conf.emb_pkl_path.startswith('/hdfs/'): with HDFSDirectTransferer(conf.emb_pkl_path, with_hdfs_command=True) as transferer: transferer.pkl_dump(emb_matrix) else: dump_to_pkl(emb_matrix, conf.emb_pkl_path) logging.info("Embedding matrix saved to %s" % conf.emb_pkl_path) else: logging.debug("Cache saved to %s" % conf.problem_path) # Back up the problem.pkl to save_base_dir/.necessary_cache. During test phase, we would load cache from save_base_dir/.necessary_cache/problem.pkl cache_bakup_path = os.path.join(conf.save_base_dir, 'necessary_cache/') logging.debug('Prepare dir: %s' % cache_bakup_path) prepare_dir(cache_bakup_path, True, allow_overwrite=True, clear_dir_if_exist=True) shutil.copy(conf.problem_path, cache_bakup_path) logging.debug("Problem %s is backed up to %s" % (conf.problem_path, cache_bakup_path)) if problem.output_dict: logging.debug("Problem target cell dict: %s" % (problem.output_dict.cell_id_map)) if params.make_cache_only: logging.info("Finish building cache!") return vocab_info = dict() # include input_type's vocab_size & init_emd_matrix vocab_sizes = problem.get_vocab_sizes() for input_cluster in vocab_sizes: vocab_info[input_cluster] = dict() vocab_info[input_cluster]['vocab_size'] = vocab_sizes[input_cluster] # add extra info for char_emb if input_cluster.lower() == 'char': for key, value in conf.input_types[input_cluster].items(): if key != 'cols': vocab_info[input_cluster][key] = value if input_cluster == 'word' and emb_matrix is not None: vocab_info[input_cluster]['init_weights'] = emb_matrix else: vocab_info[input_cluster]['init_weights'] = None lm = LearningMachine('train', conf, problem, vocab_info=vocab_info, initialize=True, use_gpu=conf.use_gpu) else: # when finetuning, load previous saved problem problem.load_problem(conf.saved_problem_path) lm = LearningMachine('train', conf, problem, vocab_info=None, initialize=False, use_gpu=conf.use_gpu) if len(conf.metrics_post_check) > 0: for metric_to_chk in conf.metrics_post_check: metric, target = metric_to_chk.split('@') if not problem.output_dict.has_cell(target): raise Exception("The target %s of %s does not exist in the training data." % (target, metric_to_chk)) if conf.pretrained_model_path: logging.info('Loading the pretrained model: %s...' % conf.pretrained_model_path) lm.load_model(conf.pretrained_model_path) loss_conf = conf.loss loss_conf['output_layer_id'] = conf.output_layer_id loss_conf['answer_column_name'] = conf.answer_column_name # loss_fn = eval(loss_conf['type'])(**loss_conf['conf']) loss_fn = Loss(**loss_conf) if conf.use_gpu is True: loss_fn.cuda() optimizer = eval(conf.optimizer_name)(lm.model.parameters(), **conf.optimizer_params) lm.train(optimizer, loss_fn) # test the best model with the best model saved lm.load_model(conf.model_save_path) if conf.test_data_path is not None: test_path = conf.test_data_path elif conf.valid_data_path is not None: test_path = conf.valid_data_path logging.info('Testing the best model saved at %s, with %s' % (conf.model_save_path, test_path)) if not test_path.endswith('pkl'): lm.test(loss_fn, test_path, predict_output_path=conf.predict_output_path) else: lm.test(loss_fn, test_path)
def evaluate(self, data, length, target, input_types, evaluator, loss_fn, pad_ids=None, cur_best_result=None, model_save_path=None, phase="", epoch=None, origin_data_path=None, predict_output_path=None): """ Args: qp_net: epoch: data: { 'string1': { 'word1': [...], 'postage_feature1': [..] } 'string2': { 'word1': [...], 'postage_feature1': [..] } lengths: { 'string1': [...], 'string2': [...] } target: [...] input_types: { "word": { "cols": [ "word1", "word2" ], "dim": 300 }, "postag": { "cols": ["postag_feature1", "postag_feature2"], "dim": 20 } origin_data_path: predict_output_path: if predict_output_path exists, output the prediction result. Returns: """ assert not (predict_output_path and not origin_data_path) if predict_output_path: to_predict = True else: to_predict = False logging.info("Starting %s ..." % phase) self.model.eval() with torch.no_grad(): data_batches, length_batches, target_batches = \ get_batches(self.problem, data, length, target, self.conf.batch_size_total, input_types, pad_ids, permutate=False, transform_tensor=True) if ProblemTypes[self.problem.problem_type] == ProblemTypes.classification: streaming_recoder = StreamingRecorder(['prediction', 'pred_scores', 'pred_scores_all', 'target']) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging: streaming_recoder = StreamingRecorder(['prediction', 'pred_scores', 'target']) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression: streaming_recoder = StreamingRecorder(['prediction', 'target']) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc: streaming_recoder = StreamingRecorder(['prediction', 'answer_text']) if to_predict: predict_stream_recoder = StreamingRecorder(self.conf.predict_fields) fin = open(origin_data_path, 'r', encoding='utf-8') if predict_output_path.startswith('/hdfs/'): direct_hdfs_path = convert_to_hdfspath(predict_output_path) local_tmp_path = convert_to_tmppath(predict_output_path) fout = open(local_tmp_path, 'w', encoding='utf-8') else: direct_hdfs_path = None fout = open(predict_output_path, 'w', encoding='utf-8') if self.conf.file_with_col_header: title_line = fin.readline() fout.write(title_line) temp_key_list = list(length_batches[0].keys()) if 'target' in temp_key_list: temp_key_list.remove('target') key_random = random.choice(temp_key_list) loss_recoder = StreamingRecorder(['loss']) if self.conf.mode == 'normal': progress = tqdm(range(len(target_batches))) elif self.conf.mode == 'philly': progress = range(len(target_batches)) for i in progress: # batch_size_actual = target_batches[i].size(0) param_list, inputs_desc, length_desc = transform_params2tensors(data_batches[i], length_batches[i]) logits_softmax = self.model(inputs_desc, length_desc, *param_list) if ProblemTypes[self.problem.problem_type] == ProblemTypes.classification: logits_softmax = list(logits_softmax.values())[0] # for auc metric prediction_pos_scores = logits_softmax[:, self.conf.pos_label].cpu().data.numpy() if self.evaluator.has_auc_type_specific: prediction_scores_all = logits_softmax.cpu().data.numpy() else: prediction_scores_all = None else: prediction_pos_scores = None prediction_scores_all = None logits_softmax_flat = {} if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging: logits_softmax = list(logits_softmax.values())[0] # Transform output shapes for metric evaluation # for seq_tag_f1 metric prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy() # [batch_size, seq_len] streaming_recoder.record_one_row( [self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()), prediction_pos_scores, self.problem.decode(target_batches[i], length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False) # pytorch's CrossEntropyLoss only support this logits_softmax_flat[self.conf.output_layer_id[0]] = logits_softmax.view(-1, logits_softmax.size(2)) # [batch_size * seq_len, # of tags] #target_batches[i] = target_batches[i].view(-1) # [batch_size * seq_len] target_batches[i][self.conf.answer_column_name[0]] = target_batches[i][self.conf.answer_column_name[0]].reshape(-1) # [batch_size * seq_len] if to_predict: prediction_batch = self.problem.decode(prediction_indices, length_batches[i][key_random].numpy()) for prediction_sample in prediction_batch: predict_stream_recoder.record('prediction', " ".join(prediction_sample)) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.classification: prediction_indices = logits_softmax.data.max(1)[1].cpu().numpy() # Should not decode! streaming_recoder.record_one_row([prediction_indices, prediction_pos_scores, prediction_scores_all, target_batches[i][self.conf.answer_column_name[0]].numpy()]) logits_softmax_flat[self.conf.output_layer_id[0]] = logits_softmax if to_predict: for field in self.conf.predict_fields: if field == 'prediction': predict_stream_recoder.record(field, self.problem.decode(prediction_indices, length_batches[i][key_random].numpy())) elif field == 'confidence': prediction_scores = logits_softmax.cpu().data.numpy() for prediction_score, prediction_idx in zip(prediction_scores, prediction_indices): predict_stream_recoder.record(field, prediction_score[prediction_idx]) elif field.startswith('confidence') and field.find('@') != -1: label_specified = field.split('@')[1] label_specified_idx = self.problem.output_dict.id(label_specified) confidence_specified = torch.index_select(logits_softmax.cpu(), 1, torch.tensor([label_specified_idx], dtype=torch.long)).squeeze(1) predict_stream_recoder.record(field, confidence_specified.data.numpy()) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression: logits_softmax = list(logits_softmax.values())[0] temp_logits_softmax_flat = logits_softmax.squeeze(1) prediction_scores = temp_logits_softmax_flat.detach().cpu().numpy() streaming_recoder.record_one_row([prediction_scores, target_batches[i][self.conf.answer_column_name[0]].numpy()]) logits_softmax_flat[self.conf.output_layer_id[0]] = temp_logits_softmax_flat if to_predict: predict_stream_recoder.record_one_row([prediction_scores]) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc: for key, value in logits_softmax.items(): logits_softmax[key] = value.squeeze() passage_identify = None for type_key in data_batches[i].keys(): if 'p' in type_key.lower(): passage_identify = type_key break if not passage_identify: raise Exception('MRC task need passage information.') prediction = self.problem.decode(logits_softmax, lengths=length_batches[i][passage_identify], batch_data=data_batches[i][passage_identify]) logits_softmax_flat = logits_softmax mrc_answer_target = None for single_target in target_batches[i]: if isinstance(target_batches[i][single_target][0], str): mrc_answer_target = target_batches[i][single_target] streaming_recoder.record_one_row([prediction, mrc_answer_target]) if to_predict: predict_stream_recoder.record_one_row([prediction]) if to_predict: logits_softmax_len = len(list(logits_softmax.values())[0]) \ if ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc else len(logits_softmax) for sample_idx in range(logits_softmax_len): while True: sample = fin.readline().rstrip() line_split = list(filter(lambda x: len(x) > 0, sample.rstrip().split('\t'))) if self.problem.file_column_num is None or len(line_split) == self.problem.file_column_num: break fout.write("%s\t%s\n" % (sample, "\t".join([str(predict_stream_recoder.get(field)[sample_idx]) for field in self.conf.predict_fields]))) predict_stream_recoder.clear_records() if self.use_gpu: for single_target in self.conf.answer_column_name: if isinstance(target_batches[i][single_target], torch.Tensor): target_batches[i][single_target] = transfer_to_gpu(target_batches[i][single_target]) loss = loss_fn(logits_softmax_flat, target_batches[i]) loss_recoder.record('loss', loss.item()) del loss, logits_softmax, logits_softmax_flat del prediction_pos_scores if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging or ProblemTypes[self.problem.problem_type] == ProblemTypes.classification: del prediction_indices del data_batches, length_batches, target_batches if ProblemTypes[self.problem.problem_type] == ProblemTypes.classification: result = self.evaluator.evaluate(streaming_recoder.get('target'), streaming_recoder.get('prediction'), y_pred_pos_score=streaming_recoder.get('pred_scores'), y_pred_scores_all=streaming_recoder.get('pred_scores_all'), formatting=True) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging: result = self.evaluator.evaluate(streaming_recoder.get('target'), streaming_recoder.get('prediction'), y_pred_pos_score=streaming_recoder.get('pred_scores'), formatting=True) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression: result = self.evaluator.evaluate(streaming_recoder.get('target'), streaming_recoder.get('prediction'), y_pred_pos_score=None, formatting=True) elif ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc: result = self.evaluator.evaluate(streaming_recoder.get('answer_text'), streaming_recoder.get('prediction'), y_pred_pos_score=None, y_pred_scores_all=None, formatting=True) if epoch: logging.info("Epoch %d, %s %s loss: %f" % (epoch, phase, result, loss_recoder.get('loss', 'mean'))) else: logging.info("%s %s loss: %f" % (phase, result, loss_recoder.get('loss', 'mean'))) if phase == 'valid': cur_result = evaluator.get_first_metric_result() if self.evaluator.compare(cur_result, cur_best_result) == 1: logging.info( 'Cur result %f is better than previous best result %s, renew the best model now...' % (cur_result, "%f" % cur_best_result if cur_best_result else "None")) if model_save_path is not None: if self.conf.mode == 'philly' and model_save_path.startswith('/hdfs/'): with HDFSDirectTransferer(model_save_path, with_hdfs_command=True) as transferer: if isinstance(self.model, nn.DataParallel): transferer.torch_save(self.model.module) else: transferer.torch_save(self.model) else: if not os.path.exists(os.path.dirname(model_save_path)): os.makedirs(os.path.dirname(model_save_path)) if isinstance(self.model, nn.DataParallel): torch.save(self.model.module, model_save_path, pickle_protocol=pkl.HIGHEST_PROTOCOL) else: torch.save(self.model, model_save_path, pickle_protocol=pkl.HIGHEST_PROTOCOL) logging.info("Best model saved to %s" % model_save_path) cur_best_result = cur_result else: logging.info('Cur result %f is no better than previous best result %f' % (cur_result, cur_best_result)) if to_predict: fin.close() fout.close() if direct_hdfs_path: move_from_local_to_hdfs(local_tmp_path, direct_hdfs_path) return cur_best_result