def feature_analysis(config): dataset = MarkerExpressionDataset(config) model = get_model(config) for marker in dataset.markers: all_features, all_labels, _ = dataset.get_all_data( marker, feature_selection=False, dup_reduce=True) model = base.clone(model) all_pred_labels = [0 for _ in range(len(all_labels))] all_pred_score = [0 for _ in range(len(all_labels))] for i in range(len(all_features)): train_features = all_features.copy() train_labels = all_labels.copy() del train_features[i] del train_labels[i] # model.fit(all_features, all_labels) model.fit(train_features, train_labels) all_pred_score[i] = model.predict_proba([all_features[i]])[0] all_pred_labels[i] = model.predict([all_features[i]])[0] tps = sum([ y_true == y_pred for y_true, y_pred in zip(all_labels, all_pred_labels) ]) acc = tps / len(all_features) results = eval_results(all_labels, all_pred_score, dataset.classes) print('marker %s: acc %1.2f' % (marker, 100 * acc)) print(results)
def evaluate(model, eval_examples, eval_features, device, args, logger,use_squad_v2, eval_file ): logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) all_ling_features = torch.tensor([f.ling_features for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_ling_features, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, ling_features, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) ling_features = ling_features.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask, ling_features) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) model.train() results = eval_results(eval_examples, eval_features, all_results, eval_file, args.n_best_size, args.max_answer_length, args.do_lower_case, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) results_list = [('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, all_results
def evaluate(args, model, tokenizer, prefix=""): eval_dataset = TopNDataset(args.topN_file, tokenizer, "dev.small", args.msmarco_dir, args.collection_memmap_dir, args.tokenize_dir, args.max_query_length, args.max_seq_length) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_dataloader = DataLoader(eval_dataset, batch_size=args.eval_batch_size, collate_fn=get_collate_function( args.mask_target)) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) attention_mask_after_softmax_layer_set = set( list(range(args.mask_layer_num))) logger.info("attention_mask_after_softmax_layer_set: {}".format( attention_mask_after_softmax_layer_set)) # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) cnt = 0 with open(args.output_score_path, 'w') as outputfile: for batch, qids, pids in tqdm( eval_dataloader, desc=f"{args.mask_target}@{args.mask_layer_num}"): model.eval() batch = {k: v.to(args.device) for k, v in batch.items()} batch[ 'attention_mask_after_softmax_layer_set'] = attention_mask_after_softmax_layer_set with torch.no_grad(): outputs = model(**batch) scores = outputs[0].detach().cpu().numpy() for qid, pid, score in zip(qids, pids, scores[:, 1]): outputfile.write(f"{qid}\t{pid}\t{score}\n") cnt += 1 # if cnt > 1000: # break generate_rank(args.output_score_path, args.output_rank_path) mrr = eval_results(args.output_rank_path) abs_output_rank_path = os.path.abspath(args.output_rank_path) mrr_ln_path = f"{abs_output_rank_path}.{mrr:.3f}" subprocess.check_call(["ln", "-s", abs_output_rank_path, mrr_ln_path]) print(mrr)
def evaluate(args, model, mode, prefix): eval_output_dir = args.eval_save_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataset = MSMARCODataset(mode, args.msmarco_dir, args.collection_memmap_dir, args.tokenize_dir, args.max_query_length, args.max_doc_length) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly collate_fn = get_collate_function(mode=mode) eval_dataloader = DataLoader(eval_dataset, batch_size=args.eval_batch_size, num_workers=args.data_num_workers, collate_fn=collate_fn) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) output_file_path = f"{eval_output_dir}/{prefix}.{mode}.score.tsv" with open(output_file_path, 'w') as outputfile: for batch, qids, docids in tqdm(eval_dataloader, desc="Evaluating"): model.eval() with torch.no_grad(): batch = {k: v.to(args.device) for k, v in batch.items()} outputs = model(**batch) scores = torch.diagonal(outputs[0]).detach().cpu().numpy() assert len(qids) == len(docids) == len(scores) for qid, docid, score in zip(qids, docids, scores): outputfile.write(f"{qid}\t{docid}\t{score}\n") rank_output = f"{eval_output_dir}/{prefix}.{mode}.rank.tsv" generate_rank(output_file_path, rank_output) if mode == "dev": mrr = eval_results(rank_output) return mrr
def evaluate(args, model, mode, prefix, eval_dataset=None): eval_output_dir = args.eval_save_dir if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) if eval_dataset == None: eval_dataset = CLEARDataset(mode=mode, args=args) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly collate_fn = get_collate_function(mode=mode) eval_dataloader = DataLoader(eval_dataset, batch_size=args.eval_batch_size, num_workers=args.data_num_workers, pin_memory=True, collate_fn=collate_fn) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) output_file_path = f"{eval_output_dir}/{prefix}.{mode}.score.tsv" with open(output_file_path, 'w') as outputfile: for batch, qids, pids in tqdm(eval_dataloader, desc="Evaluating"): model.eval() with torch.no_grad(): batch = {k: v.to(args.device) for k, v in batch.items()} scores = model(**batch) assert len(qids) == len(pids) == len(scores) for qid, pid, score in zip(qids, pids, scores): outputfile.write(f"{qid}\t{pid}\t{score}\n") rank_output = f"{eval_output_dir}/{prefix}.{mode}.rank.tsv" generate_rank(output_file_path, rank_output) if mode == "dev.small": mrr = eval_results(rank_output) * 6980 / args.num_eval_queries return mrr
def evaluate(args, model): eval_dataset = ProbDataset( f"{args.embd_root}/dev.small/{args.key}/{args.layer}", args.msmarco_dir, "dev.small", args.max_token_num) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, pin_memory=False, batch_size=args.eval_batch_size, collate_fn=get_collate_function(), num_workers=args.data_num_workers) # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() output_score_path = f"{args.eval_output_dir}/layer_{args.layer}.score.tsv" output_rank_path = f"{args.eval_output_dir}/layer_{args.layer}.rank.tsv" with open(output_score_path, 'w') as outputfile: for batch_idx, (batch, qids, pids) in enumerate( tqdm(eval_dataloader, desc="Evaluating")): del batch['labels'] batch = {k: v.to(args.device) for k, v in batch.items()} with torch.no_grad(): softmax_logits = model(**batch)[0].detach().cpu().numpy() scores = softmax_logits[:, 1] for idx, (qid, pid, score) in enumerate(zip(qids, pids, scores)): outputfile.write(f"{qid}\t{pid}\t{score}\n") generate_rank(output_score_path, output_rank_path) mrr = eval_results(output_rank_path) abs_output_rank_path = os.path.abspath(output_rank_path) mrr_ln_path = f"{abs_output_rank_path}.{mrr:.3f}" subprocess.check_call(["ln", "-s", abs_output_rank_path, mrr_ln_path])
def train_eval(config, exp_path): dataset = MarkerExpressionDataset(config) if dataset.data_clean is not None: with open(os.path.join(exp_path, 'dirty_data.txt'), 'w') as f: f.write('---data clean method: %s---\n' % dataset.data_clean) for marker, item in dataset.outlier_samples.items(): f.write('marker %s:\n' % marker) for class_id in dataset.classes: f.write('class %s:\n' % class_id) for sample_id in item.keys(): if item[sample_id]['class'] == class_id: f.write('\t%s\n' % sample_id) if dataset.feature_selection is not None or dataset.feature_transformation is not None: with open( os.path.join(exp_path, 'feature_selection_and_transformation.txt'), 'w') as f: if dataset.feature_selection is not None: f.write('---feature selection method: %s---\n' % dataset.feature_selection['method']) if 'kwargs' in dataset.feature_selection: f.write('---feature selection kwargs: %s---\n' % str(dataset.feature_selection['kwargs'])) if dataset.feature_transformation is not None: f.write('---feature transformation method: %s---\n' % dataset.feature_transformation['method']) if 'kwargs' in dataset.feature_transformation: f.write('---feature transformation kwargs: %s---\n' % str(dataset.feature_transformation['kwargs'])) for marker in dataset.markers: f.write('marker %s:\n' % marker) if dataset.fs_metric_params is not None: f.write( '---feature selection and transformation kwargs: %s---\n' % str(dataset.fs_metric_params[marker])) if dataset.feature_selection is not None: features = dataset.features feature_index = 0 f.write('---selected features---\n') if dataset.feature_selection['method'] == 'custom': support_flags = dataset.feature_selection['selection'][ marker] else: support_flags = dataset.feature_selector[ marker].get_support() for flag in support_flags: f.write('%s:\t%s\n' % (features[feature_index], flag)) feature_index = (feature_index + 1) % len(features) if dataset.feature_transformation is not None: components = dataset.feature_transformer[ marker].components_ f.write('---feature transformation components---:\n%s' % components.tolist()) # if 'feature_mean' in config: # feature_mean = config['feature_mean'] # coefficients = np.abs(feature_mean*components.sum(axis=0)).\ # reshape([len(dataset.features), -1]).sum(axis=0) # else: # coefficients = np.abs(components.sum(axis=0)).reshape([len(dataset.features), -1]).sum(axis=0) # coefficients = coefficients / coefficients.sum() # # f.write('---feature transformation coefficients---:\n%s' % coefficients.tolist()) threshold = config.get('threshold', 'roc_optimal') metrics_names = ['sensitivity', 'specificity', 'roc_auc_score'] metrics_avg_names = ['roc_auc_score_avg', 'roc_auc_score_avg_std'] fig, ax = plt.subplots(9, len(dataset.markers), squeeze=False, figsize=(6 * len(dataset.markers), 40)) metrics_file = open(os.path.join(exp_path, 'metrics.txt'), 'w') metrics_fig_filename = os.path.join(exp_path, 'conf_mat.png') best_params = dict() all_marker_train_metrics = [] all_marker_test_metrics = [] for i, marker in enumerate(dataset.markers): model = get_model(config) if 'model_kwargs_search' in config: # parameter search print('parameter search for marker %s...' % marker) all_x, all_y, cv_index = dataset.get_all_data(marker) best_model = GridSearchCV(model, param_grid=config['model_kwargs_search'], cv=cv_index, scoring='roc_auc_ovr') best_model.fit(all_x, all_y) best_params[marker] = best_model.best_params_ print('search done') else: best_model = model best_params[marker] = config['model_kwargs'] # run train and test train_xs = [] train_ys = [] train_ys_score = [] test_xs = [] test_ys = [] test_ys_score = [] for fold_i, (train_x, train_y, test_x, test_y) in enumerate(dataset.get_split_data(marker)): model = base.clone(model) model.set_params(**best_params[marker]) model.fit(train_x, train_y) # model.classes_ = dataset.classes train_xs += train_x train_ys += train_y test_xs += test_x test_ys += test_y train_y_score = model.predict_proba(train_x).tolist() train_ys_score += train_y_score test_y_score = model.predict_proba(test_x).tolist() test_ys_score += test_y_score # model_filename = os.path.join(exp_path, 'model', '%s_%s_fold_%d.pkl' # % (config['model'], marker, fold_i)) # maybe_create_path(os.path.dirname(model_filename)) # with open(model_filename, 'wb') as f: # pickle.dump(model, f) train_metrics = eval_results(train_ys, train_ys_score, labels=dataset.classes, average='macro', threshold=threshold, num_fold=dataset.num_fold) test_metrics = eval_results(test_ys, test_ys_score, labels=dataset.classes, average='macro', threshold=train_metrics['used_threshold'], num_fold=dataset.num_fold) all_marker_train_metrics.append(train_metrics) all_marker_test_metrics.append(test_metrics) # print metrics to console and file double_print('marker: %s' % marker, metrics_file) double_print('metrics on training set:', metrics_file) for j, class_j in enumerate(dataset.classes): log_str = '[class: %s. threshold: %1.1f] ' % ( class_j, 100 * train_metrics['used_threshold'][j]) for metrics_name in metrics_names: log_str += '%s: %1.1f. ' % (metrics_name, train_metrics[metrics_name][j]) double_print(log_str, metrics_file) for metrics_name in metrics_avg_names: double_print( '%s: %1.1f' % (metrics_name, train_metrics[metrics_name]), metrics_file) double_print('metrics on test set:', metrics_file) for j, class_j in enumerate(dataset.classes): log_str = '[class: %s. threshold: %1.1f] ' % ( class_j, 100 * test_metrics['used_threshold'][j]) for metrics_name in metrics_names: log_str += '%s: %1.1f. ' % (metrics_name, test_metrics[metrics_name][j]) double_print(log_str, metrics_file) for metrics_name in metrics_avg_names: double_print( '%s: %1.1f' % (metrics_name, test_metrics[metrics_name]), metrics_file) # generate figure current_ax = ax[0, i] dataset.plot_data_clean_distribution(current_ax, marker) current_ax.set_title('data cleaning on marker %s' % marker) current_ax = ax[1, i] contour_flag = len(train_xs[0]) == 2 # dup_reduced = list(tuple(tuple([train_xs[j] + [train_ys[j]] for j in range(len(train_xs))]))) # dup_reduced_train_xs = [item[:-1] for item in dup_reduced] # dup_reduced_train_ys = [item[-1] for item in dup_reduced] # dup_reduced_train_ys_str = [str(item) for item in dup_reduced_train_ys] dup_reduced_train_xs = train_x + test_x dup_reduced_train_ys = train_y + test_y dup_reduced_train_ys_str = [str(item) for item in dup_reduced_train_ys] classes_str = [str(item) for item in dataset.classes] plot_feature_distribution( dup_reduced_train_xs, ax=current_ax, t_sne=True, hue=dup_reduced_train_ys_str, hue_order=classes_str, style=dup_reduced_train_ys_str, style_order=classes_str, # x_lim='box', y_lim='box', x_lim='min_max_extend', y_lim='min_max_extend', contour=contour_flag, z_generator=best_model.predict) current_ax.set_title('%s trained on whole set' % marker) current_ax = ax[2, i] metrics.ConfusionMatrixDisplay( train_metrics['conf_mat'], display_labels=dataset.classes).plot(ax=current_ax) current_ax.set_title('%s on train set of all folds' % marker) current_ax = ax[3, i] for j in range(len(dataset.classes)): roc_curve = train_metrics['roc_curve'][j] roc_auc_score = train_metrics['roc_auc_score'][j] class_id = dataset.classes[j] sen = train_metrics['sensitivity'][j] / 100 spe = train_metrics['specificity'][j] / 100 metrics.RocCurveDisplay(fpr=roc_curve[0], tpr=roc_curve[1], roc_auc=roc_auc_score, estimator_name='class %s' % class_id).plot(ax=current_ax) current_ax.scatter(1 - spe, sen) current_ax = ax[4, i] table_val_list = [ dataset.classes, [100 * item for item in train_metrics['used_threshold']] ] row_labels = ['cls', 'thr'] for metrics_name in metrics_names: table_val_list.append(train_metrics[metrics_name]) row_labels.append(metrics_name[:min(3, len(metrics_name))]) additional_text = [] for metrics_name in metrics_avg_names: additional_text.append('%s: %1.1f' % (metrics_name, train_metrics[metrics_name])) additional_text.append(best_params[marker]) plot_table(table_val_list, row_labels, ax=current_ax, additional_text=additional_text) current_ax = ax[5, i] contour_flag = len(train_xs[0]) == 2 test_y_str = [str(item) for item in test_y] classes_str = [str(item) for item in dataset.classes] plot_feature_distribution( test_x, ax=current_ax, t_sne=True, hue=test_y_str, hue_order=classes_str, style=test_y_str, style_order=classes_str, # x_lim='box', y_lim='box', x_lim='min_max_extend', y_lim='min_max_extend', contour=contour_flag, z_generator=model.predict) current_ax.set_title('%s on test set of the last fold' % marker) current_ax = ax[6, i] metrics.ConfusionMatrixDisplay( test_metrics['conf_mat'], display_labels=dataset.classes).plot(ax=current_ax) current_ax.set_title('%s on test set of all folds' % marker) current_ax = ax[7, i] for j in range(len(dataset.classes)): roc_curve = test_metrics['roc_curve'][j] roc_auc_score = test_metrics['roc_auc_score'][j] class_id = dataset.classes[j] sen = test_metrics['sensitivity'][j] / 100 spe = test_metrics['specificity'][j] / 100 metrics.RocCurveDisplay(fpr=roc_curve[0], tpr=roc_curve[1], roc_auc=roc_auc_score, estimator_name='class %s' % class_id).plot(ax=current_ax) current_ax.scatter(1 - spe, sen) current_ax = ax[8, i] table_val_list = [ dataset.classes, [100 * item for item in test_metrics['used_threshold']] ] row_labels = ['cls', 'thr'] for metrics_name in metrics_names: table_val_list.append(test_metrics[metrics_name]) row_labels.append(metrics_name[:min(3, len(metrics_name))]) additional_text = [] for metrics_name in metrics_avg_names: additional_text.append('%s: %1.1f' % (metrics_name, test_metrics[metrics_name])) plot_table(table_val_list, row_labels, ax=current_ax, additional_text=additional_text) for metrics_name in metrics_avg_names: all_marker_values = [ item[metrics_name] for item in all_marker_train_metrics ] double_print( 'overall train %s: %1.1f' % (metrics_name, sum(all_marker_values) / len(all_marker_values)), metrics_file) for metrics_name in metrics_avg_names: all_marker_values = [ item[metrics_name] for item in all_marker_test_metrics ] double_print( 'overall test %s: %1.1f' % (metrics_name, sum(all_marker_values) / len(all_marker_values)), metrics_file) metrics_file.close() save_yaml(os.path.join(exp_path, 'best_params.yaml'), best_params) fig.savefig(metrics_fig_filename, bbox_inches='tight', pad_inches=1)
parser.add_argument( "--mask_methods", type=str, nargs="+", default=["commas", "token_mask", "attention_mask", "None"]) parser.add_argument("--input_dir", type=str, default="./data/adversary") parser.add_argument("--output_dir", type=str, default="./data/adversary") args = parser.parse_args() origin_scores = read_scores(f"{args.input_dir}/None.score.tsv") for mask_method in args.mask_methods: new_scores = read_scores(f"{args.input_dir}/{mask_method}.score.tsv") for key, score in new_scores.items(): if key in origin_scores: origin_scores[key] = score temp_score_path = f"{args.output_dir}/temp.{mask_method}.score.tsv" assert not os.path.exists(temp_score_path) with open(temp_score_path, "w") as outFile: for (qid, pid), score in origin_scores.items(): outFile.write(f"{qid}\t{pid}\t{score}\n") output_rank_path = f"{args.output_dir}/{mask_method}.rank.tsv" generate_rank(temp_score_path, output_rank_path) subprocess.check_call(["rm", temp_score_path]) mrr = eval_results(output_rank_path) abs_output_rank_path = os.path.abspath(output_rank_path) rank_with_mrr_path = f"{abs_output_rank_path}.{mrr:.3f}" if not os.path.exists(rank_with_mrr_path): subprocess.check_call( ["ln", "-s", abs_output_rank_path, rank_with_mrr_path]) print(mask_method, "MRR@10:", mrr)