def load_train_dataset(dataset, embeddings): ling_feat_spmatrix, docids = load_ling_features(dataset, training_data_path) print('Number of documents with linguistic features: %i' % len(docids)) data_root_dir = os.path.expanduser(training_data_path) csvdirname = os.path.join(data_root_dir, 'argument_data/%s-new-CSV/' % dataset) print(('Loading train/test data from %s...' % csvdirname)) person_train = [] a1_train = [] a2_train = [] ids_train = [] prefs_train = [] X_a1 = [] X_a2 = [] for file_name in listdir(csvdirname): if file_name.split('.')[-1] != 'csv': print("Skipping files without .csv suffix: %s" % csvdirname + '/' + file_name) continue Xa1, Xa2, labels, ids, turker_ids, a1, a2 = load_single_file_separate_args( csvdirname, file_name, word_to_indices_map, None) X_a1.extend(Xa1) X_a2.extend(Xa2) a1_train.extend(a1) a2_train.extend(a2) person_train.extend(turker_ids) prefs_train.extend(labels) ids_train.extend(ids) train_ids = np.array([ids_pair.split('_') for ids_pair in ids_train]) print('No. documents in training set: %i' % len(np.unique([train_ids[:, 0], train_ids[:, 1]]))) a1_train = get_docidxs_from_ids(docids, train_ids[:, 0]) a2_train = get_docidxs_from_ids(docids, train_ids[:, 1]) items_feat, uids = concat_feature_sets((a1_train, a2_train), [X_a1, X_a2], ling_feat_spmatrix, embeddings) ndims = items_feat.shape[1] return items_feat, ling_feat_spmatrix.shape[1], word_to_indices_map, a1_train, \ a2_train, prefs_train, ndims
def get_text_from_fold_regression(fold, dataset): X_test, _, ids_test, _, test_a = fold # Identify the arguments in the false pairs. X_test = np.array(X_test) _, docids = load_ling_features(dataset) testids_a = get_docidxs_from_ids(docids, ids_test) _, _, utexts = get_doc_token_seqs((testids_a), [X_test], [test_a]) return testids_a, utexts
def load_test_dataset(output, embeddings): # Load the linguistic features print(("Loading linguistic features from %s" % output)) ling_feat_spmatrix, docids = load_ling_features( 'new_test_data', output, '', output, model.features.shape[1] - len(embeddings[0])) print('Loaded libSVM data') X = [] test_ids = [] a = [] for file_name in listdir(input_dir): if file_name.split('.')[-1] != 'csv': print("Skipping files without .csv suffix: %s" % input_dir + '/' + file_name) continue data = pd.read_csv(os.path.join(input_dir, file_name), delimiter='\t', na_values=[]) data = data.fillna('N/A') ids = data['#id'].values a1 = data['argument'].values a1_tokens = [ vocabulary_embeddings_extractor.tokenize(a1_line) for a1_line in a1 ] a1_indices = [[ word_to_indices_map.get(word, 2) for word in a1_tokens_line ] for a1_tokens_line in a1_tokens] Xa1 = np.array([[1] + a1_indices_line for a1_indices_line in a1_indices]) valid_args = np.in1d(ids, docids) a1 = a1[valid_args] Xa1 = Xa1[valid_args] ids = ids[valid_args] a.extend(a1) X.extend(Xa1) test_ids.extend(ids) # load the embeddings docid_to_idx_map = np.argsort(docids).flatten() test_items_feat, uids = concat_feature_sets( (test_ids), [X], ling_feat_spmatrix, embeddings, docid_to_idx_map) return test_items_feat, uids
def get_text_from_fold(fold, dataset): X_test_a1, X_test_a2, _, ids_test, _, test_a1, test_a2 = fold test_a1 = np.array(test_a1).flatten() test_a2 = np.array(test_a2).flatten() # Identify the arguments in the false pairs. X_test_a1 = np.array(X_test_a1) X_test_a2 = np.array(X_test_a2) testids = np.array([ids_pair.split('_') for ids_pair in ids_test]) _, docids = load_ling_features(dataset) testids_a1 = get_docidxs_from_ids(docids, testids[:, 0]) testids_a2 = get_docidxs_from_ids(docids, testids[:, 1]) _, _, utexts = get_doc_token_seqs( (testids_a1, testids_a2), [X_test_a1, X_test_a2], (test_a1, test_a2)) return testids_a1, testids_a2, utexts
def compute_max_train_similarity(expt_settings, method, ls, docids, items_feat, similarities_all=None): ''' Find the maximum cosine similarity for arguments in the dataset. Compute the mean/variance of the max similarity for correct/incorrect pairs. ''' # Load the results for GPPL with Ling. expt_settings_1 = expt_settings.copy() expt_settings_1['method'] = method expt_settings_1['feature_type'] = 'ling' expt_settings_1['embeddings_type'] = '' data_root_dir = os.path.expanduser("~/data/personalised_argumentation/") resultsfile_template = 'habernal_%s_%s_%s_%s_acc%.2f_di%.2f' resultsdir_1 = get_results_dir(data_root_dir, resultsfile_template, expt_settings_1) # Load the results for GPPL with Glove. nFolds = len(list(folds.keys())) mean_false = 0 mean_true = 0 var_false = 0 var_true = 0 if similarities_all is None: feats = items_feat / ls[None, :] similarities_all = cosine_similarity( feats, feats, dense_output=True ) #matern_3_2_from_raw_vals(items_feat, ls, items_feat) total_count_true = 0 total_count_false = 0 for f in range(nFolds): fold = list(folds.keys())[f] if 'fold_order' in expt_settings_1 and expt_settings_1[ 'fold_order'] is not None: f1 = np.argwhere( np.array(expt_settings_1['fold_order']) == fold)[0][0] else: f1 = f foldfile = resultsdir_1 + '/fold%i.pkl' % f1 if os.path.isfile(foldfile): with open(foldfile, 'rb') as fh: data_1 = pickle.load(fh, encoding='latin1') # Load the ground truth classifications gold_disc_1, pred_disc_1, _, _, _, _, _, _, _ = get_fold_data( data_1, f1, expt_settings_1) # Identify the falsely classified pairs with Ling #gold_disc_1 = gold_disc_1[:, None] #gold_disc_2 = gold_disc_2[:, None] if expt_settings_1['method'] == 'SVM': pred_disc_1 = pred_disc_1[:, 1] pred_disc_1 = pred_disc_1.flatten() false_pairs_1 = pred_disc_1 != gold_disc_1 true_pairs_1 = pred_disc_1 == gold_disc_1 # Get the argument IDs for this fold X_test_a1, X_test_a2, _, ids_test, _, test_a1, test_a2 = folds.get( fold)["test"] test_a1 = np.array(test_a1)[:, None] test_a2 = np.array(test_a2)[:, None] testids = np.array([ids_pair.split('_') for ids_pair in ids_test]) X_test_a1 = np.array(X_test_a1) X_test_a2 = np.array(X_test_a2) testids_a1 = get_docidxs_from_ids(docids, testids[:, 0]) testids_a2 = get_docidxs_from_ids(docids, testids[:, 1]) X_tr_a1, X_tr_a2, _, ids_tr, _, tr_a1, tr_a2 = folds.get( fold)["training"] tr_a1 = np.array(tr_a1)[:, None] tr_a2 = np.array(tr_a2)[:, None] trids = np.array([ids_pair.split('_') for ids_pair in ids_tr]) X_tr_a1 = np.array(X_tr_a1) X_tr_a2 = np.array(X_tr_a2) _, docids = load_ling_features(expt_settings_1['dataset']) trids_a1 = get_docidxs_from_ids(docids, trids[:, 0]) trids_a2 = get_docidxs_from_ids(docids, trids[:, 1]) true_similarities = similarities_all[np.concatenate((testids_a1[true_pairs_1], testids_a2[true_pairs_1])), :]\ [:, np.concatenate((trids_a1, trids_a2))] true_similarities = np.max(true_similarities, axis=1) false_similarities = similarities_all[np.concatenate((testids_a1[false_pairs_1], testids_a2[false_pairs_1])), :]\ [:, np.concatenate((trids_a1, trids_a2))] false_similarities = np.max(false_similarities, axis=1) total_count_true += np.sum(true_pairs_1) * 2.0 total_count_false += np.sum(false_pairs_1) * 2.0 mean_total_sims_true = np.sum(true_similarities) mean_total_sims_false = np.sum(false_similarities) var_total_sims_true = np.var(true_similarities) var_total_sims_false = np.var(false_similarities) mean_false += mean_total_sims_false mean_true += mean_total_sims_true var_false += var_total_sims_false var_true += var_total_sims_true #print "mean total_similarity for correctly classified pairs: %f (STD %f)" % (mean_total_sims_true, # np.sqrt(var_total_sims_true)) #print "mean total_similarity for incorrectly classified pairs: %f (STD %f)" % (mean_total_sims_false, # np.sqrt(var_total_sims_false)) sys.stdout.write('.') sys.stdout.flush() mean_false /= total_count_false mean_true /= total_count_true var_false /= nFolds var_false = np.sqrt(var_false) var_true /= nFolds var_true = np.sqrt(var_true) print(( "For all folds: mean total_sim for correctly classified pairs: %f (STD %f)" % (mean_true, np.sqrt(var_true)))) print(( "For all folds: mean total_sim for incorrectly classified pairs: %f (STD %f)" % (mean_false, np.sqrt(var_false)))) return similarities_all
'word_mean') # step 2. Inspect the arguments that 'both' gets right and embeddings or ling alone gets wrong. Expect the results # to be similar to the same as the previous step. print_where_one_right_two_wrong(expt_settings, 'embeddings', 'both', 'word_mean', 'word_mean') print_where_one_right_two_wrong(expt_settings, 'ling', 'both', '', 'word_mean') print_where_one_right_two_wrong(expt_settings, 'both', 'embeddings', 'word_mean', 'word_mean') print_where_one_right_two_wrong(expt_settings, 'both', 'ling', 'word_mean', '') # Step 3: Compare GPPL to SVM to see which handles outliers better given same features ling_feat_spmatrix, docids = load_ling_features(expt_settings['dataset']) if 'ls' not in globals(): ls = compute_lengthscale_heuristic('ling', '', None, ling_feat_spmatrix, docids, folds, None, multiply_heuristic_power=0.5) items_feat = ling_feat_spmatrix.toarray() if 'similarity' not in globals(): similarity = None similarity = compute_max_train_similarity(expt_settings,
def get_fold_data(data, f, expt_settings, flip_labels=False): # discrete labels are 0, 1 or 2 try: if len(data[3][f]): gold_disc = np.array(data[3][f]) pred_disc = np.array(data[1][f]) * 2 if pred_disc.ndim == 1: pred_disc = pred_disc[:, np.newaxis] #if expt_settings['method'] == 'SVM': if flip_labels: pred_disc = 2 - pred_disc # probabilities gold_prob = gold_disc / 2.0 pred_prob = np.array(data[0][f]) if pred_prob.ndim == 1: pred_prob = pred_prob[:, np.newaxis] #if expt_settings['method'] == 'SVM': if flip_labels: pred_prob = 1 - pred_prob # scores used to rank if len(data[4]) > 0: gold_rank = np.array(data[4][f]) else: gold_rank = None if len(data[2]) > 0: pred_rank = np.array(data[2][f]) if pred_rank.ndim == 1: pred_rank = pred_rank[:, np.newaxis] if flip_labels: pred_rank = -pred_rank else: gold_rank = None pred_rank = None if len(data) > 8 and data[8] is not None and f in data[8] and data[ 8][f] is not None: pred_tr_disc = np.round(np.array(data[8][f])) * 2 pred_tr_prob = np.round(np.array(data[8][f])) if flip_labels: pred_tr_disc = 2 - pred_tr_disc pred_tr_prob = 1 - pred_tr_prob else: pred_tr_disc = None pred_tr_prob = None else: raise Exception('Data not found') except: gold_disc = np.array(data[3]) pred_disc = np.array(data[1]) * 2 if pred_disc.ndim == 1: pred_disc = pred_disc[:, np.newaxis] #if expt_settings['method'] == 'SVM': # pred_disc = 2 - pred_disc # probabilities gold_prob = gold_disc / 2.0 pred_prob = np.array(data[0]) if pred_prob.ndim == 1: pred_prob = pred_prob[:, np.newaxis] #if expt_settings['method'] == 'SVM': # pred_prob = 1 - pred_prob # scores used to rank if data[4] is not None and len(data[4]) > 0: gold_rank = np.array(data[4]) else: gold_rank = None if data[2] is not None and (len(data[2]) > 0 or data[2].item() is not None): pred_rank = np.array(data[2]) if pred_rank.ndim == 1: pred_rank = pred_rank[:, np.newaxis] else: gold_rank = None pred_rank = None if len(data) > 8 and data[8] is not None: pred_tr_disc = np.round(np.array(data[8])) * 2 pred_tr_prob = np.round(np.array(data[8])) else: pred_tr_disc = None pred_tr_prob = None if flip_labels: pred_disc = 2 - pred_disc pred_prob = 1 - pred_prob if pred_rank is not None: pred_rank = -pred_rank if pred_tr_disc is not None: pred_tr_disc = 2 - pred_tr_disc if pred_tr_prob is not None: pred_tr_prob = 1 - pred_tr_prob #any postprocessing e.g. to remove errors when saving data postprocced = False if pred_rank is not None and gold_rank is not None and pred_rank.shape[ 0] == 1052 and gold_rank.shape[0] != 1052: # we predicted whole dataeset instead of the subset from tests import get_fold_regression_data if expt_settings['fold_order'] is not None: fold = expt_settings['fold_order'][f] else: fold = list(expt_settings['folds'].keys())[f] _, docids = load_ling_features(expt_settings['dataset']) _, _, _, item_idx_ranktest, _, _ = get_fold_regression_data( expt_settings['folds_regression'], fold, docids) pred_rank = pred_rank[item_idx_ranktest, :] postprocced = True print("Postprocessed: %i, %i" % (pred_rank.shape[0], gold_rank.shape[0])) # Considering only the labels where a confident prediction has been made... In this case the metrics should be # shown alongside coverage. # gold_disc = gold_disc[np.abs(pred_prob.flatten() - 0.5) > 0.3] # pred_disc = pred_disc[np.abs(pred_prob.flatten() - 0.5) > 0.3] # # gold_prob = gold_prob[np.abs(pred_prob.flatten() - 0.5) > 0.3] # pred_prob = pred_prob[np.abs(pred_prob.flatten() - 0.5) > 0.3] return gold_disc, pred_disc, gold_prob, pred_prob, gold_rank, pred_rank, pred_tr_disc, pred_tr_prob, postprocced
def compute_metrics(expt_settings, methods, datasets, feature_types, embeddings_types, accuracy=1.0, di=0, npairs=0, tag='', remove_seen_from_mean=False, max_no_folds=32, min_folds_desired=0, compute_tr_performance=False, flip_labels=[]): expt_settings['acc'] = accuracy expt_settings['di'] = di row_index = np.zeros(len(methods) * len(datasets), dtype=object) columns = np.zeros(len(feature_types) * len(embeddings_types), dtype=object) row = 0 if expt_settings['di'] == 0 or np.ceil( np.float(npairs) / np.float(expt_settings['di'])) == 0: AL_rounds = np.array([0]).astype(int) else: AL_rounds = np.arange(expt_settings['di'], npairs + expt_settings['di'], expt_settings['di'], dtype=int) #np.arange( np.ceil(np.float(npairs) / np.float(expt_settings['di'])), dtype=int) if tag == '': ts = time.time() tag = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H-%M-%S') for d, dataset_next in enumerate(datasets): docids = None if expt_settings['dataset'] != dataset_next or expt_settings[ 'folds'] is None: expt_settings['dataset'] = dataset_next expt_settings['folds'], expt_settings[ 'folds_regression'], _, _, _ = load_train_test_data( expt_settings['dataset']) for m, expt_settings['method'] in enumerate(methods): if d == 0 and m == 0: if expt_settings['di'] == 0: results_shape = (len(methods) * len(datasets), len(feature_types) * len(embeddings_types), len(expt_settings['folds']) + 1, 1) else: results_shape = (len(methods) * len(datasets), len(feature_types) * len(embeddings_types), len(expt_settings['folds']) + 1, int(npairs / expt_settings['di'])) results_f1 = np.zeros(results_shape) results_acc = np.zeros(results_shape) results_logloss = np.zeros(results_shape) results_auc = np.zeros(results_shape) results_pearson = np.zeros(results_shape) results_spearman = np.zeros(results_shape) results_kendall = np.zeros(results_shape) tr_results_f1 = np.zeros(results_shape) tr_results_acc = np.zeros(results_shape) tr_results_logloss = np.zeros(results_shape) tr_results_auc = np.zeros(results_shape) row_index[row] = expt_settings['method'] + ', ' + expt_settings[ 'dataset'] col = 0 for expt_settings['feature_type'] in feature_types: if expt_settings['feature_type'] == 'ling': embeddings_to_use = [''] else: embeddings_to_use = embeddings_types for expt_settings['embeddings_type'] in embeddings_to_use: data, nFolds, resultsdir, resultsfile = load_results_data( data_root_dir, resultsfile_template, expt_settings, max_no_folds) min_folds = min_folds_desired for f in range(nFolds): print("Processing fold %i" % f) if expt_settings[ 'fold_order'] is None: # fall back to the order on the current machine fold = list(expt_settings['folds'].keys())[f] else: fold = expt_settings['fold_order'][f] if fold[-2] == "'" and fold[0] == "'": fold = fold[1:-2] elif fold[-1] == "'" and fold[0] == "'": fold = fold[1:-1] expt_settings['fold_order'][f] = fold # look for new-style data in separate files for each fold. Prefer new-style if both are found. foldfile = resultsdir + '/fold%i.pkl' % f if os.path.isfile(foldfile): with open(foldfile, 'rb') as fh: data_f = pickle.load(fh, encoding='latin1') else: # convert the old stuff to new stuff if data is None: min_folds = f + 1 print('Skipping fold with no data %i' % f) print("Skipping results for %s, %s, %s, %s" % (expt_settings['method'], expt_settings['dataset'], expt_settings['feature_type'], expt_settings['embeddings_type'])) print( "Skipped filename was: %s, old-style results file would be %s" % (foldfile, resultsfile)) continue if not os.path.isdir(resultsdir): os.mkdir(resultsdir) data_f = [] for thing in data: if f in thing: data_f.append(thing[f]) else: data_f.append(thing) with open(foldfile, 'wb') as fh: pickle.dump(data_f, fh) gold_disc, pred_disc, gold_prob, pred_prob, gold_rank, pred_rank, pred_tr_disc, \ pred_tr_prob, postprocced = get_fold_data(data_f, f, expt_settings, flip_labels=m in flip_labels) if postprocced: # data was postprocessed and needs saving with open(foldfile, 'wb') as fh: pickle.dump(data_f, fh) if pred_tr_disc is not None: print( str(pred_tr_disc.shape) + ', ' + str(pred_prob.shape) + ', ' + str(pred_tr_disc.shape[0] + pred_prob.shape[0])) for AL_round, _ in enumerate(AL_rounds): #print "fold %i " % f #print AL_round if AL_round >= pred_disc.shape[1]: continue results_f1[row, col, f, AL_round] = f1_score( gold_disc[gold_disc != 1], pred_disc[gold_disc != 1, AL_round], average='macro') #skip the don't knows results_acc[row, col, f, AL_round] = accuracy_score( gold_disc[gold_disc != 1], pred_disc[gold_disc != 1, AL_round]) results_logloss[row, col, f, AL_round] = log_loss( gold_prob[gold_disc != 1], pred_prob[gold_disc != 1, AL_round]) results_auc[row, col, f, AL_round] = roc_auc_score( gold_prob[gold_disc != 1], pred_prob[gold_disc != 1, AL_round]) # macro if gold_rank is None and expt_settings[ 'folds_regression'] is not None: if docids is None: _, docids = load_ling_features( expt_settings['dataset']) # ranking data was not saved in original file. Get it from the expt_settings['folds_regression'] here _, rankscores_test, _, _ = expt_settings[ 'folds_regression'].get(fold)["test"] gold_rank = np.array(rankscores_test) if gold_rank is not None and pred_rank is not None: results_pearson[row, col, f, AL_round] = pearsonr( gold_rank, pred_rank[:, AL_round])[0] results_spearman[row, col, f, AL_round] = spearmanr( gold_rank, pred_rank[:, AL_round])[0] results_kendall[row, col, f, AL_round] = kendalltau( gold_rank, pred_rank[:, AL_round])[0] def mean_unseen(result, remove_seen_from_mean): if not remove_seen_from_mean: return result N = len(gold_tr) Nseen = (AL_round + 1) * expt_settings['di'] Nunseen = (N - Nseen) return (result * N - Nseen) / Nunseen if pred_tr_prob is not None and AL_round < pred_tr_disc.shape[ 1] and compute_tr_performance: _, _, gold_tr, _, _, _, _ = expt_settings[ 'folds'].get(fold)["training"] gold_tr = np.array(gold_tr) if (gold_tr != 1).shape[0] != pred_tr_disc.shape[0]: print("Mismatch in fold %s! %i, %i" % (fold, (gold_tr != 1).shape[0], pred_tr_disc.shape[0])) gold_tr_prob = gold_tr / 2.0 tr_results_f1[ row, col, f, AL_round] = mean_unseen( f1_score(gold_tr[gold_tr != 1], pred_tr_disc[gold_tr != 1, AL_round], average='macro'), remove_seen_from_mean) #skip the don't knows tr_results_acc[ row, col, f, AL_round] = mean_unseen( accuracy_score( gold_tr[gold_tr != 1], pred_tr_disc[gold_tr != 1, AL_round]), remove_seen_from_mean) tr_results_logloss[ row, col, f, AL_round] = mean_unseen( log_loss( gold_tr_prob[gold_tr != 1], pred_tr_prob[gold_tr != 1, AL_round]), remove_seen_from_mean) tr_results_auc[ row, col, f, AL_round] = mean_unseen( roc_auc_score( gold_tr_prob[gold_tr != 1], pred_tr_prob[gold_tr != 1, AL_round]), remove_seen_from_mean) elif pred_tr_prob is not None and AL_round >= pred_tr_disc.shape[ 1]: tr_results_f1[row, col, f, AL_round] = 1 tr_results_acc[row, col, f, AL_round] = 1 tr_results_auc[row, col, f, AL_round] = 1 tr_results_logloss[row, col, f, AL_round] = 0 for AL_round in range(results_f1.shape[3]): foldrange = np.arange( min_folds, max_no_folds ) # skip any rounds that did not complete when taking the mean foldrange = foldrange[results_f1[row, col, foldrange, AL_round] != 0] results_f1[row, col, -1, AL_round] = np.mean( results_f1[row, col, foldrange, AL_round], axis=0) results_acc[row, col, -1, AL_round] = np.mean( results_acc[row, col, foldrange, AL_round], axis=0) results_logloss[row, col, -1, AL_round] = np.mean( results_logloss[row, col, foldrange, AL_round], axis=0) results_auc[row, col, -1, AL_round] = np.mean( results_auc[row, col, foldrange, AL_round], axis=0) results_pearson[row, col, -1, AL_round] = np.mean( results_pearson[row, col, foldrange, AL_round], axis=0) results_spearman[row, col, -1, AL_round] = np.mean( results_spearman[row, col, foldrange, AL_round], axis=0) results_kendall[row, col, -1, AL_round] = np.mean( results_kendall[row, col, foldrange, AL_round], axis=0) tr_results_f1[row, col, -1, AL_round] = np.mean( tr_results_f1[row, col, foldrange, AL_round], axis=0) tr_results_acc[row, col, -1, AL_round] = np.mean( tr_results_acc[row, col, foldrange, AL_round], axis=0) tr_results_logloss[ row, col, -1, AL_round] = np.mean( tr_results_logloss[row, col, foldrange, AL_round], axis=0) tr_results_auc[row, col, -1, AL_round] = np.mean( tr_results_auc[row, col, foldrange, AL_round], axis=0) print('p-values for %s, %s, %s, %s:' % (expt_settings['dataset'], expt_settings['method'], expt_settings['feature_type'], expt_settings['embeddings_type'])) print( wilcoxon(results_f1[0, 0, foldrange, AL_round], results_f1[row, col, foldrange, AL_round])[1]) print( wilcoxon(results_acc[0, 0, foldrange, AL_round], results_acc[row, col, foldrange, AL_round])[1]) print( wilcoxon( results_logloss[0, 0, foldrange, AL_round], results_logloss[row, col, foldrange, AL_round])[1]) print( wilcoxon(results_auc[0, 0, foldrange, AL_round], results_auc[row, col, foldrange, AL_round])[1]) print( wilcoxon( results_pearson[0, 0, foldrange, AL_round], results_pearson[row, col, foldrange, AL_round])[1]) print( wilcoxon( results_spearman[0, 0, foldrange, AL_round], results_spearman[row, col, foldrange, AL_round])[1]) print( wilcoxon( results_kendall[0, 0, foldrange, AL_round], results_kendall[row, col, foldrange, AL_round])[1]) if row == 0: # set the column headers columns[col] = expt_settings[ 'feature_type'] + ', ' + expt_settings[ 'embeddings_type'] col += 1 row += 1 combined_labels = [] for row in row_index: for col in columns: combined_labels.append(str(row) + '_' + str(col)) mean_results = [] mean_results.append( collate_AL_results(AL_rounds, results_f1, combined_labels, "Macro-F1 scores for round %i: ")) mean_results.append( collate_AL_results(AL_rounds, results_acc, combined_labels, "Accuracy (excl. don't knows), round %i:") ) # for UKPConvArgStrict don't knows are already ommitted) mean_results.append( collate_AL_results(AL_rounds, results_auc, combined_labels, "AUC ROC, round %i:")) #if AUC is higher than accuracy and F1 score, it suggests that decision boundary is not calibrated or that #accuracy may improve if we exclude data points close to the decision boundary mean_results.append( collate_AL_results(AL_rounds, results_logloss, combined_labels, "Cross Entropy classification error, round %i: ")) #(quality of the probability labels is taken into account) mean_results.append( collate_AL_results(AL_rounds, results_pearson, combined_labels, "Pearson's r for round %i: ")) mean_results.append( collate_AL_results(AL_rounds, results_spearman, combined_labels, "Spearman's rho for round %i: ")) mean_results.append( collate_AL_results(AL_rounds, results_kendall, combined_labels, "Kendall's tau for round %i: ")) if np.any(tr_results_acc): mean_results.append( collate_AL_results(AL_rounds, tr_results_f1, combined_labels, "(TR) Macro-F1 scores for round %i: ")) mean_results.append( collate_AL_results(AL_rounds, tr_results_acc, combined_labels, "(TR) Accuracy for round %i: ")) mean_results.append( collate_AL_results(AL_rounds, tr_results_auc, combined_labels, "(TR) AUC ROC for round %i: ")) mean_results.append( collate_AL_results(AL_rounds, tr_results_logloss, combined_labels, "(TR) Cross Entropy Error for round %i: ")) # metricsfile = data_root_dir + 'outputdata/expt_root_dir' + \ # 'metrics_%s.pkl' % (tag) # with open(metricsfile, 'w') as fh: # pickle.dump((results_f1, results_acc, results_auc, results_logloss, results_pearson, results_spearman, # results_kendall), fh) # TODO: Correlations between reasons and features? # TODO: Correlations between reasons and latent argument features found using preference components? return results_f1, results_acc, results_auc, results_logloss, results_pearson, results_spearman, results_kendall,\ tr_results_f1, tr_results_acc, tr_results_auc, tr_results_logloss, mean_results, combined_labels