def test_compare_to_mcnemar_on_2_models(): y_true = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) ym1 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) ym2 = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) q, p = cochrans_q(y_true, ym1, ym2) mcn_q, mcn_p = mcnemar(mcnemar_table(y_true, ym1, ym2), corrected=False, exact=False) assert q == mcn_q assert p == mcn_p
def statistics(): # RUN combined_classify_to_csv() FIRST! # Get normpneum inceptionv3 model predictions csv_path = normpneum_bin_file_dir + '_incv3.csv' normpneum_res = pd.read_csv(csv_path, header=None).to_numpy() normpneum_incv3_class_preds = normpneum_res[:, 3] # Get normpneum inceptionv3 model predictions csv_path = normpneum_bin_file_dir + '_resnetv2.csv' normpneum_res = pd.read_csv(csv_path, header=None).to_numpy() normpneum_resnetv2_class_preds = normpneum_res[:, 3] # Get the test labels normpneum_test = np.argmax(normpneum_test_labels, axis=-1) # Contingency Table tb = mcnemar_table(y_target=normpneum_test, y_model1=normpneum_incv3_class_preds, y_model2=normpneum_resnetv2_class_preds) print(tb) # McNemar's test chi2, p = mcnemar(ary=tb, corrected=True) print('chi-squared:', chi2) print('p-value:', p) accuracy_normpneum_incv3 = accuracy_score(normpneum_test, normpneum_incv3_class_preds) accuracy_normpneum_resnetv2 = accuracy_score( normpneum_test, normpneum_resnetv2_class_preds) print(f"Test accuracy normpneum incv3: {accuracy_normpneum_incv3}") print( f"Test accuracy normpneum incresnetv2: {accuracy_normpneum_resnetv2}")
def run_mcnemar_test(y_test, model_1_class_predictions, model_2_class_predictions, model_1_name, model_2_name): """ Runs the McNemar test to determine if there is a statistically significant difference in the class predictions. Writes the results and associated contingency table locally. :param y_test: y_test series :param model_1_class_predictions: class predictions from model 1 :param model_2_class_predictions: class predictions from model 2 :param model_1_name: name of the first model :param model_2_name: name of the second model """ results_table = mcnemar_table(y_target=y_test, y_model1=model_1_class_predictions, y_model2=model_2_class_predictions) chi2, p = mcnemar(ary=results_table, corrected=True) pd.DataFrame({ 'chi2': [chi2], 'p': [p] }).to_csv(os.path.join(f'{model_1_name}_{model_2_name}_mcnemar_test.csv')) board = checkerboard_plot( results_table, figsize=(6, 6), fmt='%d', col_labels=[f'{model_2_name} wrong', f'{model_2_name} right'], row_labels=[f'{model_1_name} wrong', f'{model_1_name} right']) plt.tight_layout() plt.savefig( os.path.join('modeling', 'comparison_files', f'{model_1_name}_{model_2_name}_mcnemar_test.png')) plt.clf()
def summarize_feature_comparisons( base_clf: BaseEstimator, comparison_clfs: Dict[str, BaseEstimator], X_test, y_test ): from mlxtend.evaluate import mcnemar, cochrans_q, mcnemar_table summary_dict = collections.OrderedDict() mcnemar_tbs = dict() # create list of predicted values base_y_predict = base_clf.predict(X_test) y_predictions = [base_y_predict] for idx, (name, clf) in enumerate(comparison_clfs.items()): # get the probability y_predict_proba = clf.predict_proba(X_test) y_predict = clf.predict(X_test) # form mcnemar tables against base classifier tb = mcnemar_table(y_test, base_y_predict, y_predict) mcnemar_tbs[f"base vs {name}"] = tb.values() # store predictions per classifier y_predictions.append(y_predict) # first run cochrans Q test qstat, pval = cochrans_q(y_test, *y_predictions) summary_dict["cochrans_q"] = qstat summary_dict["cochrans_q_pval"] = pval # run mcnemars test against all the predictions for name, table in mcnemar_tbs.items(): chi2stat, pval = mcnemar(table, exact=True) summary_dict[f"mcnemar_{name}_chi2stat"] = chi2stat summary_dict[f"mcnemar_{name}_pval"] = pval return summary_dict
def test_corrected_false(): tb = np.array([[101, 121], [59, 33]]) chi2, p = (21.355555555555554, 3.8151358651125936e-06) chi2p, pp = mcnemar(tb, corrected=False) assert_almost_equal(chi2, chi2p, places=7) assert_almost_equal(p, pp, places=7)
def test_defaults(): tb = np.array([[101, 121], [59, 33]]) chi2, p = (20.672222222222221, 5.4500948254271171e-06) chi2p, pp = mcnemar(tb) assert_almost_equal(chi2, chi2p, places=7) assert_almost_equal(p, pp, places=7)
def test_exact_corrected(): tb = np.array([[101, 121], [59, 33]]) chi2, p = (None, 4.4344492637555101e-06) chi2p, pp = mcnemar(tb, exact=True, corrected=False) assert chi2 is None assert_almost_equal(p, pp, places=7) assert p < 4.45e-06
def test_exact(): tb = np.array([[101, 121], [59, 33]]) chi2, p = (None, 4.4344492637555101e-06) chi2p, pp = mcnemar(tb, exact=True) assert chi2 is None assert_almost_equal(p, pp, decimal=7) assert p < 4.45e-06
def svm_p_value(trainData,testData, input_pred): svc = LinearSVC(max_iter = 10000, verbose=50, C= 0.1) train_x = np.array(list(trainData['player_array'])) train_y = np.array(list(trainData['win'])) test_x = np.array(list(testData['player_array'])) test_y = np.array(list(testData['win'])) svc.fit(train_x, train_y) test_pred = svc.predict(test_x) tb = mcnemar_table(y_target=test_y, y_model1=input_pred, y_model2=test_pred) chi2, p = mcnemar(ary=tb, corrected=True) return p
def mlp_p_value(trainData,testData, input_pred): ann = MLPClassifier(verbose=True, max_iter= 500,tol= 0.0005, solver= 'adam', alpha= 0.0001, activation= 'logistic', hidden_layer_sizes = (50,40)) train_x = np.array(list(trainData['player_array'])) train_y = np.array(list(trainData['win'])) test_x = np.array(list(testData['player_array'])) test_y = np.array(list(testData['win'])) ann.fit(train_x, train_y) test_pred = ann.predict(test_x) tb = mcnemar_table(y_target=test_y, y_model1=input_pred, y_model2=test_pred) chi2, p = mcnemar(ary=tb, corrected=True) return p
def log_p_value(trainData,testData, input_pred): log = LogR(max_iter=500, solver='newton-cg', C=0.1) train_x = np.array(list(trainData['player_array'])) train_y = np.array(list(trainData['win'])) test_x = np.array(list(testData['player_array'])) test_y = np.array(list(testData['win'])) log.fit(train_x, train_y) test_pred = log.predict(test_x) tb = mcnemar_table(y_target=test_y, y_model1=input_pred, y_model2=test_pred) chi2, p = mcnemar(ary=tb, corrected=True) return p
def mcnemar_test(target, model_1_pred, model_2_pred): """ Calculates p-value of the mcnemar test It builds a contingency table and uses that to calculate the p-value :param target: a numpy array that has the actual target values :param model_1_pred: a numpy array that contains values based on prediction of model 1 :param model_2_pred: a numpy array that contains values based on prediction of model 2 :return p_value: the probability calculated under the chi-squared distribution """ mc_table = mcnemar_table(y_target=target, y_model1=model_1_pred, y_model2=model_2_pred) n = mc_table[0, 1] + mc_table[1, 0] # if the sum of b + c is less than 25, we should you use the binomial distribution # instead of the chi-squared distribution. Check https://en.wikipedia.org/wiki/McNemar%27s_test binomial = True if n < 25 else False _, p_value = mcnemar(ary=mc_table, exact=binomial) return p_value
def main(file_1, file_2): a, b, c, d = 0, 0, 0, 0 nb_lines = 0 y_ground, y_1, y_2 = [], [], [] with open(file_1) as f1, open(file_2) as f2: for line_1, line_2 in zip(f1, f2): ground_1, pred_1 = map(int, line_1.strip().split()[1:]) ground_2, pred_2 = map(int, line_2.strip().split()[1:]) if ground_1 != ground_2: logger.error('Files do not belong to the same dataset') sys.exit(1) y_ground.append(ground_1) y_1.append(pred_1) y_2.append(pred_2) if pred_1 == ground_1: if pred_2 == ground_1: a += 1 else: b += 1 else: if pred_2 == ground_1: c += 1 else: d += 1 nb_lines += 1 logger.info('Loaded {} lines..'.format(nb_lines)) logger.info('| {} | {} |'.format(a, b)) logger.info('| {} | {} |'.format(c, d)) y_ground = np.array(y_ground) y_1 = np.array(y_1) y_2 = np.array(y_2) tb = mcnemar_table(y_target=y_ground, y_model1=y_1, y_model2=y_2) logger.info('\n {}'.format(tb)) chi2, p = mcnemar(ary=tb, corrected=True) logger.info('chi-squared: {}'.format(chi2)) logger.info('p-value: {}'.format(p))
def stat_test(df, classifier1, classifier2): x = df['Cleaned'].values y = df['Class'].values # split dataset into training and test sets, with 80:20 split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1000, stratify=y) # vectorizer for first classifier # vectorizer = CountVectorizer() vectorizer = TfidfVectorizer() vectorizer.fit(x_train) X_test = vectorizer.transform(x_test) y_pred_1 = classifier1.predict(X_test) # vectorizer for second classifier # vectorizer = CountVectorizer() vectorizer = TfidfVectorizer() vectorizer.fit(x_train) X_test = vectorizer.transform(x_test) y_pred_2 = classifier2.predict(X_test) contingency_table = mcnemar_table(y_target=y_test, y_model1=y_pred_1, y_model2=y_pred_2) print(contingency_table) chi2, p_val = mcnemar(ary=contingency_table, corrected=True) print('chi-squared:', chi2) print('p-value:', p_val)
def compute_stat_sig(systems_data, measure): significance = defaultdict(list) for system in ["systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota"]: for other_system in ["sota", "human"]: if system == other_system: continue sys_data = [x[measure] for x in systems_data[system]] other_sys_data = [x[measure] for x in systems_data[other_system]] true_data = [1] * len(sys_data) tb_b = mcnemar_table(y_target=np.array(true_data), y_model1=np.array(sys_data), y_model2=np.array(other_sys_data)) chi2, p_value = mcnemar(ary=tb_b, corrected=True) print(tb_b) print( f"mcnemar {system},{other_system}: chi2: {chi2}, p-value {p_value}" ) if p_value <= 0.05 and p_value >= 0: significance[system].append(other_system[0]) significance[system] = ",".join(significance[system]) return significance
# KNN knnModel = KNeighborsClassifier(n_neighbors=3) knnModel.fit(X_train, y_train) score = knnModel.score(X_test, y_test) print(score) # Create contingency table contingency = np.array([[0, 0], [0, 0]]) for x, y in zip(X_test, y_test): predicted_regression = regressionModel.predict([x])[0] predicted_knn = knnModel.predict([x])[0] if predicted_regression == y and predicted_knn == y: contingency[0, 0] = contingency[0, 0] + 1 elif predicted_regression == y and predicted_knn != y: contingency[1, 0] = contingency[1, 0] + 1 elif predicted_regression != y and predicted_knn == y: contingency[0, 1] = contingency[0, 1] + 1 else: contingency[1, 1] = contingency[1, 1] + 1 print(contingency) # Calculate McNemar test statistic, pvalue = mcnemar(contingency, exact=True) print('statistic=%.3f, p-value=%.3f' % (statistic, pvalue)) alpha = 0.05 if pvalue > alpha: print('Same proportions of errors (fail to reject H0)') else: print('Different proportions of errors (reject H0)')
def train(): data_x1, data_x2, data_y = load_tensors() sizedata = len(data_x1) print("Data of size:", sizedata) print("Data 2 of size:", len(data_x2)) # Split dataset into 5 sub-datasets splitted_x1 = list(split(data_x1, 5)) splitted_x2 = list(split(data_x2, 5)) splitted_y = list(split(data_y, 5)) print("Available GPU :", torch.cuda.is_available()) torch.cuda.set_device(0) k = ARGS.kFold # Prepare array of scores precision_list = [] recall_list = [] # valloss_list = [] AUC_list = [] for ind_i in range(0, k): # Prepare X_train Y_train X_test Y_test X_test1 = splitted_x1[ind_i] X_test2 = splitted_x2[ind_i] Y_test = splitted_y[ind_i] # Deep copy, otherwise iteration problem copysplitX1 = list(splitted_x1) copysplitX2 = list(splitted_x2) copysplitY = list(splitted_y) del copysplitX1[ind_i] del copysplitX2[ind_i] del copysplitY[ind_i] X_train1 = copysplitX1 # CUI + CCS X_train2 = copysplitX2 # CUI only Y_train = copysplitY modelCUI = Network(0).cuda() modelCCS = Network(1).cuda() # XAVIER Init modelCUI.apply(init_weights) modelCCS.apply(init_weights) with torch.cuda.device(0): # Hyperparameters : epochs = ARGS.nEpochs batchsize = ARGS.batchSize learning_rate = ARGS.lr log_interval = 2 criterion = nn.BCEWithLogitsLoss() # criterion = nn.BCELoss() # criterion = nn.CrossEntropyLoss() optimizer1 = optim.SGD(modelCUI.parameters(), lr=learning_rate) optimizer2 = optim.SGD(modelCCS.parameters(), lr=learning_rate) # optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Train loader numplist = np.array(X_train2) arrX = np.concatenate(numplist).tolist() tensor_x = torch.Tensor(arrX).cuda() numplist = np.array(Y_train) arrY = np.concatenate(numplist).tolist() tensor_y = torch.Tensor(arrY).cuda() print("Shape X:", np.shape(arrX), "Shape Y:", np.shape(arrY)) dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset train_loader1 = dt.DataLoader(dataset, batch_size=batchsize, shuffle=True) numplist = np.array(X_train1) arrX = np.concatenate(numplist).tolist() tensor_x = torch.Tensor(arrX).cuda() dataset = dt.TensorDataset(tensor_x, tensor_y) train_loader2 = dt.DataLoader(dataset, batch_size=batchsize, shuffle=True) # Test loader tensor_x = torch.Tensor( np.array(X_test2).tolist()).cuda() # transform to torch tensor tensor_y = torch.Tensor(np.array(Y_test).tolist()).cuda() dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset test_loader1 = dt.DataLoader(dataset, batch_size=batchsize, shuffle=False) tensor_x = torch.Tensor(np.array(X_test1).tolist()).cuda() dataset = dt.TensorDataset(tensor_x, tensor_y) test_loader2 = dt.DataLoader(dataset, batch_size=batchsize, shuffle=False) # Training model CUI print("Training CUI model...") for epoch in range(epochs): for batch_idx, (data, target) in enumerate(train_loader1): data, target = Variable(data), Variable(target) optimizer1.zero_grad() net_out = modelCUI(data) loss = criterion(net_out, target) loss.backward() optimizer1.step() # if batch_idx % log_interval == 0: # print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: '.format( # epoch, batch_idx * len(data), len(train_loader1.dataset), # 100. * batch_idx / len(train_loader1))) # print(loss.data) print("Training CUI+CCS model...") # Training model CUI+CCS for epoch in range(epochs): for batch_idx, (data, target) in enumerate(train_loader2): data, target = Variable(data), Variable(target) optimizer2.zero_grad() net_out = modelCCS(data) loss = criterion(net_out, target) loss.backward() optimizer2.step() # if batch_idx % log_interval == 0: # print('Train Epoch: {} [{}/{} ({:.0f}%)]\t Loss: '.format( # epoch, batch_idx * len(data), len(train_loader2.dataset), # 100. * batch_idx / len(train_loader2))) # print(loss.data) # Testing and save score total = 0 correct = 0 modelCUI.eval() modelCCS.eval() P = list() R = list() test_loader_list = list([test_loader1, test_loader2]) model_list = list([modelCUI, modelCCS]) nemarlist = list([np.array([]), np.array([])]) # Precisions for model, test_loader in zip(model_list, test_loader_list): for i in range(1, 4): for data in test_loader: x, labels = data outputs = model(Variable(x)).detach( ) # output is a tensor of size [BATCHSIZE][ARGS.numberOfOutputCodes] _, predicted = torch.topk(outputs.data, i) for y_predlist, y in zip(predicted, labels): for y_pred in y_predlist: total += 1 if y[y_pred] == 1: correct += 1 precision = correct / total P.append(precision) correct = 0 total = 0 for model, test_loader, mcnemar_idx in zip(model_list, test_loader_list, list([0, 1])): # Number of diagnostic for each sample (mean of 12 codes, max of 30 codes, R@10 - R@20 - R@30 seems appropriate) total_true_list = list() for data in test_loader: x, labels = data for y in labels: total_true = 0 for val in y: if val == 1: total_true += 1 total_true_list.append(total_true) # Recalls for i in range(10, 40, 10): total_true_list_cpy = list(total_true_list) for data in test_loader: x, labels = data outputs = model(Variable(x)).detach() _, predicted = torch.topk(outputs.data, i) for y_predlist, y in zip(predicted, labels): total += total_true_list_cpy.pop(0) for y_pred in y_predlist: if y[y_pred] == 1: correct += 1 if i == 30: nemarlist[mcnemar_idx] = np.append( nemarlist[mcnemar_idx], 1) else: if i == 30: if correct < total: nemarlist[mcnemar_idx] = np.append( nemarlist[mcnemar_idx], 0) else: nemarlist[mcnemar_idx] = np.append( nemarlist[mcnemar_idx], 1) # Else, there's no more diagnoses to be found, so we will not consider it as wrong recall = correct / total R.append(recall) correct = 0 total = 0 precision_list.append(P) recall_list.append(R) # AUROC YTRUE = None YPROBA = None for data in test_loader: x, labels = data x, labels = Variable(x), Variable(labels) outputs = model(x).detach().cpu().numpy() labels = labels.detach().cpu().numpy() for batch_true, batch_prob in zip(labels, outputs): YTRUE = np.concatenate( (YTRUE, [batch_true]), axis=0) if YTRUE is not None else [batch_true] YPROBA = np.concatenate( (YPROBA, [batch_prob]), axis=0) if YPROBA is not None else [batch_prob] ROC_avg_score = roc(YTRUE, YPROBA, average='micro', multi_class='ovr') AUC_list.append(ROC_avg_score) # McNemar test nemar_true = np.ones(nemarlist[0].size) nemar_m1 = nemarlist[0] nemar_m2 = nemarlist[1] tb = mcnemar_table(y_target=nemar_true, y_model1=nemar_m1, y_model2=nemar_m2) # print("Matrix: ", tb) chi2, p = mcnemar(ary=tb, corrected=True) # print('chi-squared:', chi2) # print('p-value:', p) filesave = open("McNemar_report.txt", "a") filesave.write("\nMatrix: ") filesave.write(str(tb)) filesave.write("\np-value and chi-squared:") filesave.write(str(p)) filesave.write(" ") filesave.write(str(chi2)) filesave.close() # Output score of each fold + average print("Scores for each fold:") print("Precision:", precision_list) print("Recall:", recall_list) print("AUROC:", AUC_list)
# Add some text for labels, title and custom x-axis tick labels, etc. ax.set_ylabel('Accuracy') ax.set_xticks(x) ax.set_xticklabels(labels) ax.legend(loc='upper right') ax.set_ylim([0.5, 0.68]) #autolabel(rects1) #autolabel(rects2) #autolabel(rects3) fig.tight_layout() plt.show() fig.savefig('fig2.png', dpi=500) # Example of calculating the mcnemar test from statsmodels.stats.contingency_tables import mcnemar # define contingency table table = [[4, 2], [1, 3]] # calculate mcnemar test result = mcnemar(table, correction=False) # summarize the finding print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue)) # interpret the p-value alpha = 0.05 if result.pvalue > alpha: print('Same proportions of errors (fail to reject H0)') else: print('Different proportions of errors (reject H0)')
def score(data_folder, out_folder, task, score_folder): data_folder = Path(data_folder) out_folder = Path(out_folder) datasets = ["ldc", "viggo", "webnlg", "e2e"] systems = ["systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota", "human"] stats = {} first = [] second = [] for dataset in datasets: print(f"processing {dataset}") systems_data = {} for system in systems: systems_data[system] = json.load( open(data_folder / dataset / f"{system}.json")) print(f"dataset: {dataset}") all_scored = defaultdict(list) score_folder = Path(score_folder) score_file = score_folder / task / (f"{dataset}.csv") total_texts = 5 try: df = pd.read_csv(score_file) except: print(f"{score_file} not available.") continue scores = df.to_dict(orient="records") try: input_df = pd.read_csv(out_folder / task / (f"mturk_{dataset}.csv")) except: print(f"ignoring {dataset}") continue input_data = input_df.to_dict(orient="records") if task == "fidelity_annotations": for item in scores: for i in range(total_texts): text = item[f"Input.text{i + 1}"] index = item["Input.index"] accurate = f"Answer.text{i + 1}_accurate.text{i + 1}_accurate" key = f"{index}_{text}" try: all_scored[key].append({"accurate": item[accurate]}) except: import ipdb ipdb.set_trace() fidelity_scores = [] all_ser_scores = [] all_sfc_scores = [] true_scores_sfc = [] true_scores_ser = [] sfc_data = defaultdict(list) ser_data = defaultdict(list) for x in all_scored: try: one = all_scored[x][0]["accurate"] two = all_scored[x][1]["accurate"] first.append(one) second.append(two) except: pass for item in input_data: for i in range(total_texts): text_i = item[f"text{i + 1}"] system = item[f"system{i + 1}"] index = item["index"] key = f"{index}_{text_i}" if key in all_scored: obj = systems_data[system][index] score = np.mean( [int(x["accurate"]) for x in all_scored[key]]) # these have to be reconciled if disagreeing: take ceil or floor sample_type = f'{"A_D" if obj["sfc_correct"] else "E_D"}' if dataset != "ldc": sample_type += f',{"A_H" if obj["ser_correct"] else "E_H"}' fidelity_scores.append({ "ind": index, "system": system, "value": math.ceil(score), "sample_type": sample_type, "text": text_i, "data": item["data"], "original_text": obj["original_" + dataset_fields[dataset]["text"].strip()], "sfc_correct": obj["sfc_correct"], "ser_correct": obj["ser_correct"] if "ser_correct" in obj else "", }) # Reconciled cases are those where the expert annotators disagreed. They discussed these and # reached the following agreements reconciled = { "Example 1": 0, "Example 2": 1, } if text_i in reconciled: true_scores_sfc.append(reconciled[text_i]) true_scores_ser.append(reconciled[text_i]) else: add_closest_score(score, true_scores_sfc, obj["sfc_correct"]) if dataset != "ldc": add_closest_score(score, true_scores_ser, obj["ser_correct"]) all_sfc_scores.append(obj["sfc_correct"]) sfc_data[system].append(obj["sfc_correct"]) if dataset != "ldc": all_ser_scores.append(obj["ser_correct"]) ser_data[system].append(obj["ser_correct"]) if dataset != "ldc": c_report = classification_report(true_scores_ser, all_ser_scores) stats[f"{dataset}_ser_report"] = classification_report( true_scores_ser, all_ser_scores, output_dict=True) print("SER") print(c_report) c_report = classification_report(true_scores_sfc, all_sfc_scores) stats[f"{dataset}_sfc_report"] = classification_report( true_scores_sfc, all_sfc_scores, output_dict=True) print("SFC") print(c_report) mturk_df = pd.DataFrame(fidelity_scores) agg_stats = mturk_df.groupby(["system"]).agg(["mean", "count"]) print(agg_stats) stats[f"{dataset}_score"] = agg_stats.to_dict()[("value", "mean")] stats[f"{dataset}_count"] = agg_stats.to_dict()[("value", "count")] print( mturk_df.groupby(["system", "sample_type"]).agg(["mean", "count"])) if dataset != "ldc": tb_b = mcnemar_table( y_target=np.array(true_scores_sfc), y_model1=np.array(all_sfc_scores), y_model2=np.array(all_ser_scores), ) print(tb_b) chi2, p = mcnemar(ary=tb_b, corrected=True) print(f"mcnemar chi2: {chi2}, p-value {p}") for measure in ["sfc_correct", "ser_correct"]: if measure == "ser_correct" and dataset == "ldc": continue stats[f"{dataset}_significance_{measure}"] = compute_stat_sig( systems_data, system, measure) elif task == "fluency": for item in scores: for i in range(total_texts): field = f"Input.text{i + 1}" answer_field = f"Answer.fluency{i + 1}" all_scored[item[field]].append(item[answer_field]) for x in all_scored: all_scored[x] = { "average": np.mean(all_scored[x]), "count": len(all_scored[x]) } fluency_scores = defaultdict(list) for item in input_data: for i in range(total_texts): if item[f"text{i + 1}"] in all_scored: score = all_scored[item[f"text{i + 1}"]]["average"] system = item[f"system{i + 1}"] fluency_scores[system].append(score) fluency_df_values = [] for system in fluency_scores: fluency_df_values.extend([{ "system": system, "value": fluency_scores[system][i] } for i in range(len(fluency_scores[system]))]) mturk_df = pd.DataFrame(fluency_df_values) agg_stats = mturk_df.groupby(["system" ]).agg(["mean", "count", "median"]) print(agg_stats) stats[dataset] = agg_stats.to_dict()[("value", "mean")] test_stats = sp.posthoc_wilcoxon(mturk_df, val_col="value", group_col="system", sort=True, zero_method="zsplit") print(test_stats) significance = defaultdict(list) for system in [ "systemNoFcNoFs", "systemNoFc", "systemFcPost", "sota" ]: for other_system in ["sota", "human"]: p_value = test_stats.loc[system, other_system] if p_value <= 0.05 and p_value >= 0: significance[system].append(other_system[0]) significance[system] = ",".join(significance[system]) stats[f"{dataset}_significance"] = significance print(cohen_kappa_score(first, second)) json.dump(stats, open(data_folder / f"{task}.json", "w"), indent=2)
l_rslt = [ sr_multi_nb.to_numpy(), sr_svc.to_numpy(), sr_lsvc.to_numpy(), sr_rf.to_numpy(), sr_lr.to_numpy(), sr_ada.to_numpy() ] l_pair = list(zip(l_algo, l_rslt)) l_mcnemar_rslt = [] for i, t0 in enumerate(l_pair): for j, t1 in enumerate(l_pair[i + 1:]): k0 = t0[0] k1 = t1[0] v0 = t0[1] v1 = t1[1] tb = mcnemar_table(y_target=y, y_model1=v0, y_model2=v1) chi2, p = mcnemar(ary=tb, exact=True) #print(tb) l_mcnemar_rslt.append((chi2, p)) print(f"McNemar %s v %s: chi2 : %0.3f, %0.3f" % (k0, k1, chi2, p)) for i in l_mcnemar_rslt: print("%0.5f %0.5f" % i)
# print(1) elif i < 0.522: i = 0 counter_0 += 1 condel_binary.append(i) # print(0) print('matthews_corr_coef (condel): ', matthews_corrcoef(true_class_binary, condel_binary)) ################################################################# '''SIFT''' sift_and_model = mcnemar_table(y_target=np.array(true_class_binary), y_model1=np.array(model_binary), y_model2=np.array(sift_binary)) print('model & sift: ', '\n', sift_and_model) chi2, p = mcnemar(ary=sift_and_model, corrected=True) print(' chi_squared: ', chi2) print(' p-value: ', p) brd = checkerboard_plot(sift_and_model, figsize=(2, 2), fmt='%d', col_labels=['model 2 wrong', 'model 2 right'], row_labels=['model 1 wrong', 'model 1 right']) plt.show() '''PPH2''' pph2_and_model = mcnemar_table(y_target=np.array(true_class_binary), y_model1=np.array(model_binary), y_model2=np.array(pph2_binary)) print('model & pph2: ', '\n', pph2_and_model)
if not src_sent and not ref_sent and not model_1_sent and not model_2_sent: break sent_pair = (lemmatize(src_sent, src_lemma_func), lemmatize(model_1_sent, tgt_lemma_func), lemmatize(model_2_sent, tgt_lemma_func), lemmatize(ref_sent, tgt_lemma_func), stemize(model_1_sent, tgt_stem_func), stemize(model_2_sent, tgt_stem_func), stemize(ref_sent, tgt_stem_func)) both_in_s, m1_in_s, m2_in_s, none_in_s = evaluate_sentence_pair( sent_pair, term_dict, term_patterns) both += both_in_s m1_not_m2 += m1_in_s m2_not_m1 += m2_in_s none += none_in_s print(both) print(m1_not_m2) print(m2_not_m1) print(none) from mlxtend.evaluate import mcnemar import numpy as np confusion = np.array([[both, m1_not_m2], [m2_not_m1, none]]) chi2, p = mcnemar(ary=confusion, exact=False) print("P value: %.10f" % p)
def main(): #open needed files test_data = pd.read_csv('data/test_data.csv', encoding='ISO-8859-1') train_data = pd.read_csv('data/train_data.csv', encoding='ISO-8859-1') train_bigram = pd.read_pickle('saved_pickles_models/bigram.pkl') train_id2word = pd.read_pickle('saved_pickles_models/id2word.pkl') train_corpus = pd.read_pickle('saved_pickles_models/corpus.pkl') model = pd.read_pickle('saved_pickles_models/lda_model2.model') scaler = StandardScaler() test_data_list = [] feature_vectors = [] test_vectors = [] #get distributions from every tweet in train_data print('Getting distribution...') for i in range(len(train_data)): train_top_topics = model.get_document_topics(train_corpus[i], minimum_probability=0.0) train_topic_vector = [train_top_topics[i][1] for i in range(10)] feature_vectors.append(train_topic_vector) x = np.array(feature_vectors) y = np.array(train_data.relevant) kf = KFold(5, shuffle=True, random_state=42) log_res_train_f1, log_res_sgd_train_f1, mod_huber_train_f1 = [], [], [] print('Starting classification algorithm calculations on training data...') for train_ind, val_ind in kf.split(x, y): x_train, y_train = x[train_ind], y[train_ind] x_val, y_val = x[val_ind], y[val_ind] x_train_scale = scaler.fit_transform(x_train) x_val_scale = scaler.transform(x_val) #logistic regression log_reg_train = LogisticRegression(class_weight='balanced', solver='newton-cg', fit_intercept=True).fit( x_train_scale, y_train) log_reg_train_y_pred = log_reg_train.predict(x_val_scale) log_res_train_f1.append( f1_score(y_val, log_reg_train_y_pred, average='binary')) #loss=log sgd = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, loss='log', class_weight='balanced').fit( x_train_scale, y_train) sgd_y_pred = sgd.predict(x_val_scale) log_res_sgd_train_f1.append( f1_score(y_val, sgd_y_pred, average='binary')) #modified huber sgd_huber = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, alpha=20, loss='modified_huber', class_weight='balanced').fit( x_train_scale, y_train) sgd_huber_y_pred = sgd_huber.predict(x_val_scale) mod_huber_train_f1.append( f1_score(y_val, sgd_huber_y_pred, average='binary')) print('Done with training data. Starting on testing data...\n') #gather all test tweets and apply the clean_data() and get_bigram() functions print('Cleaning testing data...') for row in test_data['tweets']: cleaned_status = clean_status(row) test_data_list.append(cleaned_status) bigrams = get_bigram(test_data_list) test_bigram = [bigrams[entry] for entry in test_data_list] test_corpus = [train_id2word.doc2bow(tweets) for tweets in test_bigram] #test model on testing data print('Starting classification algorithm calculations on testing data...') for i in range(len((test_data))): top_topics = model.get_document_topics(test_corpus[i], minimum_probability=0.0) topic_vector = [top_topics[i][1] for i in range(10)] test_vectors.append(topic_vector) x_test = np.array(test_vectors) y_test = np.array(test_data.relevant) x_fit = scaler.fit_transform(x_test) #logistic regression log_reg_test = LogisticRegression(class_weight='balanced', solver='newton-cg', fit_intercept=True).fit(x_fit, y_test) y_pred_log_res_test = log_reg_test.predict(x_test) #modified huber sgd_huber_test = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, alpha=20, loss='modified_huber', class_weight='balanced', shuffle=True).fit( x_fit, y_test) y_pred_huber_test = sgd_huber_test.predict(x_fit) #print results for both cases print('Calculating Summary...') y_target = y_test y_model1 = y_pred_log_res_test y_model2 = y_pred_huber_test m_table = mcnemar_table(y_target=y_test, y_model1=y_model1, y_model2=y_model2) chi2, p = mcnemar(ary=m_table, corrected=True) print('\n') print('Results from using training data distribution:') print( f'Logistic Regression Val f1: {np.mean(log_res_train_f1):.3f} +- {np.std(log_res_train_f1):.3f}' ) print( f'Logisitic Regression SGD Val f1: {np.mean(log_res_sgd_train_f1):.3f} +- {np.std(log_res_sgd_train_f1):.3f}' ) print( f'SVM Huber Val f1: {np.mean(mod_huber_train_f1):.3f} +- {np.std(mod_huber_train_f1):.3f}' ) print('\n') print('Results from using unseen test data:') print('Logistic regression Val f1: ' + str(f1_score(y_test, y_pred_log_res_test, average='binary'))) print('Logistic regression SGD f1: ' + str(f1_score(y_test, y_pred_huber_test, average='binary'))) print('\n') print('Summary: ') print('ncmamor table: ', m_table) print('chi-squared: ', chi2) print('p-value: ', p) #Save feature vector and huber classifier for later use print('\n') print('Saving feature vector...') save_vector = open('saved_pickles_models/feature_vector.pkl', 'wb') pickle.dump(feature_vectors, save_vector) save_vector.close() print('\n') print('Saving the huber classifier...') save_huber = open('saved_pickles_models/huber_classifier.pkl', 'wb') pickle.dump(sgd_huber, save_huber) save_huber.close() print('done')
def main( mlflow_server: str, significance: float, ): # We start by setting the tracking uri to make sure the mlflow server is reachable mlflow.set_tracking_uri(mlflow_server) # We need to instantiate the MlflowClient class for certain operations mlflow_client = MlflowClient() # We create and set an experiment to group all runs mlflow.set_experiment("Model Comparison") # We create classification data and split it into training and testing sets X, y = make_classification( n_samples=10000, n_classes=2, n_features=20, n_informative=9, random_state=random_seed, ) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2) # We first train a Logistic regression model, log it in mlflow and then move it to the production stage with mlflow.start_run(): lr_model = LogisticRegression() lr_model.fit(X_train, y_train) y_pred = lr_model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) mlflow.log_metric("accuracy", accuracy) mlflow.sklearn.log_model(lr_model, artifact_path="model", registered_model_name="Logistic Regression") mlflow_client.transition_model_version_stage(name="Logistic Regression", version=1, stage="Production") # We then train a Random Forest model, log it in mlflow and then move it to the staging stage with mlflow.start_run(): rf_model = RandomForestClassifier() rf_model.fit(X_train, y_train) y_pred = rf_model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) mlflow.log_metric("accuracy", accuracy) mlflow.sklearn.log_model(rf_model, artifact_path="model", registered_model_name="Random Forest") mlflow_client.transition_model_version_stage(name="Random Forest", version=1, stage="Staging") del lr_model del rf_model # We finally load both models from MLFlow # and compare them using the McNemar test # We get the download uris of both models and then we load them lr_model_download_uri = mlflow_client.get_model_version_download_uri( name="Logistic Regression", version=1, ) rf_model_download_uri = mlflow_client.get_model_version_download_uri( name="Random Forest", version=1, ) lr_model = mlflow.sklearn.load_model(lr_model_download_uri) rf_model = mlflow.sklearn.load_model(rf_model_download_uri) y_pred_lr = lr_model.predict(X_test) y_pred_rf = rf_model.predict(X_test) contingency_table = mcnemar_table(y_test, y_pred_lr, y_pred_rf) _, p_value = mcnemar(contingency_table, corrected=True) if p_value < significance: # In this case we reject the null hypothesis that the two models' are similar # We then archive the logistic regression model # and move the random forest model to the Production stage print( f"p-value {p_value} smaller than significance level {significance}" ) accuracy_lr = accuracy_score(y_test, y_pred_lr) accuracy_rf = accuracy_score(y_test, y_pred_rf) if accuracy_lr < accuracy_rf: print( f"Random Forest model's accuracy, {accuracy_rf}, is greater than " f"the Logistic Regression model's accuracy, {accuracy_lr}") print( "Archiving logistic regression model and moving random forest model to production" ) mlflow_client.transition_model_version_stage( name="Logistic Regression", version=1, stage="Archived", ) mlflow_client.transition_model_version_stage( name="Random Forest", version=1, stage="Production", ) else: print( f"Random Forest model's accuracy, {accuracy_rf}, is less than or equal to " f"the Logistic Regression model's accuracy, {accuracy_lr}") print("Keeping logistic regression model in production") else: print( f"p-value {p_value} greater than significance level {significance}" ) print("Keeping logistic regression model in production")
df_result['bi_F1'].to_numpy(), df_result['unibi_F1'].to_numpy()) print("ANNOVA F1 : %0.5f, %0.5f" % result) # Coher Q analysis y_uni = sr_uni.to_numpy() y_bi = sr_bi.to_numpy() y_unibi = sr_unibi.to_numpy() q, p_value = cochrans_q(y, y_uni, y_bi, y_unibi) print("COHRAN Q-Test: q: %0.5f, p_value: %0.5f" % (q, p_value)) l_grams = ['uni', 'bi', 'unibi'] l_rslt = [y_uni, y_bi, y_unibi] l_pair = list(zip(l_grams, l_rslt)) l_mcnemar_rslt = [] for i, t0 in enumerate(l_pair): for j, t1 in enumerate(l_pair[i + 1:]): k0 = t0[0] k1 = t1[0] v0 = t0[1] v1 = t1[1] tb = mcnemar_table(y_target=y, y_model1=v0, y_model2=v1) chi2, p = mcnemar(ary=tb, corrected=True) l_mcnemar_rslt.append("{chi2:.5f}".format(chi2=chi2)) l_mcnemar_rslt.append("{p:.5f}".format(p=p)) print(f"McNemar %s v %s: chi2 : %0.5f, p_value: %0.5f" % (k0, k1, chi2, p)) print(" ".join(l_mcnemar_rslt))
import numpy as np from mlxtend.evaluate import mcnemar cc = 688 cw = 127 wc = 85 ww = 204 cont_matrix = np.array([[cc, cw], [wc, ww]]) chi2, p = mcnemar(ary=cont_matrix, corrected=False) print('chi-squared:', chi2) print('p-value:', p)