def __init__(self, approach, task): self.rf_classifier = TrainClassifier() self.approach = approach if task == "test": # self.sr_output = "./data/fever-full/classifier_results/shared_dev_true_docs_"+str(approach)+"_features_new.jsonl" # self.sr_output = "./data/fever-full/classifier_results/shared_dev_true_docs_"+str(approach)+"_features_new_k_5.jsonl" self.sr_output = "./data/fever-full/classifier_results/shared_dev_true_docs_" + str( approach) + "_features_new_k_5_recall.jsonl" else: # self.sr_output = "./data/fever-full/classifier_results/subsample_train_true_docs_"+str(approach)+"_features_new.jsonl" # self.sr_output = "./data/fever-full/classifier_results/subsample_train_true_docs_"+str(approach)+"_features_new_k_5.jsonl" self.sr_output = "./data/fever-full/classifier_results/subsample_train_true_docs_" + str( approach) + "_features_new_k_5_recall.jsonl" self.tfidf = TFIDF() self.vs = VectorSpace() self.wmd = wordMoverDistance()
def __init__(self, task): self.task = task self.tfidf = TFIDF() self.vs = VectorSpace() self.wmd = wordMoverDistance()
class featureCore: def __init__(self, task): self.task = task self.tfidf = TFIDF() self.vs = VectorSpace() self.wmd = wordMoverDistance() def get_tf_idf_score(self, list_of_defactoNlps): # print ("nlp mdoes ", list_of_defactoNlps) tf_idf_score = 0 if self.task == 'classification': for model in list_of_defactoNlps: # print ("model.claim ", model.claim) # print ("model sentence ", model.sentences) relevant_sentence, score = self.tfidf.apply_tf_idf( model.claim, model.sentences) #0.2 if score >= 0.2: model.method_name["tfidf"] = { "Classification": { "pred_label": 1 } } # print ("claim ", model.claim) # print ("most similar sentence ", relevant_sentence) else: model.method_name["tfidf"] = { "Classification": { "pred_label": 0 } } else: for model in list_of_defactoNlps: relevant_sentence, score = self.tfidf.apply_tf_idf( model.claim, model.sentences) # classification: > 0.2 --> Yes, < 0.2 --> NEI # Detection: score > 0.6 --> Support, 0.2 < score < 0.6 --> Refutes, NEI < 0.2 if score >= 0.2: #detection # Supports if score > 0.4: model.method_name["tfidf"] = { "Detection": { "pred_label": 0 } } # refutes elif score <= 0.4: # REFUTES model.method_name["tfidf"] = { "Detection": { "pred_label": 1 } } #label as NEI else: model.method_name["tfidf"] = { "Detection": { "pred_label": 2 } } return list_of_defactoNlps def get_vector_space_score(self, list_of_defactoNlps): if self.task == 'classification': for model in list_of_defactoNlps: relevant_sentence, vector_space_score = self.vs.apply_vector_space( model.claim, model.sentences) if vector_space_score >= 0.2: model.method_name["vspace"] = { "Classification": { "pred_label": 1 } } else: model.method_name["vspace"] = { "Classification": { "pred_label": 0 } } else: for model in list_of_defactoNlps: relevant_sentence, vector_space_score = self.vs.apply_vector_space( model.claim, model.sentences) # classification: > 0.2 --> Yes, < 0.2 --> NEI # Detection: vector_space_score > 0.6 --> Support, 0.2 < vector_space_score < 0.6 --> Refutes, NEI < 0.2 if vector_space_score >= 0.1: #detection # Supports if vector_space_score > 0.4: model.method_name["vspace"] = { "Detection": { "pred_label": 0 } } # refutes elif vector_space_score <= 0.4: # REFUTES model.method_name["vspace"] = { "Detection": { "pred_label": 1 } } #label as NEI else: model.method_name["vspace"] = { "Detection": { "pred_label": 2 } } return list_of_defactoNlps def get_wmd_score(self, list_of_defactoNlps): # print ("nlp mdoes ", list_of_defactoNlps) wmd_score = 0 if self.task == 'classification': for model in list_of_defactoNlps: relevant_sentence, wmd_score = self.wmd.compute_wm_distance( model.claim, model.sentences) if wmd_score < 2.0: model.method_name["wmd"] = { "Classification": { "pred_label": 1 } } else: model.method_name["wmd"] = { "Classification": { "pred_label": 0 } } else: for model in list_of_defactoNlps: relevant_sentence, wmd_score = self.wmd.compute_wm_distance( model.claim, model.sentences) # classification: > 0.2 --> Yes, < 0.2 --> NEI # Detection: wmd_score > 0.6 --> Support, 0.2 < wmd_score < 0.6 --> Refutes, NEI < 0.2 if wmd_score <= 2: #detection # Supports if wmd_score < 1: model.method_name["wmd"] = { "Detection": { "pred_label": 0 } } # refutes elif wmd_score >= 1: # REFUTES model.method_name["wmd"] = { "Detection": { "pred_label": 1 } } #label as NEI else: model.method_name["wmd"] = {"Detection": {"pred_label": 2}} return list_of_defactoNlps
def __init__(self): self.tfidf = TFIDF() self.vs = VectorSpace() # self.wmd = wordMoverDistance() self.count_avg_true_evidences = 0
class SentenceClassifier: def __init__(self): self.tfidf = TFIDF() self.vs = VectorSpace() # self.wmd = wordMoverDistance() self.count_avg_true_evidences = 0 def store_tf_idf_results(self, example, approach): tmp_dict = {} scores = [] tmp_dict["id"] = example["id"] tmp_dict["true_label"] = example["label"] tmp_dict["claim"] = example["claim"] top_k_sents = 5 false_negatives_scores = [] tmp_dict["true_evidences"] = example["true_evidences"] tmp_dict["actual_true_positives"] = len(example["true_evidences"]) tmp_dict["total_sentences"] = len(example["relevant_sentences"]) tmp_dict["actual_true_negatives"] = len( example["relevant_sentences"]) - len(example["true_evidences"]) for evidence in example["relevant_sentences"]: if approach == "tfidf": _, similarity_score = self.tfidf.apply_tf_idf( example["claim"], evidence["sentence"]) # print ("similarity_score ", similarity_score) # print ("\n") if similarity_score > 0.05: # print (" scores != 0", similarity_score) scores.append(similarity_score) else: false_negatives_scores.append(similarity_score) elif approach == "vs": _, similarity_score = self.vs.apply_vector_space( example["claim"], evidence["sentence"]) # print ("similarity_score ", similarity_score) if similarity_score > 0.1: scores.append(similarity_score) else: false_negatives_scores.append(similarity_score) else: print("no approach matched") #take top k sorted_indexes = np.argsort(scores) filtered_indexes = [] if len(scores) == 0: # if score is 0, means there is no related sentence tmp_dict["predicted_sentences"] = "null" tmp_dict["predicted_sentences_ids"] = [["null", "null"]] tmp_dict["predicted_sentences"] = ["null"] # just fill random value tmp_dict["tf_idf_features"] = [10000, 10000, 10000, 10000, 10000] elif len(scores) >= top_k_sents: # print ("stored indexes ", sorted_indexes[-5:]) # filtered_indexes = [idx for idx in sorted_indexes[-5:] if scores[idx] > 0.05] # print ("filtered_indexes ", filtered_indexes) tmp_dict["predicted_sentences"] = itemgetter( *sorted_indexes[-top_k_sents:])(example["relevant_sentences"]) tmp_dict["predicted_sentences_ids"] = [[ sent["id"], sent["line_num"] ] for sent in tmp_dict["predicted_sentences"]] tmp_dict["predicted_sentences"] = [ sent["sentence"] for sent in tmp_dict["predicted_sentences"] ] tmp_dict["tf_idf_features"] = sorted(scores)[-top_k_sents:] else: # if scores size is less than 5, just add extra 10000s to feed to classifier # print ("sorted_indexes ", sorted_indexes) tmp_dict["predicted_sentences"] = itemgetter(*sorted_indexes)( example["relevant_sentences"]) # tmp_dict["predicted_sentences"] = example["relevant_sentences"] # print ("tmp dict ", tmp_dict["predicted_sentences"]) if len(sorted_indexes) == 1: tmp_dict["predicted_sentences"] = [ tmp_dict["predicted_sentences"] ] # print ("indexes 0") tmp_dict["predicted_sentences_ids"] = [[ sent["id"], sent["line_num"] ] for sent in tmp_dict["predicted_sentences"]] tmp_dict["predicted_sentences"] = [ sent["sentence"] for sent in tmp_dict["predicted_sentences"] ] tmp_dict["tf_idf_features"] = sorted(scores) + ( [10000] * (top_k_sents - len(sorted(scores)))) tmp_dict["accuracy"], t_correct_evds = self.compute_score( tmp_dict["true_evidences"], tmp_dict["predicted_sentences_ids"]) tmp_dict["predicted_true_positives"] = t_correct_evds tmp_dict["predicted_false_positives"] = len(scores) - t_correct_evds tmp_dict["predicted_true_negatives"] = len(false_negatives_scores) tmp_dict["predicted_false_negatives"] = tmp_dict[ "predicted_true_negatives"] - len(scores) - t_correct_evds tmp_dict["Recall"] = self.handle_errors( tmp_dict["predicted_true_positives"], tmp_dict["actual_true_positives"]) tmp_dict["Precision"] = self.handle_errors( tmp_dict["predicted_true_positives"], tmp_dict["predicted_false_positives"] + tmp_dict["predicted_true_positives"]) tmp_dict["accuracy_formula"] = self.handle_errors( tmp_dict["predicted_true_positives"], tmp_dict["actual_true_positives"]) tmp_dict["f1_score"] = self.handle_errors( 2 * tmp_dict["Recall"] * tmp_dict["Precision"], tmp_dict["Recall"] + tmp_dict["Precision"]) return tmp_dict # because multiple variables can give zeroDivisionError def handle_errors(self, a, b): try: z = a / b except: z = 0 return z def store_features(self, sub_sampled_data, features_path, approach): sub_sampled_data = jsonlines.open(sub_sampled_data) sub_sampled_data = [example for example in sub_sampled_data] # for eg in sub_sampled_data: # self.count_avg_true_evidences = self.count_avg_true_evidences + len(eg["true_evidences"]) # print ("total evidences ", self.count_avg_true_evidences) # print ("leng of true evidence ", self.count_avg_true_evidences/len(sub_sampled_data)) print("len of data ", len(sub_sampled_data)) # features = pd.DataFrame(columns=['S1','S2','S3','S4','S5', 'Label']) # similarity scores of top 5 sents approach = [approach] * len(sub_sampled_data) print("approach inside store features ", len(approach)) pool = Pool(processes=10) accuracy = [] precisions = [] recalls = [] f1_scores = [] formula_acc = [] count_accurate = 0 with jsonlines.open(features_path, mode='w') as f: for tmp_dict in pool.starmap(self.store_tf_idf_results, zip(sub_sampled_data, approach)): # features.loc[index] = df # print ("dictionary ", tmp_dict) if tmp_dict["accuracy"] == 1.0: count_accurate += 1 accuracy.append((tmp_dict["accuracy"])) precisions.append((tmp_dict["Precision"])) recalls.append((tmp_dict["Recall"])) f1_scores.append((tmp_dict["f1_score"])) formula_acc.append((tmp_dict["accuracy_formula"])) f.write(tmp_dict) pool.close() print("total accurate answers ", count_accurate) return accuracy, precisions, recalls, f1_scores, formula_acc def store_tf_idf_results_wmd(self, data, f): count = 0 accuracies = [] precisions = [] recalls = [] f1_scores = [] formula_acc = [] k = 3 for example in data: # print ("count ", count) tmp_dict = {} scores = [] false_negatives_scores = [] tmp_dict["true_evidences"] = example["true_evidences"] tmp_dict["actual_true_positives"] = len(example["true_evidences"]) tmp_dict["total_sentences"] = len(example["relevant_sentences"]) tmp_dict["actual_true_negatives"] = len( example["relevant_sentences"]) - len(example["true_evidences"]) tmp_dict["id"] = example["id"] tmp_dict["true_label"] = example["label"] tmp_dict["claim"] = example["claim"] tmp_dict["true_evidences"] = example["true_evidences"] for evidence in example["relevant_sentences"]: _, similarity_score = self.wmd.compute_wm_distance( example["claim"], evidence["sentence"]) # print (similarity_score) if similarity_score != "inf" and similarity_score < 1.5: # similarity_score = 4 scores.append(similarity_score) else: false_negatives_scores.append(similarity_score) #take top 5 sorted_indexes = np.argsort(scores) if len(scores) == 0: tmp_dict["predicted_sentences"] = example["relevant_sentences"] tmp_dict["predicted_sentences_ids"] = [["null", "null"]] tmp_dict["predicted_sentences"] = ["null"] # because if similarity is 4, means sentences are not similar tmp_dict["tf_idf_features"] = [4, 4, 4, 4, 4] elif len(scores) >= k: tmp_dict["predicted_sentences"] = itemgetter( *sorted_indexes[:k])(example["relevant_sentences"]) tmp_dict["predicted_sentences_ids"] = [[ sent["id"], sent["line_num"] ] for sent in tmp_dict["predicted_sentences"]] tmp_dict["predicted_sentences"] = [ sent["sentence"] for sent in tmp_dict["predicted_sentences"] ] tmp_dict["tf_idf_features"] = sorted(scores)[:k] # df.loc[0] = sorted(scores)[-5:] + [example["label"]] else: # if scores size is less than 5, just add extra 0s to feed to classifier tmp_dict["predicted_sentences"] = itemgetter(*sorted_indexes)( example["relevant_sentences"]) if len(sorted_indexes) == 1: tmp_dict["predicted_sentences"] = [ tmp_dict["predicted_sentences"] ] tmp_dict["predicted_sentences_ids"] = [[ sent["id"], sent["line_num"] ] for sent in tmp_dict["predicted_sentences"]] tmp_dict["predicted_sentences"] = [ sent["sentence"] for sent in tmp_dict["predicted_sentences"] ] tmp_dict["tf_idf_features"] = sorted(scores) + ( [0] * (k - len(sorted(scores)))) tmp_dict["accuracy"], t_correct_evds = self.compute_score( tmp_dict["true_evidences"], tmp_dict["predicted_sentences_ids"]) tmp_dict["predicted_true_positives"] = t_correct_evds tmp_dict["predicted_false_positives"] = len( scores) - t_correct_evds tmp_dict["predicted_true_negatives"] = len(false_negatives_scores) tmp_dict["predicted_false_negatives"] = tmp_dict[ "predicted_true_negatives"] - len(scores) - t_correct_evds tmp_dict["Recall"] = self.handle_errors( tmp_dict["predicted_true_positives"], tmp_dict["actual_true_positives"]) tmp_dict["Precision"] = self.handle_errors( tmp_dict["predicted_true_positives"], tmp_dict["predicted_false_positives"] + tmp_dict["predicted_true_positives"]) tmp_dict["accuracy_formula"] = self.handle_errors( tmp_dict["predicted_true_positives"], tmp_dict["actual_true_positives"]) tmp_dict["f1_score"] = self.handle_errors( 2 * tmp_dict["Recall"] * tmp_dict["Precision"], tmp_dict["Recall"] + tmp_dict["Precision"]) accuracies.append(tmp_dict["accuracy"]) precisions.append(tmp_dict["Precision"]) recalls.append(tmp_dict["Recall"]) f1_scores.append(tmp_dict["f1_score"]) formula_acc.append(tmp_dict["accuracy_formula"]) f.write(tmp_dict) count += 1 return accuracies, precisions, recalls, f1_scores, formula_acc def store_features_wmd(self, sub_sampled_data, features_path, approach): sub_sampled_data = jsonlines.open(sub_sampled_data) sub_sampled_data = [example for example in sub_sampled_data] print("len of data ", len(sub_sampled_data)) with jsonlines.open(features_path, mode='w') as f: accuracies, precisions, recalls, f1_scores, formula_acc = self.store_tf_idf_results_wmd( sub_sampled_data, f) return accuracies, precisions, recalls, f1_scores, formula_acc ''' preprocess data returns features and labels that are used to train classifier ''' def preprocess_data(self, dataset): print("processing the dataset ") features = [] labels = [] features = pd.DataFrame(columns=['s1', 's2', 's3', 's4', 's5']) with jsonlines.open(dataset, mode='r') as f: for (index, example) in enumerate(f): features.loc[index] = example['tf_idf_features'] if example['true_label'] == 'SUPPORTS': labels.append(0) elif example['true_label'] == 'REFUTES': labels.append(1) # NOT ENOUGH INFO else: labels.append(2) labels = np.array(labels) print("feature shape ", len(features)) print("labels shape ", labels.shape) return (features, labels) # count shows how many evidences were predicted correctly def compute_score(self, true_labels, pred_labels): count = 0 for true_evd_set in true_labels: if true_evd_set in pred_labels: count += 1 return (count / len(true_labels)), count ''' evaluate clf ''' def evaluate_clf(self, x, y_true, model): y_pred = model.predict(x) print( "score of tfidf ", precision_recall_fscore_support(y_true, y_pred, average='weighted')) print("accuracy score ", accuracy_score(y_true, y_pred)) def train_clf(self, X, Y, path): print("inside classifier") clf = RandomForestClassifier() clf.fit(X, Y) joblib.dump(clf, path) print("model saved")
class featureCore: def __init__(self, task): self.task = task self.tfidf = TFIDF() self.vs = VectorSpace() self.wmd = wordMoverDistance() def get_tf_idf_score(self, list_of_defactoNlps): # print ("nlp mdoes ", list_of_defactoNlps) if self.task == 'bin-classification-fever' or self.task == 'bin-classification-google': for model in list_of_defactoNlps: relevant_sentence, score = self.tfidf.apply_tf_idf( model.claim, model.sentences) #0.2 # bin-classification-fever, 1 represents sup, ref and 0 represents nei # bin-classification-google classes are binary # 0 represents supports, 1 represents refutes if score >= 0.1: model.method_name["tfidf"] = {self.task: {"pred_label": 0}} else: model.method_name["tfidf"] = {self.task: {"pred_label": 1}} #for fever-3 and google dataset # self.task == 'tri-classification-fever' or self.task == 'bin-classification-google': else: for model in list_of_defactoNlps: relevant_sentence, score = self.tfidf.apply_tf_idf( model.claim, model.sentences) # classification: > 0.2 --> Yes, < 0.2 --> NEI # Detection: score > 0.6 --> Support, 0.2 < score < 0.6 --> Refutes, NEI < 0.2 if score >= 0.2: #detection # Supports if score > 0.3: # print ("score > 0.7") model.method_name["tfidf"] = { self.task: { "pred_label": 0 } } # refutes else: # REFUTES # print ("score < 0.7") model.method_name["tfidf"] = { self.task: { "pred_label": 1 } } #label as NEI else: # print ("score < 0.05") model.method_name["tfidf"] = {self.task: {"pred_label": 2}} return list_of_defactoNlps def get_vector_space_score(self, list_of_defactoNlps): if self.task == 'bin-classification-fever' or self.task == 'bin-classification-google': for model in list_of_defactoNlps: relevant_sentence, vector_space_score = self.vs.apply_vector_space( model.claim, model.sentences) # bin-classification-fever, 1 represents sup, ref and 0 represents nei # bin-classification-google classes are binary # 0 represents supports, 1 represents refutes if vector_space_score >= 0.2: model.method_name["vspace"] = { self.task: { "pred_label": 0 } } else: model.method_name["vspace"] = { self.task: { "pred_label": 1 } } #for fever-3 and google dataset else: # print ("inside else of vector_space") for model in list_of_defactoNlps: relevant_sentence, vector_space_score = self.vs.apply_vector_space( model.claim, model.sentences) # classification: > 0.2 --> Yes, < 0.2 --> NEI # Detection: vector_space_score > 0.6 --> Support, 0.2 < vector_space_score < 0.6 --> Refutes, NEI < 0.2 if vector_space_score >= 0.2: #detection # Supports if vector_space_score > 0.3: model.method_name["vspace"] = { self.task: { "pred_label": 0 } } # refutes else: # REFUTES model.method_name["vspace"] = { self.task: { "pred_label": 1 } } #label as NEI else: model.method_name["vspace"] = { self.task: { "pred_label": 2 } } return list_of_defactoNlps def get_wmd_score(self, list_of_defactoNlps): # print ("nlp mdoes ", list_of_defactoNlps) wmd_score = 0 if self.task == 'bin-classification-fever' or self.task == 'bin-classification-google': # print ("inside wmd score ") for model in list_of_defactoNlps: relevant_sentence, wmd_score = self.wmd.compute_wm_distance( model.claim, model.sentences) #in bin-classification google : 0 represents suports, 1 represents refutes if wmd_score < 0.8: model.method_name["wmd"] = {self.task: {"pred_label": 0}} else: model.method_name["wmd"] = {self.task: {"pred_label": 1}} #for fever-3 and google dataset else: for model in list_of_defactoNlps: relevant_sentence, wmd_score = self.wmd.compute_wm_distance( model.claim, model.sentences) # classification: > 0.2 --> Yes, < 0.2 --> NEI # Detection: wmd_score > 0.6 --> Support, 0.2 < wmd_score < 0.6 --> Refutes, NEI < 0.2 if wmd_score <= 2.2: #detection # Supports if wmd_score < 1: model.method_name["wmd"] = { self.task: { "pred_label": 0 } } # refutes else: # REFUTES model.method_name["wmd"] = { self.task: { "pred_label": 1 } } #label as NEI else: model.method_name["wmd"] = {self.task: {"pred_label": 2}} return list_of_defactoNlps
class SentenceClassifier: def __init__(self): self.tfidf = TFIDF() self.vs = VectorSpace() self.wmd = wordMoverDistance() self.count_avg_true_evidences = 0 def store_tf_idf_results(self, example, approach): tmp_dict = {} scores = [] tmp_dict["id"] = example["id"] # tmp_dict["true_label"] = example["label"] tmp_dict["claim"] = example["claim"] top_k_sents = 5 false_negatives_scores = [] # tmp_dict["true_evidences"] = example["true_evidences"] # tmp_dict["actual_true_positives"] = len(example["true_evidences"]) # tmp_dict["total_sentences"] = len(example["relevant_sentences"]) # tmp_dict["actual_true_negatives"] = len(example["relevant_sentences"]) - len(example["true_evidences"]) for evidence in example["relevant_sentences"]: if approach == "tfidf": _, similarity_score = self.tfidf.apply_tf_idf( example["claim"], evidence["sentence"]) # print ("similarity_score ", similarity_score) # print ("\n") if similarity_score > 0.2: # print (" scores != 0", similarity_score) scores.append(similarity_score) elif approach == "vs": _, similarity_score = self.vs.apply_vector_space( example["claim"], evidence["sentence"]) # print ("similarity_score ", similarity_score) if similarity_score > 0.2: scores.append(similarity_score) else: print("no approach matched") #take top k sorted_indexes = np.argsort(scores) filtered_indexes = [] if len(scores) == 0: # if score is 0, means there is no related sentence # tmp_dict["predicted_sentences"] = "null" tmp_dict["predicted_sentences_ids"] = [["null", "null"]] tmp_dict["predicted_sentences"] = ["null"] # just fill random value tmp_dict["tf_idf_features"] = [10000, 10000, 10000, 10000, 10000] elif len(scores) >= top_k_sents: # print ("stored indexes ", sorted_indexes[-5:]) # filtered_indexes = [idx for idx in sorted_indexes[-5:] if scores[idx] > 0.05] # print ("filtered_indexes ", filtered_indexes) tmp_dict["predicted_sentences"] = itemgetter( *sorted_indexes[-top_k_sents:])(example["relevant_sentences"]) tmp_dict["predicted_sentences_ids"] = [[ sent["id"], sent["line_num"] ] for sent in tmp_dict["predicted_sentences"]] tmp_dict["predicted_sentences"] = [ sent["sentence"] for sent in tmp_dict["predicted_sentences"] ] tmp_dict["tf_idf_features"] = sorted(scores)[-top_k_sents:] else: # if scores size is less than 5, just add extra 10000s to feed to classifier # print ("sorted_indexes ", sorted_indexes) tmp_dict["predicted_sentences"] = itemgetter(*sorted_indexes)( example["relevant_sentences"]) # tmp_dict["predicted_sentences"] = example["relevant_sentences"] # print ("tmp dict ", tmp_dict["predicted_sentences"]) if len(sorted_indexes) == 1: tmp_dict["predicted_sentences"] = [ tmp_dict["predicted_sentences"] ] # print ("indexes 0") tmp_dict["predicted_sentences_ids"] = [[ sent["id"], sent["line_num"] ] for sent in tmp_dict["predicted_sentences"]] tmp_dict["predicted_sentences"] = [ sent["sentence"] for sent in tmp_dict["predicted_sentences"] ] tmp_dict["tf_idf_features"] = sorted(scores) + ( [10000] * (top_k_sents - len(sorted(scores)))) # tmp_dict["accuracy"], t_correct_evds = self.compute_score(tmp_dict["true_evidences"], tmp_dict["predicted_sentences_ids"]) # tmp_dict["predicted_true_positives"] = t_correct_evds # tmp_dict["predicted_false_positives"] = len(scores) - t_correct_evds # tmp_dict["predicted_true_negatives"] = len(false_negatives_scores) # tmp_dict["predicted_false_negatives"] = tmp_dict["predicted_true_negatives"] - len(scores) - t_correct_evds # tmp_dict["Recall"] = self.handle_errors(tmp_dict["predicted_true_positives"], tmp_dict["actual_true_positives"]) # tmp_dict["Precision"] = self.handle_errors(tmp_dict["predicted_true_positives"], tmp_dict["predicted_false_positives"] + tmp_dict["predicted_true_positives"]) # tmp_dict["accuracy_formula"] = self.handle_errors(tmp_dict["predicted_true_positives"], tmp_dict["actual_true_positives"]) # tmp_dict["f1_score"] = self.handle_errors(2 * tmp_dict["Recall"] * tmp_dict["Precision"], tmp_dict["Recall"] + tmp_dict["Precision"]) return tmp_dict # because multiple variables can give zeroDivisionError def handle_errors(self, a, b): try: z = a / b except: z = 0 return z def store_features(self, sub_sampled_data, features_path, approach): sub_sampled_data = jsonlines.open(sub_sampled_data) sub_sampled_data = [example for example in sub_sampled_data] print("len of data ", len(sub_sampled_data)) # features = pd.DataFrame(columns=['S1','S2','S3','S4','S5', 'Label']) # similarity scores of top 5 sents approach = [approach] * len(sub_sampled_data) print("approach inside store features ", len(approach)) pool = Pool(processes=10) accuracy = [] # precisions = [] # recalls = [] # f1_scores = [] # formula_acc = [] with jsonlines.open(features_path, mode='w') as f: for tmp_dict in pool.starmap(self.store_tf_idf_results, zip(sub_sampled_data, approach)): # features.loc[index] = df # print ("dictionary ", tmp_dict) # accuracy.append((tmp_dict["accuracy"])) # precisions.append((tmp_dict["Precision"])) # recalls.append((tmp_dict["Recall"])) # f1_scores.append((tmp_dict["f1_score"])) # formula_acc.append((tmp_dict["accuracy_formula"])) f.write(tmp_dict) pool.close() def store_tf_idf_results_wmd(self, data, f): count = 0 # accuracies = [] # precisions = [] # recalls = [] # f1_scores = [] # formula_acc = [] k = 5 for example in data: print("count ", count) tmp_dict = {} scores = [] false_negatives_scores = [] # tmp_dict["true_evidences"] = example["true_evidences"] # tmp_dict["actual_true_positives"] = len(example["true_evidences"]) # tmp_dict["total_sentences"] = len(example["relevant_sentences"]) # tmp_dict["actual_true_negatives"] = len(example["relevant_sentences"]) - len(example["true_evidences"]) tmp_dict["id"] = example["id"] # tmp_dict["true_label"] = example["label"] tmp_dict["claim"] = example["claim"] # tmp_dict["true_evidences"] = example["true_evidences"] for evidence in example["relevant_sentences"]: _, similarity_score = self.wmd.compute_wm_distance( example["claim"], evidence["sentence"]) # print (similarity_score) if similarity_score != "inf" and similarity_score < 1.5: # similarity_score = 4 scores.append(similarity_score) else: false_negatives_scores.append(similarity_score) #take top 5 sorted_indexes = np.argsort(scores) if len(scores) == 0: # tmp_dict["predicted_sentences"] = example["relevant_sentences"] tmp_dict["predicted_sentences_ids"] = [["null", "null"]] tmp_dict["predicted_sentences"] = ["null"] # because if similarity is 4, means sentences are not similar tmp_dict["tf_idf_features"] = [4, 4, 4, 4, 4] elif len(scores) >= k: tmp_dict["predicted_sentences"] = itemgetter( *sorted_indexes[:k])(example["relevant_sentences"]) tmp_dict["predicted_sentences_ids"] = [[ sent["id"], sent["line_num"] ] for sent in tmp_dict["predicted_sentences"]] tmp_dict["predicted_sentences"] = [ sent["sentence"] for sent in tmp_dict["predicted_sentences"] ] tmp_dict["tf_idf_features"] = sorted(scores)[:k] # df.loc[0] = sorted(scores)[-5:] + [example["label"]] else: # if scores size is less than 5, just add extra 0s to feed to classifier tmp_dict["predicted_sentences"] = itemgetter(*sorted_indexes)( example["relevant_sentences"]) if len(sorted_indexes) == 1: tmp_dict["predicted_sentences"] = [ tmp_dict["predicted_sentences"] ] tmp_dict["predicted_sentences_ids"] = [[ sent["id"], sent["line_num"] ] for sent in tmp_dict["predicted_sentences"]] tmp_dict["predicted_sentences"] = [ sent["sentence"] for sent in tmp_dict["predicted_sentences"] ] tmp_dict["tf_idf_features"] = sorted(scores) + ( [0] * (k - len(sorted(scores)))) # tmp_dict["accuracy"], t_correct_evds = self.compute_score(tmp_dict["true_evidences"], tmp_dict["predicted_sentences_ids"]) # tmp_dict["predicted_true_positives"] = t_correct_evds # tmp_dict["predicted_false_positives"] = len(scores) - t_correct_evds # tmp_dict["predicted_true_negatives"] = len(false_negatives_scores) # tmp_dict["predicted_false_negatives"] = tmp_dict["predicted_true_negatives"] - len(scores) - t_correct_evds # tmp_dict["Recall"] = self.handle_errors(tmp_dict["predicted_true_positives"], tmp_dict["actual_true_positives"]) # tmp_dict["Precision"] = self.handle_errors(tmp_dict["predicted_true_positives"], tmp_dict["predicted_false_positives"] + tmp_dict["predicted_true_positives"]) # tmp_dict["accuracy_formula"] = self.handle_errors(tmp_dict["predicted_true_positives"], tmp_dict["actual_true_positives"]) # tmp_dict["f1_score"] = self.handle_errors(2 * tmp_dict["Recall"] * tmp_dict["Precision"], tmp_dict["Recall"] + tmp_dict["Precision"]) # accuracies.append(tmp_dict["accuracy"]) # precisions.append(tmp_dict["Precision"]) # recalls.append(tmp_dict["Recall"]) # f1_scores.append(tmp_dict["f1_score"]) # formula_acc.append(tmp_dict["accuracy_formula"]) f.write(tmp_dict) count += 1 def store_features_wmd(self, sub_sampled_data, features_path, approach): sub_sampled_data = jsonlines.open(sub_sampled_data) sub_sampled_data = [example for example in sub_sampled_data] print("len of data ", len(sub_sampled_data)) with jsonlines.open(features_path, mode='w') as f: self.store_tf_idf_results_wmd(sub_sampled_data, f) # count shows how many evidences were predicted correctly def compute_score(self, true_labels, pred_labels): count = 0 for true_evd_set in true_labels: if true_evd_set in pred_labels: count += 1 return (count / len(true_labels)), count
class TestModels: def __init__(self, approach, task): self.rf_classifier = TrainClassifier() self.approach = approach if task == "test": # self.sr_output = "./data/fever-full/classifier_results/shared_dev_true_docs_"+str(approach)+"_features_new.jsonl" # self.sr_output = "./data/fever-full/classifier_results/shared_dev_true_docs_"+str(approach)+"_features_new_k_5.jsonl" self.sr_output = "./data/fever-full/classifier_results/shared_dev_true_docs_" + str( approach) + "_features_new_k_5_recall.jsonl" else: # self.sr_output = "./data/fever-full/classifier_results/subsample_train_true_docs_"+str(approach)+"_features_new.jsonl" # self.sr_output = "./data/fever-full/classifier_results/subsample_train_true_docs_"+str(approach)+"_features_new_k_5.jsonl" self.sr_output = "./data/fever-full/classifier_results/subsample_train_true_docs_" + str( approach) + "_features_new_k_5_recall.jsonl" self.tfidf = TFIDF() self.vs = VectorSpace() self.wmd = wordMoverDistance() # load sentence retrieval results # find claim classifier results def load_sr_results(self, cc_results_path): read_file = jsonlines.open(self.sr_output, mode="r") # with jsonlines.open(cc_results_path, mode="w") as f: for example in read_file: tmp_dict = {} tmp_dict["id"] = example["id"] if example["true_label"] == "SUPPORTS": tmp_dict["label"] = "SUPPORTS" # tmp_dict["label"] = 1 elif example["true_label"] == "REFUTES": tmp_dict["label"] = "REFUTES" # tmp_dict["label"] = 2 else: tmp_dict["label"] = "Not Enough Info" # tmp_dict["label"] = 0 tmp_dict["claim"] = example["claim"] predicted_labels = [] for sent in example["predicted_sentences"]: if self.approach == "tfidf": _, similarity_score = self.tfidf.apply_tf_idf( example["claim"], sent) #support if similarity_score >= 0.5: predicted_labels.append("SUPPORTS") # predicted_labels.append(1) #refute elif similarity_score >= 0.15 and similarity_score < 0.5: predicted_labels.append("REFUTES") # predicted_labels.append(2) #NEI else: predicted_labels.append("Not Enough Info") # predicted_labels.append(0) elif self.approach == "vs": _, similarity_score = self.vs.apply_vector_space( example["claim"], sent) #support if similarity_score >= 0.5: predicted_labels.append("SUPPORTS") # predicted_labels.append(1) #refuute elif similarity_score >= 0.15 and similarity_score < 0.5: predicted_labels.append("REFUTES") # predicted_labels.append(2) #NEI else: predicted_labels.append("Not Enough Info") # predicted_labels.append(0) elif approach == "wmd": _, similarity_score = self.wmd.compute_wm_distance( example["claim"], sent) # print ("similarity_score ", similarity_score) #support # 0.4 if similarity_score <= 0.4: predicted_labels.append("SUPPORTS") # predicted_labels.append(1) #refute # similarity_score > 0.4 and similarity_score <=0.9 elif similarity_score > 0.4 and similarity_score <= 0.9: predicted_labels.append("REFUTES") # predicted_labels.append(2) #NEI else: predicted_labels.append("Not Enough Info") # predicted_labels.append(0) tmp_dict["claims_labels"] = predicted_labels tmp_dict["true_evidence"] = example["true_evidences"] tmp_dict["predicted_evidence"] = example[ "predicted_sentences_ids"] f.write(tmp_dict) # load claim classification (cc) result # cc_results_path is cc results stored # final results will path where final label for each label will be stored # Save final label based on voting def predict_final_label(self, cc_results_path, final_results_path): cc_results = jsonlines.open(cc_results_path, mode='r') possible_labels = ["Not Enough Info", "SUPPORTS", "REFUTES"] with jsonlines.open(final_results_path, mode='w') as f: for example in cc_results: tmp_dict = {} tmp_dict["id"] = example["id"] tmp_dict["label"] = example["label"] tmp_dict["claim"] = example["claim"] # tmp_dict["cc_labels"] = example["claims_labels"] tmp_dict["evidence"] = example["true_evidence"] tmp_dict["predicted_evidence"] = example["predicted_evidence"] count_s_labels = 0 count_r_labels = 0 count_nei_labels = 0 for cc_labels in example["claims_labels"]: # if cc_labels == 1: if cc_labels == "SUPPORTS": count_s_labels += 1 # elif cc_labels == 2: elif cc_labels == "REFUTES": count_r_labels += 1 else: count_nei_labels += 1 if count_s_labels > count_r_labels: tmp_dict["predicted_label"] = "SUPPORTS" # tmp_dict["predicted_label"] = 1 elif count_s_labels == count_r_labels and count_nei_labels == 0: random_number = random.randint(1, 2) tmp_dict["predicted_label"] = possible_labels[ random_number] # tmp_dict["predicted_label"] = random_number elif count_s_labels > 0 and count_r_labels > 0 and count_s_labels == count_r_labels and count_nei_labels > 0: random_number = random.randint(1, 2) tmp_dict["predicted_label"] = possible_labels[ random_number] # tmp_dict["predicted_label"] = random_number elif count_s_labels == 0 and count_r_labels == 0 and count_nei_labels > 0: tmp_dict["predicted_label"] = "Not Enough Info" # tmp_dict["predicted_label"] = 0 elif count_r_labels > count_s_labels: tmp_dict["predicted_label"] = "REFUTES" # tmp_dict["predicted_label"] = 2 f.write(tmp_dict) def compute_score(self, dataset): with jsonlines.open(dataset, mode='r') as f: true_labels = [] pred_labels = [] for example in f: true_labels.append(example["label"]) pred_labels.append(example["predicted_label"]) true_labels = np.array(true_labels) pred_labels = np.array(pred_labels) print( "score of " + self.approach, precision_recall_fscore_support(true_labels, pred_labels, average='weighted')) print("accuracy score ", accuracy_score(true_labels, pred_labels))