def kuhn_munkres_stats(): d = read_clean_dataset() km = read_pickle_file(_feature_file_map['kuhn_munkres']) # Divide into the several datasets d['kuhn_munkres'] = km['Kuhn-Munkres'] f = d[d.articleHeadlineStance == 'for']['kuhn_munkres'] a = d[d.articleHeadlineStance == 'against']['kuhn_munkres'] o = d[d.articleHeadlineStance == 'observing']['kuhn_munkres'] # Calculate for normality: _, pf = shapiro(f) _, po = shapiro(o) _, pa = shapiro(a) # None are normaly distributed print(f"""Test for normality (K-M): 1) For: {pf} 2) Observing : {po} 3) Against : {pa}""") # Calculate p-values _, p_fa = mannwhitneyu(f, a) _, p_fo = mannwhitneyu(f, o) _, p_oa = mannwhitneyu(o, a) print(f"""P-values (K-M): 1) For - Against: {p_fa} 2) Observing - Against: {p_oa} 3) For - Observing: {p_fo}""")
def root_dist_stats(): d = read_clean_dataset() km = read_pickle_file(_feature_file_map['root_dist']) # Divide into the several datasets for feature in ['refute_dist', 'hedge_dist']: d[feature] = km[feature] f = d[d.articleHeadlineStance == 'for'][feature] a = d[d.articleHeadlineStance == 'against'][feature] o = d[d.articleHeadlineStance == 'observing'][feature] # Calculate for normality: _, pf_r = shapiro(f) _, po_r = shapiro(o) _, pa_r = shapiro(a) print(f"""Test for normality ({feature}): 1) For: {pf_r} 2) Observing : {po_r} 3) Against : {pa_r}""") _, p_fa = mannwhitneyu(f, a) _, p_fo = mannwhitneyu(f, o) _, p_oa = mannwhitneyu(o, a) print(f"""P-values ({feature}): 1) For - Against: {p_fa} 2) Observing - Against: {p_oa} 3) For - Observing: {p_fo}""")
def word2vec_stats(): d = read_clean_dataset() w2v = read_pickle_file(_feature_file_map['word2vec']) # Divide into the several datasets d['w2v'] = w2v.avg_similarity f = d[d.articleHeadlineStance == 'for']['w2v'] a = d[d.articleHeadlineStance == 'against']['w2v'] o = d[d.articleHeadlineStance == 'observing']['w2v'] # Calculate for normality: _, pf = shapiro(f) _, po = shapiro(o) _, pa = shapiro(a) # None are normaly distributed print(f"""Test for normality (W2V): 1) For: {pf} 2) Observing : {po} 3) Against : {pa}""") # Calculate p-values _, p_fa = mannwhitneyu(f, a) _, p_fo = mannwhitneyu(f, o) _, p_oa = mannwhitneyu(o, a) print(f"""P-values (W2V): 1) For - Against: {p_fa} 2) Observing - Against: {p_oa} 3) For - Observing: {p_fo}""")
def __init__(self, index=0, features=[], classifier="", settings={}, test="", hyperparameters_grid=None): self.id = index self.features = features self.test = test self.classifier = classifier self.trainingSettings = settings self.model = None self.hyperparameters_grid = hyperparameters_grid self.labels = read_clean_dataset()['articleHeadlineStance'] self.featureMatrix = self.constructFeaturesMatrix() # Compute train and test data self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.featureMatrix, self.labels, test_size=0.2, random_state=0, stratify=self.labels) self.results = self.trainOnData()
avg_similarities.append(avg_sim) prod_similarities.append(prod_sim) i += 1 if i % 50 == 0: print( f'[{i}] Sim between {claim} ||||| {headline} --> ({avg_sim}/{prod_sim})' ) # After computing all similarities, add a new column to the dataframe d = pd.DataFrame() d['avg_similarity'] = avg_similarities d['prod_similarity'] = prod_similarities return d if __name__ == '__main__': # Vector directory in my computer VECTOR_DIR = "../../../wse/vec" # Load the clean dataset df = read_clean_dataset() # Load the vectors (vectors are number 3 from https://fasttext.cc/docs/en/english-vectors.html) print('Loading vectors') nlp = spacy.load(VECTOR_DIR) print('Loaded vectors') similarity_df = claim_to_headline_sim(df, nlp) print('Saving features to', PICKLED_FEATURES_PATH) similarity_df.to_pickle(PICKLED_FEATURES_PATH + "word2vec.pkl")
for word in words: if graph.has_node(word): min_dist = min( nx.shortest_path_length(graph, source=root, target=word), min_dist) return min_dist # Dependency graph is a graph (tree) with words as nodes and if # word A is dependent on word B in a sentence, then there is an edge from B to A def create_dependency_graph(sentence): '''Creates dependency graph for the sentence using StanfordNLP''' edges = [] root = '' for token in sentence.dependencies: dep = token[0].text.lower() if dep != 'root': edges.append((dep, token[2].text)) else: root = token[2].text return nx.Graph(edges), root dataset = read_clean_dataset() # Read the dataset dataset = apply_lower_case(dataset) dataset = apply_strip(dataset) dataset = extract_root_dist(dataset) a = dataset[['refute_dist', 'hedge_dist']] a.to_pickle(PICKLED_FEATURES_PATH + "root_dist.pkl")
def q_counts(): # Statistics : Q-Feature (mean, std, number_samples) f = { 'q_ends': (0.00885, 0.09388, 1238), 'q_contains': (0.022617, 0.14874, 1238) } o = { 'q_ends': (0.090437, 0.286955, 962), 'q_contains': (0.133056, 0.339812, 962) } a = { 'q_ends': (0.025316, 0.157284, 395), 'q_contains': (0.075949, 0.265253, 395) } d = read_clean_dataset() q = read_pickle_file(_feature_file_map['Q']) q['Stance'] = d.articleHeadlineStance # Run the t-test! for feature in ['q_ends', 'q_contains']: mean_f, std_f, n_f = f[feature] mean_a, std_a, n_a = a[feature] mean_o, std_o, n_o = o[feature] # Run the actual test _, p_fo = ttest_ind_from_stats(mean1=mean_f, std1=std_f, nobs1=n_f, mean2=mean_o, std2=std_o, nobs2=n_o) _, p_fa = ttest_ind_from_stats(mean1=mean_f, std1=std_f, nobs1=n_f, mean2=mean_a, std2=std_a, nobs2=n_a) _, p_ao = ttest_ind_from_stats(mean1=mean_a, std1=std_a, nobs1=n_a, mean2=mean_o, std2=std_o, nobs2=n_o) print(f"""P-values ({feature}) 1) For - Against: {p_fa} 2) Observing - Against: {p_ao} 3) For - Observing: {p_fo}""") # Chi-square test for dependency between feature and stance contingency_table = pd.crosstab(q['Stance'], q[feature], margins=False) chi2_stat, p_val, dof, ex = stats.chi2_contingency(contingency_table) print("\n") print(f"""=== Chi2 Stat ({feature}) ===""") print(chi2_stat) print("\n") print("===Degrees of Freedom===") print(dof) print("\n") print("===P-Value===") print(p_val) print("\n") print("===Contingency Table===") print(ex)