def calculate_AMs(collocation_matrix): LLR = {} locMI = {} for row in collocation_matrix: c1 = first_order_freqs[row][0] LLR.update( { row : [] } ) locMI.update( { row : [] } ) for col in collocation_matrix[row]: c2 = freq_list[rank_map[col]]*wordcount_ratio c12 = collocation_matrix[row][col]*first_order_freqs[row][1] tf = sum(config_data[CORPUS_FREQUENCIES]) if c12 > 0: LLR[row].append(comp.LLR(c1, c2, c12, tf)) locMI[row].append(comp.localMI(c1, c2, c12, tf)) else: LLR[row].append(0) locMI[row].append(0) print("printing output files") with open(config_data[PROJECT_PATH]+"sentence_collocation_matrix_with_freqs_LLR.json", "w", encoding="utf-8") as f: json.dump(LLR, f) with open(config_data[PROJECT_PATH]+"sentence_collocation_matrix_with_freqs_locMI.json", "w", encoding="utf-8") as f: json.dump(locMI, f)
LLR = {} PMI = {} locMI = {} print(f2_freqs) for synpat in syn_list: if syn_list[synpat]["word"] in f2_freqs: f2 = f2_freqs[syn_list[synpat]["word"]] f12 = syn_list[synpat]["f12"] args = (f1, f2, f12, tf) LLR.update({synpat: comp.LLR(*args)}) PMI.update({synpat: comp.PMI(*args)}) locMI.update({synpat: comp.localMI(*args)}) with open(config_data[PROJECT_PATH] + "syntactic_synpat_associations_LLR.tsv", "w", encoding="utf-8") as f: for line in LLR: f.write(line + "\t" + str(LLR[line]) + "\n") with open(config_data[PROJECT_PATH] + "syntactic_synpat_associations_PMI.tsv", "w", encoding="utf-8") as f: for line in PMI: f.write(line + "\t" + str(PMI[line]) + "\n") with open(config_data[PROJECT_PATH] + "syntactic_synpat_associations_locMI.tsv",
LLR = {} PMI = {} locMI = {} print(f2_freqs) for synpat in syn_list: if syn_list[synpat]["word"] in f2_freqs: f2 = f2_freqs[syn_list[synpat]["word"]] f12 = syn_list[synpat]["f12"] args = (f1, f2, f12, tf) LLR.update( { synpat : comp.LLR(*args) } ) PMI.update( { synpat : comp.PMI(*args) } ) locMI.update( { synpat : comp.localMI(*args) } ) with open(config_data[PROJECT_PATH]+"syntactic_synpat_associations_LLR.tsv", "w", encoding="utf-8") as f: for line in LLR: f.write(line +"\t"+str(LLR[line]) + "\n") with open(config_data[PROJECT_PATH]+"syntactic_synpat_associations_PMI.tsv", "w", encoding="utf-8") as f: for line in PMI: f.write(line +"\t"+str(PMI[line]) + "\n") with open(config_data[PROJECT_PATH]+"syntactic_synpat_associations_locMI.tsv", "w", encoding="utf-8") as f: for line in locMI: f.write(line +"\t"+str(locMI[line]) + "\n") with open(config_data[PROJECT_PATH]+"syntactic_synpat_associations_LLR.json", "w", encoding="utf-8") as f:
LLR_vectors.update( { row : {} } ) PMI_vectors.update( { row : {} } ) locMI_vectors.update( { row : {} } ) LPPMI_vectors.update( { row : {} } ) LPlocMI_vectors.update( { row : {} } ) if "full_count" in data[row]: f1 = data[row]["full_count"] for col in word_frequencies: if word_frequencies[col] > F_THRESHOLD: f2 = word_frequencies[col] if col in data[row]: f12 = data[row][col] LLR_vectors[row].update({ col : comp.LLR(f1, f2, f12, tf) } ) LPlocMI_vectors[row].update({ col : comp.localMI(f1+1, f2+1, f12+1, tf+len(word_frequencies)) } ) LPPMI_vectors[row].update({ col : comp.PMI(f1+1, f2+1, f12+1, tf+len(word_frequencies)) } ) locMI_vectors[row].update({ col : comp.localMI(f1, f2, f12, tf) } ) PMI_vectors[row].update({ col : comp.PMI(f1, f2, f12, tf) } ) else: LPPMI_vectors[row].update({ col : comp.PMI(f1+1, f2+1, 1, tf+len(word_frequencies)) }) LPlocMI_vectors[row].update({ col : comp.localMI(f1+1, f2+1, 1, tf+len(word_frequencies)) } ) LLR_vectors[row].update( { col : 0 } ) PMI_vectors[row].update( { col : 0 } ) locMI_vectors[row].update( { col : 0 } ) else: print("no count", row) with open(config_data[PROJECT_PATH]+"syntactic_LLR_vectors_"+str(F_THRESHOLD)+".json", "w", encoding="utf-8") as f: json.dump(LLR_vectors, f)
for i in f_matrix: LLR_matrix.update( { i : {} } ) PMI_matrix.update( { i : {} } ) locMI_matrix.update( { i : {} } ) LPPMI_matrix.update( { i : {} } ) LPlocMI_matrix.update( { i : {} } ) for j in f_matrix[i]: f1 = full_counts[i] f2 = full_counts[j] f12 = f_matrix[i][j] if f12 > 0: LLR_matrix[i].update( { j : comp.LLR(f1, f2, f12, tf) } ) PMI_matrix[i].update( { j : comp.PMI(f1, f2, f12, tf) } ) locMI_matrix[i].update( { j : comp.localMI(f1, f2, f12, tf) } ) LPPMI_matrix[i].update( { j : comp.PMI(f1+1, f2+1, f12+1, tf+len(wordlist)) } ) LPlocMI_matrix[i].update( { j : comp.localMI(f1+1, f2+1, f12+1, tf+len(wordlist)) } ) else: LLR_matrix[i].update( { j : 0 } ) PMI_matrix[i].update( { j : 0 } ) locMI_matrix[i].update( { j : 0 } ) LPPMI_matrix[i].update( { j : comp.PMI(f1+1, f2+1, 1, tf+len(wordlist)) } ) LPlocMI_matrix[i].update( { j : comp.localMI(f1+1, f2+1, 1, tf+len(wordlist)) } ) print("writing output files") with open(config_data[PROJECT_PATH]+"sentence_token_LLR_"+str(F_THRESHOLD)+".json", "w", encoding="utf-8" ) as f: json.dump(LLR_matrix, f)
for i in f_matrix: LLR_matrix.update({i: {}}) PMI_matrix.update({i: {}}) locMI_matrix.update({i: {}}) LPPMI_matrix.update({i: {}}) LPlocMI_matrix.update({i: {}}) for j in f_matrix[i]: f1 = full_counts[i] f2 = full_counts[j] f12 = f_matrix[i][j] if f12 > 0: LLR_matrix[i].update({j: comp.LLR(f1, f2, f12, tf)}) PMI_matrix[i].update({j: comp.PMI(f1, f2, f12, tf)}) locMI_matrix[i].update({j: comp.localMI(f1, f2, f12, tf)}) LPPMI_matrix[i].update( {j: comp.PMI(f1 + 1, f2 + 1, f12 + 1, tf + len(wordlist))}) LPlocMI_matrix[i].update( {j: comp.localMI(f1 + 1, f2 + 1, f12 + 1, tf + len(wordlist))}) else: LLR_matrix[i].update({j: 0}) PMI_matrix[i].update({j: 0}) locMI_matrix[i].update({j: 0}) LPPMI_matrix[i].update( {j: comp.PMI(f1 + 1, f2 + 1, 1, tf + len(wordlist))}) LPlocMI_matrix[i].update( {j: comp.localMI(f1 + 1, f2 + 1, 1, tf + len(wordlist))})
for row in collocation_matrix: c1 = first_order_freqs[row][0] LLR.update( { row : [] } ) locMI.update( { row : [] } ) for col in collocation_matrix[row]: c2 = freq_list[rank_map[col]]*wordcount_ratio c12 = collocation_matrix[row][col]*first_order_freqs[row][1] tf = sum(config_data[CORPUS_FREQUENCIES]) if c12 > 0: LLR[row].append(comp.LLR(c1, c2, c12, tf)) locMI[row].append(comp.localMI(c1, c2, c12, tf)) else: LLR[row].append(0) locMI[row].append(0) print("printing output files") with open(config_data[PROJECT_PATH]+"sentence_collocation_matrix_with_freqs_LLR.json", "w", encoding="utf-8") as f: json.dump(LLR, f) with open(config_data[PROJECT_PATH]+"sentence_collocation_matrix_with_freqs_locMI.json", "w", encoding="utf-8") as f: json.dump(locMI, f)