def compute(score_lookup, word2vec_model, jate_terms_file, stopwords, jate_terms_folder, out_folder, append_label=None): kcr_lookup = score_lookup model = Word2Vec.load(word2vec_model) jate_term_base_scores = { c[0]: c[1] for c in tr.jate_terms_iterator(jate_terms_file) } term_unigrams = set() for term in jate_term_base_scores.keys(): norm_parts = utils.normalize_string(term) for part in norm_parts: part = re.sub(r'[^a-zA-Z0-9,/\-\+\s_]', ' ', part).strip() if (part in stopwords or len(part) < 2): continue else: term_unigrams.add(part) sum_unigram_scores = {} for tu in term_unigrams: if tu in kcr_lookup.keys(): sum_unigram_scores[tu] = kcr_lookup[tu] else: sum_unigram_scores[tu] = 0.0 sum_unigram_scores = utils.normalize(sum_unigram_scores) jate_terms_components = td.generate_term_component_map( jate_term_base_scores, 5, model) for file in os.listdir(jate_terms_folder): print("\t{}".format(file)) jate_term_base_scores = { c[0]: c[1] for c in tr.jate_terms_iterator(jate_terms_folder + "/" + file) } term_rank_scores = ts.SemReRankScorer(sum_unigram_scores, jate_terms_components, jate_term_base_scores) out_file = out_folder + "/" + file if append_label is not None: out_file = out_file + "_" + append_label # sorted_term_rank_scores = sorted(list(term_rank_scores), key=lambda k: k['score']) with open(out_file, 'w') as outfile: json.dump(list(term_rank_scores), outfile)
def calc_word_freq(ttf_term_json, ttf_word_out_file): term_ttf_scores = { c[0]: c[1] for c in srr.jate_terms_iterator(ttf_term_json) } word_freq = {} for t, ttf in term_ttf_scores.items(): parts = t.split(" ") for p in parts: p = exp_loader_doc_graph.lemmatizer.lemmatize(p).strip().lower() if p in stop or len(p) < 2: continue if p in word_freq.keys(): word_freq[p] += ttf else: word_freq[p] = ttf sorted_w_ttf = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) with open(ttf_word_out_file, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for tuple in sorted_w_ttf: writer.writerow([tuple[0], tuple[1]])
def run_random_baseline(word2vec_model, jate_terms_file, stopwords, jate_terms_folder, out_folder): model = None if word2vec_model is not None: model = Word2Vec.load(word2vec_model) jate_term_base_scores = { c[0]: c[1] for c in tr.jate_terms_iterator(jate_terms_file) } term_unigrams = set() for term in jate_term_base_scores.keys(): norm_parts = utils.normalize_string(term) for part in norm_parts: part = re.sub(r'[^a-zA-Z0-9,/\-\+\s_]', ' ', part).strip() if (part in stopwords or len(part) < 2): continue else: term_unigrams.add(part) sum_unigram_scores = {} for tu in term_unigrams: sum_unigram_scores[tu] = 0.0 sum_unigram_scores = utils.randomize(sum_unigram_scores) jate_terms_components = td.generate_term_component_map( jate_term_base_scores, 5, model) for file in os.listdir(jate_terms_folder): jate_term_base_scores = { c[0]: c[1] for c in tr.jate_terms_iterator(jate_terms_folder + "/" + file) } term_rank_scores = ts.SemReRankScorer(sum_unigram_scores, jate_terms_components, jate_term_base_scores) out_file = out_folder + "/" + file + "-random" # sorted_term_rank_scores = sorted(list(term_rank_scores), key=lambda k: k['score']) with open(out_file, 'w') as outfile: json.dump(list(term_rank_scores), outfile)
def find_terms_with_infrequent_words(ttf_term_json, ttf_word_csv_file, min_ttf_word, out_file, gs_file=None): term_ttf_scores = { c[0]: c[1] for c in srr.jate_terms_iterator(ttf_term_json) } word_ttf_scores = {} with open(ttf_word_csv_file, newline='\n') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') for row in reader: word_ttf_scores[row[0]] = row[1] gs_terms = [] if gs_file is not None: with open(gs_file) as f: terms = f.readlines() for t in terms: t = exp_loader_doc_graph.lemmatizer.lemmatize( t).strip().lower() if len(t) >= 2: gs_terms.append(t) selected_terms = [] for t, ttf in term_ttf_scores.items(): parts = t.split(" ") all_words_infrequent = True for p in parts: p = exp_loader_doc_graph.lemmatizer.lemmatize(p).strip().lower() if p in stop or len(p) < 2: continue if p in word_ttf_scores.keys(): wtf = word_ttf_scores[p] if float(wtf) >= min_ttf_word: all_words_infrequent = False break if all_words_infrequent: if gs_terms is not None and t in gs_terms: selected_terms.append(t) elif gs_terms is None: selected_terms.append(t) selected_terms = sorted(selected_terms) with open(out_file, 'w') as the_file: for t in selected_terms: the_file.write(t + '\n')
def analyze_node_degree(folder, jate_terms_file, stopwords, out_folder, folder_base_pattern): simTs = set() topNs = set() jate_term_base_scores = {c[0]: c[1] for c in tr.jate_terms_iterator(jate_terms_file)} term_unigrams = set() for term in jate_term_base_scores.keys(): norm_parts = utils.normalize_string(term) for part in norm_parts: part = re.sub(r'[^a-zA-Z0-9,/\-\+\s_]', ' ', part).strip() if (part in stopwords or len(part) < 2): continue else: term_unigrams.add(part) for file in os.listdir(folder): graph_type_folder = os.path.basename(file) parts = graph_type_folder.split("-") topN = parts[len(parts) - 2] simT = parts[len(parts) - 1] simTs.add(simT) topNs.add(topN) simTs = sorted(simTs) topNs = sorted(topNs) for simT in simTs: print(simT) for topN in topNs: target_folder = folder + "/" + folder_base_pattern + "-" + topN + "-" + simT print(target_folder) node_stats = calculate_node_stats(target_folder, term_unigrams) anynode_degrees = node_stats[0] termnode_degrees = node_stats[1] out_file = out_folder + "/" + folder_base_pattern + "-" + topN + "-" + simT + ".csv" f = open(out_file, 'w') for i in range(0, len(anynode_degrees)): line = str(anynode_degrees[i]) if len(termnode_degrees) > i: line += "," + str(termnode_degrees[i]) line += "\n" f.write(line) f.close()
def analyse_threshold(word2vec_model, jate_terms_file, stopwords, out_file, term_only): model = Word2Vec.load(word2vec_model) jate_term_base_scores = {c[0]: c[1] for c in tr.jate_terms_iterator(jate_terms_file)} term_unigrams = set() for term in jate_term_base_scores.keys(): norm_parts = utils.normalize_string(term) for part in norm_parts: part = re.sub(r'[^a-zA-Z0-9,/\-\+\s_]', ' ', part).strip() if (part in stopwords or len(part) < 2): continue else: term_unigrams.add(part) f = open(out_file, 'w') print(len(term_unigrams)) line="UNIGRAM, 0.9, 0.8, 0.7, 0.6, 0.5\n" f.write(line) simTs=[0.9, 0.8, 0.7, 0.6, 0.5] for unigram in term_unigrams: if unigram not in model.wv.vocab.keys(): continue similar = model.wv.most_similar(positive=unigram, topn=100000) line="\""+unigram+"\"," for simT in simTs: #print(simT) count=0 for item in similar: if term_only and item[0] not in term_unigrams: continue if item[1] < simT: break count+=1 line+=str(count)+"," line=line+"\n" f.write(line) f.close()
def calc_movement_stats(base_ate_outlist_json, srr_ate_outlist_json, term_filter_list_file, word_ttf_scores: dict): with open(term_filter_list_file) as f: filter_terms = f.readlines() base_ate_scores = { c[0]: c[1] for c in srr.jate_terms_iterator(base_ate_outlist_json) } sorted_base_ate = sorted(base_ate_scores, key=base_ate_scores.get, reverse=True) ssr_ate_scores = { c[0]: c[1] for c in ut.semrerank_json_reader(srr_ate_outlist_json) } sorted_ssr_ate = sorted(ssr_ate_scores, key=ssr_ate_scores.get, reverse=True) distribution = {} sum_avg_word_freq = { } # key: movement range; value:sum of the average word freq for each term belong to that range for adv in np.arange(-1.0, 1.0, 0.05): distribution[format(adv, '.2f')] = 0 sum_avg_word_freq[format(adv, '.2f')] = 0 count_advances = 0 count_advanced_percentages = 0 count_drops = 0 count_drop_percentages = 0 for f_t in filter_terms: f_t = f_t.strip() if f_t not in sorted_base_ate: continue avg_word_freq = calc_avg_word_freq(f_t, word_ttf_scores) ate_index = sorted_base_ate.index(f_t) ssr_index = sorted_ssr_ate.index(f_t) advance = (ate_index - ssr_index) / len(sorted_base_ate) if advance > 0: count_advanced_percentages += advance count_advances += 1 elif advance < 0: count_drop_percentages += advance count_drops += 1 if advance >= -1.0 and advance < -0.95: distribution['-1.0'] += 1 sum_avg_word_freq['-1.0'] += avg_word_freq elif advance >= -0.95 and advance < -0.9: distribution['-0.95'] += 1 sum_avg_word_freq['-0.95'] += avg_word_freq elif advance >= -0.9 and advance < -0.85: distribution['-0.90'] += 1 sum_avg_word_freq['-0.90'] += avg_word_freq elif advance >= -0.85 and advance < -0.8: distribution['-0.85'] += 1 sum_avg_word_freq['-0.85'] += avg_word_freq elif advance >= -0.8 and advance < -0.75: distribution['-0.80'] += 1 sum_avg_word_freq['-0.80'] += avg_word_freq elif advance >= -0.75 and advance < -0.7: distribution['-0.75'] += 1 sum_avg_word_freq['-0.75'] += avg_word_freq elif advance >= -0.70 and advance < -0.65: distribution['-0.70'] += 1 sum_avg_word_freq['-0.70'] += avg_word_freq elif advance >= -0.65 and advance < -0.6: distribution['-0.65'] += 1 sum_avg_word_freq['-0.65'] += avg_word_freq elif advance >= -0.6 and advance < -0.55: distribution['-0.60'] += 1 sum_avg_word_freq['-0.60'] += avg_word_freq elif advance >= -0.55 and advance < -0.5: distribution['-0.55'] += 1 sum_avg_word_freq['-0.55'] += avg_word_freq elif advance >= -0.5 and advance < -0.45: distribution['-0.50'] += 1 sum_avg_word_freq['-0.50'] += avg_word_freq elif advance >= -0.45 and advance < -0.4: distribution['-0.45'] += 1 sum_avg_word_freq['-0.45'] += avg_word_freq elif advance >= -0.4 and advance < -0.35: distribution['-0.40'] += 1 sum_avg_word_freq['-0.40'] += avg_word_freq elif advance >= -0.35 and advance < -0.3: distribution['-0.35'] += 1 sum_avg_word_freq['-0.35'] += avg_word_freq elif advance >= -0.3 and advance < -0.25: distribution['-0.30'] += 1 sum_avg_word_freq['-0.30'] += avg_word_freq elif advance >= -0.25 and advance < -0.2: distribution['-0.25'] += 1 sum_avg_word_freq['-0.25'] += avg_word_freq elif advance >= -0.20 and advance < -0.15: distribution['-0.20'] += 1 sum_avg_word_freq['-0.20'] += avg_word_freq elif advance >= -0.15 and advance < -0.1: distribution['-0.15'] += 1 sum_avg_word_freq['-0.15'] += avg_word_freq elif advance >= -1.0 and advance < -0.05: distribution['-0.10'] += 1 sum_avg_word_freq['-0.10'] += avg_word_freq elif advance >= -0.05 and advance < 0: distribution['-0.05'] += 1 sum_avg_word_freq['-0.05'] += avg_word_freq elif advance == 0: distribution['0.00'] += 1 sum_avg_word_freq['0.00'] += avg_word_freq elif advance > 0 and advance < 0.05: distribution['0.05'] += 1 sum_avg_word_freq['0.05'] += avg_word_freq elif advance >= 0.05 and advance < 0.1: distribution['0.10'] += 1 sum_avg_word_freq['0.10'] += avg_word_freq elif advance >= 0.1 and advance < 0.15: distribution['0.15'] += 1 sum_avg_word_freq['0.15'] += avg_word_freq elif advance >= 0.15 and advance < 0.2: distribution['0.20'] += 1 sum_avg_word_freq['0.20'] += avg_word_freq elif advance >= 0.2 and advance < 0.25: distribution['0.25'] += 1 sum_avg_word_freq['0.25'] += avg_word_freq elif advance >= 0.25 and advance < 0.3: distribution['0.30'] += 1 sum_avg_word_freq['0.30'] += avg_word_freq elif advance >= 0.35 and advance < 0.4: distribution['0.35'] += 1 sum_avg_word_freq['0.35'] += avg_word_freq elif advance >= 0.4 and advance < 0.45: distribution['0.40'] += 1 sum_avg_word_freq['0.40'] += avg_word_freq elif advance >= 0.45 and advance < 0.5: distribution['0.45'] += 1 sum_avg_word_freq['0.45'] += avg_word_freq elif advance >= 0.5 and advance < 0.55: distribution['0.50'] += 1 sum_avg_word_freq['0.50'] += avg_word_freq elif advance >= 0.55 and advance < 0.6: distribution['0.55'] += 1 sum_avg_word_freq['0.55'] += avg_word_freq elif advance >= 0.6 and advance < 0.65: distribution['0.60'] += 1 sum_avg_word_freq['0.60'] += avg_word_freq elif advance >= 0.65 and advance < 0.7: distribution['0.65'] += 1 sum_avg_word_freq['0.65'] += avg_word_freq elif advance >= 0.7 and advance < 0.75: distribution['0.70'] += 1 sum_avg_word_freq['0.70'] += avg_word_freq elif advance >= 0.75 and advance < 0.8: distribution['0.75'] += 1 sum_avg_word_freq['0.75'] += avg_word_freq elif advance >= 0.8 and advance < 0.85: distribution['0.80'] += 1 sum_avg_word_freq['0.80'] += avg_word_freq elif advance >= 0.85 and advance < 0.9: distribution['0.85'] += 1 sum_avg_word_freq['0.85'] += avg_word_freq elif advance >= 0.9 and advance < 0.95: distribution['0.90'] += 1 sum_avg_word_freq['0.90'] += avg_word_freq elif advance >= 0.95 and advance <= 1.0: distribution['0.95'] += 1 sum_avg_word_freq['0.95'] += avg_word_freq avg_adv = 0 avg_drop = 0 if count_advances > 0: avg_adv = count_advanced_percentages / count_advances if count_drops > 0: avg_drop = count_drop_percentages / count_drops print("avg advance,{},avg drop,{}".format(avg_adv, avg_drop)) avg_word_freq = {} for mv_range, sum_avg_w_f in sum_avg_word_freq.items(): samples = distribution[mv_range] if samples > 0: avg_word_freq[mv_range] = sum_avg_w_f / samples else: avg_word_freq[mv_range] = 0 return OrderedDict(sorted(distribution.items())), OrderedDict( sorted(avg_word_freq.items()))