def relevant_phs(embs, cates, N): for cate in cates: worst = -100 bestw = [-100] * (N + 1) bestp = [''] * (N + 1) # cate_ph = cate[2:] cate_ph = cate for ph in embs: sim = utils.cossim(embs[cate_ph], embs[ph]) if sim > worst: for i in range(N): if sim >= bestw[i]: for j in range(N - 1, i - 1, -1): bestw[j + 1] = bestw[j] bestp[j + 1] = bestp[j] bestw[i] = sim bestp[i] = ph worst = bestw[N - 1] break # print bestw # print bestp for ph in bestp[:N]: cates[cate].add(ph) print('Top similar phrases found.') return cates
def distance_q(source_type, target_type, embs, e_size): # if pmode=COS, then using cossim to evaluate the distance # if pmode=DOT, then using dot product to evaluate the distance target_pool = copy.copy(embs[target_type]) # if mode == 'Het': # target_pool = {} # for n_type in ntypes: # target_pool.update(embs[n_type]) while 1: n_name = raw_input("Enter your node: ") if n_name in embs[source_type]: print 'looking for ' + n_name + '...' t_emb = embs[source_type][n_name] sim_map = {} for key in target_pool: if pmode == 'COS': sim_map[key] = utils.cossim(t_emb, target_pool[key]) if pmode == 'DOT': sim_map[key] = utils.dot_product(t_emb, target_pool[key]) sim_map = sorted(sim_map.items(), key=operator.itemgetter(1), reverse=True) print sim_map[:10] else: print 'name ' + n_name + ' is not fould in ' + source_type
def compare(a_f, b_f): emb_f = 'embeddings.txt' a_emb_f = '%s/%s' % (a_f, emb_f) b_emb_f = '%s/%s' % (b_f, emb_f) if not exists(a_emb_f) or not exists(b_emb_f): print 'Embedding file not found' exit(1) embs_a = load_embeddings(a_emb_f) embs_b = load_embeddings(b_emb_f) embs_groups = [embs_a, embs_b] while 1: n_name = raw_input("Enter your node: ") if n_name not in embs_a or n_name not in embs_b: print '%s not found' % n_name for embs in embs_groups: t_emb = embs[n_name] sim_map = {} for key in embs: sim_map[key] = utils.cossim(t_emb, embs[key]) sim_map = sorted(sim_map.items(), key=operator.itemgetter(1), reverse=True) output_str = '\n'.join( [sim_map[i][0] + '\t' + str(sim_map[i][1]) for i in range(10)]) # print sim_map[:10] print output_str print 'group finished\n'
def label_emb_centric(folder, c_id): print 'Start labeling for %s, %s ========================' % (folder, c_id) # print folder par_folder = dirname(folder) cur_label = basename(folder) p_case_f = '%s/caseolap.txt' % par_folder c_case_f = '%s/caseolap.txt' % folder emb_f = '%s/embeddings.txt' % par_folder # generate word2vec phrases embs = utils.load_embeddings(emb_f) if cur_label not in embs: print 'Error!!!' exit(1) N = 100 worst = -100 bestw = [-100] * (N + 1) bestp = [''] * (N + 1) for ph in embs: sim = utils.cossim(embs[cur_label], embs[ph]) if sim > worst: for i in range(N): if sim >= bestw[i]: for j in range(N - 1, i - 1, -1): bestw[j + 1] = bestw[j] bestp[j + 1] = bestp[j] bestw[i] = sim bestp[i] = ph worst = bestw[N - 1] break cands = [(bestp[idx], bestw[idx]) for idx, x in enumerate(bestp)] phrase_map_p, cell_map_p, tmp = read_caseolap_result(p_case_f) parent_dist_ranking = cell_map_p[c_id] parent_dist_map = {ph: float(dist) for (ph, dist) in parent_dist_ranking} child_kl_ranking = rank_phrase(c_case_f) child_kl_map = {ph: dist for (ph, dist) in child_kl_ranking} min_score = 0.12 label_cands = {} # for (ph, score) in parent_dist_ranking: for (ph, score) in cands: if ph not in parent_dist_map: continue if ph in child_kl_map: continue label_cands[ph] = score ranked_list = sorted(label_cands.items(), key=operator.itemgetter(1), reverse=True) print ranked_list return ranked_list[0][0]
def classify_doc_real(t_emb, target_embs, pmode): # if not hierarchical sim_map = {} for key in target_embs: if pmode == 'COS': sim_map[key] = utils.cossim(t_emb, target_embs[key]) if pmode == 'DOT': sim_map[key] = utils.dot_product(t_emb, target_embs[key]) sim_map = sorted(sim_map.items(), key=operator.itemgetter(1), reverse=True) return sim_map
def spacyPhraseSim(p1, p2): # TODO: find a more reasonable way to aggregate vector processed1 = ' '.join(lemmatize_an_idea(p1)) processed2 = ' '.join(lemmatize_an_idea(p2)) tok1 = nlp(unicode(processed1)) tok2 = nlp(unicode(processed2)) v1 = np.mean([t.repvec for t in tok1], axis=0) v2 = np.mean([t.repvec for t in tok2], axis=0) sim = cossim(v1, v2) return float(sim)
def expan_round(embs, seeds_map, all_seeds, limit, cate_lim, mode='EMB', pd_map=None): target_type = 'p' multiplier = 5 thre_softmax = 0.5 extended_seeds = set() candidates = {} if mode == 'EMB': for phrase in embs[target_type]: if phrase in all_seeds: continue t_emb = embs[target_type][phrase] rel_values = {} # flat comparison for label in seeds_map: max_sim = 0 for seed in seeds_map[label]: sim = multiplier * utils.cossim(t_emb, embs[target_type][seed]) if sim > max_sim: max_sim = sim rel_values[label] = max_sim utils.softmax_for_map(rel_values) best_label = sorted(rel_values.items(), key=operator.itemgetter(1), reverse=True)[0][0] candidates[best_label + '@' + phrase] = rel_values[best_label] elif mode == 'DIS': pred_label, doc_score = doc_assignment(embs, 'd', 'l', mode='FLAT') top_labels = [w.path for w in hier.get_all_nodes()] print 'Doc Assignment done...' label_to_idx = {} for idx, label in enumerate(top_labels): label_to_idx[label] = idx # print uniform_vec label_to_doc = {} for label in top_labels: label_to_doc[label] = set() for doc, score in doc_score.iteritems(): label_to_doc[pred_label[doc]].add(doc) cnt_vec = [0.0] * len(top_labels) for label in label_to_doc: cnt_vec[label_to_idx[label]] = len(label_to_doc[label]) comp_vec = utils.l1_normalize(cnt_vec) uniform_vec = [1.0/len(top_labels)] * len(top_labels) # print cnt_vec # print comp_vec for phrase in embs['p']: if phrase in all_seeds: continue p_vec = [0.0] * len(top_labels) for doc in pd_map[phrase]: idx = label_to_idx[pred_label[doc]] p_vec[idx] += 1.0 max_label_value = 0 best_label = '' best_cnt = 0 for label in top_labels: idx = label_to_idx[label] if p_vec[idx] > 0: norm_value = p_vec[idx] / cnt_vec[idx] if norm_value > max_label_value: max_label_value = norm_value best_label = label best_cnt = p_vec[idx] if sum(p_vec) == 0: print 'ERROR!!!!!!!!!!' continue p_vec = utils.l1_normalize(p_vec) # kl = 0.1 + 0.9 * utils.kl_divergence(p_vec, uniform_vec) # kl = utils.kl_divergence(p_vec, comp_vec) kl = utils.kl_divergence(p_vec, uniform_vec) # best_label = sorted(rel_values.items(), key=operator.itemgetter(1), reverse=True)[0][0] pop = max_label_value # * (1 + math.log(1 + max_label_value)) candidates[best_label + '@' + phrase] = kl * max_label_value candidates = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True) # cands_by_label = {} # for cand in candidates: # label, phrase = cand.split('@') # if label not in cands_by_label: # cands_by_label[label] = {} # cands_by_label[label][phrase] = candidates[cand] # for label in cands_by_label: # print '\n' + label # cand_cate = cands_by_label[label] # best_exps = sorted(cand_cate.items(), key=operator.itemgetter(1), reverse=True)[:10] # # best_exps = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True)[:30] # print best_exps # exit(1) added = 0 added_cates = {} for (cand, score) in candidates: label, phrase = cand.split('@') if label not in added_cates: added_cates[label] = 0 if added_cates[label] >= cate_lim: continue if len(seeds_map[label]) >= 3: continue extended_seeds.add(cand) added_cates[label] += 1 added += 1 if added > limit: break print 'extended: ' + str(extended_seeds) return extended_seeds
def gloveSim(tokens1, tokens2): vec1 = vector_space.vec_for_sentence(tokens1) vec2 = vector_space.vec_for_sentence(tokens2) if np.isnan(np.sum(vec1)) or np.isnan(np.sum(vec2)): return -5000 return float(cossim(vec1,vec2))
def get_rep(folder, c_id, N): print( 'Start get representative phrases for %s, %s ========================' % (folder, c_id)) # print folder par_folder = dirname(folder) cur_label = basename(folder) result_phrases = [cur_label] ph_f = '%s/caseolap.txt' % par_folder if exists(ph_f): kw_clus_f = '%s/cluster_keywords.txt' % par_folder kws = set() with open(kw_clus_f) as f: for line in f: clus_id, ph = line.strip('\r\n').split('\t') if clus_id == c_id: kws.add(ph) emb_f = '%s/embeddings.txt' % par_folder embs = utils.load_embeddings(emb_f) # print len(kws) phrase_map_p, cell_map_p, tmp = read_caseolap_result(ph_f) parent_dist_ranking = cell_map_p[c_id] ph_scores = {} for (ph, score) in parent_dist_ranking: if ph not in kws: continue emb_dist = utils.cossim(embs[ph], embs[cur_label]) ph_scores[ph] = score * emb_dist ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True) for (ph, score) in ph_scores: if ph not in result_phrases and ph in kws: result_phrases.append(ph) if len(result_phrases) >= N: break elif ph_idf == None: print('looking at embeddings for %s' % folder) ph_f = '%s/embeddings.txt' % par_folder kw_f = '%s/keywords.txt' % par_folder keywords = set() with open(kw_f) as f: for line in f: keywords.add(line.strip('\r\n')) embs = utils.load_embeddings(ph_f) tmp_embs = {} for k in keywords: if k in embs: tmp_embs[k] = embs[k] embs = tmp_embs worst = -100 bestw = [-100] * (N + 1) bestp = [''] * (N + 1) for ph in embs: sim = utils.cossim(embs[cur_label], embs[ph]) if sim > worst: for i in range(N): if sim >= bestw[i]: for j in range(N - 1, i - 1, -1): bestw[j + 1] = bestw[j] bestp[j + 1] = bestp[j] bestw[i] = sim bestp[i] = ph worst = bestw[N - 1] break for ph in bestp[:N]: if ph not in result_phrases: result_phrases.append(ph) else: # Using TF-IDF to generate print('looking at tf-idf for %s' % folder) d_clus_f = '%s/paper_cluster.txt' % par_folder kw_clus_f = '%s/cluster_keywords.txt' % par_folder docs = [] kws = set() with open(d_clus_f) as f: for line in f: doc_id, clus_id = line.strip('\r\n').split('\t') if clus_id == c_id: docs.append(doc_id) with open(kw_clus_f) as f: for line in f: clus_id, ph = line.strip('\r\n').split('\t') if clus_id == c_id: kws.add(ph) ph_scores = {x: 0 for x in ph_idf} for d in docs: if d in doc_to_ph: for ph in doc_to_ph[d]: ph_scores[ph] += 1 for ph in ph_scores: if ph_scores[ph] == 0: continue if ph not in kws: ph_scores[ph] = 0 continue ph_scores[ph] = 1 + math.log(ph_scores[ph]) ph_scores[ph] *= ph_idf[ph] ph_scores = sorted(ph_scores.items(), key=operator.itemgetter(1), reverse=True) for (ph, score) in ph_scores: if ph not in result_phrases: result_phrases.append(ph) if len(result_phrases) > N: break # print result_phrases return result_phrases
def spacySim(word1, word2): tok1 = nlp(unicode(word1))[0] tok2 = nlp(unicode(word2))[0] sim = cossim(tok1.repvec, tok2.repvec) return float(sim)