示例#1
0
def relevant_phs(embs, cates, N):

    for cate in cates:
        worst = -100
        bestw = [-100] * (N + 1)
        bestp = [''] * (N + 1)
        # cate_ph = cate[2:]
        cate_ph = cate

        for ph in embs:
            sim = utils.cossim(embs[cate_ph], embs[ph])
            if sim > worst:
                for i in range(N):
                    if sim >= bestw[i]:
                        for j in range(N - 1, i - 1, -1):
                            bestw[j + 1] = bestw[j]
                            bestp[j + 1] = bestp[j]
                        bestw[i] = sim
                        bestp[i] = ph
                        worst = bestw[N - 1]
                        break

        # print bestw
        # print bestp

        for ph in bestp[:N]:
            cates[cate].add(ph)

    print('Top similar phrases found.')

    return cates
示例#2
0
def distance_q(source_type, target_type, embs, e_size):
	# if pmode=COS, then using cossim to evaluate the distance
	# if pmode=DOT, then using dot product to evaluate the distance


	target_pool = copy.copy(embs[target_type])
	# if mode == 'Het':
	# 	target_pool = {}
	# 	for n_type in ntypes:
	# 		target_pool.update(embs[n_type])

	while 1:
		n_name = raw_input("Enter your node: ")
		if n_name in embs[source_type]:
			print 'looking for ' + n_name + '...'
			t_emb = embs[source_type][n_name]
			sim_map = {}
			for key in target_pool:
				if pmode == 'COS':
					sim_map[key] = utils.cossim(t_emb, target_pool[key])
				if pmode == 'DOT':
					sim_map[key] = utils.dot_product(t_emb, target_pool[key])
			sim_map = sorted(sim_map.items(), key=operator.itemgetter(1), reverse=True)
			print sim_map[:10]

		else:
			print 'name ' + n_name + ' is not fould in ' + source_type
def compare(a_f, b_f):

    emb_f = 'embeddings.txt'
    a_emb_f = '%s/%s' % (a_f, emb_f)
    b_emb_f = '%s/%s' % (b_f, emb_f)

    if not exists(a_emb_f) or not exists(b_emb_f):
        print 'Embedding file not found'
        exit(1)

    embs_a = load_embeddings(a_emb_f)
    embs_b = load_embeddings(b_emb_f)

    embs_groups = [embs_a, embs_b]

    while 1:
        n_name = raw_input("Enter your node: ")
        if n_name not in embs_a or n_name not in embs_b:
            print '%s not found' % n_name

        for embs in embs_groups:
            t_emb = embs[n_name]
            sim_map = {}
            for key in embs:
                sim_map[key] = utils.cossim(t_emb, embs[key])

            sim_map = sorted(sim_map.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
            output_str = '\n'.join(
                [sim_map[i][0] + '\t' + str(sim_map[i][1]) for i in range(10)])
            # print sim_map[:10]
            print output_str
            print 'group finished\n'
示例#4
0
def label_emb_centric(folder, c_id):
    print 'Start labeling for %s, %s ========================' % (folder, c_id)
    # print folder
    par_folder = dirname(folder)
    cur_label = basename(folder)
    p_case_f = '%s/caseolap.txt' % par_folder
    c_case_f = '%s/caseolap.txt' % folder
    emb_f = '%s/embeddings.txt' % par_folder

    # generate word2vec phrases
    embs = utils.load_embeddings(emb_f)
    if cur_label not in embs:
        print 'Error!!!'
        exit(1)
    N = 100
    worst = -100
    bestw = [-100] * (N + 1)
    bestp = [''] * (N + 1)

    for ph in embs:
        sim = utils.cossim(embs[cur_label], embs[ph])
        if sim > worst:
            for i in range(N):
                if sim >= bestw[i]:
                    for j in range(N - 1, i - 1, -1):
                        bestw[j + 1] = bestw[j]
                        bestp[j + 1] = bestp[j]
                    bestw[i] = sim
                    bestp[i] = ph
                    worst = bestw[N - 1]
                    break

    cands = [(bestp[idx], bestw[idx]) for idx, x in enumerate(bestp)]

    phrase_map_p, cell_map_p, tmp = read_caseolap_result(p_case_f)
    parent_dist_ranking = cell_map_p[c_id]
    parent_dist_map = {ph: float(dist) for (ph, dist) in parent_dist_ranking}
    child_kl_ranking = rank_phrase(c_case_f)
    child_kl_map = {ph: dist for (ph, dist) in child_kl_ranking}
    min_score = 0.12
    label_cands = {}

    # for (ph, score) in parent_dist_ranking:
    for (ph, score) in cands:
        if ph not in parent_dist_map:
            continue

        if ph in child_kl_map:
            continue

        label_cands[ph] = score

    ranked_list = sorted(label_cands.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    print ranked_list

    return ranked_list[0][0]
示例#5
0
def classify_doc_real(t_emb, target_embs, pmode):
	# if not hierarchical
	sim_map = {}
	for key in target_embs:
		if pmode == 'COS':
			sim_map[key] = utils.cossim(t_emb, target_embs[key])
		if pmode == 'DOT':
			sim_map[key] = utils.dot_product(t_emb, target_embs[key])
	sim_map = sorted(sim_map.items(), key=operator.itemgetter(1), reverse=True)
	return sim_map
示例#6
0
def spacyPhraseSim(p1, p2):
    # TODO: find a more reasonable way to aggregate vector
    processed1 = ' '.join(lemmatize_an_idea(p1))
    processed2 = ' '.join(lemmatize_an_idea(p2))
    tok1 = nlp(unicode(processed1))

    tok2 = nlp(unicode(processed2))

    v1 = np.mean([t.repvec for t in tok1], axis=0)
    v2 = np.mean([t.repvec for t in tok2], axis=0)
    sim = cossim(v1, v2)
    return float(sim)
示例#7
0
def spacyPhraseSim(p1, p2):
    # TODO: find a more reasonable way to aggregate vector
    processed1 = ' '.join(lemmatize_an_idea(p1))
    processed2 = ' '.join(lemmatize_an_idea(p2))
    tok1 = nlp(unicode(processed1))

    tok2 = nlp(unicode(processed2))

    v1 = np.mean([t.repvec for t in tok1], axis=0)
    v2 = np.mean([t.repvec for t in tok2], axis=0)
    sim = cossim(v1, v2)
    return float(sim)
示例#8
0
def expan_round(embs, seeds_map, all_seeds, limit, cate_lim, mode='EMB', pd_map=None):

	target_type = 'p'

	multiplier = 5
	thre_softmax = 0.5

	extended_seeds = set()
	candidates = {}

	if mode == 'EMB':
		for phrase in embs[target_type]:
			if phrase in all_seeds:
				continue
			t_emb = embs[target_type][phrase]
			rel_values = {}
			# flat comparison
			for label in seeds_map:
				max_sim = 0
				for seed in seeds_map[label]:
					sim = multiplier * utils.cossim(t_emb, embs[target_type][seed])
					if sim > max_sim:
						max_sim = sim
				rel_values[label] = max_sim

			utils.softmax_for_map(rel_values)
			best_label = sorted(rel_values.items(), key=operator.itemgetter(1), reverse=True)[0][0]
			candidates[best_label + '@' + phrase] = rel_values[best_label]
	
	elif mode == 'DIS':
		pred_label, doc_score = doc_assignment(embs, 'd', 'l', mode='FLAT')
		top_labels = [w.path for w in hier.get_all_nodes()]
		print 'Doc Assignment done...'

		label_to_idx = {}
		for idx, label in enumerate(top_labels):
			label_to_idx[label] = idx
		# print uniform_vec
		label_to_doc = {}
		
		for label in top_labels:
			label_to_doc[label] = set()
		for doc, score in doc_score.iteritems():
			label_to_doc[pred_label[doc]].add(doc)
		cnt_vec = [0.0] * len(top_labels)
		for label in label_to_doc:
			cnt_vec[label_to_idx[label]] = len(label_to_doc[label])
		comp_vec = utils.l1_normalize(cnt_vec)

		uniform_vec = [1.0/len(top_labels)] * len(top_labels)
		# print cnt_vec
		# print comp_vec

		for phrase in embs['p']:
			if phrase in all_seeds:
				continue

			p_vec = [0.0] * len(top_labels)

			for doc in pd_map[phrase]:
				idx = label_to_idx[pred_label[doc]]
				p_vec[idx] += 1.0

			max_label_value = 0
			best_label = ''
			best_cnt = 0
			for label in top_labels:
				idx = label_to_idx[label]
				if p_vec[idx] > 0:
					norm_value = p_vec[idx] / cnt_vec[idx]
					if norm_value > max_label_value:
						max_label_value = norm_value
						best_label = label
						best_cnt = p_vec[idx]

			if sum(p_vec) == 0:
				print 'ERROR!!!!!!!!!!'
				continue
			p_vec = utils.l1_normalize(p_vec)
			# kl = 0.1 + 0.9 * utils.kl_divergence(p_vec, uniform_vec)
			# kl = utils.kl_divergence(p_vec, comp_vec)
			kl = utils.kl_divergence(p_vec, uniform_vec)

			# best_label = sorted(rel_values.items(), key=operator.itemgetter(1), reverse=True)[0][0]
			pop = max_label_value
			# * (1 + math.log(1 + max_label_value))
			candidates[best_label + '@' + phrase] = kl * max_label_value

	candidates = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True)

	# cands_by_label = {}
	# for cand in candidates:
	# 	label, phrase = cand.split('@')
	# 	if label not in cands_by_label:
	# 		cands_by_label[label] = {}
	# 	cands_by_label[label][phrase] = candidates[cand]

	# for label in cands_by_label:
	# 	print '\n' + label
	# 	cand_cate = cands_by_label[label]
	# 	best_exps = sorted(cand_cate.items(), key=operator.itemgetter(1), reverse=True)[:10]
	# # best_exps = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True)[:30]
	# 	print best_exps

	# exit(1)

	added = 0
	added_cates = {}
	for (cand, score) in candidates:
		label, phrase = cand.split('@')
		if label not in added_cates:
			added_cates[label] = 0
		if added_cates[label] >= cate_lim:
			continue
		if len(seeds_map[label]) >= 3:
			continue
		extended_seeds.add(cand)
		added_cates[label] += 1
		added += 1
		if added > limit:
			break

	print 'extended: ' + str(extended_seeds)
	return extended_seeds
示例#9
0
def gloveSim(tokens1, tokens2):
    vec1 = vector_space.vec_for_sentence(tokens1)
    vec2 = vector_space.vec_for_sentence(tokens2)
    if np.isnan(np.sum(vec1)) or np.isnan(np.sum(vec2)):
        return -5000
    return float(cossim(vec1,vec2))
def get_rep(folder, c_id, N):
    print(
        'Start get representative phrases for %s, %s ========================'
        % (folder, c_id))
    # print folder
    par_folder = dirname(folder)
    cur_label = basename(folder)

    result_phrases = [cur_label]

    ph_f = '%s/caseolap.txt' % par_folder
    if exists(ph_f):
        kw_clus_f = '%s/cluster_keywords.txt' % par_folder
        kws = set()
        with open(kw_clus_f) as f:
            for line in f:
                clus_id, ph = line.strip('\r\n').split('\t')
                if clus_id == c_id:
                    kws.add(ph)
        emb_f = '%s/embeddings.txt' % par_folder
        embs = utils.load_embeddings(emb_f)

        # print len(kws)
        phrase_map_p, cell_map_p, tmp = read_caseolap_result(ph_f)
        parent_dist_ranking = cell_map_p[c_id]

        ph_scores = {}
        for (ph, score) in parent_dist_ranking:
            if ph not in kws:
                continue
            emb_dist = utils.cossim(embs[ph], embs[cur_label])
            ph_scores[ph] = score * emb_dist

        ph_scores = sorted(ph_scores.items(),
                           key=operator.itemgetter(1),
                           reverse=True)

        for (ph, score) in ph_scores:
            if ph not in result_phrases and ph in kws:
                result_phrases.append(ph)
            if len(result_phrases) >= N:
                break

    elif ph_idf == None:

        print('looking at embeddings for %s' % folder)

        ph_f = '%s/embeddings.txt' % par_folder
        kw_f = '%s/keywords.txt' % par_folder
        keywords = set()
        with open(kw_f) as f:
            for line in f:
                keywords.add(line.strip('\r\n'))

        embs = utils.load_embeddings(ph_f)
        tmp_embs = {}
        for k in keywords:
            if k in embs:
                tmp_embs[k] = embs[k]
        embs = tmp_embs

        worst = -100
        bestw = [-100] * (N + 1)
        bestp = [''] * (N + 1)

        for ph in embs:
            sim = utils.cossim(embs[cur_label], embs[ph])
            if sim > worst:
                for i in range(N):
                    if sim >= bestw[i]:
                        for j in range(N - 1, i - 1, -1):
                            bestw[j + 1] = bestw[j]
                            bestp[j + 1] = bestp[j]
                        bestw[i] = sim
                        bestp[i] = ph
                        worst = bestw[N - 1]
                        break

        for ph in bestp[:N]:
            if ph not in result_phrases:
                result_phrases.append(ph)
    else:
        # Using TF-IDF to generate
        print('looking at tf-idf for %s' % folder)
        d_clus_f = '%s/paper_cluster.txt' % par_folder
        kw_clus_f = '%s/cluster_keywords.txt' % par_folder
        docs = []
        kws = set()
        with open(d_clus_f) as f:
            for line in f:
                doc_id, clus_id = line.strip('\r\n').split('\t')
                if clus_id == c_id:
                    docs.append(doc_id)
        with open(kw_clus_f) as f:
            for line in f:
                clus_id, ph = line.strip('\r\n').split('\t')
                if clus_id == c_id:
                    kws.add(ph)
        ph_scores = {x: 0 for x in ph_idf}
        for d in docs:
            if d in doc_to_ph:
                for ph in doc_to_ph[d]:
                    ph_scores[ph] += 1

        for ph in ph_scores:
            if ph_scores[ph] == 0:
                continue
            if ph not in kws:
                ph_scores[ph] = 0
                continue
            ph_scores[ph] = 1 + math.log(ph_scores[ph])
            ph_scores[ph] *= ph_idf[ph]
        ph_scores = sorted(ph_scores.items(),
                           key=operator.itemgetter(1),
                           reverse=True)

        for (ph, score) in ph_scores:
            if ph not in result_phrases:
                result_phrases.append(ph)
            if len(result_phrases) > N:
                break

    # print result_phrases
    return result_phrases
示例#11
0
def spacySim(word1, word2):
    tok1 = nlp(unicode(word1))[0]
    tok2 = nlp(unicode(word2))[0]
    sim = cossim(tok1.repvec, tok2.repvec)
    return float(sim)
示例#12
0
def spacySim(word1, word2):
    tok1 = nlp(unicode(word1))[0]
    tok2 = nlp(unicode(word2))[0]
    sim = cossim(tok1.repvec, tok2.repvec)
    return float(sim)