示例#1
0
    def multilayer_noun_based_network(self):
        print 'MLN-Noun'
        network_size = len(self.document_data[0])
        document_sentences = self.document_data[0]
        only_auxiliar = Graph.Full(network_size)
        all_edges = only_auxiliar.get_edgelist()

        network_edges = []
        auxiliar_list = []
        weight_list = []
        for i in range(len(self.inter_edge)):
            weight_list.append([])


        for i in all_edges:
            index1 = i[0]
            index2 = i[1]
            similarity = cosineSimilarity(document_sentences[index1][0], document_sentences[index2][0])
            belong_same_document = document_sentences[index1][1] == document_sentences[index2][1]

            if similarity > 0:
                network_edges.append((index1, index2))
                auxiliar_list.append(similarity)

                if belong_same_document:
                    for index , j in enumerate(self.inter_edge):
                        weight_list[index].append(similarity)
                    #weight_list.append(similarity)
                else:
                    for index, j in enumerate(self.inter_edge):
                        weight_list[index].append(similarity*j)
                    #weight_list.append(similarity*self.inter_edge) # [1.7, 1.9]

        networks = []


        for i in weight_list:
            for j in self.limiar_mln: # [0.1, 0.15, 0.2]
                network = Graph()
                network.add_vertices(network_size)
                network.add_edges(network_edges)
                network.es['weight'] = i
                auxiliar_network = self.remove_edges_for_mln(network, j)
                #print j , len(network.get_edgelist()) , len(auxiliar_network.get_edgelist())
                #a = input()
                pair = (network, auxiliar_network)
                networks.append(pair)
        #network = Graph()
        #network.add_vertices(network_size)
        #network.add_edges(network_edges)
        #network.es['weight'] = weight_list
        #auxiliar_network = self.remove_edges_for_mln(network, 0.4)
        #auxiliar_network = self.remove_edges_for_mln(network, self.limiar_mln)
        #return [network, threshold]
        #return [(network, auxiliar_network), threshold]
        #print len(networks)
        #a = input()
        threshold = (max(auxiliar_list) + min(auxiliar_list)) / 2
        return (networks , threshold)
 def getSupFeats(self):
     sup_feature_dict = {}
     sup1, sup2 = [ev.supArgs for ev in self.events]
     for c1, c2 in itertools.product(sup1.keys(), sup2.keys()):
         align = "%s-%s" % (c1, c2)
         sim = round(utils.cosineSimilarity(sup1[c1], sup2[c2]), 3)
         if sim:
             sup_feature_dict[align] = sim
     return sup_feature_dict
    def getCfsimFeats(self, cf1, cf2, get_contributors=False, cf_threshold=0):
        cfsim_feature_dict, cfsim_contributors = defaultdict(float), {}
        for c1, c2 in itertools.product(cf1.args.keys(), cf2.args.keys()):
            align = "%s-%s" % (c1, c2)
            if get_contributors:
                align_sim, contributors = utils.cosineSimilarity(cf1.args[c1], cf2.args[c2], get_contributors=True, threshold=cf_threshold)
            else:
                align_sim = utils.cosineSimilarity(cf1.args[c1], cf2.args[c2], threshold=cf_threshold)

            align_sim = round(align_sim, 3)
            if align_sim:
                cfsim_feature_dict[align] = align_sim 
                cfsim_feature_dict["%s-_" % c1] += align_sim
                cfsim_feature_dict["_-%s" % c2] += align_sim
                cfsim_contributors[align] = contributors

        cfsim_scores = {align : round(cfsim, 3) for align, cfsim in cfsim_feature_dict.iteritems()}

        return cfsim_scores, cfsim_contributors
示例#4
0
    def noun_based_network(self):
        #print "creando red de sustantivos"
        network_size = len(self.document_data[0])
        document_sentences = self.document_data[0]



        only_auxiliar = Graph.Full(network_size)
        all_edges  = only_auxiliar.get_edgelist()

        network = Graph()
        network.add_vertices(network_size)
        network_edges =[]
        weight_list = []
        cosine_sim_list = []
        for i in all_edges:
            index1 = i[0]
            index2 = i[1]
            #common_elements = has_common_elements(document_sentences[index1] , document_sentences[index2])
            common_elements = has_common_elements(document_sentences[index1][0], document_sentences[index2][0])
            if common_elements>0:
                network_edges.append((index1,index2))
                weight_list.append(common_elements)    # MLN     -------
                #cosine = cosineSimilarity(document_sentences[index1], document_sentences[index2])
                cosine = cosineSimilarity(document_sentences[index1][0], document_sentences[index2][0])
                cosine_sim_list.append(cosine)

        network.add_edges(network_edges)
        network.es['weight'] = weight_list
        #print network.es['weight']
        #print cosine_sim_list  ###### PROBLEMAS PARA INGLES sds
        threshold = (max(cosine_sim_list) + min(cosine_sim_list))/2  #PROBLMAS PARA INGLES sds
        #print threshold ####################
        #threshold = 0
        #diameter = network.diameter()
        #print diameter
        #draw_graph(network)
        #if diameter == 6:
        #    draw_graph(network)

        #return [network, threshold] #None es el valor de treshold para MDS, para NOUns debe calcularse en la misma etapa de generacion
        return ([network], threshold)
def search(query):
    dataset = {}
    tfidf = [0] * len(inverdIndex)
    cos_sim = {}

    doc_listwords = []
    for word in utils.removeSymbols(query.lower()).split():
        if word not in stopwords and utils.isNotEmpty(word):
            doc_listwords.append(word)
    dataset = Counter(doc_listwords)

    for id, word in enumerate(inverdIndex.keys()):
        if word in dataset:
            tfidf[id] = dataset.get(word, 0) * inv_frec_vector[id]

    for word in dataset.keys():
        if word in inverdIndex:  # si la palabra esta en el index invertido
            for key in inverdIndex.get(word):
                if key not in cos_sim:
                    cos_sim[key] = utils.cosineSimilarity(tfidf, allTfidf[key])
    return cos_sim
示例#6
0
with open("data.csv", "r") as ins:
    for line in ins:
        arr = line.split(",")
        data[arr[0].split('\'')[1]] = arr[1].split('\'')[1]

questions = list(data.keys())
docTFIDFs = json.load(open("stack-tfidf.json"))

while (1):
    query = raw_input("Please enter question: ")
    print("\n")
    maxSimilarity = -1
    bestQuestion = ""
    print("-----------------------")
    print(
        "Calculating tfidf for the query with all questions as reference ...")
    queryTFIDF = utils.getTFIDF(query, questions)
    print(
        "Calling cosine similarity between all the questions to find best match ..."
    )
    for i in range(len(questions)):
        question = questions[i]
        similarity = utils.cosineSimilarity(queryTFIDF, docTFIDFs[question])
        if similarity > maxSimilarity:
            print(similarity)
            maxSimilarity = similarity
            bestQuestion = question
    print("Best question match : " + bestQuestion)
    print("Max similarity score : " + str(maxSimilarity))
    print("Best answer : " + data[bestQuestion])
示例#7
0
def get_max_similarity(sentence, extractos):
    similarities = []
    for i in extractos:
        similarities.append(cosineSimilarity(sentence.split(), i.split()))
    return max(similarities)
示例#8
0
    return res


feature = getFeatureVector(processed)

author = ['Bryant Zhou']
recommended = []

with open('processed.json') as f:
    paper2author = json.load(f)
    # print(paper2author)
    length = len(paper2author)
    newIdx = length
    paper2author[validateTitle] = {}
    paper2author[validateTitle]['author'] = author
    paper2author[validateTitle]['feature'] = feature
    paper2author[validateTitle]['processed'] = processed
    paper2author[validateTitle]['similarity'] = [float('inf')] * (length + 1)
    paper2author[validateTitle]['index'] = newIdx

    for key in paper2author:
        print(paper2author[key]['feature'])
        print(f'new: {feature}')
        sim = cosineSimilarity(paper2author[key]['feature'], feature)
        paper2author[key]['similarity'].append(sim)
        idx = paper2author[key]['index']
        paper2author[validateTitle]['similarity'][idx] = sim
        if sim < 1:
            recommended.append((key, paper2author[key]['author']))

print(recommended)
def computeScores(data, tag, vectors, ferr, params):
	queryId2Mention = {}; mention2QueryId = {}; qid = 1
	isWeighted = params['weight']; isMeanCentered = params['meanCenter']; embeddingType = params['embeddingType']
	num_cands = params['numCands']; num_components = params['ncomp']
	tpca = 0.0; twpca = 0.0
	trueCandMentions = False

	key = []; cand_names = []; hard2beat_baseline = []; avg_baseline = []; wavg_baseline = []; agw_pca = []; agw_wpca = []; labels = []

	for doc_name in data:
		doc_candidates = []; doc_weight_array = []
		doc_entity_candidates = []
		for mention_dict in data[doc_name]:
			mention_name = mention_dict["mention"]
			if 'tabel' in tag:
				position = str(mention_dict["row"])+str(mention_dict["col"])
			else:
				position = mention_dict["posI"]
			true_entity_id = mention_dict["wikidata_id"]
			isDifficult = mention_dict["difficulty"]
			if str(true_entity_id) == '-1':
				ferr.write("[Wikipedia Page for True-Entity has no Wikidata Mapping]: Skip this mention: "+doc_name+" "+mention_name+"\n")
				continue

			if "candidates" in mention_dict:
				candidate_tuples = []
				temp_candidates = []; weight_array = []
				flag = -1; cand_pos = 1

				for cand in mention_dict["candidates"]:
					cand_name = cand[0]
					prominence_score = 1/float(cand_pos)
					try:
						entity_vector = vectors[cand_name]
					except KeyError:
						ferr.write("[Missing Embedding] Skipping candidates that do not have pre-trained entity embeddings: "+doc_name+" "+mention_name+" "+cand_name+"\n")
						continue

					# candidates used for constructing the grassmannian subspace
					candidate_tuples.append((cand_name, prominence_score))
					temp_candidates.append(cand_name); weight_array.append(prominence_score)

					# check if the true entity was found in the candidates
					#if trueCandMentions:
					if cand_name == true_entity_id:
						flag = 0

					cand_pos += 1
					# restricting the data to only top-num_cands candidates per mention
					if num_cands != -1 and len(temp_candidates) >= num_cands:
						break

				# if the true entity is not present in the candidates, ignore this mention
				if trueCandMentions and flag == -1:
					ferr.write("[Missing True Entity] Skipping mentions without a true entity in the candidates: "+doc_name+" "+mention_name+"\n")
					mention2QueryId[(doc_name,mention_name,position)] = (-1,-1)
					continue
				else:
					if flag == -1:
						mention2QueryId[(doc_name,mention_name,position)] = (-1,-1)
					else:
						if (doc_name,mention_name,position) not in mention2QueryId:
							mention2QueryId[(doc_name,mention_name,position)] = (qid,int(isDifficult))
							queryId2Mention[qid] = (doc_name,mention_name,position)
							qid+=1

					doc_candidates += temp_candidates; doc_weight_array += weight_array
					doc_entity_candidates.append((true_entity_id, mention_name, position, candidate_tuples))

			else: # if there are no candidates, ignore this mention
				ferr.write("[Missing Candidates] Skipping mentions with no candidates: "+doc_name+" "+mention_name+"\n")
				mention2QueryId[(doc_name,mention_name,position)] = (-1,-1)

		if len(doc_entity_candidates) == 0:
			ferr.write("[Skip Document] No true entity in the document"+doc_name+"\n")
			continue

		uniform_weights = list(np.ones(len(doc_candidates)))
		tpca_start = time.clock()
		subspace, sinV, _ = utils.constructRepresentation(doc_candidates, uniform_weights, vectors, 'pca', isMeanCentered, num_components, (doc_name,))
		tpca_end = time.clock()
		tpca += tpca_end - tpca_start
		avgSubspace = utils.constructRepresentation(doc_candidates, uniform_weights, vectors, 'avg', debugInfo=(doc_name,))

	
		twpca_start = time.clock()
		subspace_weighted, sinV_weighted, _ = utils.constructRepresentation(doc_candidates, doc_weight_array, vectors, 'wpca', isMeanCentered, num_components, (doc_name,))
		twpca_end = time.clock()
		twpca += twpca_end - twpca_start
		weighted_avgSubspace = utils.constructRepresentation(doc_candidates, doc_weight_array, vectors, 'avg', debugInfo=(doc_name,))

		for (true_entity, mention, position, candidates) in doc_entity_candidates:
			queryId, isDifficult = mention2QueryId[(doc_name,mention,position)]
			if queryId != -1:
				for candidate in candidates:
					candidate_id = candidate[0]
					simProminence = float(candidate[1])

					entity_vector = vectors[candidate_id]/np.linalg.norm(vectors[candidate_id])

					tpca_start = time.clock()	
					if isMeanCentered:
						simPCA = utils.computeVecSubspaceSimilarity(entity_vector - avgSubspace, subspace, sinV, isWeighted)
					else:
						simPCA = utils.computeVecSubspaceSimilarity(entity_vector, subspace, sinV, isWeighted)
					tpca_end = time.clock()
					tpca += tpca_end - tpca_start
					simAvg = utils.cosineSimilarity(entity_vector, avgSubspace)
					
					twpca_start = time.clock()
					if isMeanCentered:
						simWPCA = utils.computeVecSubspaceSimilarity(entity_vector - avgSubspace, subspace_weighted, sinV_weighted, isWeighted)
					else:
						simWPCA = utils.computeVecSubspaceSimilarity(entity_vector, subspace_weighted, sinV_weighted, isWeighted)
					twpca_end = time.clock()
					twpca += twpca_end - twpca_start
					simWAvg = utils.cosineSimilarity(entity_vector, weighted_avgSubspace)

					if candidate_id == true_entity:
						label = 1
					else:
						label = 0

					key.append("qid:"+str(queryId)); cand_names.append(candidate_id); hard2beat_baseline.append(simProminence); avg_baseline.append(simAvg); wavg_baseline.append(simWAvg); agw_pca.append(simPCA); agw_wpca.append(simWPCA); labels.append(label)
	return key, cand_names, hard2beat_baseline, avg_baseline, wavg_baseline, agw_pca, agw_wpca, labels, mention2QueryId, queryId2Mention, tpca, twpca
示例#10
0
# train the model and get the feature vector for each paper
model = models.Word2Vec(sentences, min_count=1, size=7, window=2)
for key in paper2author:
  vector = []
  for stemmedWord in paper2author[key]['processed']:
    vector.append(model[stemmedWord])
  paper2author[key]['feature'] = vector 
print('trained done')
# get cosSim for each pair
for key1 in paper2author:
  for key2 in paper2author:
    paper1, paper2 = paper2author[key1], paper2author[key2]
    index1, index2 = paper1['index'], paper2['index']
    if index1 >= index2: continue    #  only compare when index1 < index2 to minimize comparisons
    try: 
      cosSim = cosineSimilarity(paper1['feature'], paper2['feature'])
      paper1['similarity'][index2] = paper2['similarity'][index1] = cosSim
    except:  # some edge case has research paper title with only number
      continue
print('loop done')
model.save('model.bin')
for key in paper2author:
  for i, arr in enumerate(paper2author[key]['feature']):
    paper2author[key]['feature'][i] = paper2author[key]['feature'][i].tolist()
print(paper2author)
with open('processed.json', 'w') as f:
  json.dump(paper2author, f, indent=4)

###########  Ex: paper2author:  ###########
# { 'Machine Translation Demonstration': {'author': ['Ulrike Schwall'],
#                                         'processed': ['lmt', '-', 'machin', 'translat', 'demonstr'],