Exemplo n.º 1
0
def getlcscost(n, m):
    lcs_n_m = len(nlp.lcs(n, m))
    max_length = __builtin__.max(len(n), len(m))
    lcsr_n_m = float(lcs_n_m) / float(max_length)
    edit_n_m = nlp.editex("".join(OrderedDict.fromkeys(n)),
                          "".join(OrderedDict.fromkeys(m)))
    if edit_n_m != 0:
        sim_cost_n_m = lcsr_n_m / edit_n_m
        return sim_cost_n_m
    return 0
Exemplo n.º 2
0
	def run(self):
		print "Created Random walk thread"
		node_len = len(self.node_list)
		hit_matrix = np.zeros(node_len)
		r_matrix = np.zeros(node_len)
		norm_matrix = np.zeros(node_len)
		cost_matrix = np.zeros(node_len)
		node_index = self.node_index
		start_node_index = node_index
		source_node_index = node_index
		for i in range(0,(STEPS_VALUE/2)+1):
			start_node_index = node_index
			source_node_index = node_index
			P = self.P_arr[i]
			# P = np.ma.masked_array(P, self.A_mask.mask)
			# np.ma.set_fill_value(P, 0.)
			# P = P.filled()
			hits = 0
			# print "STEP "+str(i)
			# print self.node_list[start_node_index]
			while (type(self.node_list[source_node_index]) is ContextNode) or (type(self.node_list[source_node_index]) is WordNode and self.node_list[source_node_index].isNoisy) or (hits < MAX_HITS):
				hits = hits + 1
				row_array = P[source_node_index,None,:]
				row_array[0,start_node_index]=0
				source_node_index = np.argmax(row_array)
				if row_array[0,source_node_index] == 0:
					# print "No where to go"
					break
				# print "->"
				# print self.node_list[source_node_index]
				# pdb.set_trace()
				if (type(self.node_list[source_node_index]) is WordNode and not self.node_list[source_node_index].isNoisy) or (hits >= MAX_HITS):
					break
			# print "STEP Done"
			r_matrix[source_node_index]=r_matrix[source_node_index]+1
			hit_matrix[source_node_index]=hits
		H_matrix = np.true_divide(hit_matrix,r_matrix)
		where_are_NaNs = np.isnan(H_matrix)
		H_matrix[where_are_NaNs] = 0.
		# print "==========Final H Matrix==========="
		# print H_matrix
		total = 0.0
		for j in range(0,len(self.node_list)):
			total = total + H_matrix[j]
		if total!=0:
			for j in range(0,len(self.node_list)):
				H_matrix[j] = H_matrix[j]/total
		# print H_matrix
		final_word_map={}
		word_node_list = []

		#Getting only the word node indices
		for i in range(0,len(self.node_list)):
			if type(self.node_list[i]) is WordNode:
				word_node_list.append(i)

		#Building the cost matrix
		for j in word_node_list:
			if node_index != j:
				n = str(self.node_list[node_index])
				m = str(self.node_list[j])
				lcs_n_m = len(nlp.lcs(n,m))
				max_length = max(len(n),len(m))
				lcsr_n_m = float(lcs_n_m)/float(max_length)
				edit_n_m = nlp.editex("".join(OrderedDict.fromkeys(n)),"".join(OrderedDict.fromkeys(m)))
				if edit_n_m !=0:
					sim_cost_n_m = lcsr_n_m/edit_n_m
					# print sim_cost_n_m
					cost_matrix[j] = float(H_matrix[j] + sim_cost_n_m)
					# print "="
					# print cost_matrix[i,j]
		# print "===========Final Cost Matrix================"
		# print cost_matrix
		#Cost matrix done
		if type(self.node_list[node_index]) is WordNode and self.node_list[node_index].isNoisy:
			self.final_word_map[str(self.node_list[node_index])]=[]
			# pdb.set_trace()
			row_array = cost_matrix
			row_array = np.asarray(np.argsort(row_array)).reshape(-1)[::-1]
			for word_index in range(0,MAX_WORDS):
				if type(self.node_list[row_array[word_index]]) is WordNode and not self.node_list[row_array[word_index]].isNoisy:
					self.final_word_map[str(self.node_list[node_index])].append((str(self.node_list[row_array[word_index]]),cost_matrix[row_array[word_index]]))
		print self.final_word_map
		end_time = datetime.now()
		time_delta = end_time - self.start_time
		print "Total time taken: "+ str(time_delta.seconds)+"s"
Exemplo n.º 3
0
def randomwalk(B,X,Y):
	"""Random walk implementation
	
	This is the random walk implementation
	Arguments:
		B {Networkx graphs} -- Networkx graph that is created
		X {WordNodes} -- List of WordNode
		Y {ContextNodes} -- List of ContextNodes
	"""
	#Create weight probabities:
	A = nx.to_numpy_matrix(B)
	# print A
	node_list = B.nodes()
	hit_matrix = np.array([[0. for node1 in node_list] for node2 in node_list])
	r_matrix = np.array([[0. for node1 in node_list] for node2 in node_list])
	norm_matrix = np.array([[0. for node1 in node_list] for node2 in node_list])
	cost_matrix = np.array([[0. for node1 in node_list] for node2 in node_list])
	for i in range(0,len(node_list)):
		total = 0.0
		for j in range(0,len(node_list)):
			total = total + A[i,j]
		if total!=0:
			for j in range(0,len(node_list)):
				A[i,j] = A[i,j]/total
	# print A
	#Random walks algorithm
	A_mask = np.ma.masked_where(A==0., A)

	for node_index in range(0,len(node_list)):
		if type(node_list[node_index]) is WordNode and node_list[node_index].isNoisy:
			start_node_index = node_index
			source_node_index = node_index
			for i in range(1,STEPS_VALUE+1,2):
				start_node_index = node_index
				source_node_index = node_index
				P = np.linalg.matrix_power(A,i)
				P = np.ma.masked_array(P, A_mask.mask)
				np.ma.set_fill_value(P, 0.)
				P = P.filled()
				hits = 0
				print "STEP "+str(i)
				print node_list[start_node_index]
				while (type(node_list[source_node_index]) is ContextNode) or (type(node_list[source_node_index]) is WordNode and node_list[source_node_index].isNoisy) or (hits < MAX_HITS):
					hits = hits + 1
					row_array = P[source_node_index,None,:]
					row_array[0,start_node_index]=0
					source_node_index = np.argmax(row_array)
					if row_array[0,source_node_index] == 0:
						print "No where to go"
						break
					print "->"
					print node_list[source_node_index]
					# pdb.set_trace()
					if (type(node_list[source_node_index]) is WordNode and not node_list[source_node_index].isNoisy) or (hits >= MAX_HITS):
						break
				print "STEP Done"
				r_matrix[start_node_index,source_node_index]=r_matrix[start_node_index,source_node_index]+1
				hit_matrix[start_node_index,source_node_index]=hits
	H_matrix = np.true_divide(hit_matrix,r_matrix)
	where_are_NaNs = np.isnan(H_matrix)
	H_matrix[where_are_NaNs] = 0.
	print "==========Final H Matrix==========="
	print H_matrix
	for i in range(0,len(node_list)):
		total = 0.0
		for j in range(0,len(node_list)):
			total = total + H_matrix[i,j]
		if total!=0:
			for j in range(0,len(node_list)):
				H_matrix[i,j] = H_matrix[i,j]/total
	print H_matrix
	final_word_map={}
	word_node_list = []

	#Getting only the word node indices
	for i in range(0,len(node_list)):
		if type(node_list[i]) is WordNode:
			word_node_list.append(i)

	#Building the cost matrix
	for i in word_node_list:
		for j in word_node_list:
			if i != j:
				n = str(node_list[i])
				m = str(node_list[j])
				lcs_n_m = len(nlp.lcs(n,m))
				max_length = max(len(n),len(m))
				lcsr_n_m = float(lcs_n_m)/float(max_length)
				edit_n_m = nlp.editex("".join(OrderedDict.fromkeys(n)),"".join(OrderedDict.fromkeys(m)))
				if edit_n_m !=0:
					sim_cost_n_m = lcsr_n_m/edit_n_m
					print sim_cost_n_m
					cost_matrix[i,j] = float(H_matrix[i,j] + sim_cost_n_m)
					# print "="
					# print cost_matrix[i,j]
	print "===========Final Cost Matrix================"
	print cost_matrix
	#Cost matrix done
	for node_index in range(0,len(node_list)):
		if type(node_list[node_index]) is WordNode and node_list[node_index].isNoisy:
			final_word_map[str(node_list[node_index])]=[]
			# pdb.set_trace()
			row_array = cost_matrix[node_index,None,:]
			row_array = np.asarray(np.argsort(row_array,axis=1)).reshape(-1)[::-1]
			for word_index in range(0,MAX_WORDS):
				if type(node_list[row_array[word_index]]) is WordNode and not node_list[row_array[word_index]].isNoisy:
					final_word_map[str(node_list[node_index])].append((str(node_list[row_array[word_index]]),cost_matrix[node_index,row_array[word_index]]))
	print final_word_map