예제 #1
0
def custom_kernel(X, Y ):
	global custom_data_points

	retmat = np.zeros(  (len(X), len(Y)) ) 
	for i in range(len(X)):
		for j in range( len(Y) ) : 
			#test linear kernel
			#X[i] is the ith data point, but we use its 0th feature as pointer into custom_data_points to get the original data
			#try :
				e1A, e2A, labelA, featureA =  custom_data_points[ int(X[i][0]) ] 
				e1B, e2B, labelB, featureB =  custom_data_points[ int(Y[j][0]) ] 
				obj1 = get_doc_obj(e1A, e2A)
				obj2 = get_doc_obj(e1B, e2B)


				sim = sdpKernel.SPDK(e1A, e2A, obj1, e1B, e2B, obj2)
				#sim =  custom_linear_kernel(e1A, e2A, e1B, e2B, featureA, featureB)
				#sim =  graphKernel(e1A, e2A, e1B, e2B)
				#print "linear and graph sims" , linear_sim, graph_sim
				#print "custom_kernel_computation_for", e1A.get_display(), e2A.get_display(), e1B.get_display(), e2B.get_display() ,sim
				retmat[i,j] =  sim
#			except:
#				print >>sys.stderr, "extracting features for ", i, j 
#				print  >>sys.stderr, "X of i ", X[i]
#				print >>sys.stderr, "Y of j ", Y[j]
#
#				print >>sys.stderr, "failing in custom kernel", i, j , " of " , len(X), len(Y) 
#				print >>sys.stderr, " whose contents are ", X[i], Y[j]
#				print >>sys.stderr, "working out the similarity for ", \
#					e1A.get_display(), e2A.get_display(), labelA, featureA, e1B.get_display(), e2B.get_display(),labelB, featureB
#				sys.exit(-1)
		if  int(i*100.0/ len(X) )+1 % 10 == 0 : 	
			print >>sys.stderr, "custom_kernel computation progress ",  i*100.0/len(X) , "%"
		### for j
	### for i
	return retmat
예제 #2
0
def compute_metrics( test_points, classifier_predictions, relation_type , gold_relations):
	#test_points  		= list of tuples of (entity pair information, true labels, features)
	#classifier_predictions = (corresponding) list of predicted labels
	global dump_predictions 

	predictedRelations= set([])
	actualRelations = set([])
	for (eid1,eid2,documentId, rellabel) in gold_relations: 
		if rellabel== relation_type : 
			actualRelations.add( (eid1,eid2,documentId) )

	classifier_predictions = apply_semantic_constraints(test_points, relation_type, classifier_predictions)
	
	ofh= open("tmp.predictions."+relation_type, "w")
	###
	idx = 0
	for predicted_label in classifier_predictions:  #open(svmlight_predictions).readlines() :
		e1, e2, true_label, features = test_points[idx]
		print >>ofh,true_label, predicted_label,"|", e1.get_display(),e2.get_display(),"|", get_doc_obj(e1,e2).rawText[e1.sentenceId]

		if predicted_label == relation_type:
			predictedRelations.add( (e1.entityId, e2.entityId, e1.documentId) )
			if e1.documentId not in dump_predictions : dump_predictions[e1.documentId] = []
			dump_predictions[e1.documentId].append(  (e1, e2, relation_type) )
		idx += 1
	##
	ofh.close()

	correctRelations=   predictedRelations.intersection( actualRelations )
	falseNegatives=     actualRelations - predictedRelations

	print "metrics for ", relation_type, " with ", len(test_points) , " test points"
	print "metrics: actual ",   len(actualRelations) , actualRelations
	print "metrics: predicted", len(predictedRelations), predictedRelations
	print "metrics: false negatives", len(falseNegatives), falseNegatives 
	print "metrics: correct ",  len(correctRelations), correctRelations
	precision, recall, f1 = -1,-1,-1

	if len(predictedRelations) >0  and len(actualRelations)>0 :
		precision = len(correctRelations)*100.0/len(predictedRelations) 
		recall = len(correctRelations)*100.0/ len(actualRelations)
	if len(correctRelations) >0 :
		f1 = 2.0*precision* recall / (precision+recall)

	print  relation_type, "prec, recall, f1 ",  precision, recall, f1, "\n_______________________________________________________________________"
	print  >>sys.stderr, relation_type, "prec, recall, f1 ",  precision, recall, f1, "\n_______________________________________________________________________"
예제 #3
0
def get_candidates(data_file, relation_type, gold_relations, feature_type="BOW"):
	#bag of words feature -- and identifying information -- only for valid (based on entity type signature) candidate entity pairs
	pointSet= []

	import re
	for line in open(data_file).readlines() :
		#if "Bag_Of_Words" not in line  : continue #skip entity pairs without this feature
		if "SENTENCE_BOUNDARY"  in line : continue #skip pairs beyond a sentence

		fields= {}
		for kv  in line.split("\t") :
		       kv = kv.strip()
		       if len(kv) <=0 : continue

		       tarr	= kv.split(":")
		       key	= tarr[0].strip()
		       val 	= ":".join(tarr[1:] ).strip() #everything after the first : as is
		       fields[key] = val 

		###
		label = fields["RelationLabel"]
		e1 = clsEntity.createEntityFromString( fields["EntityArg1"] )
		e2 = clsEntity.createEntityFromString( fields["EntityArg2"] )

		###
		documentId = e1.documentId 

		if (e1,e2,documentId, relation_type) in gold_relations:
			check_label = relation_type
		else:
			check_label = "NOT_RELATED"

		if check_label==relation_type and label!=relation_type  :
		 	print >>sys.stderr, "debug this ", e1.get_display(), e2.get_display(), documentId, relation_type  
		 	print >>sys.stderr, "coming from ", data_file , line 
			sys.exit(-1)	


		#skip this point if...
		if relation_type =="ANY_RELATION" and len(get_possible_relations(e1.entityType, e2.entityType ))<=0 :
			continue
		if relation_type!="ANY_RELATION" and False == valid_relation_signature(relation_type, e1.entityType, e2.entityType):
			continue

		if feature_type=="BOW":
			features = get_regular_features( fields, e1, e2 , relation_type ) #bag of words and other such lexical features	
		elif feature_type == "Parse_Tree" :
			features = fields["Parse_Tree"] #the string parse tree as is
		####
		pointSet.append(  (e1, e2, label, features) ) 
	##### for line in ...

	#dumping candidates for debugging
	ofh= open("tmp.candidates."+ data_file +"."+relation_type, "w") 
	for (e1,e2,label,features) in sorted(pointSet, key=operator.itemgetter(2) ) : #keep it sorted on labels for easy analysis
		print >>ofh, label, e1.get_display(), e2.get_display(), e1.entityDescription,  e2.entityDescription,"|", get_doc_obj(e1,e2).rawText[e1.sentenceId].replace("\n"," ") ,"|", features
	ofh.close()
	
	return pointSet 
예제 #4
0
def compute_metrics( test_points, classifier_predictions, relation_type , gold_relations):
	#test_points  		= list of tuples of (entity pair information, true labels, features)
	#classifier_predictions = (corresponding) list of predicted labels
	global dump_predictions 

	predictedRelations= set([])
	actualRelations = set([])
	for (eid1,eid2,documentId, rellabel) in gold_relations: 
		if rellabel== relation_type : 
			actualRelations.add( (eid1,eid2,documentId) )

	classifier_predictions = apply_semantic_constraints(test_points, relation_type, classifier_predictions)
	
	ofh= open("tmp.predictions."+relation_type, "w")
	###
	idx = 0
	for predicted_label in classifier_predictions:  #open(svmlight_predictions).readlines() :
		e1, e2, true_label, features = test_points[idx]
		print >>ofh,true_label, predicted_label,"|", e1.get_display(),e2.get_display(),"|", get_doc_obj(e1,e2).rawText[e1.sentenceId]

		if predicted_label == relation_type:
			predictedRelations.add( (e1.entityId, e2.entityId, e1.documentId) )
			if e1.documentId not in dump_predictions : dump_predictions[e1.documentId] = []
			dump_predictions[e1.documentId].append(  (e1, e2, relation_type) )
		idx += 1
	##
	ofh.close()

	correctRelations=   predictedRelations.intersection( actualRelations )
	falseNegatives=     actualRelations - predictedRelations

	print "metrics for ", relation_type, " with ", len(test_points) , " test points"
	print "metrics: actual ",   len(actualRelations) , actualRelations
	print "metrics: predicted", len(predictedRelations), predictedRelations
	print "metrics: false negatives", len(falseNegatives), falseNegatives 
	print "metrics: correct ",  len(correctRelations), correctRelations
	precision, recall, f1 = -1,-1,-1

	if len(predictedRelations) >0  and len(actualRelations)>0 :
		precision = len(correctRelations)*100.0/len(predictedRelations) 
		recall = len(correctRelations)*100.0/ len(actualRelations)
	if len(correctRelations) >0 :
		f1 = 2.0*precision* recall / (precision+recall)

	print  relation_type, "prec, recall, f1 ",  precision, recall, f1, "\n_______________________________________________________________________"
	print  >>sys.stderr, relation_type, "prec, recall, f1 ",  precision, recall, f1, "\n_______________________________________________________________________"
예제 #5
0
def get_candidates(data_file, relation_type, gold_relations, feature_type="BOW"):
	#bag of words feature -- and identifying information -- only for valid (based on entity type signature) candidate entity pairs
	pointSet= []

	import re
	for line in open(data_file).readlines() :
		#if "Bag_Of_Words" not in line  : continue #skip entity pairs without this feature
		if "SENTENCE_BOUNDARY"  in line : continue #skip pairs beyond a sentence

		fields= {}
		for kv  in line.split("\t") :
		       kv = kv.strip()
		       if len(kv) <=0 : continue

		       tarr	= kv.split(":")
		       key	= tarr[0].strip()
		       val 	= ":".join(tarr[1:] ).strip() #everything after the first : as is
		       fields[key] = val 

		###
		label = fields["RelationLabel"]
		e1 = clsEntity.createEntityFromString( fields["EntityArg1"] )
		e2 = clsEntity.createEntityFromString( fields["EntityArg2"] )

		###
		documentId = e1.documentId 

		if (e1,e2,documentId, relation_type) in gold_relations:
			check_label = relation_type
		else:
			check_label = "NOT_RELATED"

		if check_label==relation_type and label!=relation_type  :
		 	print >>sys.stderr, "debug this ", e1.get_display(), e2.get_display(), documentId, relation_type  
		 	print >>sys.stderr, "coming from ", data_file , line 
			sys.exit(-1)	


		#skip this point if...
		if relation_type =="ANY_RELATION" and len(get_possible_relations(e1.entityType, e2.entityType ))<=0 :
			continue
		if relation_type!="ANY_RELATION" and False == valid_relation_signature(relation_type, e1.entityType, e2.entityType):
			continue

		if feature_type=="BOW":
			features = get_regular_features( fields, e1, e2 , relation_type ) #bag of words and other such lexical features	
		elif feature_type == "Parse_Tree" :
			features = fields["Parse_Tree"] #the string parse tree as is
		####
		pointSet.append(  (e1, e2, label, features) ) 
	##### for line in ...

	#dumping candidates for debugging
	ofh= open("tmp.candidates."+ data_file +"."+relation_type, "w") 
	for (e1,e2,label,features) in sorted(pointSet, key=operator.itemgetter(2) ) : #keep it sorted on labels for easy analysis
		print >>ofh, label, e1.get_display(), e2.get_display(), e1.entityDescription,  e2.entityDescription,"|", get_doc_obj(e1,e2).rawText[e1.sentenceId].replace("\n"," ") ,"|", features
	ofh.close()
	
	return pointSet