def custom_kernel(X, Y ): global custom_data_points retmat = np.zeros( (len(X), len(Y)) ) for i in range(len(X)): for j in range( len(Y) ) : #test linear kernel #X[i] is the ith data point, but we use its 0th feature as pointer into custom_data_points to get the original data #try : e1A, e2A, labelA, featureA = custom_data_points[ int(X[i][0]) ] e1B, e2B, labelB, featureB = custom_data_points[ int(Y[j][0]) ] obj1 = get_doc_obj(e1A, e2A) obj2 = get_doc_obj(e1B, e2B) sim = sdpKernel.SPDK(e1A, e2A, obj1, e1B, e2B, obj2) #sim = custom_linear_kernel(e1A, e2A, e1B, e2B, featureA, featureB) #sim = graphKernel(e1A, e2A, e1B, e2B) #print "linear and graph sims" , linear_sim, graph_sim #print "custom_kernel_computation_for", e1A.get_display(), e2A.get_display(), e1B.get_display(), e2B.get_display() ,sim retmat[i,j] = sim # except: # print >>sys.stderr, "extracting features for ", i, j # print >>sys.stderr, "X of i ", X[i] # print >>sys.stderr, "Y of j ", Y[j] # # print >>sys.stderr, "failing in custom kernel", i, j , " of " , len(X), len(Y) # print >>sys.stderr, " whose contents are ", X[i], Y[j] # print >>sys.stderr, "working out the similarity for ", \ # e1A.get_display(), e2A.get_display(), labelA, featureA, e1B.get_display(), e2B.get_display(),labelB, featureB # sys.exit(-1) if int(i*100.0/ len(X) )+1 % 10 == 0 : print >>sys.stderr, "custom_kernel computation progress ", i*100.0/len(X) , "%" ### for j ### for i return retmat
def compute_metrics( test_points, classifier_predictions, relation_type , gold_relations): #test_points = list of tuples of (entity pair information, true labels, features) #classifier_predictions = (corresponding) list of predicted labels global dump_predictions predictedRelations= set([]) actualRelations = set([]) for (eid1,eid2,documentId, rellabel) in gold_relations: if rellabel== relation_type : actualRelations.add( (eid1,eid2,documentId) ) classifier_predictions = apply_semantic_constraints(test_points, relation_type, classifier_predictions) ofh= open("tmp.predictions."+relation_type, "w") ### idx = 0 for predicted_label in classifier_predictions: #open(svmlight_predictions).readlines() : e1, e2, true_label, features = test_points[idx] print >>ofh,true_label, predicted_label,"|", e1.get_display(),e2.get_display(),"|", get_doc_obj(e1,e2).rawText[e1.sentenceId] if predicted_label == relation_type: predictedRelations.add( (e1.entityId, e2.entityId, e1.documentId) ) if e1.documentId not in dump_predictions : dump_predictions[e1.documentId] = [] dump_predictions[e1.documentId].append( (e1, e2, relation_type) ) idx += 1 ## ofh.close() correctRelations= predictedRelations.intersection( actualRelations ) falseNegatives= actualRelations - predictedRelations print "metrics for ", relation_type, " with ", len(test_points) , " test points" print "metrics: actual ", len(actualRelations) , actualRelations print "metrics: predicted", len(predictedRelations), predictedRelations print "metrics: false negatives", len(falseNegatives), falseNegatives print "metrics: correct ", len(correctRelations), correctRelations precision, recall, f1 = -1,-1,-1 if len(predictedRelations) >0 and len(actualRelations)>0 : precision = len(correctRelations)*100.0/len(predictedRelations) recall = len(correctRelations)*100.0/ len(actualRelations) if len(correctRelations) >0 : f1 = 2.0*precision* recall / (precision+recall) print relation_type, "prec, recall, f1 ", precision, recall, f1, "\n_______________________________________________________________________" print >>sys.stderr, relation_type, "prec, recall, f1 ", precision, recall, f1, "\n_______________________________________________________________________"
def get_candidates(data_file, relation_type, gold_relations, feature_type="BOW"): #bag of words feature -- and identifying information -- only for valid (based on entity type signature) candidate entity pairs pointSet= [] import re for line in open(data_file).readlines() : #if "Bag_Of_Words" not in line : continue #skip entity pairs without this feature if "SENTENCE_BOUNDARY" in line : continue #skip pairs beyond a sentence fields= {} for kv in line.split("\t") : kv = kv.strip() if len(kv) <=0 : continue tarr = kv.split(":") key = tarr[0].strip() val = ":".join(tarr[1:] ).strip() #everything after the first : as is fields[key] = val ### label = fields["RelationLabel"] e1 = clsEntity.createEntityFromString( fields["EntityArg1"] ) e2 = clsEntity.createEntityFromString( fields["EntityArg2"] ) ### documentId = e1.documentId if (e1,e2,documentId, relation_type) in gold_relations: check_label = relation_type else: check_label = "NOT_RELATED" if check_label==relation_type and label!=relation_type : print >>sys.stderr, "debug this ", e1.get_display(), e2.get_display(), documentId, relation_type print >>sys.stderr, "coming from ", data_file , line sys.exit(-1) #skip this point if... if relation_type =="ANY_RELATION" and len(get_possible_relations(e1.entityType, e2.entityType ))<=0 : continue if relation_type!="ANY_RELATION" and False == valid_relation_signature(relation_type, e1.entityType, e2.entityType): continue if feature_type=="BOW": features = get_regular_features( fields, e1, e2 , relation_type ) #bag of words and other such lexical features elif feature_type == "Parse_Tree" : features = fields["Parse_Tree"] #the string parse tree as is #### pointSet.append( (e1, e2, label, features) ) ##### for line in ... #dumping candidates for debugging ofh= open("tmp.candidates."+ data_file +"."+relation_type, "w") for (e1,e2,label,features) in sorted(pointSet, key=operator.itemgetter(2) ) : #keep it sorted on labels for easy analysis print >>ofh, label, e1.get_display(), e2.get_display(), e1.entityDescription, e2.entityDescription,"|", get_doc_obj(e1,e2).rawText[e1.sentenceId].replace("\n"," ") ,"|", features ofh.close() return pointSet