def createDics(self, bin_dataframe,pretrained_embeddings): # bin_np = bin_dataframe.as_matrix() bin_np = bin_dataframe.to_numpy() if pretrained_embeddings==False: # maybe not need this! self.word_to_ix["<unk>"] = len(self.word_to_ix) #initialize the event dictionary self.event_to_ix["non-event"]= len(self.event_to_ix) self.event_to_ix["event"]= len(self.event_to_ix) # initialize the tags dictionary self.tag_to_ix["B-Other"] = len(self.tag_to_ix) self.tag_to_ix["I-Other"] = len(self.tag_to_ix) for line in bin_np: if line[1] != None: if pretrained_embeddings==True: continue else: for word in utils.strToLst(line[1]): if word not in self.word_to_ix: self.word_to_ix[word] = len(self.word_to_ix) else: tag = utils.strToLst(line[0])['corrected_tags'] if tag not in self.tag_to_ix: self.tag_to_ix[tag] = len(self.tag_to_ix) self.BIOset, self.ECset = utils.getSortedTagsFromBIO(self.tag_to_ix) self.tag_to_ix = utils.getSegmentationDict(self.BIOset) self.ec_to_ix = utils.getSegmentationDict(self.ECset)
def __init__(self, tsv_file,isTrain,pretrained_embeddings=False, word_to_ix={}, tag_to_ix={},event_to_ix={},pad_length=0): """ Args: csv_file (string): Path to the csv file with annotations. root_dir (string): Directory with all the images. transform (callable, optional): Optional transform to be applied on a sample. """ # self.landmarks_frame = pd.read_csv(csv_file) col_vector = ['time', 'tweet'] self.bin_df = pd.read_csv(tsv_file, names=col_vector, encoding="utf-8", engine='python', sep="\t") self.matches = [] self.word_to_ix = word_to_ix self.tag_to_ix = tag_to_ix self.event_to_ix = event_to_ix self.pad_length=pad_length self.BIOset, self.ECset = utils.getSortedTagsFromBIO(self.tag_to_ix) self.tag_to_ix = utils.getSegmentationDict(self.BIOset) self.ec_to_ix = utils.getSegmentationDict(self.ECset) if isTrain == True: self.createDics(self.bin_df,pretrained_embeddings) self.preprocess(self.bin_df)
def __init__(self, tag_to_ix): self.tag_to_ix = tag_to_ix self.nerSegmentationTags, self.ECset = utils.getSortedTagsFromBIO(tag_to_ix) self.tag_to_ix = utils.getSegmentationDict(self.nerSegmentationTags) self.totals = 0 self.oks = 0 self.tpsNER = 0 self.fpsNER = 0 self.fnsNER = 0 self.tpsNERMicro_no_other = 0 self.fpsNERMicro_no_other = 0 self.fnsNERMicro_no_other = 0 self.tpsClassesNER = dict.fromkeys(self.ECset, 0) self.fpsClassesNER = dict.fromkeys(self.ECset, 0) self.fnsClassesNER = dict.fromkeys(self.ECset, 0) self.precisionNER = dict.fromkeys(self.ECset, 0) self.recallNER = dict.fromkeys(self.ECset, 0) self.f1NER = dict.fromkeys(self.ECset, 0) self.correct_predsNER, self.total_correctNER, self.total_predsNER = 0., 0., 0.
def add_test(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL): n = open("pred_output/output_rel.txt", "a") m = open("pred_output/output_ner.txt", "a") for batch_idx in range(len(pred_batchesNER)): predNER = pred_batchesNER[batch_idx] trueNER = true_batchesNER[batch_idx] predRel = pred_batchesREL[batch_idx] trueRel = true_batchesREL[batch_idx] ptoken_ids, _, plabel_ids, phead_ids, plabel_names = utils.transformToInitialInput( predRel, self.RELset) _, _, tlabel_ids, thead_ids, tlabel_names = utils.transformToInitialInput( trueRel, self.RELset) trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids) predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids) n.write(str(predRel)) tagsNER = utils.getSegmentationDict(self.nerSegmentationTags)#self. if self.ner_chunk_eval == "boundaries_type": lab_chunks = set(get_chunks(trueNER, tagsNER)) lab_pred_chunks = set(get_chunks(predNER, tagsNER)) elif self.ner_chunk_eval == "boundaries": lab_chunks = set(keepOnlyChunkBoundaries(set(get_chunks(trueNER, tagsNER)))) lab_pred_chunks = set(keepOnlyChunkBoundaries(set(get_chunks(predNER, tagsNER)))) lab_chunks_list = list(lab_chunks) lab_pred_chunks_list = list(lab_pred_chunks) #n.write(str(predRel)) #m.write("\t".join(lab_pred_chunks_list)) m.write((str(lab_pred_chunks))) n.write("\n") m.write("\n")
def add(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL,true_batchesBIONER): for batch_idx in range(len(pred_batchesNER)): predNER = pred_batchesNER[batch_idx] trueNER = true_batchesNER[batch_idx] predRel = pred_batchesREL[batch_idx] trueRel = true_batchesREL[batch_idx] trueBIONER=true_batchesBIONER[batch_idx] ptoken_ids, _, plabel_ids, phead_ids, plabel_names = utils.transformToInitialInput( predRel, self.RELset) _, _, tlabel_ids, thead_ids, tlabel_names = utils.transformToInitialInput( trueRel, self.RELset) trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids) predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids) #print (self.NERset) tagsNER = utils.getSegmentationDict(self.nerSegmentationTags)#self. lab_chunks_ = set(get_chunks(listOfTagsToids(trueBIONER,self.nerSegmentationTags), tagsNER)) #lab_pred_chunks = set(get_chunks(predNER, tagsNER)) lab_chunks_list_ = list(lab_chunks_) trueNER_tags=listOfIdsToTags(trueNER,self.NERset) predNER_tags=listOfIdsToTags(predNER, self.NERset) lab_chunks = set(classesToChunks(trueNER_tags, lab_chunks_list_)) lab_pred_chunks=set(classesToChunks(predNER_tags, lab_chunks_list_)) lab_chunks_list = list(lab_chunks) lab_pred_chunks_list = list(lab_pred_chunks) for lab_idx in range(len(lab_pred_chunks_list)): if lab_pred_chunks_list[lab_idx] in lab_chunks_list: # print (lab_pred_chunks_list[lab_idx][0]) self.tpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1 else: self.fpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1 # fnsEntitiesNER+=1 for lab_idx in range(len(lab_chunks_list)): if lab_chunks_list[lab_idx] not in lab_pred_chunks_list: self.fnsClassesNER[lab_chunks_list[lab_idx][0]] += 1 relTrue = set(relationChunks(trueRel, lab_chunks_list,relationTuple=self.rel_chunk_eval)) relPred = set(relationChunks(predRel, lab_pred_chunks_list,relationTuple=self.rel_chunk_eval)) relTrueList = list(relTrue) # trueRel# # if (len(trueRel)!=len(relTrueList)): # print ("warning") relPredList = list(relPred) # predRel# #print("GOLD REL chunks:" + str(relTrueList)) #print("PRED REL chunks:" + str(relPredList)) for lab_idx in range(len(relPredList)): if relPredList[lab_idx] in relTrueList: # print (lab_pred_chunks_list[lab_idx][0]) self.tpsClassesREL[relPredList[lab_idx][1]] += 1 # print (relPredList[lab_idx]) else: self.fpsClassesREL[relPredList[lab_idx][1]] += 1 # fnsEntitiesNER+=1 for lab_idx in range(len(relTrueList)): if relTrueList[lab_idx] not in relPredList: self.fnsClassesREL[relTrueList[lab_idx][1]] += 1 self.correct_predsNER += len(lab_chunks & lab_pred_chunks) self.total_predsNER += len(lab_pred_chunks) self.total_correctNER += len(lab_chunks) self.correct_predsREL += len(relTrue & relPred) self.total_predsREL += len(relPred) self.total_correctREL += len(relTrue)
def add(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL): for batch_idx in range(len(pred_batchesNER)): predNER = pred_batchesNER[batch_idx] trueNER = true_batchesNER[batch_idx] predRel = pred_batchesREL[batch_idx] trueRel = true_batchesREL[batch_idx] ptoken_ids, _, plabel_ids, phead_ids, plabel_names = utils.transformToInitialInput( predRel, self.RELset) _, _, tlabel_ids, thead_ids, tlabel_names = utils.transformToInitialInput( trueRel, self.RELset) trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids) predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids) tagsNER = utils.getSegmentationDict( self.nerSegmentationTags) #self. if self.ner_chunk_eval == "boundaries_type": lab_chunks = set(get_chunks(trueNER, tagsNER)) lab_pred_chunks = set(get_chunks(predNER, tagsNER)) elif self.ner_chunk_eval == "boundaries": lab_chunks = set( keepOnlyChunkBoundaries(set(get_chunks(trueNER, tagsNER)))) lab_pred_chunks = set( keepOnlyChunkBoundaries(set(get_chunks(predNER, tagsNER)))) lab_chunks_list = list(lab_chunks) lab_pred_chunks_list = list(lab_pred_chunks) if self.ner_chunk_eval == "boundaries_type": for lab_idx in range(len(lab_pred_chunks_list)): if lab_pred_chunks_list[lab_idx] in lab_chunks_list: # print (lab_pred_chunks_list[lab_idx][0]) self.tpsClassesNER[lab_pred_chunks_list[lab_idx] [0]] += 1 else: self.fpsClassesNER[lab_pred_chunks_list[lab_idx] [0]] += 1 # fnsEntitiesNER+=1 for lab_idx in range(len(lab_chunks_list)): if lab_chunks_list[lab_idx] not in lab_pred_chunks_list: self.fnsClassesNER[lab_chunks_list[lab_idx][0]] += 1 elif self.ner_chunk_eval == "boundaries": for lab_idx in range(len(lab_pred_chunks_list)): if lab_pred_chunks_list[lab_idx] in lab_chunks_list: # print (lab_pred_chunks_list[lab_idx][0]) self.tpsNER += 1 else: self.fpsNER += 1 # fnsEntitiesNER+=1 for lab_idx in range(len(lab_chunks_list)): if lab_chunks_list[lab_idx] not in lab_pred_chunks_list: self.fnsNER += 1 if self.root_node == True: lab_chunks_list_with_ROOT = copy.deepcopy(lab_chunks_list) lab_chunks_list_with_ROOT.append((None, 0, 0)) lab_pred_chunks_list_with_ROOT = copy.deepcopy( lab_pred_chunks_list) lab_pred_chunks_list_with_ROOT.append((None, 0, 0)) relTrue = set( relationChunks(trueRel, lab_chunks_list_with_ROOT, relationTuple=self.rel_chunk_eval)) relPred = set( relationChunks(predRel, lab_pred_chunks_list_with_ROOT, relationTuple=self.rel_chunk_eval)) else: relTrue = set( relationChunks(trueRel, lab_chunks_list, relationTuple=self.rel_chunk_eval)) relPred = set( relationChunks(predRel, lab_pred_chunks_list, relationTuple=self.rel_chunk_eval)) relTrueList = list(relTrue) # trueRel# # if (len(trueRel)!=len(relTrueList)): # print ("warning") relPredList = list(relPred) # predRel# for lab_idx in range(len(relPredList)): if relPredList[lab_idx] in relTrueList: # print (lab_pred_chunks_list[lab_idx][0]) self.tpsClassesREL[relPredList[lab_idx][1]] += 1 # print (relPredList[lab_idx]) else: self.fpsClassesREL[relPredList[lab_idx][1]] += 1 # fnsEntitiesNER+=1 for lab_idx in range(len(relTrueList)): if relTrueList[lab_idx] not in relPredList: self.fnsClassesREL[relTrueList[lab_idx][1]] += 1 self.correct_predsNER += len(lab_chunks & lab_pred_chunks) self.total_predsNER += len(lab_pred_chunks) self.total_correctNER += len(lab_chunks) self.correct_predsREL += len(relTrue & relPred) self.total_predsREL += len(relPred) self.total_correctREL += len(relTrue)
import utils