def createDics(self, bin_dataframe,pretrained_embeddings):

        # bin_np = bin_dataframe.as_matrix()
        bin_np = bin_dataframe.to_numpy()

        if pretrained_embeddings==False: # maybe not need this!

            self.word_to_ix["<unk>"] = len(self.word_to_ix)

        #initialize the event dictionary
        self.event_to_ix["non-event"]= len(self.event_to_ix)
        self.event_to_ix["event"]= len(self.event_to_ix)

        # initialize the tags dictionary
        self.tag_to_ix["B-Other"] = len(self.tag_to_ix)
        self.tag_to_ix["I-Other"] = len(self.tag_to_ix)

        for line in bin_np:
            if line[1] != None:
                if pretrained_embeddings==True:
                    continue
                else:
                    for word in utils.strToLst(line[1]):
                        if word not in self.word_to_ix:
                            self.word_to_ix[word] = len(self.word_to_ix)

            else:
                tag = utils.strToLst(line[0])['corrected_tags']
                if tag not in self.tag_to_ix:
                    self.tag_to_ix[tag] = len(self.tag_to_ix)

        self.BIOset, self.ECset = utils.getSortedTagsFromBIO(self.tag_to_ix)
        self.tag_to_ix = utils.getSegmentationDict(self.BIOset)
        self.ec_to_ix = utils.getSegmentationDict(self.ECset)
    def __init__(self, tsv_file,isTrain,pretrained_embeddings=False, word_to_ix={}, tag_to_ix={},event_to_ix={},pad_length=0):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        # self.landmarks_frame = pd.read_csv(csv_file)

        col_vector = ['time', 'tweet']
        self.bin_df = pd.read_csv(tsv_file, names=col_vector, encoding="utf-8",
                                  engine='python', sep="\t")

        self.matches = []
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.event_to_ix = event_to_ix

        self.pad_length=pad_length

        self.BIOset, self.ECset = utils.getSortedTagsFromBIO(self.tag_to_ix)
        self.tag_to_ix = utils.getSegmentationDict(self.BIOset)
        self.ec_to_ix = utils.getSegmentationDict(self.ECset)


        if isTrain == True:
            self.createDics(self.bin_df,pretrained_embeddings)

        self.preprocess(self.bin_df)
    def __init__(self, tag_to_ix):
        self.tag_to_ix = tag_to_ix

        self.nerSegmentationTags, self.ECset = utils.getSortedTagsFromBIO(tag_to_ix)
        self.tag_to_ix = utils.getSegmentationDict(self.nerSegmentationTags)

        self.totals = 0
        self.oks = 0

        self.tpsNER = 0
        self.fpsNER = 0
        self.fnsNER = 0

        self.tpsNERMicro_no_other = 0
        self.fpsNERMicro_no_other = 0
        self.fnsNERMicro_no_other = 0

        self.tpsClassesNER = dict.fromkeys(self.ECset, 0)
        self.fpsClassesNER = dict.fromkeys(self.ECset, 0)
        self.fnsClassesNER = dict.fromkeys(self.ECset, 0)
        self.precisionNER = dict.fromkeys(self.ECset, 0)
        self.recallNER = dict.fromkeys(self.ECset, 0)
        self.f1NER = dict.fromkeys(self.ECset, 0)

        self.correct_predsNER, self.total_correctNER, self.total_predsNER = 0., 0., 0.
    def add_test(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL):
        n = open("pred_output/output_rel.txt", "a")
        m = open("pred_output/output_ner.txt", "a")

        for batch_idx in range(len(pred_batchesNER)):
            predNER = pred_batchesNER[batch_idx]
            trueNER = true_batchesNER[batch_idx]

            predRel = pred_batchesREL[batch_idx]
            trueRel = true_batchesREL[batch_idx]

            ptoken_ids, _, plabel_ids, phead_ids, plabel_names = utils.transformToInitialInput(
                predRel, self.RELset)

            _, _, tlabel_ids, thead_ids, tlabel_names = utils.transformToInitialInput(
                trueRel, self.RELset)

            trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids)

            predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids)

            n.write(str(predRel))
            tagsNER = utils.getSegmentationDict(self.nerSegmentationTags)#self.

            if self.ner_chunk_eval == "boundaries_type":

                lab_chunks = set(get_chunks(trueNER, tagsNER))
                lab_pred_chunks = set(get_chunks(predNER, tagsNER))

            elif self.ner_chunk_eval == "boundaries":

                lab_chunks = set(keepOnlyChunkBoundaries(set(get_chunks(trueNER, tagsNER))))
                lab_pred_chunks = set(keepOnlyChunkBoundaries(set(get_chunks(predNER, tagsNER))))

            lab_chunks_list = list(lab_chunks)
            lab_pred_chunks_list = list(lab_pred_chunks)
            #n.write(str(predRel))
            #m.write("\t".join(lab_pred_chunks_list))
            m.write((str(lab_pred_chunks)))

        n.write("\n")
        m.write("\n")
示例#5
0
    def add(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL,true_batchesBIONER):



        for batch_idx in range(len(pred_batchesNER)):
            predNER = pred_batchesNER[batch_idx]
            trueNER = true_batchesNER[batch_idx]

            predRel = pred_batchesREL[batch_idx]
            trueRel = true_batchesREL[batch_idx]

            trueBIONER=true_batchesBIONER[batch_idx]


            ptoken_ids, _, plabel_ids, phead_ids, plabel_names = utils.transformToInitialInput(
                predRel, self.RELset)

            _, _, tlabel_ids, thead_ids, tlabel_names = utils.transformToInitialInput(
                trueRel, self.RELset)

            trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids)

            predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids)


            #print (self.NERset)
            tagsNER = utils.getSegmentationDict(self.nerSegmentationTags)#self.



            lab_chunks_ = set(get_chunks(listOfTagsToids(trueBIONER,self.nerSegmentationTags), tagsNER))
            #lab_pred_chunks = set(get_chunks(predNER, tagsNER))

            lab_chunks_list_ = list(lab_chunks_)


            trueNER_tags=listOfIdsToTags(trueNER,self.NERset)
            predNER_tags=listOfIdsToTags(predNER, self.NERset)

            lab_chunks = set(classesToChunks(trueNER_tags, lab_chunks_list_))
            lab_pred_chunks=set(classesToChunks(predNER_tags, lab_chunks_list_))

            lab_chunks_list = list(lab_chunks)
            lab_pred_chunks_list = list(lab_pred_chunks)


            for lab_idx in range(len(lab_pred_chunks_list)):

                if lab_pred_chunks_list[lab_idx] in lab_chunks_list:
                    # print (lab_pred_chunks_list[lab_idx][0])
                    self.tpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1
                else:
                    self.fpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1
                    # fnsEntitiesNER+=1

            for lab_idx in range(len(lab_chunks_list)):

                if lab_chunks_list[lab_idx] not in lab_pred_chunks_list:
                    self.fnsClassesNER[lab_chunks_list[lab_idx][0]] += 1

            relTrue = set(relationChunks(trueRel, lab_chunks_list,relationTuple=self.rel_chunk_eval))

            relPred = set(relationChunks(predRel, lab_pred_chunks_list,relationTuple=self.rel_chunk_eval))

            relTrueList = list(relTrue)  # trueRel#

            # if (len(trueRel)!=len(relTrueList)):
            #    print ("warning")

            relPredList = list(relPred)  # predRel#

            #print("GOLD REL chunks:" + str(relTrueList))

            #print("PRED REL chunks:" + str(relPredList))

            for lab_idx in range(len(relPredList)):

                if relPredList[lab_idx] in relTrueList:
                    # print (lab_pred_chunks_list[lab_idx][0])
                    self.tpsClassesREL[relPredList[lab_idx][1]] += 1
                    # print (relPredList[lab_idx])
                else:
                    self.fpsClassesREL[relPredList[lab_idx][1]] += 1
                    # fnsEntitiesNER+=1

            for lab_idx in range(len(relTrueList)):

                if relTrueList[lab_idx] not in relPredList:
                    self.fnsClassesREL[relTrueList[lab_idx][1]] += 1

            self.correct_predsNER += len(lab_chunks & lab_pred_chunks)
            self.total_predsNER += len(lab_pred_chunks)
            self.total_correctNER += len(lab_chunks)

            self.correct_predsREL += len(relTrue & relPred)
            self.total_predsREL += len(relPred)
            self.total_correctREL += len(relTrue)
示例#6
0
    def add(self, pred_batchesNER, true_batchesNER, pred_batchesREL,
            true_batchesREL):

        for batch_idx in range(len(pred_batchesNER)):
            predNER = pred_batchesNER[batch_idx]
            trueNER = true_batchesNER[batch_idx]

            predRel = pred_batchesREL[batch_idx]
            trueRel = true_batchesREL[batch_idx]

            ptoken_ids, _, plabel_ids, phead_ids, plabel_names = utils.transformToInitialInput(
                predRel, self.RELset)

            _, _, tlabel_ids, thead_ids, tlabel_names = utils.transformToInitialInput(
                trueRel, self.RELset)

            trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids)

            predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids)

            tagsNER = utils.getSegmentationDict(
                self.nerSegmentationTags)  #self.

            if self.ner_chunk_eval == "boundaries_type":

                lab_chunks = set(get_chunks(trueNER, tagsNER))
                lab_pred_chunks = set(get_chunks(predNER, tagsNER))

            elif self.ner_chunk_eval == "boundaries":

                lab_chunks = set(
                    keepOnlyChunkBoundaries(set(get_chunks(trueNER, tagsNER))))
                lab_pred_chunks = set(
                    keepOnlyChunkBoundaries(set(get_chunks(predNER, tagsNER))))

            lab_chunks_list = list(lab_chunks)
            lab_pred_chunks_list = list(lab_pred_chunks)

            if self.ner_chunk_eval == "boundaries_type":
                for lab_idx in range(len(lab_pred_chunks_list)):

                    if lab_pred_chunks_list[lab_idx] in lab_chunks_list:
                        # print (lab_pred_chunks_list[lab_idx][0])
                        self.tpsClassesNER[lab_pred_chunks_list[lab_idx]
                                           [0]] += 1
                    else:
                        self.fpsClassesNER[lab_pred_chunks_list[lab_idx]
                                           [0]] += 1
                        # fnsEntitiesNER+=1

                for lab_idx in range(len(lab_chunks_list)):

                    if lab_chunks_list[lab_idx] not in lab_pred_chunks_list:
                        self.fnsClassesNER[lab_chunks_list[lab_idx][0]] += 1

            elif self.ner_chunk_eval == "boundaries":
                for lab_idx in range(len(lab_pred_chunks_list)):

                    if lab_pred_chunks_list[lab_idx] in lab_chunks_list:
                        # print (lab_pred_chunks_list[lab_idx][0])
                        self.tpsNER += 1
                    else:
                        self.fpsNER += 1
                        # fnsEntitiesNER+=1

                for lab_idx in range(len(lab_chunks_list)):

                    if lab_chunks_list[lab_idx] not in lab_pred_chunks_list:
                        self.fnsNER += 1

            if self.root_node == True:
                lab_chunks_list_with_ROOT = copy.deepcopy(lab_chunks_list)
                lab_chunks_list_with_ROOT.append((None, 0, 0))

                lab_pred_chunks_list_with_ROOT = copy.deepcopy(
                    lab_pred_chunks_list)
                lab_pred_chunks_list_with_ROOT.append((None, 0, 0))

                relTrue = set(
                    relationChunks(trueRel,
                                   lab_chunks_list_with_ROOT,
                                   relationTuple=self.rel_chunk_eval))

                relPred = set(
                    relationChunks(predRel,
                                   lab_pred_chunks_list_with_ROOT,
                                   relationTuple=self.rel_chunk_eval))

            else:
                relTrue = set(
                    relationChunks(trueRel,
                                   lab_chunks_list,
                                   relationTuple=self.rel_chunk_eval))

                relPred = set(
                    relationChunks(predRel,
                                   lab_pred_chunks_list,
                                   relationTuple=self.rel_chunk_eval))

            relTrueList = list(relTrue)  # trueRel#

            # if (len(trueRel)!=len(relTrueList)):
            #    print ("warning")

            relPredList = list(relPred)  # predRel#

            for lab_idx in range(len(relPredList)):

                if relPredList[lab_idx] in relTrueList:
                    # print (lab_pred_chunks_list[lab_idx][0])
                    self.tpsClassesREL[relPredList[lab_idx][1]] += 1
                    # print (relPredList[lab_idx])
                else:
                    self.fpsClassesREL[relPredList[lab_idx][1]] += 1
                    # fnsEntitiesNER+=1

            for lab_idx in range(len(relTrueList)):

                if relTrueList[lab_idx] not in relPredList:
                    self.fnsClassesREL[relTrueList[lab_idx][1]] += 1

            self.correct_predsNER += len(lab_chunks & lab_pred_chunks)
            self.total_predsNER += len(lab_pred_chunks)
            self.total_correctNER += len(lab_chunks)

            self.correct_predsREL += len(relTrue & relPred)
            self.total_predsREL += len(relPred)
            self.total_correctREL += len(relTrue)