예제 #1
0
 def generate_data(self, corpus, modelname, pairtypes):
     # TODO: remove old model
     pcount = 0
     truepcount = 0
     ns = 0
     for did in corpus.documents:
         doc_entities = corpus.documents[did].get_entities("goldstandard")
         examplelines = []
         # logging.info("{}".format(sentence.sid))
         # sentence_entities = sentence.entities.elist["goldstandard"]
         # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"])))
         for pair in itertools.permutations(doc_entities, 2):
             sn1 = int(pair[0].sid.split(".")[-1][1:])
             sn2 = int(pair[1].sid.split(".")[-1][1:])
             # if self.pairtype in corpus.type_sentences and pair[0].sid not in corpus.type_sentences[self.pairtype]:
             #     continue
             if abs(sn2 - sn1) > 0 or pair[0].start == pair[
                     1].start or pair[0].end == pair[1].end:
                 continue
             # if self.pairtype in ("Has_Sequence_Identical_To", "Is_Functionally_Equivalent_To") and pair[0].type != pair[1].type:
             #     continue
             #if pair[0].text == pair[1].text:
             #    continue
             # logging.info("{}=>{}|{}=>{}".format(pair[0].type, pair[1].type, pairtypes[0], pairtypes[1]))
             if pair[0].type in config.pair_types[self.pairtype][
                     "source_types"] and pair[1].type in config.pair_types[
                         self.pairtype]["target_types"]:
                 #if pair[0].type in config.event_types[self.pairtype]["source_types"] and pair[1].type in config.event_types[self.pairtype]["target_types"]:
                 #pair[1].type in config.pair_types[self.pairtype]["source_types"] and pair[0].type in config.pair_types[self.pairtype]["target_types"]:
                 # logging.debug(pair)
                 #if pair[0].type not in config.pair_types[self.pairtype]["source_types"]:
                 #    pair = (pair[1], pair[0])
                 pid = did + ".p" + str(pcount)
                 # self.pairs[pid] = (e1id, e2id)
                 if sn1 != sn2:
                     sentence1 = corpus.documents[did].get_sentence(
                         pair[0].sid)
                     sentence2 = corpus.documents[did].get_sentence(
                         pair[1].sid)
                     sentence = Sentence(text=sentence1.text + " " +
                                         sentence2.text,
                                         offset=sentence1.offset)
                     sentence.tokens = sentence1.tokens + sentence2.tokens
                     for t in pair[1].tokens:
                         t.order += len(sentence1.tokens)
                 else:
                     sentence = corpus.documents[did].get_sentence(
                         pair[0].sid)
                 f, label = self.generate_features(sentence, pair)
                 self.features.append(f)
                 self.labels.append(label)
                 self.pairs.append(pair)
예제 #2
0
 def generate_data(self, corpus, modelname, pairtypes):
    # TODO: remove old model
     pcount = 0
     truepcount = 0
     ns = 0
     for did in corpus.documents:
         doc_entities = corpus.documents[did].get_entities("goldstandard")
         examplelines = []
         # logging.info("{}".format(sentence.sid))
         # sentence_entities = sentence.entities.elist["goldstandard"]
         # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"])))
         for pair in itertools.permutations(doc_entities, 2):
             sn1 = int(pair[0].sid.split(".")[-1][1:])
             sn2 = int(pair[1].sid.split(".")[-1][1:])
             # if self.pairtype in corpus.type_sentences and pair[0].sid not in corpus.type_sentences[self.pairtype]:
             #     continue
             if abs(sn2 - sn1) > 0 or pair[0].start == pair[1].start or pair[0].end == pair[1].end:
                 continue
             # if self.pairtype in ("Has_Sequence_Identical_To", "Is_Functionally_Equivalent_To") and pair[0].type != pair[1].type:
             #     continue
             #if pair[0].text == pair[1].text:
             #    continue
             # logging.info("{}=>{}|{}=>{}".format(pair[0].type, pair[1].type, pairtypes[0], pairtypes[1]))
             if pair[0].type in config.seedev_types.pair_types[self.pairtype]["source_types"] and pair[1].type in config.seedev_types.pair_types[self.pairtype]["target_types"]:
             #if pair[0].type in config.event_types[self.pairtype]["source_types"] and pair[1].type in config.event_types[self.pairtype]["target_types"]:
                                     #pair[1].type in config.pair_types[self.pairtype]["source_types"] and pair[0].type in config.pair_types[self.pairtype]["target_types"]:
                 # logging.debug(pair)
                 #if pair[0].type not in config.pair_types[self.pairtype]["source_types"]:
                 #    pair = (pair[1], pair[0])
                 pid = did + ".p" + str(pcount)
                 # self.pairs[pid] = (e1id, e2id)
                 if sn1 != sn2:
                     sentence1 = corpus.documents[did].get_sentence(pair[0].sid)
                     sentence2 = corpus.documents[did].get_sentence(pair[1].sid)
                     sentence = Sentence(text = sentence1.text + " " + sentence2.text, offset=sentence1.offset)
                     sentence.tokens = sentence1.tokens + sentence2.tokens
                     for t in pair[1].tokens:
                         t.order += len(sentence1.tokens)
                 else:
                     sentence = corpus.documents[did].get_sentence(pair[0].sid)
                 f, label = self.generate_features(sentence, pair)
                 self.features.append(f)
                 self.labels.append(label)
                 self.pairs.append(pair)