def generate_data(self, corpus, modelname, pairtypes): # TODO: remove old model pcount = 0 truepcount = 0 ns = 0 for did in corpus.documents: doc_entities = corpus.documents[did].get_entities("goldstandard") examplelines = [] # logging.info("{}".format(sentence.sid)) # sentence_entities = sentence.entities.elist["goldstandard"] # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"]))) for pair in itertools.permutations(doc_entities, 2): sn1 = int(pair[0].sid.split(".")[-1][1:]) sn2 = int(pair[1].sid.split(".")[-1][1:]) # if self.pairtype in corpus.type_sentences and pair[0].sid not in corpus.type_sentences[self.pairtype]: # continue if abs(sn2 - sn1) > 0 or pair[0].start == pair[ 1].start or pair[0].end == pair[1].end: continue # if self.pairtype in ("Has_Sequence_Identical_To", "Is_Functionally_Equivalent_To") and pair[0].type != pair[1].type: # continue #if pair[0].text == pair[1].text: # continue # logging.info("{}=>{}|{}=>{}".format(pair[0].type, pair[1].type, pairtypes[0], pairtypes[1])) if pair[0].type in config.pair_types[self.pairtype][ "source_types"] and pair[1].type in config.pair_types[ self.pairtype]["target_types"]: #if pair[0].type in config.event_types[self.pairtype]["source_types"] and pair[1].type in config.event_types[self.pairtype]["target_types"]: #pair[1].type in config.pair_types[self.pairtype]["source_types"] and pair[0].type in config.pair_types[self.pairtype]["target_types"]: # logging.debug(pair) #if pair[0].type not in config.pair_types[self.pairtype]["source_types"]: # pair = (pair[1], pair[0]) pid = did + ".p" + str(pcount) # self.pairs[pid] = (e1id, e2id) if sn1 != sn2: sentence1 = corpus.documents[did].get_sentence( pair[0].sid) sentence2 = corpus.documents[did].get_sentence( pair[1].sid) sentence = Sentence(text=sentence1.text + " " + sentence2.text, offset=sentence1.offset) sentence.tokens = sentence1.tokens + sentence2.tokens for t in pair[1].tokens: t.order += len(sentence1.tokens) else: sentence = corpus.documents[did].get_sentence( pair[0].sid) f, label = self.generate_features(sentence, pair) self.features.append(f) self.labels.append(label) self.pairs.append(pair)
def generate_data(self, corpus, modelname, pairtypes): # TODO: remove old model pcount = 0 truepcount = 0 ns = 0 for did in corpus.documents: doc_entities = corpus.documents[did].get_entities("goldstandard") examplelines = [] # logging.info("{}".format(sentence.sid)) # sentence_entities = sentence.entities.elist["goldstandard"] # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"]))) for pair in itertools.permutations(doc_entities, 2): sn1 = int(pair[0].sid.split(".")[-1][1:]) sn2 = int(pair[1].sid.split(".")[-1][1:]) # if self.pairtype in corpus.type_sentences and pair[0].sid not in corpus.type_sentences[self.pairtype]: # continue if abs(sn2 - sn1) > 0 or pair[0].start == pair[1].start or pair[0].end == pair[1].end: continue # if self.pairtype in ("Has_Sequence_Identical_To", "Is_Functionally_Equivalent_To") and pair[0].type != pair[1].type: # continue #if pair[0].text == pair[1].text: # continue # logging.info("{}=>{}|{}=>{}".format(pair[0].type, pair[1].type, pairtypes[0], pairtypes[1])) if pair[0].type in config.seedev_types.pair_types[self.pairtype]["source_types"] and pair[1].type in config.seedev_types.pair_types[self.pairtype]["target_types"]: #if pair[0].type in config.event_types[self.pairtype]["source_types"] and pair[1].type in config.event_types[self.pairtype]["target_types"]: #pair[1].type in config.pair_types[self.pairtype]["source_types"] and pair[0].type in config.pair_types[self.pairtype]["target_types"]: # logging.debug(pair) #if pair[0].type not in config.pair_types[self.pairtype]["source_types"]: # pair = (pair[1], pair[0]) pid = did + ".p" + str(pcount) # self.pairs[pid] = (e1id, e2id) if sn1 != sn2: sentence1 = corpus.documents[did].get_sentence(pair[0].sid) sentence2 = corpus.documents[did].get_sentence(pair[1].sid) sentence = Sentence(text = sentence1.text + " " + sentence2.text, offset=sentence1.offset) sentence.tokens = sentence1.tokens + sentence2.tokens for t in pair[1].tokens: t.order += len(sentence1.tokens) else: sentence = corpus.documents[did].get_sentence(pair[0].sid) f, label = self.generate_features(sentence, pair) self.features.append(f) self.labels.append(label) self.pairs.append(pair)