class TripleExtractor(): def __init__(self): self.regex_pattern = r'[^\x00-\x7F]+' self.openie = StanfordOpenIE() def clean_text(self, text): return re.sub(self.regex_pattern, ' ', text) def extract_entities(self, text): return self.nlp.ner(text) def get_triples(self, text): triples = self.openie.annotate(text) return triples
class OpenIEBaselineModel: def __init__(self): from openie import StanfordOpenIE self.openie_client = StanfordOpenIE() self.spacy_nlp = spacy.load("en_core_web_sm") def predict(self, inst, supporting_facts): ent2doc = dict(inst["context"]) reasoning_steps = [] for sup_ent, sup_sent_id in supporting_facts: if sup_sent_id > len(ent2doc[sup_ent]): continue # # sup_sent = list(self.spacy_nlp(ent2doc[sup_ent][sup_sent_id]).sents) # # if len(sup_sent) == 0: # continue # # sup_sent = sup_sent[0] # sup_sent = [sup_ent if tk.text in ["it", "they", "she", "he"] else str(tk) for tk in sup_sent] # sup_sent = " ".join(sup_sent) sup_sent = ent2doc[sup_ent][sup_sent_id] for triplet in self.openie_client.annotate(sup_sent): if triplet["subject"] in ["it", "they", "she", "he"]: triplet["subject"] = sup_ent reasoning_steps += [( sup_ent, sup_sent_id, (triplet["subject"], triplet["relation"], triplet["object"]), )] return reasoning_steps
class ExtractInformation: IS_GPU = True SUBJECT = 'subject' SUBJECT_ENTITY = 'subject_entity' RELATION = 'relation' OBJECT = 'object' OBJECT_ENTITY = 'object_entity' ENTITY_NAME = 'name' ENTITY_TYPE = 'entity_type' ENTITY_SUBJECT_OTHER = 'subject_other' ENTITY_OBJECT_OTHER = 'object_other' def __init__(self, modelSpacy='en_core_web_lg', modelCoref='en'): print(os.path.dirname(spacy.__file__)) if ExtractInformation.IS_GPU: spacy.prefer_gpu() self.modelSpacy = modelSpacy self.modelCoref = modelCoref self.stanfordClient = StanfordOpenIE() self.nlpCoref, self.nlpSpacy = self.initSpacy(modelSpacy, modelCoref) def initSpacy(self, modelSpacy, modelCoref): nlpSpacy = spacy.load(modelSpacy) nlpCoref = spacy.load('en') coref = neuralcoref.NeuralCoref(nlpCoref.vocab) nlpCoref.add_pipe(coref, name=modelCoref) return nlpCoref, nlpSpacy #Stage 1: replace Pronouns To Noun, example: My sister has a dog. She loves him. => Cluster: [My sister: [My sister, She], a dog: [a dog, him]] def replacePronounsToNoun(self, nlp, inputText): #todo unicode input Text #ouputText = unicode(inputText) ouputText = inputText doc = nlp(inputText) if (doc._.has_coref): ouputText = doc._.coref_resolved return doc._.has_coref, ouputText #Stage 2: Extract Entities def extractEntities(self, nlp, inputText): doc = nlp(inputText) entities = [] for ent in doc.ents: entities.append({ ExtractInformation.ENTITY_NAME: ent.text, ExtractInformation.ENTITY_TYPE: ent.label_ }) return entities #Stage 3: Extract Triple def extractTriple(self, inputText): hasCoref, inputText = self.replacePronounsToNoun( self.nlpCoref, inputText) #todo similaty relation tripleStanfords = self.extractTripleStanfordOpenIE(inputText) tripleSpacys = self.extractTripleSpacy(self.nlpSpacy, inputText) tripleTemps = tripleStanfords for tripleStanford in tripleStanfords: subject1 = tripleStanford.get(ExtractInformation.SUBJECT) relation1 = tripleStanford.get(ExtractInformation.RELATION) object1 = tripleStanford.get(ExtractInformation.OBJECT) for tripleSpacy in tripleSpacys: subject2 = tripleSpacy.get(ExtractInformation.SUBJECT) relation2 = tripleSpacy.get(ExtractInformation.RELATION) object2 = tripleSpacy.get(ExtractInformation.OBJECT) if ((subject1 == subject2)): if ((object1 == object2) or (object1 in object2)): text1 = self.nlpSpacy(relation1) text2 = self.nlpSpacy(relation2) if (text1.similarity(text2) > 0.6): tripleTemps.remove(tripleStanford) break triples = tripleTemps + tripleSpacys for triple in triples: subjectEnts = self.nlpSpacy(triple.get(ExtractInformation.SUBJECT)) triple[ExtractInformation.SUBJECT_ENTITY] = [ (e.text, e.start_char, e.end_char, e.label_) for e in subjectEnts.ents ] objectEnts = self.nlpSpacy(triple.get(ExtractInformation.OBJECT)) triple[ExtractInformation.OBJECT_ENTITY] = [ (e.text, e.start_char, e.end_char, e.label_) for e in objectEnts.ents ] return triples def extractTripleStanfordOpenIE(self, inputText): triples = [] try: triples = self.stanfordClient.annotate(inputText) except Exception as exception: print("--- extract Triple Stanford OpenIE Error " + str(exception)) return triples def extractTripleSpacy(self, nlp, inputText): docSeparate = nlp(inputText) sentences = [sent.string.strip() for sent in docSeparate.sents] triples = [] for sentence in sentences: doc = nlp(sentence) spans = list(doc.ents) + list(doc.noun_chunks) for span in spans: span.merge() for ent in doc.ents: preps = [ prep for prep in ent.root.head.children if prep.dep_ == "prep" ] for prep in preps: for child in prep.children: triples.append({ ExtractInformation.SUBJECT: ent.text, ExtractInformation.RELATION: "{} {}".format(ent.root.head, prep), ExtractInformation.OBJECT: child.text }) return triples def trainAdditionalEntity(self, train_data, label, nlp, model=None, n_iter=30): if ("ner" not in nlp.pipe_names): ner = nlp.create_pipe("ner") nlp.add_pipe(ner) else: ner = nlp.get_pipe("ner") ner.add_label(label) if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(train_data) batches = minibatch(train_data, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) return nlp def saveModel(self, output_dir, nlp, new_model_name): if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir)