def __init__(self, modelSpacy='en_core_web_lg', modelCoref='en'): print(os.path.dirname(spacy.__file__)) if ExtractInformation.IS_GPU: spacy.prefer_gpu() self.modelSpacy = modelSpacy self.modelCoref = modelCoref self.stanfordClient = StanfordOpenIE() self.nlpCoref, self.nlpSpacy = self.initSpacy(modelSpacy, modelCoref)
def process(self, context): super().process() structured_tweets = [] with contextlib.redirect_stdout(None): with StanfordOpenIE() as client: for index, item in context['_data'].iterrows(): triples = [] for triple in client.annotate(item['content']): if triple['subject'] != '' and triple[ 'relation'] != '' and triple['object'] != '': if self._keyword in triple[ 'subject'] or self._keyword in triple[ 'object']: relation_pos = wn.synsets(triple['relation']) if len(relation_pos): if relation_pos[0].pos() == 'v': triple['date'] = item['date'] triples.append(triple) if len(triples): sorted(triples, key=lambda x: len(x['object'])) structured_tweets.append(triples[-1]) df = pd.DataFrame(data=structured_tweets).drop_duplicates( subset=['subject', 'relation', 'object']) if self._csv: df.to_csv(self._csv, index=False) x = {'_data': df} return x
def extraction(text): """ Extract relations between entities present in the news item and packs them in (head, relation, tail) triples. :param text: str with input news content :return: list of (h, r, t) triples """ nlp = spacy.load('en_core_web_lg') #sum_text = summarizer.summarize(text) text_resolved = resolve_coreferences(nlp, text) # print(text_resolved) doc = nlp(text_resolved) lemmatized_text = ' '.join([token.lemma_ if (token.pos == 100 or token.pos == 87) else token.text for token in doc]) # print(lemmatized_text) with StanfordOpenIE() as client: triples_dict = client.annotate(lemmatized_text) valid_ents = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'DATE', 'TIME'] entities = [ent.text for ent in doc.ents if ent.label_ in valid_ents] return [(d['subject'], d['relation'], d['object']) for d in triples_dict], list(set(entities))
def process_text(text, name): top_relations = [] triples = [] with StanfordOpenIE() as client: # print('Text: %s.' % text) for triple in client.annotate(text): # print('|-', triple) top_relations.append(triple['relation']) triples.append(triple) terms = set( [w for w, _ in Counter(top_relations).most_common(15)] ) # if not w in {'is','are','is in','consists of','has','have','controls','is with', 'is represented by','is partially protected by'}]) print(terms) for t in triples: #for term in ["hit","shoot","grab","catch","use","blow","destroy","touch","avoid","collide"]: # if term in t['relation']: # in terms: print(t) # break graph_image = name + '_graph.png' client.generate_graphviz_graph(text, graph_image) print('Graph generated: %s.' % graph_image)
def process_relation_extraction(text): triples = [] with StanfordOpenIE() as client: for triple in client.annotate(text): triples.append(triple) return pd.DataFrame(triples)
def openie_subj(data): """ input: data - column of pd.DataFrame() ('content'/'lead'/...) output: subjects from OpenIE """ dependences = [] with StanfordOpenIE() as client: for item in tqdm(data, total=len(data)): dependency = [] relations = client.annotate(item) for r in relations: words = tuple(r.values()) dependency.append(words) objects = list(map(lambda x: x[0], dependency)) c = Counter(objects) common = OrderedDict({ key: val for key, val in sorted( c.items(), key=lambda item: item[1], reverse=True) if val > 1 }) dependences.append(common) return dependences
def information_extraction(text): with StanfordOpenIE() as client: output = client.annotate(text) if isinstance(output, list): if len(output) != 0: return output[0] else: return np.nan return output
def tweet_token(tweet): with StanfordOpenIE() as client: text = tweet print('Text: %s.' % text) for triple in client.annotate(text): # convert token from coreNLP to JSON triple_token = json.dumps(triple) #triple_token = json.loads(x) #print(x) return triple_token
def stanford_openie(): from openie import StanfordOpenIE with StanfordOpenIE() as client: text = 'Barack Obama was born in Hawaii. Richard Manning wrote this sentence.' print('Text: %s.' % text) for triple in client.annotate(text): print('|-', triple) graph_image = 'graph.png' client.generate_graphviz_graph(text, graph_image) print('Graph generated: %s.' % graph_image)
def load_ie(): """ Load Stanford Open IE based Triple Extractor function Returns: function: Open IE annotate function to extract fact triple """ print("Loading Open IE Pipeline...") from openie import StanfordOpenIE client = StanfordOpenIE() return client.annotate
class TripleExtractor(): def __init__(self): self.regex_pattern = r'[^\x00-\x7F]+' self.openie = StanfordOpenIE() def clean_text(self, text): return re.sub(self.regex_pattern, ' ', text) def extract_entities(self, text): return self.nlp.ner(text) def get_triples(self, text): triples = self.openie.annotate(text) return triples
def SVO_extractor(self, data): with StanfordOpenIE() as client: svo_pos = {'s_pos' : [], 'v_pos' : [], 'o_pos': [], 'label' : [], 'date' : []} for index, row in tqdm(data.iterrows()): try: for sentence in client.annotate(row['header']): svo_pos['s_pos'].append(sentence['subject']) svo_pos['v_pos'].append(sentence['relation']) svo_pos['o_pos'].append(sentence['object']) svo_pos['label'].append(1) svo_pos['date'].append(row['date']) except AttributeError: pass return svo_pos
def graph_annotations(text, properties, doc_key, graph_out_loc): """ Use philipperemy's openie wrapper to make graphviz renderings of a set of annotations. parameters: text, str: text to anntoate and graph properties, dict: properties dict containing affinity cap doc_key, str: name of the document, used to create the output file name graph_out_loc, str, path to save the output file returns: None """ save_name = f'{graph_out_loc}/{doc_key}_openie_graph.png' with StanfordOpenIE(properties=properties) as client: client.generate_graphviz_graph(text, save_name)
def main(): ''' Open Information Extraction example using Spacy - doc taken from https://en.wikipedia.org/wiki/World_War_II - comparative done with StanfordOpenIe library https://github.com/philipperemy/Stanford-OpenIE-Python ''' text = """The Empire of Japan aimed to dominate Asia and the Pacific and was already at war with the Republic of China in 1937, but the world war is generally said to have begun on 1 September 1939 with the invasion of Poland by Germany and subsequent declarations of war on Germany by France and the United Kingdom. From late 1939 to early 1941, in a series of campaigns and treaties, Germany conquered or controlled much of continental Europe, and formed the Axis alliance with Italy and Japan. Under the Molotov-Ribbentrop Pact of August 1939, Germany and the Soviet Union partitioned and annexed territories of their European neighbours, Poland, Finland, Romania and the Baltic states. The war continued primarily between the European Axis powers and the coalition of the United Kingdom and the British Commonwealth, with campaigns including the North Africa and East Africa campaigns, the aerial Battle of Britain, the Blitz bombing campaign, the Balkan Campaign as well as the long-running Battle of the Atlantic. In June 1941, the European Axis powers launched an invasion of the Soviet Union, opening the largest land theatre of war in history, which trapped the major part of the Axis' military forces into a war of attrition. In December 1941, Japan attacked the United States and European territories in the Pacific Ocean, and quickly conquered much of the Western Pacific.""" nlp = spacy.load("en_core_web_sm") doc = nlp(text) sentences = list(doc.sents) for sentence in sentences: subject, verb, attribute = extract_svo(sentence, nlp) print("Subject: ", subject, "| Verb: ", verb, "| Obj: ", attribute) print("================ With StanfordOpenIe:") with StanfordOpenIE() as client: for sentence in sentences: for triple in client.annotate(sentence.text): print('|-', triple)
def run(self): if self.config['extraction']['oie'] == 'stanford': with StanfordOpenIE() as client: if self.config['mode'] == 'qa': for qa in self.input: for utter in qa['utterances']: for sent in utter['sents']: sent['triples'] = client.annotate( sent['statement']) elif self.config['mode'] == 'subtitle': for ep in self.input: for scene in ep: for u in scene['scene']: for sent in u['sents']: sent['triples'] = client.annotate( sent['statement']) print('Stanford Open IE done..') return self.input
def start(new_book: str, book_file_name: str): """ function: Extract the relation between the entities in the book. we will use Python3 wrapper for Stanford OpenIE for this job. Input: A string: "new_book" of the pre-processed book A string: "book_file_name" which is name of the book as stored on Hard disk. Returns: Nothing, it generates the graph and saves it as image. It also outputs the relations in a text file """ # We will take only first 40,000 letters for getting relationships in the books # We will then store block of 10,000 letters in each element of a list to make processing easy. TEXT = [] for x in range(0, 40000, 10000): TEXT.append(new_book[x:x + 10000]) relation_file = open("h_entity_relation_list_" + book_file_name + '.txt', 'w+') relation_file.write( "This File contains Entity relations (of nouns) extracted" " from first 8,000 words from the book " + book_file_name + "\n") relation_file.write( "\n--------------------------------------------------\n\n\n") # passing text to StanfordOpenIE to process with StanfordOpenIE() as client: for text in TEXT: # print('\nText: \n%s.' % text) for triple in client.annotate(text): # Below lines check if the relation is between two noun entities or between noun and verb. try: if is_relation_good(triple): relation_file.write("|- " + str(triple) + '\n') except KeyError: pass graph_image = 'h_entity_relation_graph_' + book_file_name + '_' + str( random.randint(0, 100000)) + '_.png' client.generate_graphviz_graph(text, graph_image)
class OpenIEBaselineModel: def __init__(self): from openie import StanfordOpenIE self.openie_client = StanfordOpenIE() self.spacy_nlp = spacy.load("en_core_web_sm") def predict(self, inst, supporting_facts): ent2doc = dict(inst["context"]) reasoning_steps = [] for sup_ent, sup_sent_id in supporting_facts: if sup_sent_id > len(ent2doc[sup_ent]): continue # # sup_sent = list(self.spacy_nlp(ent2doc[sup_ent][sup_sent_id]).sents) # # if len(sup_sent) == 0: # continue # # sup_sent = sup_sent[0] # sup_sent = [sup_ent if tk.text in ["it", "they", "she", "he"] else str(tk) for tk in sup_sent] # sup_sent = " ".join(sup_sent) sup_sent = ent2doc[sup_ent][sup_sent_id] for triplet in self.openie_client.annotate(sup_sent): if triplet["subject"] in ["it", "they", "she", "he"]: triplet["subject"] = sup_ent reasoning_steps += [( sup_ent, sup_sent_id, (triplet["subject"], triplet["relation"], triplet["object"]), )] return reasoning_steps
def get_triples(corpus=''): try: with StanfordOpenIE() as client: return list(dict.fromkeys([(t['subject'], t['relation'], t['object']) for t in client.annotate(corpus)])) except: return []
from flask import Flask, jsonify, request, send_file from openie import StanfordOpenIE import math import os app = Flask(__name__) client = StanfordOpenIE() @app.route('/getGraph', methods=['POST']) def get_image(): json = request.json note = json['note'] # note = 'Barack Obama was born in Hawaii. Richard Manning wrote this sentence.' graph_image = './graph.png' client.generate_graphviz_graph(note, graph_image) return send_file(graph_image, mimetype='image/png') if __name__ == '__main__': app.run(debug=True, port=int(os.environ.get('PORT', 9090)), host='0.0.0.0')
def svo(self): if self._svo is None: self._svo = StanfordOpenIE() return self._svo
'--interval', type=int, nargs=2, required=True, help='Interval of lines to read. (default: %(default)s)') args = parser.parse_args() maybe_make_directory(args.output_dir) return args args = parse_args() with StanfordOpenIE() as client: with open(args.input_path, 'r', encoding='utf8') as file: corpus = file.read().replace('\n', ' ').replace('\r', '') triples_corpus = client.annotate(corpus[args.interval[0]:args.interval[1]]) print('Found %s triples in the corpus.' % len(triples_corpus)) basename = os.path.basename(args.input_path) filename = os.path.splitext(basename)[0] with open( os.path.join( args.output_dir, f"{filename}_{args.interval[0]}_{args.interval[1]}.txt"), 'w') as output_file: for triple in triples_corpus: output_file.write(str(triple) + "\n")
def stanfordOIE(texts): with StanfordOpenIE() as client: result = [] for text in texts: result.append(client.annotate(text)) return result
class ExtractInformation: IS_GPU = True SUBJECT = 'subject' SUBJECT_ENTITY = 'subject_entity' RELATION = 'relation' OBJECT = 'object' OBJECT_ENTITY = 'object_entity' ENTITY_NAME = 'name' ENTITY_TYPE = 'entity_type' ENTITY_SUBJECT_OTHER = 'subject_other' ENTITY_OBJECT_OTHER = 'object_other' def __init__(self, modelSpacy='en_core_web_lg', modelCoref='en'): print(os.path.dirname(spacy.__file__)) if ExtractInformation.IS_GPU: spacy.prefer_gpu() self.modelSpacy = modelSpacy self.modelCoref = modelCoref self.stanfordClient = StanfordOpenIE() self.nlpCoref, self.nlpSpacy = self.initSpacy(modelSpacy, modelCoref) def initSpacy(self, modelSpacy, modelCoref): nlpSpacy = spacy.load(modelSpacy) nlpCoref = spacy.load('en') coref = neuralcoref.NeuralCoref(nlpCoref.vocab) nlpCoref.add_pipe(coref, name=modelCoref) return nlpCoref, nlpSpacy #Stage 1: replace Pronouns To Noun, example: My sister has a dog. She loves him. => Cluster: [My sister: [My sister, She], a dog: [a dog, him]] def replacePronounsToNoun(self, nlp, inputText): #todo unicode input Text #ouputText = unicode(inputText) ouputText = inputText doc = nlp(inputText) if (doc._.has_coref): ouputText = doc._.coref_resolved return doc._.has_coref, ouputText #Stage 2: Extract Entities def extractEntities(self, nlp, inputText): doc = nlp(inputText) entities = [] for ent in doc.ents: entities.append({ ExtractInformation.ENTITY_NAME: ent.text, ExtractInformation.ENTITY_TYPE: ent.label_ }) return entities #Stage 3: Extract Triple def extractTriple(self, inputText): hasCoref, inputText = self.replacePronounsToNoun( self.nlpCoref, inputText) #todo similaty relation tripleStanfords = self.extractTripleStanfordOpenIE(inputText) tripleSpacys = self.extractTripleSpacy(self.nlpSpacy, inputText) tripleTemps = tripleStanfords for tripleStanford in tripleStanfords: subject1 = tripleStanford.get(ExtractInformation.SUBJECT) relation1 = tripleStanford.get(ExtractInformation.RELATION) object1 = tripleStanford.get(ExtractInformation.OBJECT) for tripleSpacy in tripleSpacys: subject2 = tripleSpacy.get(ExtractInformation.SUBJECT) relation2 = tripleSpacy.get(ExtractInformation.RELATION) object2 = tripleSpacy.get(ExtractInformation.OBJECT) if ((subject1 == subject2)): if ((object1 == object2) or (object1 in object2)): text1 = self.nlpSpacy(relation1) text2 = self.nlpSpacy(relation2) if (text1.similarity(text2) > 0.6): tripleTemps.remove(tripleStanford) break triples = tripleTemps + tripleSpacys for triple in triples: subjectEnts = self.nlpSpacy(triple.get(ExtractInformation.SUBJECT)) triple[ExtractInformation.SUBJECT_ENTITY] = [ (e.text, e.start_char, e.end_char, e.label_) for e in subjectEnts.ents ] objectEnts = self.nlpSpacy(triple.get(ExtractInformation.OBJECT)) triple[ExtractInformation.OBJECT_ENTITY] = [ (e.text, e.start_char, e.end_char, e.label_) for e in objectEnts.ents ] return triples def extractTripleStanfordOpenIE(self, inputText): triples = [] try: triples = self.stanfordClient.annotate(inputText) except Exception as exception: print("--- extract Triple Stanford OpenIE Error " + str(exception)) return triples def extractTripleSpacy(self, nlp, inputText): docSeparate = nlp(inputText) sentences = [sent.string.strip() for sent in docSeparate.sents] triples = [] for sentence in sentences: doc = nlp(sentence) spans = list(doc.ents) + list(doc.noun_chunks) for span in spans: span.merge() for ent in doc.ents: preps = [ prep for prep in ent.root.head.children if prep.dep_ == "prep" ] for prep in preps: for child in prep.children: triples.append({ ExtractInformation.SUBJECT: ent.text, ExtractInformation.RELATION: "{} {}".format(ent.root.head, prep), ExtractInformation.OBJECT: child.text }) return triples def trainAdditionalEntity(self, train_data, label, nlp, model=None, n_iter=30): if ("ner" not in nlp.pipe_names): ner = nlp.create_pipe("ner") nlp.add_pipe(ner) else: ner = nlp.get_pipe("ner") ner.add_label(label) if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(train_data) batches = minibatch(train_data, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) return nlp def saveModel(self, output_dir, nlp, new_model_name): if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir)
def __init__(self): from openie import StanfordOpenIE self.openie_client = StanfordOpenIE() self.spacy_nlp = spacy.load("en_core_web_sm")
def extract_triples(text): with StanfordOpenIE() as client: triples = [triple for triple in client.annotate(text)] return triples
def __init__(self): self.regex_pattern = r'[^\x00-\x7F]+' self.openie = StanfordOpenIE()