def line_to_predictions(ner_fea, dec, json_eg, attr, content_type, word_embeddings, trigger_generator, trigger_model, arg_generator): """ :type word_embeddings: embeddings.word_embeddings.WordEmbedding :type trigger_generator: event.event_trigger.EventTriggerGenerator :type trigger_model: model.event_cnn.EventExtractionModel :type arg_generator: event.event_argument.EventArgumentGenerator """ global spacy_en content = find(attr, json_eg) # json_eg.get(attr) #print(content_type.encode('ascii', 'ignore')) #print(content.encode('ascii', 'ignore')) offset = 0 all_predictions = [] if content is not None: if type(content) is list: content = '\n'.join(content) for line in content.split('\n'): #print(offset) #print('[' + content_type.encode('ascii', 'ignore') + ']') #print('[' + line.encode('ascii', 'ignore') + ']') doc_ner_predictions = [] sentences = get_sentences(line, content_type) if sentences is not None: for sent in sentences: sent_predictions = decode_sentence(ner_fea, dec, content, sent, offset, content_type) doc_ner_predictions.extend(sent_predictions) all_predictions.extend(sent_predictions) if content_type == 'Post': doc = Document('dummy', line) for i, p in enumerate(doc_ner_predictions): id = 'em-{}'.format(i) doc.add_entity_mention( EntityMention(id, IntPair(p['start'], p['end']), p['text'], p['label'])) doc.annotate_sentences(spacy_en, word_embeddings) (trigger_examples, trigger_data, trigger_data_list, trigger_label) = generate_trigger_data_feature( trigger_generator, [doc]) trigger_predictions = trigger_model.predict(trigger_data_list) offset += len(line) + 1 # +1 to account for newline # a list of dict, one for each predicted NE mention if len(all_predictions) > 0: if not "extractions" in json_eg: json_eg["extractions"] = {} json_eg['extractions'][attr] = all_predictions return json_eg
def process_times(cls, doc, document_node): """ :type doc: text.text_theory.Document :type document_node: xml.etree.ElementTree.Element """ for time_node in document_node.findall('timex2'): time_id = time_node.attrib['ID'] all_mentions = time_node.findall('timex2_mention') for mention_node in all_mentions: mention_id = mention_node.attrib['ID'] (text, start, end) = cls.process_xml_charseq(mention_node[0][0]) em = EntityMention(mention_id, IntPair(start, end), text, 'Time') doc.add_entity_mention(em)
def process_values(cls, doc, document_node): """ :type doc: text.text_theory.Document :type document_node: xml.etree.ElementTree.Element """ for value_node in document_node.findall('value'): value_id = value_node.attrib['ID'] value_type = value_node.attrib['TYPE'] all_mentions = value_node.findall('value_mention') for mention_node in all_mentions: mention_id = mention_node.attrib['ID'] (text, start, end) = cls.process_xml_charseq(mention_node[0][0]) em = EntityMention(mention_id, IntPair(start, end), text, value_type) doc.add_entity_mention(em)
def process_entities(cls, doc, document_node): """ :type doc: text.text_theory.Document :type document_node: xml.etree.ElementTree.Element """ all_entities = document_node.findall('entity') for entity_node in all_entities: entity_id = entity_node.attrib['ID'] entity_type = entity_node.attrib['TYPE'] entity_subtype = entity_node.attrib['SUBTYPE'] all_mentions = entity_node.findall('entity_mention') for mention_node in all_mentions: mention_id = mention_node.attrib['ID'] head = mention_node.find('head') (text, start, end) = cls.process_xml_charseq(head[0]) em = EntityMention(mention_id, IntPair(start, end), text, entity_type+'.'+entity_subtype) doc.add_entity_mention(em)
def extract_sentence_annotation(text, offset): """offset: char offset thus far (excluding xml tags) from prior sentences.""" start_tag = 0 end_tag = -1 raw_text = '' entity_mentions = [] # ignore everything starting from 'REMOVED_URL' url_index = text.find(' REMOVED_URL', 0) if url_index != -1: text = text[0:url_index] start_tag = text.find('<ENAMEX', 0) while(start_tag != -1): raw_text += text[end_tag+1 : start_tag] end_tag = text.find('>', start_tag) entity_type = re.search(r' TYPE="(.*)"', text[start_tag:end_tag]).group(1) start_tag = text.find('</ENAMEX>', end_tag) mention_text = text[end_tag+1 : start_tag] start = offset+len(raw_text) end = offset+len(raw_text)+len(mention_text) if '-' in mention_text and entity_type.endswith('DESC'): print('Rejecting %s[%s], because Spacy will split the string into multiple tokens, and DESC should always be just a single word' % (entity_type, mention_text)).encode('utf-8') else: (new_mention_text, prefix_length, suffix_length) = strip_mention_text(mention_text) if new_mention_text != mention_text: print('Revising %s to %s' % (mention_text, new_mention_text)).encode('utf-8') id = 'm-' + str(start+prefix_length) + '-' + str(end-suffix_length) entity_mentions.append(EntityMention(id, IntPair(start+prefix_length, end-suffix_length), new_mention_text, entity_type)) raw_text += mention_text end_tag = text.find('>', start_tag) start_tag = text.find('<ENAMEX', end_tag) raw_text += text[end_tag+1:] return (raw_text, entity_mentions)
def _read_annotation_file(cls, infile, event_type, text): """ :type infile: str :type event_type: str :type text: str Returns: list[text.text_theory.Event] :param text: this is the raw text corresponding to the annotation """ docid = os.path.basename(infile) events = [] """:type: list[text.text_theory.Event]""" negative_spans = [] """:type: list[text.text_span.TextSpan]""" anchors_not_in_eventspans = [] # these might be in negative spans """:type: list[text.text_span.Anchor]""" with open(infile, 'r') as f: for line in f: tokens = line.strip().split() span_type = tokens[0] start = int(tokens[1]) end = int(tokens[2]) + 1 text_string = ' '.join(text[start:end].replace('\n', ' ').strip().split()) end = start + len(text_string) if '<' in text_string or '>' in text_string: print('Skipping annotation of type {}, as it has either "<" or ">"'.format(span_type)) continue if span_type == event_type: id = '{}-e{}'.format(docid, len(events)) event_span = EventSpan(id, IntPair(start, end), text_string, event_type) e = Event(id, event_type) e.add_event_span(event_span) events.append(e) elif '/' in span_type: # this is an event argument em = EntityMention('dummy', IntPair(start, end), text_string, 'dummy') event_role = span_type.split('/')[1] e = cls._find_event_containing_span(events, start, end) if e is None: print('Cannot find an event span for {} {} (start,end)=({},{}) "{}". Skipping.'.format(event_type, docid, start, end, text_string)) else: arg_id = '{}-a{}'.format(e.id, e.number_of_arguments()) e.add_argument(EventArgument(arg_id, em, event_role)) elif span_type == 'anchor': e = cls._find_event_containing_span(events, start, end) anchor = Anchor('dummy', IntPair(start, end), text_string, event_type) if e is None: # it might be in a negative span #print('Cannot find an event span for {} {} (start,end)=({},{}) "{}". Skipping.'.format(event_type, docid, start, end, text_string.replace(' ', '_'))) anchors_not_in_eventspans.append(anchor) else: e.add_anchor(anchor) elif span_type == 'negative': negative_spans.append(TextSpan(IntPair(start, end), text_string)) elif span_type == 'interesting': pass # we discard these for now for anchor in anchors_not_in_eventspans: found = False for span in negative_spans: if span.start_char_offset() <= anchor.start_char_offset() and anchor.end_char_offset() <= span.end_char_offset(): found = True break if not found: print('Cannot find an event nor negative span for anchor {} {} (start,end)=({},{}) "{}". Skipping.'.format( \ event_type, docid, anchor.start_char_offset(), anchor.end_char_offset(), anchor.text.replace(' ', '_'))) # keep only events with anchor return [event for event in events if event.number_of_anchors() > 0]
ner_decoder, content, sent, offset=0, content_type='Blog')) for p in ner_predictions: print(p) # create a document based on text content, add NER predictions as EntityMentions, then apply Spacy to # perform sentence segmentation and tokenization, and use Spacy tokens to back the EntityMentions doc = Document('dummy', content) for i, p in enumerate(ner_predictions): id = 'em-{}'.format(i) doc.add_entity_mention( EntityMention(id, IntPair(p['start'], p['end']), p['text'], p['label'])) doc.annotate_sentences(spacy_en, word_embeddings) event_domain = None if params.get_string('domain') == 'cyber': # initialize a particular event domain, which stores info on the event types and event roles event_domain = CyberDomain() arg_generator = EventArgumentGenerator(event_domain, params) trigger_generator = EventTriggerGenerator(event_domain, params) (trigger_examples, trigger_data, trigger_data_list, trigger_label) = generate_trigger_data_feature(trigger_generator, [doc]) print('==== Loading Trigger model ====') trigger_model = load_trigger_model(params.get_string('event_model_dir'))