예제 #1
0
파일: decoder.py 프로젝트: BBN-E/Hume
def line_to_predictions(ner_fea, dec, json_eg, attr, content_type,
                        word_embeddings, trigger_generator, trigger_model,
                        arg_generator):
    """
    :type word_embeddings: embeddings.word_embeddings.WordEmbedding
    :type trigger_generator: event.event_trigger.EventTriggerGenerator
    :type trigger_model: model.event_cnn.EventExtractionModel
    :type arg_generator: event.event_argument.EventArgumentGenerator
    """
    global spacy_en

    content = find(attr, json_eg)  # json_eg.get(attr)

    #print(content_type.encode('ascii', 'ignore'))
    #print(content.encode('ascii', 'ignore'))

    offset = 0
    all_predictions = []

    if content is not None:
        if type(content) is list:
            content = '\n'.join(content)
        for line in content.split('\n'):
            #print(offset)
            #print('[' + content_type.encode('ascii', 'ignore') + ']')
            #print('[' + line.encode('ascii', 'ignore') + ']')

            doc_ner_predictions = []
            sentences = get_sentences(line, content_type)
            if sentences is not None:
                for sent in sentences:
                    sent_predictions = decode_sentence(ner_fea, dec, content,
                                                       sent, offset,
                                                       content_type)
                    doc_ner_predictions.extend(sent_predictions)
                    all_predictions.extend(sent_predictions)

            if content_type == 'Post':
                doc = Document('dummy', line)
                for i, p in enumerate(doc_ner_predictions):
                    id = 'em-{}'.format(i)
                    doc.add_entity_mention(
                        EntityMention(id, IntPair(p['start'], p['end']),
                                      p['text'], p['label']))
                doc.annotate_sentences(spacy_en, word_embeddings)

                (trigger_examples, trigger_data, trigger_data_list,
                 trigger_label) = generate_trigger_data_feature(
                     trigger_generator, [doc])
                trigger_predictions = trigger_model.predict(trigger_data_list)

            offset += len(line) + 1  # +1 to account for newline

    # a list of dict, one for each predicted NE mention
    if len(all_predictions) > 0:
        if not "extractions" in json_eg:
            json_eg["extractions"] = {}
        json_eg['extractions'][attr] = all_predictions

    return json_eg
예제 #2
0
    def process_times(cls, doc, document_node):
        """
        :type doc: text.text_theory.Document
        :type document_node: xml.etree.ElementTree.Element
        """
        for time_node in document_node.findall('timex2'):
            time_id = time_node.attrib['ID']

            all_mentions = time_node.findall('timex2_mention')
            for mention_node in all_mentions:
                mention_id = mention_node.attrib['ID']
                (text, start, end) = cls.process_xml_charseq(mention_node[0][0])
                em = EntityMention(mention_id, IntPair(start, end), text, 'Time')
                doc.add_entity_mention(em)
예제 #3
0
    def process_values(cls, doc, document_node):
        """
        :type doc: text.text_theory.Document
        :type document_node: xml.etree.ElementTree.Element
        """
        for value_node in document_node.findall('value'):
            value_id = value_node.attrib['ID']
            value_type = value_node.attrib['TYPE']

            all_mentions = value_node.findall('value_mention')
            for mention_node in all_mentions:
                mention_id = mention_node.attrib['ID']
                (text, start, end) = cls.process_xml_charseq(mention_node[0][0])
                em = EntityMention(mention_id, IntPair(start, end), text, value_type)
                doc.add_entity_mention(em)
예제 #4
0
    def process_entities(cls, doc, document_node):
        """
        :type doc: text.text_theory.Document
        :type document_node: xml.etree.ElementTree.Element
        """
        all_entities = document_node.findall('entity')
        for entity_node in all_entities:
            entity_id = entity_node.attrib['ID']
            entity_type = entity_node.attrib['TYPE']
            entity_subtype = entity_node.attrib['SUBTYPE']

            all_mentions = entity_node.findall('entity_mention')
            for mention_node in all_mentions:
                mention_id = mention_node.attrib['ID']
                head = mention_node.find('head')
                (text, start, end) = cls.process_xml_charseq(head[0])
                em = EntityMention(mention_id, IntPair(start, end), text, entity_type+'.'+entity_subtype)
                doc.add_entity_mention(em)
예제 #5
0
파일: idt.py 프로젝트: BBN-E/Hume
def extract_sentence_annotation(text, offset):
    """offset: char offset thus far (excluding xml tags) from prior sentences."""

    start_tag = 0
    end_tag = -1
    raw_text = ''
    entity_mentions = []

    # ignore everything starting from 'REMOVED_URL'
    url_index = text.find(' REMOVED_URL', 0)
    if url_index != -1:
        text = text[0:url_index]

    start_tag = text.find('<ENAMEX', 0)
    while(start_tag != -1):
        raw_text += text[end_tag+1 : start_tag]

        end_tag = text.find('>', start_tag)
        entity_type = re.search(r' TYPE="(.*)"', text[start_tag:end_tag]).group(1)

        start_tag = text.find('</ENAMEX>', end_tag)
        mention_text = text[end_tag+1 : start_tag]

        start = offset+len(raw_text)
        end = offset+len(raw_text)+len(mention_text)
        if '-' in mention_text and entity_type.endswith('DESC'):
            print('Rejecting %s[%s], because Spacy will split the string into multiple tokens, and DESC should always be just a single word' % (entity_type, mention_text)).encode('utf-8')
        else:
            (new_mention_text, prefix_length, suffix_length) = strip_mention_text(mention_text)
            if new_mention_text != mention_text:
                print('Revising %s to %s' % (mention_text, new_mention_text)).encode('utf-8')
            id = 'm-' + str(start+prefix_length) + '-' + str(end-suffix_length)
            entity_mentions.append(EntityMention(id, IntPair(start+prefix_length, end-suffix_length), new_mention_text, entity_type))

        raw_text += mention_text

        end_tag = text.find('>', start_tag)
        start_tag = text.find('<ENAMEX', end_tag)

    raw_text += text[end_tag+1:]

    return (raw_text, entity_mentions)
예제 #6
0
파일: spannotator.py 프로젝트: BBN-E/Hume
    def _read_annotation_file(cls, infile, event_type, text):
        """
        :type infile: str
        :type event_type: str
        :type text: str
        Returns:
            list[text.text_theory.Event]
        :param text: this is the raw text corresponding to the annotation
        """
        docid = os.path.basename(infile)

        events = []
        """:type: list[text.text_theory.Event]"""
        negative_spans = []
        """:type: list[text.text_span.TextSpan]"""
        anchors_not_in_eventspans = []      # these might be in negative spans
        """:type: list[text.text_span.Anchor]"""
        with open(infile, 'r') as f:
            for line in f:
                tokens = line.strip().split()
                span_type = tokens[0]
                start = int(tokens[1])
                end = int(tokens[2]) + 1
                text_string = ' '.join(text[start:end].replace('\n', ' ').strip().split())
                end = start + len(text_string)

                if '<' in text_string or '>' in text_string:
                    print('Skipping annotation of type {}, as it has either "<" or ">"'.format(span_type))
                    continue

                if span_type == event_type:
                    id = '{}-e{}'.format(docid, len(events))
                    event_span = EventSpan(id, IntPair(start, end), text_string, event_type)
                    e = Event(id, event_type)
                    e.add_event_span(event_span)
                    events.append(e)
                elif '/' in span_type:  # this is an event argument
                    em = EntityMention('dummy', IntPair(start, end), text_string, 'dummy')
                    event_role = span_type.split('/')[1]
                    e = cls._find_event_containing_span(events, start, end)
                    if e is None:
                        print('Cannot find an event span for {} {} (start,end)=({},{}) "{}". Skipping.'.format(event_type, docid, start, end, text_string))
                    else:
                        arg_id = '{}-a{}'.format(e.id, e.number_of_arguments())
                        e.add_argument(EventArgument(arg_id, em, event_role))
                elif span_type == 'anchor':
                    e = cls._find_event_containing_span(events, start, end)
                    anchor = Anchor('dummy', IntPair(start, end), text_string, event_type)
                    if e is None:
                        # it might be in a negative span
                        #print('Cannot find an event span for {} {} (start,end)=({},{}) "{}". Skipping.'.format(event_type, docid, start, end, text_string.replace(' ', '_')))
                        anchors_not_in_eventspans.append(anchor)
                    else:
                        e.add_anchor(anchor)
                elif span_type == 'negative':
                    negative_spans.append(TextSpan(IntPair(start, end), text_string))
                elif span_type == 'interesting':
                    pass                # we discard these for now

        for anchor in anchors_not_in_eventspans:
            found = False
            for span in negative_spans:
                if span.start_char_offset() <= anchor.start_char_offset() and anchor.end_char_offset() <= span.end_char_offset():
                    found = True
                    break
            if not found:
                print('Cannot find an event nor negative span for anchor {} {} (start,end)=({},{}) "{}". Skipping.'.format( \
                    event_type, docid, anchor.start_char_offset(), anchor.end_char_offset(), anchor.text.replace(' ', '_')))

        # keep only events with anchor
        return [event for event in events if event.number_of_anchors() > 0]
예제 #7
0
파일: decode_text.py 프로젝트: BBN-E/Hume
                            ner_decoder,
                            content,
                            sent,
                            offset=0,
                            content_type='Blog'))

    for p in ner_predictions:
        print(p)

    # create a document based on text content, add NER predictions as EntityMentions, then apply Spacy to
    # perform sentence segmentation and tokenization, and use Spacy tokens to back the EntityMentions
    doc = Document('dummy', content)
    for i, p in enumerate(ner_predictions):
        id = 'em-{}'.format(i)
        doc.add_entity_mention(
            EntityMention(id, IntPair(p['start'], p['end']), p['text'],
                          p['label']))
    doc.annotate_sentences(spacy_en, word_embeddings)

    event_domain = None
    if params.get_string('domain') == 'cyber':
        # initialize a particular event domain, which stores info on the event types and event roles
        event_domain = CyberDomain()

    arg_generator = EventArgumentGenerator(event_domain, params)
    trigger_generator = EventTriggerGenerator(event_domain, params)

    (trigger_examples, trigger_data, trigger_data_list,
     trigger_label) = generate_trigger_data_feature(trigger_generator, [doc])

    print('==== Loading Trigger model ====')
    trigger_model = load_trigger_model(params.get_string('event_model_dir'))