示例#1
0
 def _iterate_mention_arguments_as_spans(
     sms: List[SituationMention]
 ) -> Iterable[Union[CementSpan, CementEntityMention]]:
     for sm in sms:
         for arg in sm.argumentList:
             if arg.situationMentionId is not None:
                 ref_sm: SituationMention = doc.comm.situationMentionForUUID[
                     arg.situationMentionId.uuidString]
                 span = CementSpan.from_token_ref_sequence(
                     ref_sm.tokens, document=doc)
             elif arg.tokens is not None:
                 span = CementSpan.from_token_ref_sequence(arg.tokens,
                                                           document=doc)
             elif arg.entityMentionId is not None:
                 span = CementEntityMention.from_entity_mention(
                     mention=doc.comm.entityMentionForUUID[
                         arg.entityMentionId.uuidString],
                     document=doc)
             else:
                 logger.info(
                     f'MentionArgument={arg} - does not have any span information.'
                 )
                 continue
             span.attrs.add('role', arg.role)
             yield span
def bnb_json_to_concrete(documents: Iterable[Dict]) -> Iterable[CementDocument]:
    wnl = WordNetLemmatizer()
    for i, doc in tqdm(enumerate(documents)):
        cement_doc = CementDocument.from_tokens(tokens={'passage': doc['sentences']},
                                                doc_id=doc['doc_key'])
        arguments = []
        event_type = wnl.lemmatize(word=doc['trigger']['text'][0].lower())
        event_type = event_type if 'loss' not in event_type else 'loss'
        for k, v in doc['arguments'].items():
            if len(v) > 1:
                logger.info(f'Arg has more than one span: {k} - {v}')
            for arg in v:
                arguments.append(
                    CementEntityMention(start=arg['span'][0],
                                        end=arg['span'][1],
                                        document=cement_doc,
                                        role=k)
                                        # role=f'{event_type}-{k}' if 'Arg' in k else k)
                )
        # special handling for `tax-loss` predicate
        cement_doc.add_event_mention(
            trigger=CementSpan(start=doc['trigger']['span'][0], end=doc['trigger']['span'][1], document=cement_doc),
            arguments=arguments,
            event_type=event_type,
        )
        yield cement_doc
def rams_json_to_concrete(documents: Iterable[Dict], has_additional_mention: bool = True) -> Iterable[CementDocument]:
    for i, doc in tqdm(enumerate(documents)):
        cement_doc = CementDocument.from_tokens(tokens={'passage': doc['sentences']},
                                                doc_id=doc['doc_key'])
        raw_args, event_type = get_rams_event(doc)
        cement_doc.add_event_mention(
            trigger=CementSpan(start=raw_args[-1][0], end=raw_args[-1][1], document=cement_doc),
            arguments=[
                CementEntityMention(start=start, end=end, document=cement_doc, role=role)
                for start, end, role in raw_args if role != 'TRIGGER'
            ],
            event_type=ontology['mappings']['events'][event_type],
        )
        if has_additional_mention:
            additional_mentions = get_predicted_mentions(doc)
            for mention in additional_mentions:
                cement_doc.add_entity_mention(mention=CementEntityMention(start=mention[0],
                                                                          end=mention[-1],
                                                                          document=cement_doc))
        yield cement_doc
def gvdb_json_to_concrete(documents: Iterable[Dict]) -> Iterable[CementDocument]:
    for i, doc in tqdm(enumerate(documents)):
        cement_doc = CementDocument.from_tokens(tokens={'passage': doc['full_text']},
                                                doc_id=doc['doc_key'])
        cement_doc.add_event_mention(
            arguments=[
                CementEntityMention(start=start, end=end - 1, document=cement_doc, role=role)
                for start, end, role, _, _ in doc['spans']
            ],
            event_type='Shooting',
        )
        yield cement_doc
 def _get_events(jdoc: Dict, cdoc: CementDocument) -> Iterable[Tuple[str, CementSpan, List[CementSpan]]]:
     events: Dict[Tuple[int, int], List] = {}
     for trigger in jdoc['evt_triggers']:
         trigger_start = trigger[0]
         trigger_end = trigger[1]
         etype = trigger[2][0][0] if 'unspecified' not in trigger[2][0][0] else trigger[2][0][0].replace(
             'unspecified', 'n/a')
         events[(trigger_start, trigger_end)] = [
             ontology['mappings']['events'][etype],
             CementSpan(start=trigger_start, end=trigger_end, document=cdoc)
         ]
     for link in jdoc['gold_evt_links']:
         # logger.info(f'Role: {link[2]} - Normalized Role: {ontology["mappings"]["args"][link[2]]}')
         events[(link[0][0], link[0][1])].append(
             CementEntityMention(
                 start=link[1][0], end=link[1][1], role=ontology['mappings']['args'][link[2]], document=cdoc
             )
         )
     for event in events.values():
         yield event[0], event[1], event[2:]
示例#6
0
def process_mentions(doc_stream: Iterable[CementDocument],
                     predictor: Predictor) -> Iterable[CementDocument]:
    for doc in doc_stream:
        # process EntityMention
        for em in doc.iterate_entity_mentions():
            cem = CementEntityMention.from_entity_mention(mention=em, document=doc)
            head_token_offset = find_mention_head(mention=cem, predictor=predictor)
            cem.attrs.head = cem.start + head_token_offset
            cem.write_em_head_to_comm()
        # process trigger
        for sm in doc.iterate_situation_mentions():
            if sm.tokens is not None:
                trigger_span = CementSpan.from_token_ref_sequence(token_ref_sequence=sm.tokens,
                                                                  document=doc)
                head_token_offset = find_mention_head(mention=trigger_span, predictor=predictor)
                trigger_span.write_span_kv(value=str(trigger_span.start + head_token_offset),
                                           suffix='head',
                                           key=sm.uuid.uuidString,
                                           key_prefix='trigger')
        yield doc
def collect_raw_document_data(
        doc: CementDocument) -> Dict[str, Union[Counter, Set, List]]:
    # check meta for sentence level iterator pre-generated
    # has_ner_sentence_iterator = bool(doc.read_kv_map(prefix='meta', key='ner-iterator', suffix='sentence'))
    # has_event_sentence_iterator = bool(doc.read_kv_map(prefix='meta', key='events-iterator', suffix='sentence'))
    # has_relation_sentence_iterator = bool(doc.read_kv_map(prefix='meta', key='relations-iterator', suffix='sentence'))
    document_counter = Counter()
    document_desc = {
        'doc_key': doc.comm.id,
        'num_sentences': doc.num_sentences()
    }
    ner = set()
    event_types = set()
    event_roles: defaultdict = defaultdict(set)
    relation_types = set()

    document_counter['Tokens'] += len(doc)
    for i in range(doc.num_sentences()):
        document_counter[('Sentence', i)] += len(doc.get_sentence(sent_id=i))
        document_counter['Sentence'] += 1

    # count `EntityMention`s
    # if has_ner_sentence_iterator and doc.read_kv_map(prefix='ner', key=str(i), suffix='sentence') != '':
    #     em_uuids = doc.read_kv_map(prefix='ner', key=str(i), suffix='sentence').split(',')
    ems = list(doc.iterate_entity_mentions())
    document_counter['EntityMention'] += len(ems)
    for em in ems:
        cem = CementEntityMention.from_entity_mention(mention=em, document=doc)
        document_counter[('EntityMention', 'tokens')] += len(cem)
        document_counter[('NER', cem.attrs.entity_type)] += 1
        ner.add(cem.attrs.entity_type)

    # count `SituationMention`s
    # Event Mention
    # if has_event_sentence_iterator and doc.read_kv_map(prefix='event', key=str(i), suffix='sentence') != '':
    #     event_uuids = doc.read_kv_map(prefix='event', key=str(i), suffix='sentence').split(',')
    events = list(doc.iterate_event_mentions())
    document_counter['EventMention'] += len(events)
    for event in events:
        document_counter[('EventMention', 'args')] += len(event.argumentList)
        document_counter[('EventType', event.situationKind)] += 1
        event_types.add(event.situationKind)
        participated_roles = set()
        for arg in event.argumentList:
            participated_roles.add(arg.role)
            document_counter[('EventRole', arg.role)] += 1
        event_roles[event.situationKind].update(participated_roles)
    # Relation Mention
    # if has_relation_sentence_iterator and doc.read_kv_map(prefix='relation', key=str(i), suffix='sentence') != '':
    #     relation_uuids = doc.read_kv_map(prefix='relation', key=str(i), suffix='sentence').split(',')
    relations = list(doc.iterate_relation_mentions())
    document_counter['RelationMention'] += len(relations)
    for relation in relations:
        document_counter[('RelationType', relation.situationKind)] += 1
        relation_types.add(relation.situationKind)

    # if not has_ner_sentence_iterator:
    #     logger.warning(f'doc_key: {doc.comm.id} - does not have sentence iterator info for NER.')
    # if not has_event_sentence_iterator:
    #     logger.warning(f'doc_key: {doc.comm.id} - does not have sentence iterator info for Event Mention.')
    # if not has_relation_sentence_iterator:
    #     logger.warning(f'doc_key: {doc.comm.id} - does not have sentence iterator info for Relation Mention.')

    document_desc['counter'] = document_counter
    document_desc['ontology'] = {
        'ner': ner,
        'event_types': event_types,
        'event_roles': event_roles,
        'relation_types': relation_types
    }

    return document_desc
示例#8
0
    def extract_instances_from_doc(
            self,
            doc: CementDocument,
            cache_target: Optional[str] = None) -> Iterable[Instance]:
        def _iterate_mention_arguments_as_spans(
            sms: List[SituationMention]
        ) -> Iterable[Union[CementSpan, CementEntityMention]]:
            for sm in sms:
                for arg in sm.argumentList:
                    if arg.situationMentionId is not None:
                        ref_sm: SituationMention = doc.comm.situationMentionForUUID[
                            arg.situationMentionId.uuidString]
                        span = CementSpan.from_token_ref_sequence(
                            ref_sm.tokens, document=doc)
                    elif arg.tokens is not None:
                        span = CementSpan.from_token_ref_sequence(arg.tokens,
                                                                  document=doc)
                    elif arg.entityMentionId is not None:
                        span = CementEntityMention.from_entity_mention(
                            mention=doc.comm.entityMentionForUUID[
                                arg.entityMentionId.uuidString],
                            document=doc)
                    else:
                        logger.info(
                            f'MentionArgument={arg} - does not have any span information.'
                        )
                        continue
                    span.attrs.add('role', arg.role)
                    yield span

        entity_mentions: List[CementEntityMention] = [
            CementEntityMention.from_entity_mention(mention, doc)
            for mention in doc.iterate_entity_mentions()
        ]
        event_mentions: List[SituationMention] = list(
            doc.iterate_event_mentions())

        if self._sentence_mode:
            local_mention_indices: Dict[Tuple[int, int],
                                        Union[CementSpan,
                                              CementEntityMention]] = {}
            for em in entity_mentions:
                sent_ids, indices = zip(*em.to_local_indices())
                if sent_ids[0] != sent_ids[1]:
                    logger.info(
                        f'Mention span crosses sentence boundary: {sent_ids}')
                    continue
                else:
                    local_mention_indices[(indices[0], indices[-1])] = em
            sent_to_ems: Dict[int, Dict[Tuple[int, int], Union[
                CementSpan, CementEntityMention]]] = {
                    sent_id: {k: v
                              for k, v in mention_group}
                    for sent_id, mention_group in groupby(
                        local_mention_indices.items(), key=lambda t: t[0][0])
                }
            for event in event_mentions:
                # if this event has a trigger
                assert event.tokens is not None, 'This event does not have a trigger'
                trigger_span = CementSpan.from_token_ref_sequence(event.tokens,
                                                                  document=doc)
                sent_id, (trigger_start,
                          trigger_end) = trigger_span.to_local_indices()
                sequence_array: np.ndarray = self._access_cache_by_key(
                    key=[doc.comm.id, str(sent_id)], target=cache_target)
                sequence = [doc.get_sentence(sent_id=sent_id)]
                event_type: str = event.situationKind
                argument_mentions: Dict[Tuple[int, int], Tuple[str, Union[
                    CementSpan, CementEntityMention]]] = {}
                for arg in _iterate_mention_arguments_as_spans([event]):
                    sid, arg_span = arg.to_local_indices()
                    assert sid == sent_id, f'Arguments cross sentences - cannot process with sentence mode.'
                    argument_mentions[arg_span] = (arg.attrs.role, arg)
                if not self._gold_mentions_only:
                    for em_indices, em in sent_to_ems[sent_id].items():
                        if em_indices not in argument_mentions:
                            argument_mentions[em_indices] = ('None', em)

                yield self.text_to_instance(doc=doc,
                                            sequence=sequence,
                                            sequence_array=sequence_array,
                                            event_type=event_type,
                                            trigger=(trigger_start,
                                                     trigger_end),
                                            mention_spans=argument_mentions)
        else:
            sequence_array: np.ndarray = self._read_context_array_from_cache(
                key=[doc.comm.id], target=cache_target)
            sequence = doc.iterate_sentences()
            for event in event_mentions:
                if event.tokens is not None:
                    trigger_span = CementSpan.from_token_ref_sequence(
                        event.tokens, document=doc)
                else:  # use the whole document as the trigger
                    trigger_span = CementSpan(start=0,
                                              end=len(doc) - 1,
                                              document=doc)
                event_type: str = event.situationKind
                argument_mentions: Dict[Tuple[int, int], Tuple[str, Union[
                    CementSpan, CementEntityMention]]] = {
                        arg.to_index_tuple(): (arg.attrs.role, arg)
                        for arg in _iterate_mention_arguments_as_spans([event])
                    }
                if not self._gold_mentions_only:
                    for em in entity_mentions:
                        em_indices = em.to_index_tuple()
                        if em_indices not in argument_mentions:
                            argument_mentions[em_indices] = ('None', em)

                yield self.text_to_instance(
                    doc=doc,
                    sequence=sequence,
                    sequence_array=sequence_array,
                    event_type=event_type,
                    trigger=trigger_span.to_index_tuple(),
                    mention_spans=argument_mentions)
示例#9
0
def to_cement_doc_stream(
        json_stream: Iterable[Dict]) -> Iterable[CementDocument]:
    for json_obj in json_stream:
        # create a `CementDocument`
        doc = CementDocument.from_tokens(
            tokens={'passage': json_obj['sentences']},
            doc_id=json_obj['doc_key'])

        # extract entity mentions (EMD or NER)
        doc.write_kv_map(prefix='meta',
                         key='ner-iterator',
                         suffix='sentence',
                         value='True')
        for line_id, ems in enumerate(json_obj['ner']):
            uuids = []
            for em in ems:
                cem = CementEntityMention(start=em[0],
                                          end=em[1],
                                          entity_type=em[2],
                                          document=doc)
                em_id = doc.add_entity_mention(mention=cem)
                uuids.append(em_id.uuidString)
            doc.write_kv_map(prefix='ner',
                             key=str(line_id),
                             suffix='sentence',
                             value=','.join(uuids))

        # extract event mentions
        if 'events' in json_obj:
            doc.write_kv_map(prefix='meta',
                             key='relations-iterator',
                             suffix='sentence',
                             value='True')
            for line_id, events in enumerate(json_obj['events']):
                uuids = []
                for event in events:
                    trigger = CementSpan(start=event[0][0],
                                         end=event[0][0],
                                         document=doc)
                    arguments = [
                        CementEntityMention(start=start,
                                            end=end,
                                            role=role,
                                            document=doc)
                        for start, end, role in event[1:]
                    ]
                    sm_id = doc.add_event_mention(trigger=trigger,
                                                  arguments=arguments,
                                                  event_type=event[0][1])
                    uuids.append(sm_id.uuidString)
                doc.write_kv_map(prefix='event',
                                 key=str(line_id),
                                 suffix='sentence',
                                 value=','.join(uuids))
        else:
            logger.info(
                f'doc_key: {json_obj["doc_key"]} - does not have events.')

        # extract relation mentions
        if 'relations' in json_obj:
            doc.write_kv_map(prefix='meta',
                             key='events-iterator',
                             suffix='sentence',
                             value='True')
            for line_id, relations in enumerate(json_obj['relations']):
                uuids = []
                for relation in relations:
                    sub = CementEntityMention(start=relation[0],
                                              end=relation[1],
                                              document=doc)
                    obj = CementEntityMention(start=relation[2],
                                              end=relation[3],
                                              document=doc)
                    sm_id = doc.add_relation_mention(arguments=[sub, obj],
                                                     relation_type=relation[4])
                    uuids.append(sm_id.uuidString)
                doc.write_kv_map(prefix='relation',
                                 key=str(line_id),
                                 suffix='sentence',
                                 value=','.join(uuids))
        else:
            logger.info(
                f'doc_key: {json_obj["doc_key"]} - does not have relations.')

        yield doc