def bnb_json_to_concrete(documents: Iterable[Dict]) -> Iterable[CementDocument]:
    wnl = WordNetLemmatizer()
    for i, doc in tqdm(enumerate(documents)):
        cement_doc = CementDocument.from_tokens(tokens={'passage': doc['sentences']},
                                                doc_id=doc['doc_key'])
        arguments = []
        event_type = wnl.lemmatize(word=doc['trigger']['text'][0].lower())
        event_type = event_type if 'loss' not in event_type else 'loss'
        for k, v in doc['arguments'].items():
            if len(v) > 1:
                logger.info(f'Arg has more than one span: {k} - {v}')
            for arg in v:
                arguments.append(
                    CementEntityMention(start=arg['span'][0],
                                        end=arg['span'][1],
                                        document=cement_doc,
                                        role=k)
                                        # role=f'{event_type}-{k}' if 'Arg' in k else k)
                )
        # special handling for `tax-loss` predicate
        cement_doc.add_event_mention(
            trigger=CementSpan(start=doc['trigger']['span'][0], end=doc['trigger']['span'][1], document=cement_doc),
            arguments=arguments,
            event_type=event_type,
        )
        yield cement_doc
def rams_json_to_concrete(documents: Iterable[Dict], has_additional_mention: bool = True) -> Iterable[CementDocument]:
    for i, doc in tqdm(enumerate(documents)):
        cement_doc = CementDocument.from_tokens(tokens={'passage': doc['sentences']},
                                                doc_id=doc['doc_key'])
        raw_args, event_type = get_rams_event(doc)
        cement_doc.add_event_mention(
            trigger=CementSpan(start=raw_args[-1][0], end=raw_args[-1][1], document=cement_doc),
            arguments=[
                CementEntityMention(start=start, end=end, document=cement_doc, role=role)
                for start, end, role in raw_args if role != 'TRIGGER'
            ],
            event_type=ontology['mappings']['events'][event_type],
        )
        if has_additional_mention:
            additional_mentions = get_predicted_mentions(doc)
            for mention in additional_mentions:
                cement_doc.add_entity_mention(mention=CementEntityMention(start=mention[0],
                                                                          end=mention[-1],
                                                                          document=cement_doc))
        yield cement_doc
def gvdb_json_to_concrete(documents: Iterable[Dict]) -> Iterable[CementDocument]:
    for i, doc in tqdm(enumerate(documents)):
        cement_doc = CementDocument.from_tokens(tokens={'passage': doc['full_text']},
                                                doc_id=doc['doc_key'])
        cement_doc.add_event_mention(
            arguments=[
                CementEntityMention(start=start, end=end - 1, document=cement_doc, role=role)
                for start, end, role, _, _ in doc['spans']
            ],
            event_type='Shooting',
        )
        yield cement_doc
 def _get_events(jdoc: Dict, cdoc: CementDocument) -> Iterable[Tuple[str, CementSpan, List[CementSpan]]]:
     events: Dict[Tuple[int, int], List] = {}
     for trigger in jdoc['evt_triggers']:
         trigger_start = trigger[0]
         trigger_end = trigger[1]
         etype = trigger[2][0][0] if 'unspecified' not in trigger[2][0][0] else trigger[2][0][0].replace(
             'unspecified', 'n/a')
         events[(trigger_start, trigger_end)] = [
             ontology['mappings']['events'][etype],
             CementSpan(start=trigger_start, end=trigger_end, document=cdoc)
         ]
     for link in jdoc['gold_evt_links']:
         # logger.info(f'Role: {link[2]} - Normalized Role: {ontology["mappings"]["args"][link[2]]}')
         events[(link[0][0], link[0][1])].append(
             CementEntityMention(
                 start=link[1][0], end=link[1][1], role=ontology['mappings']['args'][link[2]], document=cdoc
             )
         )
     for event in events.values():
         yield event[0], event[1], event[2:]
Пример #5
0
def to_cement_doc_stream(
        json_stream: Iterable[Dict]) -> Iterable[CementDocument]:
    for json_obj in json_stream:
        # create a `CementDocument`
        doc = CementDocument.from_tokens(
            tokens={'passage': json_obj['sentences']},
            doc_id=json_obj['doc_key'])

        # extract entity mentions (EMD or NER)
        doc.write_kv_map(prefix='meta',
                         key='ner-iterator',
                         suffix='sentence',
                         value='True')
        for line_id, ems in enumerate(json_obj['ner']):
            uuids = []
            for em in ems:
                cem = CementEntityMention(start=em[0],
                                          end=em[1],
                                          entity_type=em[2],
                                          document=doc)
                em_id = doc.add_entity_mention(mention=cem)
                uuids.append(em_id.uuidString)
            doc.write_kv_map(prefix='ner',
                             key=str(line_id),
                             suffix='sentence',
                             value=','.join(uuids))

        # extract event mentions
        if 'events' in json_obj:
            doc.write_kv_map(prefix='meta',
                             key='relations-iterator',
                             suffix='sentence',
                             value='True')
            for line_id, events in enumerate(json_obj['events']):
                uuids = []
                for event in events:
                    trigger = CementSpan(start=event[0][0],
                                         end=event[0][0],
                                         document=doc)
                    arguments = [
                        CementEntityMention(start=start,
                                            end=end,
                                            role=role,
                                            document=doc)
                        for start, end, role in event[1:]
                    ]
                    sm_id = doc.add_event_mention(trigger=trigger,
                                                  arguments=arguments,
                                                  event_type=event[0][1])
                    uuids.append(sm_id.uuidString)
                doc.write_kv_map(prefix='event',
                                 key=str(line_id),
                                 suffix='sentence',
                                 value=','.join(uuids))
        else:
            logger.info(
                f'doc_key: {json_obj["doc_key"]} - does not have events.')

        # extract relation mentions
        if 'relations' in json_obj:
            doc.write_kv_map(prefix='meta',
                             key='events-iterator',
                             suffix='sentence',
                             value='True')
            for line_id, relations in enumerate(json_obj['relations']):
                uuids = []
                for relation in relations:
                    sub = CementEntityMention(start=relation[0],
                                              end=relation[1],
                                              document=doc)
                    obj = CementEntityMention(start=relation[2],
                                              end=relation[3],
                                              document=doc)
                    sm_id = doc.add_relation_mention(arguments=[sub, obj],
                                                     relation_type=relation[4])
                    uuids.append(sm_id.uuidString)
                doc.write_kv_map(prefix='relation',
                                 key=str(line_id),
                                 suffix='sentence',
                                 value=','.join(uuids))
        else:
            logger.info(
                f'doc_key: {json_obj["doc_key"]} - does not have relations.')

        yield doc