def _iterate_mention_arguments_as_spans( sms: List[SituationMention] ) -> Iterable[Union[CementSpan, CementEntityMention]]: for sm in sms: for arg in sm.argumentList: if arg.situationMentionId is not None: ref_sm: SituationMention = doc.comm.situationMentionForUUID[ arg.situationMentionId.uuidString] span = CementSpan.from_token_ref_sequence( ref_sm.tokens, document=doc) elif arg.tokens is not None: span = CementSpan.from_token_ref_sequence(arg.tokens, document=doc) elif arg.entityMentionId is not None: span = CementEntityMention.from_entity_mention( mention=doc.comm.entityMentionForUUID[ arg.entityMentionId.uuidString], document=doc) else: logger.info( f'MentionArgument={arg} - does not have any span information.' ) continue span.attrs.add('role', arg.role) yield span
def bnb_json_to_concrete(documents: Iterable[Dict]) -> Iterable[CementDocument]: wnl = WordNetLemmatizer() for i, doc in tqdm(enumerate(documents)): cement_doc = CementDocument.from_tokens(tokens={'passage': doc['sentences']}, doc_id=doc['doc_key']) arguments = [] event_type = wnl.lemmatize(word=doc['trigger']['text'][0].lower()) event_type = event_type if 'loss' not in event_type else 'loss' for k, v in doc['arguments'].items(): if len(v) > 1: logger.info(f'Arg has more than one span: {k} - {v}') for arg in v: arguments.append( CementEntityMention(start=arg['span'][0], end=arg['span'][1], document=cement_doc, role=k) # role=f'{event_type}-{k}' if 'Arg' in k else k) ) # special handling for `tax-loss` predicate cement_doc.add_event_mention( trigger=CementSpan(start=doc['trigger']['span'][0], end=doc['trigger']['span'][1], document=cement_doc), arguments=arguments, event_type=event_type, ) yield cement_doc
def find_mention_head(mention: CementSpan, predictor: Predictor) -> Optional[int]: tokens = mention.get_tokens() predicted_results = predictor.predict(sentence=tokens) # sanity check if ( len(tokens) != len(predicted_results['words']) or not all([a == b for a, b in zip(tokens, predicted_results['words'])]) ): logger.warning(f'Tokenizations do not match: {tokens} - {predicted_results["words"]}') return None for i in range(len(tokens)): if predicted_results['predicted_dependencies'][i] == 'root': logger.info(f'Mention: {mention} - has head: {predicted_results["words"][i]}') return i return 0
def _get_events(jdoc: Dict, cdoc: CementDocument) -> Iterable[Tuple[str, CementSpan, List[CementSpan]]]: events: Dict[Tuple[int, int], List] = {} for trigger in jdoc['evt_triggers']: trigger_start = trigger[0] trigger_end = trigger[1] etype = trigger[2][0][0] if 'unspecified' not in trigger[2][0][0] else trigger[2][0][0].replace( 'unspecified', 'n/a') events[(trigger_start, trigger_end)] = [ ontology['mappings']['events'][etype], CementSpan(start=trigger_start, end=trigger_end, document=cdoc) ] for link in jdoc['gold_evt_links']: # logger.info(f'Role: {link[2]} - Normalized Role: {ontology["mappings"]["args"][link[2]]}') events[(link[0][0], link[0][1])].append( CementEntityMention( start=link[1][0], end=link[1][1], role=ontology['mappings']['args'][link[2]], document=cdoc ) ) for event in events.values(): yield event[0], event[1], event[2:]
def rams_json_to_concrete(documents: Iterable[Dict], has_additional_mention: bool = True) -> Iterable[CementDocument]: for i, doc in tqdm(enumerate(documents)): cement_doc = CementDocument.from_tokens(tokens={'passage': doc['sentences']}, doc_id=doc['doc_key']) raw_args, event_type = get_rams_event(doc) cement_doc.add_event_mention( trigger=CementSpan(start=raw_args[-1][0], end=raw_args[-1][1], document=cement_doc), arguments=[ CementEntityMention(start=start, end=end, document=cement_doc, role=role) for start, end, role in raw_args if role != 'TRIGGER' ], event_type=ontology['mappings']['events'][event_type], ) if has_additional_mention: additional_mentions = get_predicted_mentions(doc) for mention in additional_mentions: cement_doc.add_entity_mention(mention=CementEntityMention(start=mention[0], end=mention[-1], document=cement_doc)) yield cement_doc
def process_mentions(doc_stream: Iterable[CementDocument], predictor: Predictor) -> Iterable[CementDocument]: for doc in doc_stream: # process EntityMention for em in doc.iterate_entity_mentions(): cem = CementEntityMention.from_entity_mention(mention=em, document=doc) head_token_offset = find_mention_head(mention=cem, predictor=predictor) cem.attrs.head = cem.start + head_token_offset cem.write_em_head_to_comm() # process trigger for sm in doc.iterate_situation_mentions(): if sm.tokens is not None: trigger_span = CementSpan.from_token_ref_sequence(token_ref_sequence=sm.tokens, document=doc) head_token_offset = find_mention_head(mention=trigger_span, predictor=predictor) trigger_span.write_span_kv(value=str(trigger_span.start + head_token_offset), suffix='head', key=sm.uuid.uuidString, key_prefix='trigger') yield doc
def extract_instances_from_doc( self, doc: CementDocument, cache_target: Optional[str] = None) -> Iterable[Instance]: def _iterate_mention_arguments_as_spans( sms: List[SituationMention] ) -> Iterable[Union[CementSpan, CementEntityMention]]: for sm in sms: for arg in sm.argumentList: if arg.situationMentionId is not None: ref_sm: SituationMention = doc.comm.situationMentionForUUID[ arg.situationMentionId.uuidString] span = CementSpan.from_token_ref_sequence( ref_sm.tokens, document=doc) elif arg.tokens is not None: span = CementSpan.from_token_ref_sequence(arg.tokens, document=doc) elif arg.entityMentionId is not None: span = CementEntityMention.from_entity_mention( mention=doc.comm.entityMentionForUUID[ arg.entityMentionId.uuidString], document=doc) else: logger.info( f'MentionArgument={arg} - does not have any span information.' ) continue span.attrs.add('role', arg.role) yield span entity_mentions: List[CementEntityMention] = [ CementEntityMention.from_entity_mention(mention, doc) for mention in doc.iterate_entity_mentions() ] event_mentions: List[SituationMention] = list( doc.iterate_event_mentions()) if self._sentence_mode: local_mention_indices: Dict[Tuple[int, int], Union[CementSpan, CementEntityMention]] = {} for em in entity_mentions: sent_ids, indices = zip(*em.to_local_indices()) if sent_ids[0] != sent_ids[1]: logger.info( f'Mention span crosses sentence boundary: {sent_ids}') continue else: local_mention_indices[(indices[0], indices[-1])] = em sent_to_ems: Dict[int, Dict[Tuple[int, int], Union[ CementSpan, CementEntityMention]]] = { sent_id: {k: v for k, v in mention_group} for sent_id, mention_group in groupby( local_mention_indices.items(), key=lambda t: t[0][0]) } for event in event_mentions: # if this event has a trigger assert event.tokens is not None, 'This event does not have a trigger' trigger_span = CementSpan.from_token_ref_sequence(event.tokens, document=doc) sent_id, (trigger_start, trigger_end) = trigger_span.to_local_indices() sequence_array: np.ndarray = self._access_cache_by_key( key=[doc.comm.id, str(sent_id)], target=cache_target) sequence = [doc.get_sentence(sent_id=sent_id)] event_type: str = event.situationKind argument_mentions: Dict[Tuple[int, int], Tuple[str, Union[ CementSpan, CementEntityMention]]] = {} for arg in _iterate_mention_arguments_as_spans([event]): sid, arg_span = arg.to_local_indices() assert sid == sent_id, f'Arguments cross sentences - cannot process with sentence mode.' argument_mentions[arg_span] = (arg.attrs.role, arg) if not self._gold_mentions_only: for em_indices, em in sent_to_ems[sent_id].items(): if em_indices not in argument_mentions: argument_mentions[em_indices] = ('None', em) yield self.text_to_instance(doc=doc, sequence=sequence, sequence_array=sequence_array, event_type=event_type, trigger=(trigger_start, trigger_end), mention_spans=argument_mentions) else: sequence_array: np.ndarray = self._read_context_array_from_cache( key=[doc.comm.id], target=cache_target) sequence = doc.iterate_sentences() for event in event_mentions: if event.tokens is not None: trigger_span = CementSpan.from_token_ref_sequence( event.tokens, document=doc) else: # use the whole document as the trigger trigger_span = CementSpan(start=0, end=len(doc) - 1, document=doc) event_type: str = event.situationKind argument_mentions: Dict[Tuple[int, int], Tuple[str, Union[ CementSpan, CementEntityMention]]] = { arg.to_index_tuple(): (arg.attrs.role, arg) for arg in _iterate_mention_arguments_as_spans([event]) } if not self._gold_mentions_only: for em in entity_mentions: em_indices = em.to_index_tuple() if em_indices not in argument_mentions: argument_mentions[em_indices] = ('None', em) yield self.text_to_instance( doc=doc, sequence=sequence, sequence_array=sequence_array, event_type=event_type, trigger=trigger_span.to_index_tuple(), mention_spans=argument_mentions)
def to_cement_doc_stream( json_stream: Iterable[Dict]) -> Iterable[CementDocument]: for json_obj in json_stream: # create a `CementDocument` doc = CementDocument.from_tokens( tokens={'passage': json_obj['sentences']}, doc_id=json_obj['doc_key']) # extract entity mentions (EMD or NER) doc.write_kv_map(prefix='meta', key='ner-iterator', suffix='sentence', value='True') for line_id, ems in enumerate(json_obj['ner']): uuids = [] for em in ems: cem = CementEntityMention(start=em[0], end=em[1], entity_type=em[2], document=doc) em_id = doc.add_entity_mention(mention=cem) uuids.append(em_id.uuidString) doc.write_kv_map(prefix='ner', key=str(line_id), suffix='sentence', value=','.join(uuids)) # extract event mentions if 'events' in json_obj: doc.write_kv_map(prefix='meta', key='relations-iterator', suffix='sentence', value='True') for line_id, events in enumerate(json_obj['events']): uuids = [] for event in events: trigger = CementSpan(start=event[0][0], end=event[0][0], document=doc) arguments = [ CementEntityMention(start=start, end=end, role=role, document=doc) for start, end, role in event[1:] ] sm_id = doc.add_event_mention(trigger=trigger, arguments=arguments, event_type=event[0][1]) uuids.append(sm_id.uuidString) doc.write_kv_map(prefix='event', key=str(line_id), suffix='sentence', value=','.join(uuids)) else: logger.info( f'doc_key: {json_obj["doc_key"]} - does not have events.') # extract relation mentions if 'relations' in json_obj: doc.write_kv_map(prefix='meta', key='events-iterator', suffix='sentence', value='True') for line_id, relations in enumerate(json_obj['relations']): uuids = [] for relation in relations: sub = CementEntityMention(start=relation[0], end=relation[1], document=doc) obj = CementEntityMention(start=relation[2], end=relation[3], document=doc) sm_id = doc.add_relation_mention(arguments=[sub, obj], relation_type=relation[4]) uuids.append(sm_id.uuidString) doc.write_kv_map(prefix='relation', key=str(line_id), suffix='sentence', value=','.join(uuids)) else: logger.info( f'doc_key: {json_obj["doc_key"]} - does not have relations.') yield doc