def add_event_to_comm(mentions: List[Tuple[int, int, str]], event_type: str, comm: Communication): # This function assumes that the comm only has one sentence tokenization_id = comm.sectionList[0].sentenceList[0].tokenization.uuid comm.situationMentionSetList[0].mentionList.append( SituationMention(uuid=augf.next(), situationType='EVENT', situationKind=ontology['mappings']['events'][event_type], argumentList=[]) ) for mention in mentions: new_entity_mention: Optional[EntityMention] = check_duplicate_mention(comm, (mention[0], mention[1])) if new_entity_mention is None: new_entity_mention = EntityMention( uuid=augf.next(), tokens=TokenRefSequence( tokenIndexList=list(range(mention[0], mention[1])), tokenizationId=tokenization_id ) ) comm.entityMentionSetList[0].mentionList.append(new_entity_mention) comm.situationMentionSetList[0].mentionList[0].argumentList.append( MentionArgument( role=mention[2], entityMentionId=new_entity_mention.uuid ) )
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[ Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1)) ]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def add_additional_mentions_to_comm(mentions: List[Tuple[int, int]], comm: Communication): tokenization_id = comm.sectionList[0].sentenceList[0].tokenization.uuid for mention in mentions: if check_duplicate_mention(comm, (mention[0], mention[1])) is not None: continue new_entity_mention = EntityMention( uuid=augf.next(), tokens=TokenRefSequence( tokenIndexList=list(range(mention[0], mention[1])), tokenizationId=tokenization_id ) ) comm.entityMentionSetList[0].mentionList.append(new_entity_mention)
def annotate(self, communication): text = communication.text augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() entities = {} for section in communication.sectionList: for sentence in section.sentenceList: tokens = [ x.text for x in sentence.tokenization.tokenList.tokenList ] tags = [ x.tag for x in sentence.tokenization.tokenTaggingList[-1].taggedTokenList ] for subtree in nltk.ne_chunk(zip(tokens, tags)).subtrees(): if subtree.label() != "S": name = " ".join([x[0] for x in subtree.leaves()]) logging.info("Found named entity \"%s\"", name) entities[(name, subtree.label( ))] = entities.get(name, []) + [ EntityMention( uuid=aug.next(), entityType=subtree.label(), tokens=TokenRefSequence( tokenIndexList=[], tokenizationId=sentence.tokenization.uuid)) ] communication.entitySetList.append( EntitySet(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int(time.time()), tool="nltk"), entityList=[ Entity(uuid=aug.next(), mentionIdList=[x.uuid for x in v], canonicalName=k[0], type=k[1]) for k, v in entities.iteritems() ])) communication.entityMentionSetList.append( EntityMentionSet(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int( time.time()), tool="nltk"), mentionList=sum(entities.values(), []))) return communication
def update_concrete(comm, prediction): toolname = 'Violet_NER_annotator' timestamp = int(time.time()) mention_list = [] for section in comm.sectionList: for sentence in section.sentenceList: start = 0 pred_ner_tags = [] tknzation = sentence.tokenization in_NE = False ne_type = '' tokenization_id = None token_idx_list = [] ne_text = [] for i, tk in enumerate(tknzation.tokenList.tokenList): pred_tags = ' '.join(prediction[start:start + len(tk.text)]) if in_NE: for i, tag in enumerate(prediction[start:start + len(tk.text)]): if tag != 'I-' + ne_type: if i != 0: token_idx_list.append(i) ne_text.append(tk.text) entity_tokens = TokenRefSequence( tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split( '.') if '.' in ne_type else (ne_type, 'NAM') #print token_idx_list, ne_text, e_type, p_type e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False break if not in_NE and 'B-' in pred_tags: #print 'not in NE,', prediction[start:start+len(tk.text)] in_NE = True for tag in prediction[start:start + len(tk.text)]: #print tag if tag.startswith('B-'): ne_type = tag.split('-')[1] tokenization_id = tknzation.uuid token_idx_list.append(i) ne_text.append(tk.text) break #print token_idx_list, ne_text if prediction[start + len(tk.text) - 1] != 'I-' + ne_type: entity_tokens = TokenRefSequence( tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split( '.') if '.' in ne_type else (ne_type, 'NAM') e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False start += len(tk.text) pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags)) pner_tokentagging = TokenTagging(taggingType=PRED_TAG, taggedTokenList=pred_ner_tags, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tknzation.tokenTaggingList.append(pner_tokentagging) entity_list = [ Entity(uuid=generate_UUID(), type=mention.entityType, canonicalName=mention.text, mentionIdList=[mention.uuid]) for mention in mention_list ] entity_mention_set = EntityMentionSet(uuid=generate_UUID(), metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), mentionList=mention_list) entity_set = EntitySet(uuid=generate_UUID(), metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), entityList=entity_list, mentionSetId=entity_mention_set.uuid) comm.entityMentionSetList = [entity_mention_set] comm.entitySetList = [entity_set]