def comm_with_other_tags(*additional_tagging_types): comm = create_comm( 'quick', '''\ The quick brown fox jumped over the lazy dog . Or did she ? ''') for section in comm.sectionList: for sentence in section.sentenceList: sentence.tokenization.tokenTaggingList = [ TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool', timestamp=1, ), taggingType=u'upper', taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag=token.text.upper(), ) for token in sentence.tokenization.tokenList.tokenList ], ), TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool', timestamp=1, ), taggingType=u'lower', taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag=token.text.lower(), ) for token in sentence.tokenization.tokenList.tokenList ], ), ] + [ TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool/{}'.format(i), timestamp=1, ), taggingType=tagging_type, taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag='{}_{}/{}'.format(tagging_type, token.tokenIndex, i), ) for token in sentence.tokenization.tokenList.tokenList ], ) for (i, tagging_type) in enumerate(additional_tagging_types) ] return comm
def comm_with_other_tags(*additional_tagging_types): comm = create_comm('quick', '''\ The quick brown fox jumped over the lazy dog . Or did she ? ''') for section in comm.sectionList: for sentence in section.sentenceList: sentence.tokenization.tokenTaggingList = [ TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool', timestamp=1, ), taggingType=u'upper', taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag=token.text.upper(), ) for token in sentence.tokenization.tokenList.tokenList ], ), TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool', timestamp=1, ), taggingType=u'lower', taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag=token.text.lower(), ) for token in sentence.tokenization.tokenList.tokenList ], ), ] + [ TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool/{}'.format(i), timestamp=1, ), taggingType=tagging_type, taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag='{}_{}/{}'.format(tagging_type, token.tokenIndex, i), ) for token in sentence.tokenization.tokenList.tokenList ], ) for (i, tagging_type) in enumerate(additional_tagging_types) ] return comm
def test_validate_minimal_communication_with_uuid(): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() assert validate_communication(comm)
def test_validate_minimal_communication_with_uuid(): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata( tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() assert validate_communication(comm)
def search(self, query): logger.info("Received SearchQuery: '%s'" % query) search_result_items = [] weights = (None if query.labels is None or len(query.labels) == 0 else [float(e) for e in query.labels]) entities, log_distribution = self.index.query(query.terms, query.k, weights=weights) lm = sorted(self.index.feat2id.items(), key=lambda x: log_distribution[self.index.feat2id[x[0]]], reverse=True) query.labels = [e[0] for e in lm[:self.k_query]] for guid, score in entities: search_result_item = SearchResultItem() uuid_sentences = self.index.guid2sent[guid] ss = np.empty((len(uuid_sentences), )) for idx, (uuid, sent) in enumerate(uuid_sentences): ss[idx] = lm_score(log_distribution, self.index.feat2id, sent) sorted_idi = np.argsort(ss)[-1:-self.k_rationale - 1:-1] sents = [' '.join(uuid_sentences[e][1]) for e in sorted_idi] uuidi = [uuid_sentences[e][0] for e in sorted_idi] search_result_item.communicationId = guid + '\n' + '\n'.join(sents) search_result_item.sentenceId = None search_result_item.score = score entity = Entity() entity.uuid = generate_UUID() entity.id = guid uuidList = [] for single_uuid in uuidi: uuidObj = UUID() uuidObj.uuidString = single_uuid uuidList.append(uuidObj) entity.mentionIdList = uuidList search_result_item.entity = entity search_result_items.append(search_result_item) search_result = SearchResult() search_result.uuid = generate_UUID() search_result.searchResultItems = search_result_items search_result.searchQuery = query logger.info("Returned SearchResult with %d SearchResultItems\n" % len(search_result.searchResultItems)) return search_result
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[ Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1)) ]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property( value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def set_tokentaggings_of_type_v(tokenization, taggingType, prediction, toolname): timestamp = long(time.time()*1e6) tokens = tokenization.tokenList.tokenList new_pred = [] start = 0 for i, tk in enumerate(tokens): tg = ' '.join(prediction[start:start+len(tk.text)]) #print tk.text, tg new_pred.append(TaggedToken(tokenIndex=i,tag=tg)) start += len(tk.text) assert len(new_pred) == len(tokens) #print start, len(prediction) assert start == len(prediction) new_tokentagging = TokenTagging( taggingType=taggingType, taggedTokenList=new_pred, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tokenization.tokenTaggingList.append(new_tokentagging)
def set_tokentaggings_of_type_v(tokenization, taggingType, prediction, toolname): timestamp = long(time.time() * 1e6) tokens = tokenization.tokenList.tokenList new_pred = [] start = 0 for i, tk in enumerate(tokens): tg = ' '.join(prediction[start:start + len(tk.text)]) #print tk.text, tg new_pred.append(TaggedToken(tokenIndex=i, tag=tg)) start += len(tk.text) assert len(new_pred) == len(tokens) #print start, len(prediction) assert start == len(prediction) new_tokentagging = TokenTagging(taggingType=taggingType, taggedTokenList=new_pred, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tokenization.tokenTaggingList.append(new_tokentagging)
def test_generate_UUID(): comm = Communication() comm.uuid = generate_UUID()
def update_concrete(comm, prediction): toolname = 'Violet_NER_annotator' timestamp = int(time.time()) mention_list = [] for section in comm.sectionList: for sentence in section.sentenceList: start = 0 pred_ner_tags = [] tknzation = sentence.tokenization in_NE = False ne_type = '' tokenization_id = None token_idx_list = [] ne_text = [] for i, tk in enumerate(tknzation.tokenList.tokenList): pred_tags = ' '.join(prediction[start:start + len(tk.text)]) if in_NE: for i, tag in enumerate(prediction[start:start + len(tk.text)]): if tag != 'I-' + ne_type: if i != 0: token_idx_list.append(i) ne_text.append(tk.text) entity_tokens = TokenRefSequence( tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split( '.') if '.' in ne_type else (ne_type, 'NAM') #print token_idx_list, ne_text, e_type, p_type e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False break if not in_NE and 'B-' in pred_tags: #print 'not in NE,', prediction[start:start+len(tk.text)] in_NE = True for tag in prediction[start:start + len(tk.text)]: #print tag if tag.startswith('B-'): ne_type = tag.split('-')[1] tokenization_id = tknzation.uuid token_idx_list.append(i) ne_text.append(tk.text) break #print token_idx_list, ne_text if prediction[start + len(tk.text) - 1] != 'I-' + ne_type: entity_tokens = TokenRefSequence( tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split( '.') if '.' in ne_type else (ne_type, 'NAM') e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False start += len(tk.text) pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags)) pner_tokentagging = TokenTagging(taggingType=PRED_TAG, taggedTokenList=pred_ner_tags, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tknzation.tokenTaggingList.append(pner_tokentagging) entity_list = [ Entity(uuid=generate_UUID(), type=mention.entityType, canonicalName=mention.text, mentionIdList=[mention.uuid]) for mention in mention_list ] entity_mention_set = EntityMentionSet(uuid=generate_UUID(), metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), mentionList=mention_list) entity_set = EntitySet(uuid=generate_UUID(), metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), entityList=entity_list, mentionSetId=entity_mention_set.uuid) comm.entityMentionSetList = [entity_mention_set] comm.entitySetList = [entity_set]
def update_concrete(comm, prediction): toolname = 'Violet_NER_annotator' timestamp = int(time.time()) mention_list = [] for section in comm.sectionList: for sentence in section.sentenceList: start = 0 pred_ner_tags = [] tknzation = sentence.tokenization in_NE = False ne_type = '' tokenization_id = None token_idx_list = [] ne_text = [] for i, tk in enumerate(tknzation.tokenList.tokenList): pred_tags = ' '.join(prediction[start:start+len(tk.text)]) if in_NE: for i, tag in enumerate(prediction[start:start+len(tk.text)]): if tag != 'I-' + ne_type: if i != 0: token_idx_list.append(i) ne_text.append(tk.text) entity_tokens = TokenRefSequence(tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split('.') if '.' in ne_type else (ne_type, 'NAM') #print token_idx_list, ne_text, e_type, p_type e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False break if not in_NE and 'B-' in pred_tags: #print 'not in NE,', prediction[start:start+len(tk.text)] in_NE = True for tag in prediction[start:start+len(tk.text)]: #print tag if tag.startswith('B-'): ne_type = tag.split('-')[1] tokenization_id = tknzation.uuid token_idx_list.append(i) ne_text.append(tk.text) break #print token_idx_list, ne_text if prediction[start+len(tk.text)-1] != 'I-'+ne_type: entity_tokens = TokenRefSequence(tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split('.') if '.' in ne_type else (ne_type, 'NAM') e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens,entityType=e_type,phraseType=p_type,text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False start += len(tk.text) pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags)) pner_tokentagging = TokenTagging( taggingType=PRED_TAG, taggedTokenList=pred_ner_tags, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tknzation.tokenTaggingList.append(pner_tokentagging) entity_list = [Entity(uuid=generate_UUID(),type=mention.entityType,canonicalName=mention.text,mentionIdList=[mention.uuid]) for mention in mention_list] entity_mention_set = EntityMentionSet(uuid=generate_UUID(),metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),mentionList=mention_list) entity_set = EntitySet(uuid=generate_UUID(),metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp),entityList=entity_list,mentionSetId=entity_mention_set.uuid) comm.entityMentionSetList = [entity_mention_set] comm.entitySetList = [entity_set]