def comm_with_other_tags(*additional_tagging_types): comm = create_comm( 'quick', '''\ The quick brown fox jumped over the lazy dog . Or did she ? ''') for section in comm.sectionList: for sentence in section.sentenceList: sentence.tokenization.tokenTaggingList = [ TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool', timestamp=1, ), taggingType=u'upper', taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag=token.text.upper(), ) for token in sentence.tokenization.tokenList.tokenList ], ), TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool', timestamp=1, ), taggingType=u'lower', taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag=token.text.lower(), ) for token in sentence.tokenization.tokenList.tokenList ], ), ] + [ TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool/{}'.format(i), timestamp=1, ), taggingType=tagging_type, taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag='{}_{}/{}'.format(tagging_type, token.tokenIndex, i), ) for token in sentence.tokenization.tokenList.tokenList ], ) for (i, tagging_type) in enumerate(additional_tagging_types) ] return comm
def test_get_tagged_tokens_non_unique_tagging(tokenization): tokenization.tokenTaggingList.append( TokenTagging( taggingType='NUMERAL', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='Y'), TaggedToken(tokenIndex=2, tag='Y'), ], ), ) with raises(Exception): get_tagged_tokens(tokenization, 'NUMERAL')
def tokenization(request): return Tokenization(tokenTaggingList=[ TokenTagging( metadata=AnnotationMetadata(tool='x'), taggingType='?', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='?'), TaggedToken(tokenIndex=1, tag='?'), TaggedToken(tokenIndex=2, tag='?'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='x'), taggingType='POS', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='N'), TaggedToken(tokenIndex=2, tag='X'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='y'), taggingType='NUMERAL', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='N'), TaggedToken(tokenIndex=2, tag='Y'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='y'), taggingType='LEMMA', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='mambo'), TaggedToken(tokenIndex=1, tag='number'), TaggedToken(tokenIndex=2, tag='4'), ], ), ], )
def test_get_token_taggings_non_unique_tagging(tokenization): tokenization.tokenTaggingList.append( TokenTagging( taggingType='NUMERAL', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='Y'), TaggedToken(tokenIndex=2, tag='Y'), ], ), ) assert [ [(0, 'N'), (1, 'N'), (2, 'Y')], [(0, 'N'), (1, 'Y'), (2, 'Y')], ] == [[(t.tokenIndex, t.tag) for t in tt.taggedTokenList] for tt in get_token_taggings(tokenization, 'NUMERAL')]
def test_get_tagged_tokens_non_unique_tagging_specify_tool(tokenization): tokenization.tokenTaggingList.append( TokenTagging( metadata=AnnotationMetadata(tool='z'), taggingType='NUMERAL', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='Y'), TaggedToken(tokenIndex=2, tag='Y'), ], ), ) assert ['N', 'N', 'Y'] == list( map(lambda t: t.tag, get_tagged_tokens(tokenization, 'NUMERAL', tool='y'))) assert [0, 1, 2] == list( map(lambda t: t.tokenIndex, get_tagged_tokens(tokenization, 'NUMERAL', tool='y')))
def set_tokentaggings_of_type_v(tokenization, taggingType, prediction, toolname): timestamp = long(time.time() * 1e6) tokens = tokenization.tokenList.tokenList new_pred = [] start = 0 for i, tk in enumerate(tokens): tg = ' '.join(prediction[start:start + len(tk.text)]) #print tk.text, tg new_pred.append(TaggedToken(tokenIndex=i, tag=tg)) start += len(tk.text) assert len(new_pred) == len(tokens) #print start, len(prediction) assert start == len(prediction) new_tokentagging = TokenTagging(taggingType=taggingType, taggedTokenList=new_pred, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tokenization.tokenTaggingList.append(new_tokentagging)
def annotate(self, communication): text = communication.text augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() for section in communication.sectionList: for sentence in section.sentenceList: tokens = [ x.text for x in sentence.tokenization.tokenList.tokenList ] sentence.tokenization.tokenTaggingList.append( TokenTagging(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int( time.time()), tool="nltk"), taggedTokenList=[], taggingType="Penn Treebank")) for i, (tok, tag) in enumerate(nltk.pos_tag(tokens)): logging.info("Tagged %s as %s", tok, tag) sentence.tokenization.tokenTaggingList[ -1].taggedTokenList.append( TaggedToken(tokenIndex=i, tag=tag)) return communication
def update_concrete(comm, prediction): toolname = 'Violet_NER_annotator' timestamp = int(time.time()) mention_list = [] for section in comm.sectionList: for sentence in section.sentenceList: start = 0 pred_ner_tags = [] tknzation = sentence.tokenization in_NE = False ne_type = '' tokenization_id = None token_idx_list = [] ne_text = [] for i, tk in enumerate(tknzation.tokenList.tokenList): pred_tags = ' '.join(prediction[start:start + len(tk.text)]) if in_NE: for i, tag in enumerate(prediction[start:start + len(tk.text)]): if tag != 'I-' + ne_type: if i != 0: token_idx_list.append(i) ne_text.append(tk.text) entity_tokens = TokenRefSequence( tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split( '.') if '.' in ne_type else (ne_type, 'NAM') #print token_idx_list, ne_text, e_type, p_type e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False break if not in_NE and 'B-' in pred_tags: #print 'not in NE,', prediction[start:start+len(tk.text)] in_NE = True for tag in prediction[start:start + len(tk.text)]: #print tag if tag.startswith('B-'): ne_type = tag.split('-')[1] tokenization_id = tknzation.uuid token_idx_list.append(i) ne_text.append(tk.text) break #print token_idx_list, ne_text if prediction[start + len(tk.text) - 1] != 'I-' + ne_type: entity_tokens = TokenRefSequence( tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split( '.') if '.' in ne_type else (ne_type, 'NAM') e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False start += len(tk.text) pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags)) pner_tokentagging = TokenTagging(taggingType=PRED_TAG, taggedTokenList=pred_ner_tags, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tknzation.tokenTaggingList.append(pner_tokentagging) entity_list = [ Entity(uuid=generate_UUID(), type=mention.entityType, canonicalName=mention.text, mentionIdList=[mention.uuid]) for mention in mention_list ] entity_mention_set = EntityMentionSet(uuid=generate_UUID(), metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), mentionList=mention_list) entity_set = EntitySet(uuid=generate_UUID(), metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), entityList=entity_list, mentionSetId=entity_mention_set.uuid) comm.entityMentionSetList = [entity_mention_set] comm.entitySetList = [entity_set]