Пример #1
0
def comm_with_other_tags(*additional_tagging_types):
    comm = create_comm(
        'quick', '''\
The quick brown fox jumped
over the lazy dog .

Or did she ?
''')
    for section in comm.sectionList:
        for sentence in section.sentenceList:
            sentence.tokenization.tokenTaggingList = [
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool',
                        timestamp=1,
                    ),
                    taggingType=u'upper',
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag=token.text.upper(),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ),
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool',
                        timestamp=1,
                    ),
                    taggingType=u'lower',
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag=token.text.lower(),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ),
            ] + [
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool/{}'.format(i),
                        timestamp=1,
                    ),
                    taggingType=tagging_type,
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag='{}_{}/{}'.format(tagging_type,
                                                  token.tokenIndex, i),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ) for (i, tagging_type) in enumerate(additional_tagging_types)
            ]
    return comm
Пример #2
0
def test_get_tagged_tokens_non_unique_tagging(tokenization):
    tokenization.tokenTaggingList.append(
        TokenTagging(
            taggingType='NUMERAL',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='Y'),
                TaggedToken(tokenIndex=2, tag='Y'),
            ],
        ), )
    with raises(Exception):
        get_tagged_tokens(tokenization, 'NUMERAL')
Пример #3
0
def tokenization(request):
    return Tokenization(tokenTaggingList=[
        TokenTagging(
            metadata=AnnotationMetadata(tool='x'),
            taggingType='?',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='?'),
                TaggedToken(tokenIndex=1, tag='?'),
                TaggedToken(tokenIndex=2, tag='?'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='x'),
            taggingType='POS',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='N'),
                TaggedToken(tokenIndex=2, tag='X'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='y'),
            taggingType='NUMERAL',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='N'),
                TaggedToken(tokenIndex=2, tag='Y'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='y'),
            taggingType='LEMMA',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='mambo'),
                TaggedToken(tokenIndex=1, tag='number'),
                TaggedToken(tokenIndex=2, tag='4'),
            ],
        ),
    ], )
Пример #4
0
def test_get_token_taggings_non_unique_tagging(tokenization):
    tokenization.tokenTaggingList.append(
        TokenTagging(
            taggingType='NUMERAL',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='Y'),
                TaggedToken(tokenIndex=2, tag='Y'),
            ],
        ), )
    assert [
        [(0, 'N'), (1, 'N'), (2, 'Y')],
        [(0, 'N'), (1, 'Y'), (2, 'Y')],
    ] == [[(t.tokenIndex, t.tag) for t in tt.taggedTokenList]
          for tt in get_token_taggings(tokenization, 'NUMERAL')]
Пример #5
0
def test_get_tagged_tokens_non_unique_tagging_specify_tool(tokenization):
    tokenization.tokenTaggingList.append(
        TokenTagging(
            metadata=AnnotationMetadata(tool='z'),
            taggingType='NUMERAL',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='Y'),
                TaggedToken(tokenIndex=2, tag='Y'),
            ],
        ), )
    assert ['N', 'N', 'Y'] == list(
        map(lambda t: t.tag,
            get_tagged_tokens(tokenization, 'NUMERAL', tool='y')))
    assert [0, 1, 2] == list(
        map(lambda t: t.tokenIndex,
            get_tagged_tokens(tokenization, 'NUMERAL', tool='y')))
Пример #6
0
def set_tokentaggings_of_type_v(tokenization, taggingType, prediction,
                                toolname):
    timestamp = long(time.time() * 1e6)
    tokens = tokenization.tokenList.tokenList
    new_pred = []
    start = 0
    for i, tk in enumerate(tokens):
        tg = ' '.join(prediction[start:start + len(tk.text)])
        #print tk.text, tg
        new_pred.append(TaggedToken(tokenIndex=i, tag=tg))
        start += len(tk.text)
    assert len(new_pred) == len(tokens)
    #print start, len(prediction)
    assert start == len(prediction)
    new_tokentagging = TokenTagging(taggingType=taggingType,
                                    taggedTokenList=new_pred,
                                    metadata=AnnotationMetadata(
                                        tool=toolname, timestamp=timestamp),
                                    uuid=generate_UUID())
    tokenization.tokenTaggingList.append(new_tokentagging)
Пример #7
0
    def annotate(self, communication):
        text = communication.text
        augf = AnalyticUUIDGeneratorFactory(communication)
        aug = augf.create()
        for section in communication.sectionList:
            for sentence in section.sentenceList:
                tokens = [
                    x.text for x in sentence.tokenization.tokenList.tokenList
                ]
                sentence.tokenization.tokenTaggingList.append(
                    TokenTagging(uuid=aug.next(),
                                 metadata=AnnotationMetadata(timestamp=int(
                                     time.time()),
                                                             tool="nltk"),
                                 taggedTokenList=[],
                                 taggingType="Penn Treebank"))
                for i, (tok, tag) in enumerate(nltk.pos_tag(tokens)):
                    logging.info("Tagged %s as %s", tok, tag)
                    sentence.tokenization.tokenTaggingList[
                        -1].taggedTokenList.append(
                            TaggedToken(tokenIndex=i, tag=tag))

        return communication
Пример #8
0
def update_concrete(comm, prediction):
    toolname = 'Violet_NER_annotator'
    timestamp = int(time.time())
    mention_list = []
    for section in comm.sectionList:
        for sentence in section.sentenceList:
            start = 0
            pred_ner_tags = []
            tknzation = sentence.tokenization
            in_NE = False
            ne_type = ''
            tokenization_id = None
            token_idx_list = []
            ne_text = []
            for i, tk in enumerate(tknzation.tokenList.tokenList):
                pred_tags = ' '.join(prediction[start:start + len(tk.text)])
                if in_NE:
                    for i, tag in enumerate(prediction[start:start +
                                                       len(tk.text)]):
                        if tag != 'I-' + ne_type:
                            if i != 0:
                                token_idx_list.append(i)
                                ne_text.append(tk.text)
                            entity_tokens = TokenRefSequence(
                                tokenizationId=tokenization_id,
                                tokenIndexList=token_idx_list)
                            e_type, p_type = ne_type.split(
                                '.') if '.' in ne_type else (ne_type, 'NAM')
                            #print token_idx_list, ne_text, e_type, p_type
                            e_mention = EntityMention(uuid=generate_UUID(),
                                                      tokens=entity_tokens,
                                                      entityType=e_type,
                                                      phraseType=p_type,
                                                      text=''.join(ne_text))
                            mention_list.append(e_mention)
                            tokenization_id = None
                            token_idx_list = []
                            ne_text = []
                            ne_type = ''
                            in_NE = False
                            break
                if not in_NE and 'B-' in pred_tags:
                    #print 'not in NE,', prediction[start:start+len(tk.text)]
                    in_NE = True
                    for tag in prediction[start:start + len(tk.text)]:
                        #print tag
                        if tag.startswith('B-'):
                            ne_type = tag.split('-')[1]
                            tokenization_id = tknzation.uuid
                            token_idx_list.append(i)
                            ne_text.append(tk.text)
                            break
                    #print token_idx_list, ne_text
                    if prediction[start + len(tk.text) - 1] != 'I-' + ne_type:
                        entity_tokens = TokenRefSequence(
                            tokenizationId=tokenization_id,
                            tokenIndexList=token_idx_list)
                        e_type, p_type = ne_type.split(
                            '.') if '.' in ne_type else (ne_type, 'NAM')
                        e_mention = EntityMention(uuid=generate_UUID(),
                                                  tokens=entity_tokens,
                                                  entityType=e_type,
                                                  phraseType=p_type,
                                                  text=''.join(ne_text))
                        mention_list.append(e_mention)
                        tokenization_id = None
                        token_idx_list = []
                        ne_text = []
                        ne_type = ''
                        in_NE = False
                start += len(tk.text)
                pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags))
            pner_tokentagging = TokenTagging(taggingType=PRED_TAG,
                                             taggedTokenList=pred_ner_tags,
                                             metadata=AnnotationMetadata(
                                                 tool=toolname,
                                                 timestamp=timestamp),
                                             uuid=generate_UUID())
            tknzation.tokenTaggingList.append(pner_tokentagging)
    entity_list = [
        Entity(uuid=generate_UUID(),
               type=mention.entityType,
               canonicalName=mention.text,
               mentionIdList=[mention.uuid]) for mention in mention_list
    ]
    entity_mention_set = EntityMentionSet(uuid=generate_UUID(),
                                          metadata=AnnotationMetadata(
                                              tool=toolname,
                                              timestamp=timestamp),
                                          mentionList=mention_list)
    entity_set = EntitySet(uuid=generate_UUID(),
                           metadata=AnnotationMetadata(tool=toolname,
                                                       timestamp=timestamp),
                           entityList=entity_list,
                           mentionSetId=entity_mention_set.uuid)
    comm.entityMentionSetList = [entity_mention_set]
    comm.entitySetList = [entity_set]