def add_event_to_comm(mentions: List[Tuple[int, int, str]],
                      event_type: str,
                      comm: Communication):
    # This function assumes that the comm only has one sentence
    tokenization_id = comm.sectionList[0].sentenceList[0].tokenization.uuid
    comm.situationMentionSetList[0].mentionList.append(
        SituationMention(uuid=augf.next(),
                         situationType='EVENT',
                         situationKind=ontology['mappings']['events'][event_type],
                         argumentList=[])
    )
    for mention in mentions:
        new_entity_mention: Optional[EntityMention] = check_duplicate_mention(comm, (mention[0], mention[1]))
        if new_entity_mention is None:
            new_entity_mention = EntityMention(
                uuid=augf.next(),
                tokens=TokenRefSequence(
                    tokenIndexList=list(range(mention[0], mention[1])),
                    tokenizationId=tokenization_id
                )
            )
        comm.entityMentionSetList[0].mentionList.append(new_entity_mention)
        comm.situationMentionSetList[0].mentionList[0].argumentList.append(
            MentionArgument(
                role=mention[2],
                entityMentionId=new_entity_mention.uuid
            )
        )
Пример #2
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[
        Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))
    ])
    tokn = Tokenization(uuid=generate_UUID(),
                        metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(),
                      kind='kind',
                      label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(),
                       entityType='entityType',
                       text='text',
                       tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(),
                           metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts)
    props = list(
        Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0)
        for i in range(num_properties))
    am = MentionArgument(role='role',
                         entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(),
                              metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(),
                         id='id',
                         text='text',
                         type='type',
                         metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
def add_additional_mentions_to_comm(mentions: List[Tuple[int, int]],
                                    comm: Communication):
    tokenization_id = comm.sectionList[0].sentenceList[0].tokenization.uuid
    for mention in mentions:
        if check_duplicate_mention(comm, (mention[0], mention[1])) is not None:
            continue

        new_entity_mention = EntityMention(
            uuid=augf.next(),
            tokens=TokenRefSequence(
                tokenIndexList=list(range(mention[0], mention[1])),
                tokenizationId=tokenization_id
            )
        )
        comm.entityMentionSetList[0].mentionList.append(new_entity_mention)
Пример #4
0
    def annotate(self, communication):
        text = communication.text
        augf = AnalyticUUIDGeneratorFactory(communication)
        aug = augf.create()
        entities = {}
        for section in communication.sectionList:
            for sentence in section.sentenceList:
                tokens = [
                    x.text for x in sentence.tokenization.tokenList.tokenList
                ]
                tags = [
                    x.tag for x in
                    sentence.tokenization.tokenTaggingList[-1].taggedTokenList
                ]
                for subtree in nltk.ne_chunk(zip(tokens, tags)).subtrees():
                    if subtree.label() != "S":
                        name = " ".join([x[0] for x in subtree.leaves()])
                        logging.info("Found named entity \"%s\"", name)
                        entities[(name, subtree.label(
                        ))] = entities.get(name, []) + [
                            EntityMention(
                                uuid=aug.next(),
                                entityType=subtree.label(),
                                tokens=TokenRefSequence(
                                    tokenIndexList=[],
                                    tokenizationId=sentence.tokenization.uuid))
                        ]

        communication.entitySetList.append(
            EntitySet(uuid=aug.next(),
                      metadata=AnnotationMetadata(timestamp=int(time.time()),
                                                  tool="nltk"),
                      entityList=[
                          Entity(uuid=aug.next(),
                                 mentionIdList=[x.uuid for x in v],
                                 canonicalName=k[0],
                                 type=k[1]) for k, v in entities.iteritems()
                      ]))

        communication.entityMentionSetList.append(
            EntityMentionSet(uuid=aug.next(),
                             metadata=AnnotationMetadata(timestamp=int(
                                 time.time()),
                                                         tool="nltk"),
                             mentionList=sum(entities.values(), [])))

        return communication
Пример #5
0
def update_concrete(comm, prediction):
    toolname = 'Violet_NER_annotator'
    timestamp = int(time.time())
    mention_list = []
    for section in comm.sectionList:
        for sentence in section.sentenceList:
            start = 0
            pred_ner_tags = []
            tknzation = sentence.tokenization
            in_NE = False
            ne_type = ''
            tokenization_id = None
            token_idx_list = []
            ne_text = []
            for i, tk in enumerate(tknzation.tokenList.tokenList):
                pred_tags = ' '.join(prediction[start:start + len(tk.text)])
                if in_NE:
                    for i, tag in enumerate(prediction[start:start +
                                                       len(tk.text)]):
                        if tag != 'I-' + ne_type:
                            if i != 0:
                                token_idx_list.append(i)
                                ne_text.append(tk.text)
                            entity_tokens = TokenRefSequence(
                                tokenizationId=tokenization_id,
                                tokenIndexList=token_idx_list)
                            e_type, p_type = ne_type.split(
                                '.') if '.' in ne_type else (ne_type, 'NAM')
                            #print token_idx_list, ne_text, e_type, p_type
                            e_mention = EntityMention(uuid=generate_UUID(),
                                                      tokens=entity_tokens,
                                                      entityType=e_type,
                                                      phraseType=p_type,
                                                      text=''.join(ne_text))
                            mention_list.append(e_mention)
                            tokenization_id = None
                            token_idx_list = []
                            ne_text = []
                            ne_type = ''
                            in_NE = False
                            break
                if not in_NE and 'B-' in pred_tags:
                    #print 'not in NE,', prediction[start:start+len(tk.text)]
                    in_NE = True
                    for tag in prediction[start:start + len(tk.text)]:
                        #print tag
                        if tag.startswith('B-'):
                            ne_type = tag.split('-')[1]
                            tokenization_id = tknzation.uuid
                            token_idx_list.append(i)
                            ne_text.append(tk.text)
                            break
                    #print token_idx_list, ne_text
                    if prediction[start + len(tk.text) - 1] != 'I-' + ne_type:
                        entity_tokens = TokenRefSequence(
                            tokenizationId=tokenization_id,
                            tokenIndexList=token_idx_list)
                        e_type, p_type = ne_type.split(
                            '.') if '.' in ne_type else (ne_type, 'NAM')
                        e_mention = EntityMention(uuid=generate_UUID(),
                                                  tokens=entity_tokens,
                                                  entityType=e_type,
                                                  phraseType=p_type,
                                                  text=''.join(ne_text))
                        mention_list.append(e_mention)
                        tokenization_id = None
                        token_idx_list = []
                        ne_text = []
                        ne_type = ''
                        in_NE = False
                start += len(tk.text)
                pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags))
            pner_tokentagging = TokenTagging(taggingType=PRED_TAG,
                                             taggedTokenList=pred_ner_tags,
                                             metadata=AnnotationMetadata(
                                                 tool=toolname,
                                                 timestamp=timestamp),
                                             uuid=generate_UUID())
            tknzation.tokenTaggingList.append(pner_tokentagging)
    entity_list = [
        Entity(uuid=generate_UUID(),
               type=mention.entityType,
               canonicalName=mention.text,
               mentionIdList=[mention.uuid]) for mention in mention_list
    ]
    entity_mention_set = EntityMentionSet(uuid=generate_UUID(),
                                          metadata=AnnotationMetadata(
                                              tool=toolname,
                                              timestamp=timestamp),
                                          mentionList=mention_list)
    entity_set = EntitySet(uuid=generate_UUID(),
                           metadata=AnnotationMetadata(tool=toolname,
                                                       timestamp=timestamp),
                           entityList=entity_list,
                           mentionSetId=entity_mention_set.uuid)
    comm.entityMentionSetList = [entity_mention_set]
    comm.entitySetList = [entity_set]