Пример #1
0
def comm_with_other_tags(*additional_tagging_types):
    comm = create_comm(
        'quick', '''\
The quick brown fox jumped
over the lazy dog .

Or did she ?
''')
    for section in comm.sectionList:
        for sentence in section.sentenceList:
            sentence.tokenization.tokenTaggingList = [
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool',
                        timestamp=1,
                    ),
                    taggingType=u'upper',
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag=token.text.upper(),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ),
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool',
                        timestamp=1,
                    ),
                    taggingType=u'lower',
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag=token.text.lower(),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ),
            ] + [
                TokenTagging(
                    uuid=generate_UUID(),
                    metadata=AnnotationMetadata(
                        tool=u'tool/{}'.format(i),
                        timestamp=1,
                    ),
                    taggingType=tagging_type,
                    taggedTokenList=[
                        TaggedToken(
                            tokenIndex=token.tokenIndex,
                            tag='{}_{}/{}'.format(tagging_type,
                                                  token.tokenIndex, i),
                        )
                        for token in sentence.tokenization.tokenList.tokenList
                    ],
                ) for (i, tagging_type) in enumerate(additional_tagging_types)
            ]
    return comm
Пример #2
0
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."):
    """Create a simple (valid) Communication suitable for testing purposes

    The Communication will have a single Section containing a single
    Sentence.

    Args:

    - `comm_id`: A string specifying a Communication ID
    - `sentence_string`: A string to be used for the sentence text.
       The string will be whitespace-tokenized.

    Returns:

    - A Concrete Communication object
    """
    logging.warning('create_simple_comm will be removed in a future'
                    ' release, please use create_comm instead')

    toolname = "TEST"
    timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    comm = Communication(id=comm_id,
                         metadata=AnnotationMetadata(tool=toolname,
                                                     timestamp=timestamp),
                         type=toolname,
                         uuid=aug.next())

    tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST,
                                metadata=AnnotationMetadata(
                                    tool=toolname, timestamp=timestamp),
                                tokenList=TokenList(tokenList=[]),
                                uuid=aug.next())
    token_string_list = sentence_string.split()
    for i, token_string in enumerate(token_string_list):
        tokenization.tokenList.tokenList.append(
            Token(text=token_string, tokenIndex=i))

    sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)),
                        tokenization=tokenization,
                        uuid=aug.next())

    section = Section(kind="SectionKind",
                      sentenceList=[sentence],
                      textSpan=TextSpan(0, len(sentence_string)),
                      uuid=aug.next())

    comm.sectionList = [section]
    comm.text = sentence_string

    return comm
Пример #3
0
 def search(self, query):
     augf = AnalyticUUIDGeneratorFactory()
     aug = augf.create()
     results = []
     for query1 in return_search_results(query.rawQuery):
         query1 = SearchQuery(type=SearchType.SENTENCES,
                              terms=query1.split(" "),
                              rawQuery=query1,
                              k=500)
         result = self.other.search(query1)
         # logging.info(result.searchResultItems)
         results.extend(result.searchResultItems)
     # results = SearchResult(searchResultItems=results, searchQuery=query)
     # logging.info(len(results))
     resultsDict = {}
     for result in results:
         resultsDict[result.sentenceId.uuidString] = result
     results = []
     for key in resultsDict:
         results.append(resultsDict[key])
     # results = results[:10] # comment out on full run
     comm_ids_list, temp = get_comm_ids(results)
     dictUUID = fetch_dataset(comm_ids_list, temp)
     inv_map = {v: k for k, v in dictUUID.items()}
     toHannah = []
     for uuid in dictUUID:
         toHannah.append([query.rawQuery, dictUUID[uuid]])
     resultItemRet = SearchResult(uuid=aug.next(),
                                  searchQuery=query,
                                  searchResultItems=results,
                                  metadata=AnnotationMetadata(
                                      tool="search",
                                      timestamp=int(time.time())),
                                  lang="eng")
     model = pickle.load(open("./trained_model.p", "rb"))
     pre = Preprocess()
     feature_matrix = pre.process_run(toHannah)
     dictRanks = pre_ranking(feature_matrix, model, toHannah, inv_map)
     results = rerank(dictRanks, resultItemRet)
     resultArr = results.searchResultItems
     resultArr = sorted(resultArr,
                        key=lambda result: result.score,
                        reverse=True)
     for item in resultArr:
         logging.info(item.score)
     resultItemRet = SearchResult(uuid=aug.next(),
                                  searchQuery=query,
                                  searchResultItems=resultArr,
                                  metadata=AnnotationMetadata(
                                      tool="search",
                                      timestamp=int(time.time())),
                                  lang="eng")
     return resultItemRet
Пример #4
0
def _comm_with_properties(num_properties):
    ts = 17
    meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts)
    toks = TokenList(tokenList=[
        Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))
    ])
    tokn = Tokenization(uuid=generate_UUID(),
                        metadata=meta_tokn,
                        kind=TokenizationKind.TOKEN_LIST,
                        tokenList=toks)
    sentence = Sentence(uuid=generate_UUID(), tokenization=tokn)
    section = Section(uuid=generate_UUID(),
                      kind='kind',
                      label='label',
                      sentenceList=[sentence])
    trfs = TokenRefSequence(tokenizationId=tokn.uuid,
                            tokenIndexList=[0],
                            anchorTokenIndex=0)
    em = EntityMention(uuid=generate_UUID(),
                       entityType='entityType',
                       text='text',
                       tokens=trfs)
    meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts)
    ems = EntityMentionSet(uuid=generate_UUID(),
                           metadata=meta_ems,
                           mentionList=[em])
    meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts)
    props = list(
        Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0)
        for i in range(num_properties))
    am = MentionArgument(role='role',
                         entityMentionId=em.uuid,
                         propertyList=props)
    sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am])
    meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts)
    sms = SituationMentionSet(uuid=generate_UUID(),
                              metadata=meta_sms,
                              mentionList=[sm])
    meta_comm = AnnotationMetadata(tool='tool', timestamp=ts)
    comm = Communication(uuid=generate_UUID(),
                         id='id',
                         text='text',
                         type='type',
                         metadata=meta_comm,
                         sectionList=[section],
                         situationMentionSetList=[sms],
                         entityMentionSetList=[ems])
    add_references_to_communication(comm)
    return comm
Пример #5
0
def test_validate_minimal_communication_with_uuid():
    comm = Communication()
    comm.id = "myID"
    comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time()))
    comm.type = "Test Communication"
    comm.uuid = generate_UUID()
    assert validate_communication(comm)
Пример #6
0
def test_repr_on_tokenization():
    tokenization = Tokenization(
        metadata=AnnotationMetadata(
            tool="test", timestamp=int(time.time())),
        uuid=UUID(uuidString='01234567-0123-4567-89ab-cdef89abcdef')
    )
    tokenization.__repr__()
Пример #7
0
def create_sentence(sen_text, sen_start, sen_end, aug, metadata_tool,
                    metadata_timestamp, annotation_level):
    '''
    Create sentence from provided text and metadata.
    Lower-level routine (called indirectly by create_comm).
    '''

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)
    sentences = sections and (annotation_level != AL_SECTION)
    tokens = sentences and (annotation_level != AL_SENTENCE)

    return Sentence(
        uuid=aug.next(),
        textSpan=TextSpan(sen_start, sen_end),
        tokenization=Tokenization(
            uuid=aug.next(),
            kind=TokenizationKind.TOKEN_LIST,
            metadata=AnnotationMetadata(
                tool=metadata_tool,
                timestamp=metadata_timestamp,
            ),
            tokenList=TokenList(tokenList=[
                Token(
                    tokenIndex=i,
                    text=tok_text,
                ) for (i, tok_text) in enumerate(sen_text.split())
            ]),
        ) if tokens else None,
    )
Пример #8
0
    def annotate(self, communication):
        text = communication.text
        augf = AnalyticUUIDGeneratorFactory(communication)
        aug = augf.create()
        entities = {}
        for section in communication.sectionList:
            for sentence in section.sentenceList:
                tokens = [
                    x.text for x in sentence.tokenization.tokenList.tokenList
                ]
                tags = [
                    x.tag for x in
                    sentence.tokenization.tokenTaggingList[-1].taggedTokenList
                ]
                for subtree in nltk.ne_chunk(zip(tokens, tags)).subtrees():
                    if subtree.label() != "S":
                        name = " ".join([x[0] for x in subtree.leaves()])
                        logging.info("Found named entity \"%s\"", name)
                        entities[(name, subtree.label(
                        ))] = entities.get(name, []) + [
                            EntityMention(
                                uuid=aug.next(),
                                entityType=subtree.label(),
                                tokens=TokenRefSequence(
                                    tokenIndexList=[],
                                    tokenizationId=sentence.tokenization.uuid))
                        ]

        communication.entitySetList.append(
            EntitySet(uuid=aug.next(),
                      metadata=AnnotationMetadata(timestamp=int(time.time()),
                                                  tool="nltk"),
                      entityList=[
                          Entity(uuid=aug.next(),
                                 mentionIdList=[x.uuid for x in v],
                                 canonicalName=k[0],
                                 type=k[1]) for k, v in entities.iteritems()
                      ]))

        communication.entityMentionSetList.append(
            EntityMentionSet(uuid=aug.next(),
                             metadata=AnnotationMetadata(timestamp=int(
                                 time.time()),
                                                         tool="nltk"),
                             mentionList=sum(entities.values(), [])))

        return communication
Пример #9
0
 def search(self, search_query):
     return SearchResult(
         uuid=UUID(uuidString='12345678-1234-5678-1234-567812345678'),
         searchResultItems=[
             SearchResultItem(communicationId=term, score=42.)
             for term in search_query.terms
         ],
         metadata=AnnotationMetadata(tool=self.METADATA_TOOL,
                                     timestamp=int(time())))
Пример #10
0
 def search(self, query):
     augf = AnalyticUUIDGeneratorFactory()
     aug = augf.create()
     return SearchResult(uuid=aug.next(),
                         searchQuery=query,
                         searchResultItems=[],
                         metadata=AnnotationMetadata(tool="stub search",
                                                     timestamp=int(
                                                         time.time())),
                         lang="eng")
Пример #11
0
def tokenization(request):
    return Tokenization(tokenTaggingList=[
        TokenTagging(
            metadata=AnnotationMetadata(tool='x'),
            taggingType='?',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='?'),
                TaggedToken(tokenIndex=1, tag='?'),
                TaggedToken(tokenIndex=2, tag='?'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='x'),
            taggingType='POS',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='N'),
                TaggedToken(tokenIndex=2, tag='X'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='y'),
            taggingType='NUMERAL',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='N'),
                TaggedToken(tokenIndex=2, tag='Y'),
            ],
        ),
        TokenTagging(
            metadata=AnnotationMetadata(tool='y'),
            taggingType='LEMMA',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='mambo'),
                TaggedToken(tokenIndex=1, tag='number'),
                TaggedToken(tokenIndex=2, tag='4'),
            ],
        ),
    ], )
def json_to_concrete(doc: Dict) -> Communication:
    metadata = AnnotationMetadata(
        tool="BlingBLing",
        timestamp=int(datetime.datetime.now().timestamp())
    )
    comm: Communication = Communication(
        uuid=augf.next(),
        id=doc['doc_key'],
        type="aida",
        metadata=metadata,
        lidList=[LanguageIdentification(
            uuid=augf.next(),
            metadata=metadata,
            languageToProbabilityMap={doc['language_id']: 1.0}
        )],
        sectionList=[Section(
            uuid=augf.next(),
            kind="passage",
            sentenceList=[
                Sentence(
                    uuid=augf.next(),
                    tokenization=Tokenization(
                        uuid=augf.next(),
                        kind=TokenizationKind.TOKEN_LIST,
                        metadata=metadata,
                        tokenList=TokenList(
                            tokenList=[
                                Token(
                                    tokenIndex=i,
                                    text=t
                                )
                                for i, t in enumerate(get_flatten_sentence(doc))
                            ]
                        )
                    )
                )
            ]
        )],
        entityMentionSetList=[EntityMentionSet(
            uuid=augf.next(),
            metadata=metadata,
            mentionList=[]
        )],
        situationMentionSetList=[SituationMentionSet(
            uuid=augf.next(),
            metadata=metadata,
            mentionList=[]
        )]
    )

    return comm
Пример #13
0
 def annotate(self, communication):
     text = ""
     for section in communication.sectionList:
         if section.kind == "content":
             text += communication.text[section.textSpan.start:section.textSpan.ending]
     scores = {languages.get(iso639_1_code=k).iso639_3_code : math.exp(v) for k, v in self.classifier.classify(text).iteritems()}
     logging.info(str(scores))
     augf = AnalyticUUIDGeneratorFactory(communication)
     aug = augf.create()
     lid = LanguageIdentification(uuid=aug.next(),
                                  languageToProbabilityMap=scores,
                                  metadata=AnnotationMetadata(tool="valid", timestamp=int(time.time()), kBest=len(scores)),
     )
     communication.lidList.append(lid)
     return communication
Пример #14
0
def test_get_tagged_tokens_non_unique_tagging_specify_tool(tokenization):
    tokenization.tokenTaggingList.append(
        TokenTagging(
            metadata=AnnotationMetadata(tool='z'),
            taggingType='NUMERAL',
            taggedTokenList=[
                TaggedToken(tokenIndex=0, tag='N'),
                TaggedToken(tokenIndex=1, tag='Y'),
                TaggedToken(tokenIndex=2, tag='Y'),
            ],
        ), )
    assert ['N', 'N', 'Y'] == list(
        map(lambda t: t.tag,
            get_tagged_tokens(tokenization, 'NUMERAL', tool='y')))
    assert [0, 1, 2] == list(
        map(lambda t: t.tokenIndex,
            get_tagged_tokens(tokenization, 'NUMERAL', tool='y')))
Пример #15
0
 def annotate(self, communication):
     print communication.id
     augf = AnalyticUUIDGeneratorFactory(communication)
     aug = augf.create()
     for section in communication.sectionList:
         for sentence in section.sentenceList:
             text = communication.text[sentence.textSpan.start:sentence.textSpan.ending]
             sentence.tokenization = Tokenization(uuid = aug.next(),
                                                  kind = TokenizationKind.TOKEN_LIST,
                                                  tokenList = TokenList(tokenList=[]),
                                                  tokenTaggingList = [],
                                                  metadata = AnnotationMetadata(timestamp=int(time.time()), tool="nltk"))
                                                  
             for i, token in enumerate(nltk.word_tokenize(text)):
                 logging.info("Found token %s", token)
                 sentence.tokenization.tokenList.tokenList.append(Token(tokenIndex=i, text=token))
     return communication
Пример #16
0
def capture_tweet_lid(twitter_dict):
    """
    Attempts to capture the 'lang' field in the twitter API, if it
    exists.

    Returns a list of LanguageIdentification objects, or None if the
    field is not present in the tweet json.
    """
    if u'lang' in twitter_dict:
        amd = AnnotationMetadata(tool="Twitter LID",
                                 timestamp=int(time.time()),
                                 kBest=1)
        kvs = {}
        kvs[twitter_lid_to_iso639_3(twitter_dict[u'lang'])] = 1.0
        return LanguageIdentification(metadata=amd,
                                      languageToProbabilityMap=kvs)
    else:
        return None
Пример #17
0
def create_comm(comm_id,
                text='',
                comm_type='article',
                section_kind='passage',
                metadata_tool='concrete-python',
                metadata_timestamp=None,
                annotation_level=AL_TOKEN):
    '''
    Create a simple, valid Communication from text.
    By default the text will be split by double-newlines into sections
    and then by single newlines into sentences within those sections.

    annotation_level controls the amount of annotation that is added:
      AL_NONE      add no optional annotations (not even sections)
      AL_SECTION   add sections but not sentences
      AL_SENTENCE  add sentences but not tokens
      AL_TOKEN     add all annotations, up to tokens (the default)

    If metadata_timestamp is None, the current time will be used.
    '''

    if metadata_timestamp is None:
        metadata_timestamp = int(time.time())

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()

    sections = (annotation_level is not None) and (annotation_level != AL_NONE)

    return Communication(
        id=comm_id,
        uuid=aug.next(),
        type=comm_type,
        text=text,
        metadata=AnnotationMetadata(
            tool=metadata_tool,
            timestamp=metadata_timestamp,
        ),
        sectionList=([
            create_section(sec_text, sec_start, sec_end, section_kind, aug,
                           metadata_tool, metadata_timestamp, annotation_level)
            for (sec_text, sec_start, sec_end) in _split(text, '\n\n')
        ] if text.strip() else []) if sections else None,
    )
Пример #18
0
 def search(self, query):
     augf = AnalyticUUIDGeneratorFactory()
     aug = augf.create()
     k_returned = query.k
     terms = query.terms
     # implement boolean search here
     query_matches = {}
     documents = []
     with gzip.open("/mnt/index/index.gz", 'rt', encoding='utf-8') as index:
         first_line = index.readline()
         documents = first_line.split()
         query_matches = self.docsByTerm(terms, index)
     results = self.returnDocList(query_matches, documents, k_returned)
     return SearchResult(uuid=aug.next(),
                         searchQuery=query,
                         searchResultItems=results,
                         metadata=AnnotationMetadata(tool="stub search",
                                                     timestamp=int(
                                                         time.time())),
                         lang="eng")
Пример #19
0
def set_tokentaggings_of_type_v(tokenization, taggingType, prediction,
                                toolname):
    timestamp = long(time.time() * 1e6)
    tokens = tokenization.tokenList.tokenList
    new_pred = []
    start = 0
    for i, tk in enumerate(tokens):
        tg = ' '.join(prediction[start:start + len(tk.text)])
        #print tk.text, tg
        new_pred.append(TaggedToken(tokenIndex=i, tag=tg))
        start += len(tk.text)
    assert len(new_pred) == len(tokens)
    #print start, len(prediction)
    assert start == len(prediction)
    new_tokentagging = TokenTagging(taggingType=taggingType,
                                    taggedTokenList=new_pred,
                                    metadata=AnnotationMetadata(
                                        tool=toolname, timestamp=timestamp),
                                    uuid=generate_UUID())
    tokenization.tokenTaggingList.append(new_tokentagging)
Пример #20
0
 def search(self, query):
     augf = AnalyticUUIDGeneratorFactory()
     aug = augf.create()
     terms = query.terms
     num_docs = query.k
     query_matches = {}
     documents = []
     results = []
     with gzip.open("/mnt/index/index.gz", 'rt', encoding='utf-8') as index:
         first_line = index.readline()
         documents = first_line.split()
         query_matches = self.tfidfByDoc(terms, index)
     results = self.returnDocList(query_matches, documents, num_docs)
     # begin weighted search here
     return SearchResult(uuid=aug.next(),
                         searchQuery=query,
                         searchResultItems=results,
                         metadata=AnnotationMetadata(
                             tool="stub search",
                             timestamp=int(time.time())),
                         lang="eng")
Пример #21
0
    def annotate(self, communication):
        text = communication.text
        augf = AnalyticUUIDGeneratorFactory(communication)
        aug = augf.create()
        for section in communication.sectionList:
            for sentence in section.sentenceList:
                tokens = [
                    x.text for x in sentence.tokenization.tokenList.tokenList
                ]
                sentence.tokenization.tokenTaggingList.append(
                    TokenTagging(uuid=aug.next(),
                                 metadata=AnnotationMetadata(timestamp=int(
                                     time.time()),
                                                             tool="nltk"),
                                 taggedTokenList=[],
                                 taggingType="Penn Treebank"))
                for i, (tok, tag) in enumerate(nltk.pos_tag(tokens)):
                    logging.info("Tagged %s as %s", tok, tag)
                    sentence.tokenization.tokenTaggingList[
                        -1].taggedTokenList.append(
                            TaggedToken(tokenIndex=i, tag=tag))

        return communication
Пример #22
0
    def index():
        text = request.forms.get('text')
        transport = TTransport.TFramedTransport(
            TSocket.TSocket(options.annotator_host, options.annotator_port))
        protocol = TCompactProtocol.TCompactProtocol(transport)
        client = Annotator.Client(protocol)
        transport.open()
        augf = AnalyticUUIDGeneratorFactory()
        aug = augf.create()
        c = Communication(
            id="",
            text=text,
            uuid=aug.next(),
            type="user-supplied input",
            metadata=AnnotationMetadata(timestamp=int(time.time()),
                                        tool="stdin"),
            sectionList=[
                Section(uuid=aug.next(),
                        sentenceList=[],
                        kind="paragraph",
                        textSpan=TextSpan(start=0, ending=len(text)))
            ],
            entitySetList=[],
            entityMentionSetList=[],
        )

        new_c = client.annotate(c)
        form = '''<form action="/" method="post">
        Enter or paste some text: <input name="text" type="text" />
        <input value="Submit" type="submit" />
        </form>
        '''
        return form + "\n".join(["<h3>%s</h3>" % text] + [
            "\n".join([
                "<br>%s %s" % (e.type, e.canonicalName) for e in es.entityList
            ]) for es in new_c.entitySetList
        ])
Пример #23
0
def json_tweet_object_to_Communication(tweet):
    """
    """
    tweet_info = json_tweet_object_to_TweetInfo(tweet)

    augf = AnalyticUUIDGeneratorFactory()
    aug = augf.create()
    if 'id_str' in tweet:
        tweet_id = tweet['id_str']
    else:
        logging.warning('Tweet has no id_str, leaving communication id blank')
        tweet_id = None
    tweet_time = unix_time(datetime.strptime(tweet_info.createdAt,
                                             CREATED_AT_FORMAT))
    comm = Communication(
        communicationMetadata=CommunicationMetadata(
            tweetInfo=tweet_info),
        metadata=AnnotationMetadata(
            tool=TOOL_NAME,
            timestamp=int(time.time())),
        originalText=tweet_info.text,
        text=tweet_info.text,
        type=TWEET_TYPE,
        uuid=aug.next(),
        startTime=tweet_time,
        endTime=tweet_time,
        id=tweet_id
    )

    # either this, or pass in gen as parameter to fx
    # latter is more annoying to test but slightly cleaner
    if tweet_info.lid is not None:
        tweet_info.lid.uuid = aug.next()
        lidList = [tweet_info.lid]
        comm.lidList = lidList
    return comm
Пример #24
0
    ofd = CommunicationWriterTGZ(options.output)
    with reader(gzip.open(options.input)) as ifd:
        for i, line in enumerate(ifd):
            toks = line.strip().split("\t")
            if len(toks) != 3:
                continue            
            cid, label, text = toks
            g = ugf.create()
            t = int(time())
            comm = Communication(id=cid,
                                 uuid=g.next(),
                                 type="Text document",
                                 text=text,
                                 communicationTaggingList=[CommunicationTagging(uuid=g.next(),
                                                                                metadata=AnnotationMetadata(tool="Gold labeling",
                                                                                                            timestamp=t,
                                                                                                            kBest=1,
                                                                                ),
                                                                                taggingType=options.tag_type,
                                                                                tagList=[label],
                                                                                confidenceList=[1.0],
                                 )],
                                 metadata=AnnotationMetadata(tool="text_to_concrete.py ingester", timestamp=t, kBest=1),
                                 sectionList=[Section(uuid=g.next(),
                                                      textSpan=TextSpan(start=0, ending=len(text)),
                                                      kind="content",
                                                      )
                                 ])
            ofd.write(comm)
    ofd.close()
Пример #25
0
 def __init__(self, tool=None):
     self.metadata = AnnotationMetadata(
         tool=HasMetadata.gen_tool(tool),
         timestamp=0,
     )
Пример #26
0
    parser.add_argument("-p", "--port", dest="port", type=int, default=9090)
    parser.add_argument("-H", "--host", dest="host", default="localhost")
    options = parser.parse_args()

    # Make socket
    transport = TSocket.TSocket(options.host, options.port)

    # Buffering is critical. Raw sockets are very slow
    transport = TTransport.TBufferedTransport(transport)

    # Wrap in a protocol
    protocol = TCompactProtocol.TCompactProtocol(transport)

    # Create a client to use the protocol encoder
    client = Annotator.Client(protocol)
    
    # Connect!
    transport.open()

    while True:
        s = raw_input("Write some text > ")
        if re.match(r"^\s*$", s):
            break
        else:
            augf = AnalyticUUIDGeneratorFactory()
            aug = augf.create()
            c = Communication(id="", text=s, uuid=aug.next(), type="tweet", metadata=AnnotationMetadata(timestamp=0, tool="stdin"), lidList=[])

            new_c = client.annotate(c)
            print new_c
Пример #27
0
def update_concrete(comm, prediction):
    toolname = 'Violet_NER_annotator'
    timestamp = int(time.time())
    mention_list = []
    for section in comm.sectionList:
        for sentence in section.sentenceList:
            start = 0
            pred_ner_tags = []
            tknzation = sentence.tokenization
            in_NE = False
            ne_type = ''
            tokenization_id = None
            token_idx_list = []
            ne_text = []
            for i, tk in enumerate(tknzation.tokenList.tokenList):
                pred_tags = ' '.join(prediction[start:start + len(tk.text)])
                if in_NE:
                    for i, tag in enumerate(prediction[start:start +
                                                       len(tk.text)]):
                        if tag != 'I-' + ne_type:
                            if i != 0:
                                token_idx_list.append(i)
                                ne_text.append(tk.text)
                            entity_tokens = TokenRefSequence(
                                tokenizationId=tokenization_id,
                                tokenIndexList=token_idx_list)
                            e_type, p_type = ne_type.split(
                                '.') if '.' in ne_type else (ne_type, 'NAM')
                            #print token_idx_list, ne_text, e_type, p_type
                            e_mention = EntityMention(uuid=generate_UUID(),
                                                      tokens=entity_tokens,
                                                      entityType=e_type,
                                                      phraseType=p_type,
                                                      text=''.join(ne_text))
                            mention_list.append(e_mention)
                            tokenization_id = None
                            token_idx_list = []
                            ne_text = []
                            ne_type = ''
                            in_NE = False
                            break
                if not in_NE and 'B-' in pred_tags:
                    #print 'not in NE,', prediction[start:start+len(tk.text)]
                    in_NE = True
                    for tag in prediction[start:start + len(tk.text)]:
                        #print tag
                        if tag.startswith('B-'):
                            ne_type = tag.split('-')[1]
                            tokenization_id = tknzation.uuid
                            token_idx_list.append(i)
                            ne_text.append(tk.text)
                            break
                    #print token_idx_list, ne_text
                    if prediction[start + len(tk.text) - 1] != 'I-' + ne_type:
                        entity_tokens = TokenRefSequence(
                            tokenizationId=tokenization_id,
                            tokenIndexList=token_idx_list)
                        e_type, p_type = ne_type.split(
                            '.') if '.' in ne_type else (ne_type, 'NAM')
                        e_mention = EntityMention(uuid=generate_UUID(),
                                                  tokens=entity_tokens,
                                                  entityType=e_type,
                                                  phraseType=p_type,
                                                  text=''.join(ne_text))
                        mention_list.append(e_mention)
                        tokenization_id = None
                        token_idx_list = []
                        ne_text = []
                        ne_type = ''
                        in_NE = False
                start += len(tk.text)
                pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags))
            pner_tokentagging = TokenTagging(taggingType=PRED_TAG,
                                             taggedTokenList=pred_ner_tags,
                                             metadata=AnnotationMetadata(
                                                 tool=toolname,
                                                 timestamp=timestamp),
                                             uuid=generate_UUID())
            tknzation.tokenTaggingList.append(pner_tokentagging)
    entity_list = [
        Entity(uuid=generate_UUID(),
               type=mention.entityType,
               canonicalName=mention.text,
               mentionIdList=[mention.uuid]) for mention in mention_list
    ]
    entity_mention_set = EntityMentionSet(uuid=generate_UUID(),
                                          metadata=AnnotationMetadata(
                                              tool=toolname,
                                              timestamp=timestamp),
                                          mentionList=mention_list)
    entity_set = EntitySet(uuid=generate_UUID(),
                           metadata=AnnotationMetadata(tool=toolname,
                                                       timestamp=timestamp),
                           entityList=entity_list,
                           mentionSetId=entity_mention_set.uuid)
    comm.entityMentionSetList = [entity_mention_set]
    comm.entitySetList = [entity_set]
Пример #28
0
 def getMetadata(self, ):
     metadata = AnnotationMetadata(tool=self.METADATA_TOOL,
                                   timestamp=int(time()))
     return metadata
Пример #29
0
    # Connect!
    transport.open()

    while True:
        s = raw_input("Write some text > ")
        if re.match(r"^\s*$", s):
            break
        else:
            augf = AnalyticUUIDGeneratorFactory()
            aug = augf.create()
            c = Communication(
                id="",
                text=s,
                uuid=aug.next(),
                type="user-supplied input",
                metadata=AnnotationMetadata(timestamp=int(time.time()),
                                            tool="stdin"),
                sectionList=[
                    Section(uuid=aug.next(),
                            sentenceList=[],
                            kind="paragraph",
                            textSpan=TextSpan(start=0, ending=len(s)))
                ],
                entitySetList=[],
                entityMentionSetList=[],
            )

            new_c = client.annotate(c)
            for es in new_c.entitySetList:
                for e in es.entityList:
                    print "%s %s" % (e.type, e.canonicalName)