def comm_with_other_tags(*additional_tagging_types): comm = create_comm( 'quick', '''\ The quick brown fox jumped over the lazy dog . Or did she ? ''') for section in comm.sectionList: for sentence in section.sentenceList: sentence.tokenization.tokenTaggingList = [ TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool', timestamp=1, ), taggingType=u'upper', taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag=token.text.upper(), ) for token in sentence.tokenization.tokenList.tokenList ], ), TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool', timestamp=1, ), taggingType=u'lower', taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag=token.text.lower(), ) for token in sentence.tokenization.tokenList.tokenList ], ), ] + [ TokenTagging( uuid=generate_UUID(), metadata=AnnotationMetadata( tool=u'tool/{}'.format(i), timestamp=1, ), taggingType=tagging_type, taggedTokenList=[ TaggedToken( tokenIndex=token.tokenIndex, tag='{}_{}/{}'.format(tagging_type, token.tokenIndex, i), ) for token in sentence.tokenization.tokenList.tokenList ], ) for (i, tagging_type) in enumerate(additional_tagging_types) ] return comm
def create_simple_comm(comm_id, sentence_string="Super simple sentence ."): """Create a simple (valid) Communication suitable for testing purposes The Communication will have a single Section containing a single Sentence. Args: - `comm_id`: A string specifying a Communication ID - `sentence_string`: A string to be used for the sentence text. The string will be whitespace-tokenized. Returns: - A Concrete Communication object """ logging.warning('create_simple_comm will be removed in a future' ' release, please use create_comm instead') toolname = "TEST" timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() comm = Communication(id=comm_id, metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), type=toolname, uuid=aug.next()) tokenization = Tokenization(kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), tokenList=TokenList(tokenList=[]), uuid=aug.next()) token_string_list = sentence_string.split() for i, token_string in enumerate(token_string_list): tokenization.tokenList.tokenList.append( Token(text=token_string, tokenIndex=i)) sentence = Sentence(textSpan=TextSpan(0, len(sentence_string)), tokenization=tokenization, uuid=aug.next()) section = Section(kind="SectionKind", sentenceList=[sentence], textSpan=TextSpan(0, len(sentence_string)), uuid=aug.next()) comm.sectionList = [section] comm.text = sentence_string return comm
def search(self, query): augf = AnalyticUUIDGeneratorFactory() aug = augf.create() results = [] for query1 in return_search_results(query.rawQuery): query1 = SearchQuery(type=SearchType.SENTENCES, terms=query1.split(" "), rawQuery=query1, k=500) result = self.other.search(query1) # logging.info(result.searchResultItems) results.extend(result.searchResultItems) # results = SearchResult(searchResultItems=results, searchQuery=query) # logging.info(len(results)) resultsDict = {} for result in results: resultsDict[result.sentenceId.uuidString] = result results = [] for key in resultsDict: results.append(resultsDict[key]) # results = results[:10] # comment out on full run comm_ids_list, temp = get_comm_ids(results) dictUUID = fetch_dataset(comm_ids_list, temp) inv_map = {v: k for k, v in dictUUID.items()} toHannah = [] for uuid in dictUUID: toHannah.append([query.rawQuery, dictUUID[uuid]]) resultItemRet = SearchResult(uuid=aug.next(), searchQuery=query, searchResultItems=results, metadata=AnnotationMetadata( tool="search", timestamp=int(time.time())), lang="eng") model = pickle.load(open("./trained_model.p", "rb")) pre = Preprocess() feature_matrix = pre.process_run(toHannah) dictRanks = pre_ranking(feature_matrix, model, toHannah, inv_map) results = rerank(dictRanks, resultItemRet) resultArr = results.searchResultItems resultArr = sorted(resultArr, key=lambda result: result.score, reverse=True) for item in resultArr: logging.info(item.score) resultItemRet = SearchResult(uuid=aug.next(), searchQuery=query, searchResultItems=resultArr, metadata=AnnotationMetadata( tool="search", timestamp=int(time.time())), lang="eng") return resultItemRet
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[ Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1)) ]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def test_validate_minimal_communication_with_uuid(): comm = Communication() comm.id = "myID" comm.metadata = AnnotationMetadata(tool="TEST", timestamp=int(time.time())) comm.type = "Test Communication" comm.uuid = generate_UUID() assert validate_communication(comm)
def test_repr_on_tokenization(): tokenization = Tokenization( metadata=AnnotationMetadata( tool="test", timestamp=int(time.time())), uuid=UUID(uuidString='01234567-0123-4567-89ab-cdef89abcdef') ) tokenization.__repr__()
def create_sentence(sen_text, sen_start, sen_end, aug, metadata_tool, metadata_timestamp, annotation_level): ''' Create sentence from provided text and metadata. Lower-level routine (called indirectly by create_comm). ''' sections = (annotation_level is not None) and (annotation_level != AL_NONE) sentences = sections and (annotation_level != AL_SECTION) tokens = sentences and (annotation_level != AL_SENTENCE) return Sentence( uuid=aug.next(), textSpan=TextSpan(sen_start, sen_end), tokenization=Tokenization( uuid=aug.next(), kind=TokenizationKind.TOKEN_LIST, metadata=AnnotationMetadata( tool=metadata_tool, timestamp=metadata_timestamp, ), tokenList=TokenList(tokenList=[ Token( tokenIndex=i, text=tok_text, ) for (i, tok_text) in enumerate(sen_text.split()) ]), ) if tokens else None, )
def annotate(self, communication): text = communication.text augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() entities = {} for section in communication.sectionList: for sentence in section.sentenceList: tokens = [ x.text for x in sentence.tokenization.tokenList.tokenList ] tags = [ x.tag for x in sentence.tokenization.tokenTaggingList[-1].taggedTokenList ] for subtree in nltk.ne_chunk(zip(tokens, tags)).subtrees(): if subtree.label() != "S": name = " ".join([x[0] for x in subtree.leaves()]) logging.info("Found named entity \"%s\"", name) entities[(name, subtree.label( ))] = entities.get(name, []) + [ EntityMention( uuid=aug.next(), entityType=subtree.label(), tokens=TokenRefSequence( tokenIndexList=[], tokenizationId=sentence.tokenization.uuid)) ] communication.entitySetList.append( EntitySet(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int(time.time()), tool="nltk"), entityList=[ Entity(uuid=aug.next(), mentionIdList=[x.uuid for x in v], canonicalName=k[0], type=k[1]) for k, v in entities.iteritems() ])) communication.entityMentionSetList.append( EntityMentionSet(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int( time.time()), tool="nltk"), mentionList=sum(entities.values(), []))) return communication
def search(self, search_query): return SearchResult( uuid=UUID(uuidString='12345678-1234-5678-1234-567812345678'), searchResultItems=[ SearchResultItem(communicationId=term, score=42.) for term in search_query.terms ], metadata=AnnotationMetadata(tool=self.METADATA_TOOL, timestamp=int(time())))
def search(self, query): augf = AnalyticUUIDGeneratorFactory() aug = augf.create() return SearchResult(uuid=aug.next(), searchQuery=query, searchResultItems=[], metadata=AnnotationMetadata(tool="stub search", timestamp=int( time.time())), lang="eng")
def tokenization(request): return Tokenization(tokenTaggingList=[ TokenTagging( metadata=AnnotationMetadata(tool='x'), taggingType='?', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='?'), TaggedToken(tokenIndex=1, tag='?'), TaggedToken(tokenIndex=2, tag='?'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='x'), taggingType='POS', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='N'), TaggedToken(tokenIndex=2, tag='X'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='y'), taggingType='NUMERAL', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='N'), TaggedToken(tokenIndex=2, tag='Y'), ], ), TokenTagging( metadata=AnnotationMetadata(tool='y'), taggingType='LEMMA', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='mambo'), TaggedToken(tokenIndex=1, tag='number'), TaggedToken(tokenIndex=2, tag='4'), ], ), ], )
def json_to_concrete(doc: Dict) -> Communication: metadata = AnnotationMetadata( tool="BlingBLing", timestamp=int(datetime.datetime.now().timestamp()) ) comm: Communication = Communication( uuid=augf.next(), id=doc['doc_key'], type="aida", metadata=metadata, lidList=[LanguageIdentification( uuid=augf.next(), metadata=metadata, languageToProbabilityMap={doc['language_id']: 1.0} )], sectionList=[Section( uuid=augf.next(), kind="passage", sentenceList=[ Sentence( uuid=augf.next(), tokenization=Tokenization( uuid=augf.next(), kind=TokenizationKind.TOKEN_LIST, metadata=metadata, tokenList=TokenList( tokenList=[ Token( tokenIndex=i, text=t ) for i, t in enumerate(get_flatten_sentence(doc)) ] ) ) ) ] )], entityMentionSetList=[EntityMentionSet( uuid=augf.next(), metadata=metadata, mentionList=[] )], situationMentionSetList=[SituationMentionSet( uuid=augf.next(), metadata=metadata, mentionList=[] )] ) return comm
def annotate(self, communication): text = "" for section in communication.sectionList: if section.kind == "content": text += communication.text[section.textSpan.start:section.textSpan.ending] scores = {languages.get(iso639_1_code=k).iso639_3_code : math.exp(v) for k, v in self.classifier.classify(text).iteritems()} logging.info(str(scores)) augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() lid = LanguageIdentification(uuid=aug.next(), languageToProbabilityMap=scores, metadata=AnnotationMetadata(tool="valid", timestamp=int(time.time()), kBest=len(scores)), ) communication.lidList.append(lid) return communication
def test_get_tagged_tokens_non_unique_tagging_specify_tool(tokenization): tokenization.tokenTaggingList.append( TokenTagging( metadata=AnnotationMetadata(tool='z'), taggingType='NUMERAL', taggedTokenList=[ TaggedToken(tokenIndex=0, tag='N'), TaggedToken(tokenIndex=1, tag='Y'), TaggedToken(tokenIndex=2, tag='Y'), ], ), ) assert ['N', 'N', 'Y'] == list( map(lambda t: t.tag, get_tagged_tokens(tokenization, 'NUMERAL', tool='y'))) assert [0, 1, 2] == list( map(lambda t: t.tokenIndex, get_tagged_tokens(tokenization, 'NUMERAL', tool='y')))
def annotate(self, communication): print communication.id augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() for section in communication.sectionList: for sentence in section.sentenceList: text = communication.text[sentence.textSpan.start:sentence.textSpan.ending] sentence.tokenization = Tokenization(uuid = aug.next(), kind = TokenizationKind.TOKEN_LIST, tokenList = TokenList(tokenList=[]), tokenTaggingList = [], metadata = AnnotationMetadata(timestamp=int(time.time()), tool="nltk")) for i, token in enumerate(nltk.word_tokenize(text)): logging.info("Found token %s", token) sentence.tokenization.tokenList.tokenList.append(Token(tokenIndex=i, text=token)) return communication
def capture_tweet_lid(twitter_dict): """ Attempts to capture the 'lang' field in the twitter API, if it exists. Returns a list of LanguageIdentification objects, or None if the field is not present in the tweet json. """ if u'lang' in twitter_dict: amd = AnnotationMetadata(tool="Twitter LID", timestamp=int(time.time()), kBest=1) kvs = {} kvs[twitter_lid_to_iso639_3(twitter_dict[u'lang'])] = 1.0 return LanguageIdentification(metadata=amd, languageToProbabilityMap=kvs) else: return None
def create_comm(comm_id, text='', comm_type='article', section_kind='passage', metadata_tool='concrete-python', metadata_timestamp=None, annotation_level=AL_TOKEN): ''' Create a simple, valid Communication from text. By default the text will be split by double-newlines into sections and then by single newlines into sentences within those sections. annotation_level controls the amount of annotation that is added: AL_NONE add no optional annotations (not even sections) AL_SECTION add sections but not sentences AL_SENTENCE add sentences but not tokens AL_TOKEN add all annotations, up to tokens (the default) If metadata_timestamp is None, the current time will be used. ''' if metadata_timestamp is None: metadata_timestamp = int(time.time()) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() sections = (annotation_level is not None) and (annotation_level != AL_NONE) return Communication( id=comm_id, uuid=aug.next(), type=comm_type, text=text, metadata=AnnotationMetadata( tool=metadata_tool, timestamp=metadata_timestamp, ), sectionList=([ create_section(sec_text, sec_start, sec_end, section_kind, aug, metadata_tool, metadata_timestamp, annotation_level) for (sec_text, sec_start, sec_end) in _split(text, '\n\n') ] if text.strip() else []) if sections else None, )
def search(self, query): augf = AnalyticUUIDGeneratorFactory() aug = augf.create() k_returned = query.k terms = query.terms # implement boolean search here query_matches = {} documents = [] with gzip.open("/mnt/index/index.gz", 'rt', encoding='utf-8') as index: first_line = index.readline() documents = first_line.split() query_matches = self.docsByTerm(terms, index) results = self.returnDocList(query_matches, documents, k_returned) return SearchResult(uuid=aug.next(), searchQuery=query, searchResultItems=results, metadata=AnnotationMetadata(tool="stub search", timestamp=int( time.time())), lang="eng")
def set_tokentaggings_of_type_v(tokenization, taggingType, prediction, toolname): timestamp = long(time.time() * 1e6) tokens = tokenization.tokenList.tokenList new_pred = [] start = 0 for i, tk in enumerate(tokens): tg = ' '.join(prediction[start:start + len(tk.text)]) #print tk.text, tg new_pred.append(TaggedToken(tokenIndex=i, tag=tg)) start += len(tk.text) assert len(new_pred) == len(tokens) #print start, len(prediction) assert start == len(prediction) new_tokentagging = TokenTagging(taggingType=taggingType, taggedTokenList=new_pred, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tokenization.tokenTaggingList.append(new_tokentagging)
def search(self, query): augf = AnalyticUUIDGeneratorFactory() aug = augf.create() terms = query.terms num_docs = query.k query_matches = {} documents = [] results = [] with gzip.open("/mnt/index/index.gz", 'rt', encoding='utf-8') as index: first_line = index.readline() documents = first_line.split() query_matches = self.tfidfByDoc(terms, index) results = self.returnDocList(query_matches, documents, num_docs) # begin weighted search here return SearchResult(uuid=aug.next(), searchQuery=query, searchResultItems=results, metadata=AnnotationMetadata( tool="stub search", timestamp=int(time.time())), lang="eng")
def annotate(self, communication): text = communication.text augf = AnalyticUUIDGeneratorFactory(communication) aug = augf.create() for section in communication.sectionList: for sentence in section.sentenceList: tokens = [ x.text for x in sentence.tokenization.tokenList.tokenList ] sentence.tokenization.tokenTaggingList.append( TokenTagging(uuid=aug.next(), metadata=AnnotationMetadata(timestamp=int( time.time()), tool="nltk"), taggedTokenList=[], taggingType="Penn Treebank")) for i, (tok, tag) in enumerate(nltk.pos_tag(tokens)): logging.info("Tagged %s as %s", tok, tag) sentence.tokenization.tokenTaggingList[ -1].taggedTokenList.append( TaggedToken(tokenIndex=i, tag=tag)) return communication
def index(): text = request.forms.get('text') transport = TTransport.TFramedTransport( TSocket.TSocket(options.annotator_host, options.annotator_port)) protocol = TCompactProtocol.TCompactProtocol(transport) client = Annotator.Client(protocol) transport.open() augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication( id="", text=text, uuid=aug.next(), type="user-supplied input", metadata=AnnotationMetadata(timestamp=int(time.time()), tool="stdin"), sectionList=[ Section(uuid=aug.next(), sentenceList=[], kind="paragraph", textSpan=TextSpan(start=0, ending=len(text))) ], entitySetList=[], entityMentionSetList=[], ) new_c = client.annotate(c) form = '''<form action="/" method="post"> Enter or paste some text: <input name="text" type="text" /> <input value="Submit" type="submit" /> </form> ''' return form + "\n".join(["<h3>%s</h3>" % text] + [ "\n".join([ "<br>%s %s" % (e.type, e.canonicalName) for e in es.entityList ]) for es in new_c.entitySetList ])
def json_tweet_object_to_Communication(tweet): """ """ tweet_info = json_tweet_object_to_TweetInfo(tweet) augf = AnalyticUUIDGeneratorFactory() aug = augf.create() if 'id_str' in tweet: tweet_id = tweet['id_str'] else: logging.warning('Tweet has no id_str, leaving communication id blank') tweet_id = None tweet_time = unix_time(datetime.strptime(tweet_info.createdAt, CREATED_AT_FORMAT)) comm = Communication( communicationMetadata=CommunicationMetadata( tweetInfo=tweet_info), metadata=AnnotationMetadata( tool=TOOL_NAME, timestamp=int(time.time())), originalText=tweet_info.text, text=tweet_info.text, type=TWEET_TYPE, uuid=aug.next(), startTime=tweet_time, endTime=tweet_time, id=tweet_id ) # either this, or pass in gen as parameter to fx # latter is more annoying to test but slightly cleaner if tweet_info.lid is not None: tweet_info.lid.uuid = aug.next() lidList = [tweet_info.lid] comm.lidList = lidList return comm
ofd = CommunicationWriterTGZ(options.output) with reader(gzip.open(options.input)) as ifd: for i, line in enumerate(ifd): toks = line.strip().split("\t") if len(toks) != 3: continue cid, label, text = toks g = ugf.create() t = int(time()) comm = Communication(id=cid, uuid=g.next(), type="Text document", text=text, communicationTaggingList=[CommunicationTagging(uuid=g.next(), metadata=AnnotationMetadata(tool="Gold labeling", timestamp=t, kBest=1, ), taggingType=options.tag_type, tagList=[label], confidenceList=[1.0], )], metadata=AnnotationMetadata(tool="text_to_concrete.py ingester", timestamp=t, kBest=1), sectionList=[Section(uuid=g.next(), textSpan=TextSpan(start=0, ending=len(text)), kind="content", ) ]) ofd.write(comm) ofd.close()
def __init__(self, tool=None): self.metadata = AnnotationMetadata( tool=HasMetadata.gen_tool(tool), timestamp=0, )
parser.add_argument("-p", "--port", dest="port", type=int, default=9090) parser.add_argument("-H", "--host", dest="host", default="localhost") options = parser.parse_args() # Make socket transport = TSocket.TSocket(options.host, options.port) # Buffering is critical. Raw sockets are very slow transport = TTransport.TBufferedTransport(transport) # Wrap in a protocol protocol = TCompactProtocol.TCompactProtocol(transport) # Create a client to use the protocol encoder client = Annotator.Client(protocol) # Connect! transport.open() while True: s = raw_input("Write some text > ") if re.match(r"^\s*$", s): break else: augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication(id="", text=s, uuid=aug.next(), type="tweet", metadata=AnnotationMetadata(timestamp=0, tool="stdin"), lidList=[]) new_c = client.annotate(c) print new_c
def update_concrete(comm, prediction): toolname = 'Violet_NER_annotator' timestamp = int(time.time()) mention_list = [] for section in comm.sectionList: for sentence in section.sentenceList: start = 0 pred_ner_tags = [] tknzation = sentence.tokenization in_NE = False ne_type = '' tokenization_id = None token_idx_list = [] ne_text = [] for i, tk in enumerate(tknzation.tokenList.tokenList): pred_tags = ' '.join(prediction[start:start + len(tk.text)]) if in_NE: for i, tag in enumerate(prediction[start:start + len(tk.text)]): if tag != 'I-' + ne_type: if i != 0: token_idx_list.append(i) ne_text.append(tk.text) entity_tokens = TokenRefSequence( tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split( '.') if '.' in ne_type else (ne_type, 'NAM') #print token_idx_list, ne_text, e_type, p_type e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False break if not in_NE and 'B-' in pred_tags: #print 'not in NE,', prediction[start:start+len(tk.text)] in_NE = True for tag in prediction[start:start + len(tk.text)]: #print tag if tag.startswith('B-'): ne_type = tag.split('-')[1] tokenization_id = tknzation.uuid token_idx_list.append(i) ne_text.append(tk.text) break #print token_idx_list, ne_text if prediction[start + len(tk.text) - 1] != 'I-' + ne_type: entity_tokens = TokenRefSequence( tokenizationId=tokenization_id, tokenIndexList=token_idx_list) e_type, p_type = ne_type.split( '.') if '.' in ne_type else (ne_type, 'NAM') e_mention = EntityMention(uuid=generate_UUID(), tokens=entity_tokens, entityType=e_type, phraseType=p_type, text=''.join(ne_text)) mention_list.append(e_mention) tokenization_id = None token_idx_list = [] ne_text = [] ne_type = '' in_NE = False start += len(tk.text) pred_ner_tags.append(TaggedToken(tokenIndex=i, tag=pred_tags)) pner_tokentagging = TokenTagging(taggingType=PRED_TAG, taggedTokenList=pred_ner_tags, metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), uuid=generate_UUID()) tknzation.tokenTaggingList.append(pner_tokentagging) entity_list = [ Entity(uuid=generate_UUID(), type=mention.entityType, canonicalName=mention.text, mentionIdList=[mention.uuid]) for mention in mention_list ] entity_mention_set = EntityMentionSet(uuid=generate_UUID(), metadata=AnnotationMetadata( tool=toolname, timestamp=timestamp), mentionList=mention_list) entity_set = EntitySet(uuid=generate_UUID(), metadata=AnnotationMetadata(tool=toolname, timestamp=timestamp), entityList=entity_list, mentionSetId=entity_mention_set.uuid) comm.entityMentionSetList = [entity_mention_set] comm.entitySetList = [entity_set]
def getMetadata(self, ): metadata = AnnotationMetadata(tool=self.METADATA_TOOL, timestamp=int(time())) return metadata
# Connect! transport.open() while True: s = raw_input("Write some text > ") if re.match(r"^\s*$", s): break else: augf = AnalyticUUIDGeneratorFactory() aug = augf.create() c = Communication( id="", text=s, uuid=aug.next(), type="user-supplied input", metadata=AnnotationMetadata(timestamp=int(time.time()), tool="stdin"), sectionList=[ Section(uuid=aug.next(), sentenceList=[], kind="paragraph", textSpan=TextSpan(start=0, ending=len(s))) ], entitySetList=[], entityMentionSetList=[], ) new_c = client.annotate(c) for es in new_c.entitySetList: for e in es.entityList: print "%s %s" % (e.type, e.canonicalName)