def _next_from_stream(self): try: comm = Communication() comm.read(self.protocol) if self._add_references: add_references_to_communication(comm) return (comm, self._source_filename) except EOFError: self.transport.close() raise StopIteration
def _next_from_zip(self): if self.zip_infolist_index >= len(self.zip_infolist): raise StopIteration zipinfo = self.zip_infolist[self.zip_infolist_index] self.zip_infolist_index += 1 comm = TSerialization.deserialize( Communication(), self.zip.open(zipinfo).read(), protocol_factory=factory.protocolFactory) if self._add_references: add_references_to_communication(comm) return (comm, zipinfo.filename)
def read_communication_from_buffer(buf, add_references=True): ''' Deserialize buf (a binary string) and return resulting communication. Add references if requested. ''' transport_in = TMemoryBuffer(buf) protocol_in = factory.createProtocol(transport_in) comm = Communication() comm.read(protocol_in) if add_references: add_references_to_communication(comm) return comm
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[ Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1)) ]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property(value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def read_communication_from_file(communication_filename, add_references=True): """Read a Communication from the file specified by filename Args: - `communication_filename`: String with filename Returns: - A Concrete `Communication` object """ comm = read_thrift_from_file(Communication(), communication_filename) if add_references: add_references_to_communication(comm) return comm
def _comm_with_properties(num_properties): ts = 17 meta_tokn = AnnotationMetadata(tool='tokn-tool', timestamp=ts) toks = TokenList(tokenList=[Token(tokenIndex=0, text='text', textSpan=TextSpan(start=0, ending=1))]) tokn = Tokenization(uuid=generate_UUID(), metadata=meta_tokn, kind=TokenizationKind.TOKEN_LIST, tokenList=toks) sentence = Sentence(uuid=generate_UUID(), tokenization=tokn) section = Section(uuid=generate_UUID(), kind='kind', label='label', sentenceList=[sentence]) trfs = TokenRefSequence(tokenizationId=tokn.uuid, tokenIndexList=[0], anchorTokenIndex=0) em = EntityMention(uuid=generate_UUID(), entityType='entityType', text='text', tokens=trfs) meta_ems = AnnotationMetadata(tool='ems-tool', timestamp=ts) ems = EntityMentionSet(uuid=generate_UUID(), metadata=meta_ems, mentionList=[em]) meta_prop = AnnotationMetadata(tool='Annotator1', timestamp=ts) props = list( Property( value="Property%d" % i, metadata=meta_prop, polarity=4.0) for i in range(num_properties)) am = MentionArgument(role='role', entityMentionId=em.uuid, propertyList=props) sm = SituationMention(uuid=generate_UUID(), tokens=trfs, argumentList=[am]) meta_sms = AnnotationMetadata(tool='sms-tool', timestamp=ts) sms = SituationMentionSet(uuid=generate_UUID(), metadata=meta_sms, mentionList=[sm]) meta_comm = AnnotationMetadata(tool='tool', timestamp=ts) comm = Communication(uuid=generate_UUID(), id='id', text='text', type='type', metadata=meta_comm, sectionList=[section], situationMentionSetList=[sms], entityMentionSetList=[ems]) add_references_to_communication(comm) return comm
def _next_from_tar(self): while True: tarinfo = self.tar.next() if tarinfo is None: raise StopIteration if not tarinfo.isfile(): # Ignore directories continue filename = os.path.split(tarinfo.name)[-1] if filename[0] is '.' and filename[1] is '_': # Ignore attribute files created by OS X tar continue comm = TSerialization.deserialize( Communication(), self.tar.extractfile(tarinfo).read(), protocol_factory=factory.protocolFactory) if self._add_references: add_references_to_communication(comm) # hack to keep memory usage O(1) # (...but the real hack is tarfile :) self.tar.members = [] return (comm, tarinfo.name)