def multi_token_match(stream_item, aligner_data): ''' iterate through tokens looking for near-exact matches to strings in si.ratings...mentions ''' sentences = stream_item.body.sentences.get(aligner_data['tagger_id']) if not sentences: return ## construct a list of tuples, where the first part of each tuple ## is a tuple of cleansed strings, and the second part is the ## Token object from which it came. tokens = map(lambda tok: (cleanse(tok.token.decode('utf8')).split(' '), tok), itertools.chain(*[sent.tokens for sent in sentences])) for annotator_id, ratings in stream_item.ratings.items(): if annotator_id == aligner_data['annotator_id']: for rating in ratings: label = Label(annotator=rating.annotator, target=rating.target) num_tokens_matched = 0 for tok in look_ahead_match(rating, tokens): if aligner_data.get('update_labels'): tok.labels.pop(annotator_id, None) add_annotation(tok, label) num_tokens_matched += 1 if num_tokens_matched == 0: logger.critical('failed multi_token_match %r:\n mentions: %r\n tokens: %r\n clean_html=%r', stream_item.abs_url, rating.mentions, tokens, stream_item.body.clean_html) else: logger.debug('matched %d tokens for %r', num_tokens_matched, rating.target.target_id)
def multi_token_match(stream_item, aligner_data): ''' iterate through tokens looking for near-exact matches to strings in si.ratings...mentions ''' sentences = stream_item.body.sentences.get(aligner_data['tagger_id']) if not sentences: return ## construct a list of tuples, where the first part of each tuple ## is a tuple of cleansed strings, and the second part is the ## Token object from which it came. tokens = map( lambda tok: (cleanse(tok.token.decode('utf8')).split(' '), tok), itertools.chain(*[sent.tokens for sent in sentences])) for annotator_id, ratings in stream_item.ratings.items(): if annotator_id == aligner_data['annotator_id']: for rating in ratings: label = Label(annotator=rating.annotator, target=rating.target) num_tokens_matched = 0 for tok in look_ahead_match(rating, tokens): if aligner_data.get('update_labels'): tok.labels.pop(annotator_id, None) add_annotation(tok, label) num_tokens_matched += 1 if num_tokens_matched == 0: logger.critical( 'failed multi_token_match %r:\n mentions: %r\n tokens: %r\n clean_html=%r', stream_item.abs_url, rating.mentions, tokens, stream_item.body.clean_html) else: logger.debug('matched %d tokens for %r', num_tokens_matched, rating.target.target_id)
def _make_stream_item(cls, path, metadata, abs_url, entities): ''' ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. creation_time = os.path.getctime(path) ## make stream item stream_item = streamcorpus.make_stream_item( creation_time, abs_url) stream_item.source = metadata.get('source') ## build a ContentItem for the body body = streamcorpus.ContentItem() body.media_type = magic.from_file(path, mime=True) logger.info('opening %r', path) with open(path) as f: body.raw = f.read() ## attach the content_item to the stream_item stream_item.body = body ## annotations anno = streamcorpus.Annotator() anno.annotator_id = metadata['annotator_id'] anno.annotation_time = stream_item.stream_time num_ratings = 0 for entity, is_profile in entities: num_ratings += 1 ## pull out target id and mention tokens target_id = str(entity['target_id']) ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id = target_id) rating.contains_mention = True if is_profile: rating.flags = [streamcorpus.FlagType.PROFILE] ## parse slots in yaml file slots = cls._parse_slots(entity['slots']) ## heuristically split the slots string on white space and ## use each token as a separate mention. rating.mentions = [cleanse(unicode(slot[1], 'utf-8')) for slot in slots] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url) return stream_item
def test_multi_token_match(): si = make_stream_item(0, '') tagger_id = 'test_tagger' annotator_id = 'test_anno' target_id = 'test_target' si.body.sentences[tagger_id] = [ Sentence(tokens=[ Token(token='This'), Token(token='-LRB-big-RRB- dog'), Token(token='Jake'), Token(token='has'), Token(token='no'), Token(token=u'\u1F601'.encode('utf8')), Token(token='...'), Token(token='Teeth'), ])] rating = Rating(annotator=Annotator(annotator_id=annotator_id), target=Target(target_id=target_id), mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')], ) add_annotation(si, rating) aligner_data = dict( tagger_id = tagger_id, annotator_id = annotator_id, ) multi_token_match(si, aligner_data) assert si.body.sentences[tagger_id][0].tokens[1].labels assert si.body.sentences[tagger_id][0].tokens[2].labels assert si.body.sentences[tagger_id][0].tokens[-3].labels assert si.body.sentences[tagger_id][0].tokens[-2].labels assert si.body.sentences[tagger_id][0].tokens[-1].labels
def test_multi_token_match(): si = make_stream_item(0, '') tagger_id = 'test_tagger' annotator_id = 'test_anno' target_id = 'test_target' si.body.sentences[tagger_id] = [ Sentence(tokens=[ Token(token='This'), Token(token='-LRB-big-RRB- dog'), Token(token='Jake'), Token(token='has'), Token(token='no'), Token(token=u'\u1F601'.encode('utf8')), Token(token='...'), Token(token='Teeth'), ]) ] rating = Rating( annotator=Annotator(annotator_id=annotator_id), target=Target(target_id=target_id), mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')], ) add_annotation(si, rating) aligner_data = dict( tagger_id=tagger_id, annotator_id=annotator_id, ) multi_token_match(si, aligner_data) assert si.body.sentences[tagger_id][0].tokens[1].labels assert si.body.sentences[tagger_id][0].tokens[2].labels assert si.body.sentences[tagger_id][0].tokens[-3].labels assert si.body.sentences[tagger_id][0].tokens[-2].labels assert si.body.sentences[tagger_id][0].tokens[-1].labels
def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode('utf8') except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str)) tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length = end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent)
def _offset_labels(stream_item, aligner_data, offset_type='BYTES'): ## get a set of tokens -- must have OffsetType.<offset_type> type offsets. offset_type = OffsetType._NAMES_TO_VALUES[offset_type] sentences = stream_item.body.sentences[aligner_data['tagger_id']] ## These next few steps are probably the most ## memory intensive, because they fully ## instantiate all the tokens. token_collection = SortedCollection( itertools.chain(*[sent.tokens for sent in sentences]), key=lambda tok: tok.offsets[offset_type].first ) ## if labels on ContentItem, then make labels on Tokens for annotator_id in stream_item.body.labels: if annotator_id != aligner_data['annotator_id']: continue for label in stream_item.body.labels[annotator_id]: ## remove the offset from the label, because we are ## putting it into the token label_off = label.offsets.pop( offset_type ) assert label_off.length == len(label_off.value) #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value, # '\n'.join(hope_original.split('\n')[label_off.first:label_off.first+label_off.length])) #print 'tc %d %r' % (len(token_collection), token_collection._keys) #print 'label_off.first=%d, length=%d, value=%r' % (label_off.first, label_off.length, label_off.value) toks = token_collection.find_range( label_off.first, label_off.first + label_off.length) #print "find_le: ", token_collection.find_le(label_off.first) toks = list(toks) #print 'aligned tokens', toks for tok in toks: add_annotation(tok, label) ## only for debugging assert tok.token is not None, tok.token if not tok.token in label_off.value: sys.exit('%r not in %r' % \ ([(t.offsets[offset_type].first, t.token) for t in toks], label_off.value))
def _offset_labels(stream_item, aligner_data, offset_type='BYTES'): ## get a set of tokens -- must have OffsetType.<offset_type> type offsets. offset_type = OffsetType._NAMES_TO_VALUES[offset_type] sentences = stream_item.body.sentences[aligner_data['tagger_id']] ## These next few steps are probably the most ## memory intensive, because they fully ## instantiate all the tokens. token_collection = SortedCollection( itertools.chain(*[sent.tokens for sent in sentences]), key=lambda tok: tok.offsets[offset_type].first) ## if labels on ContentItem, then make labels on Tokens for annotator_id in stream_item.body.labels: if annotator_id != aligner_data['annotator_id']: continue for label in stream_item.body.labels[annotator_id]: ## remove the offset from the label, because we are ## putting it into the token label_off = label.offsets.pop(offset_type) assert label_off.length == len(label_off.value) #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value, # '\n'.join(hope_original.split('\n')[label_off.first:label_off.first+label_off.length])) #print 'tc %d %r' % (len(token_collection), token_collection._keys) #print 'label_off.first=%d, length=%d, value=%r' % (label_off.first, label_off.length, label_off.value) toks = token_collection.find_range( label_off.first, label_off.first + label_off.length) #print "find_le: ", token_collection.find_le(label_off.first) toks = list(toks) #print 'aligned tokens', toks for tok in toks: add_annotation(tok, label) ## only for debugging assert tok.token is not None, tok.token if not tok.token in label_off.value: sys.exit('%r not in %r' % \ ([(t.offsets[offset_type].first, t.token) for t in toks], label_off.value))
def names_in_chains(stream_item, aligner_data): ''' Convert doc-level Rating object into a Label, and add that Label to all Token in all coref chains identified by aligner_data["chain_selector"] :param stream_item: document that has a doc-level Rating to translate into token-level Labels. :param aligner_data: dict containing: chain_selector: ALL or ANY annotator_id: string to find at stream_item.Ratings[i].annotator.annotator_id If chain_selector==ALL, then only apply Label to chains in which all of the Rating.mentions strings appear as substrings within at least one of the Token.token strings. If chain_selector==ANY, then apply Label to chains in which any of the Rating.mentions strings appear as a substring within at least one of the Token.token strings. If chain_selector==ANY_MULTI_TOKEN, then apply Label to chains in which all the names in any of the Rating.mentions strings appear as a substring within at least one of the Token.token strings. ''' chain_selector = aligner_data.get('chain_selector', '') assert chain_selector in _CHAIN_SELECTORS, \ 'chain_selector: %r not in %r' % (chain_selector, _CHAIN_SELECTORS.keys()) ## convert chain_selector to a function chain_selector = _CHAIN_SELECTORS[chain_selector] ## make inverted index equiv_id --> (names, tokens) equiv_ids = make_chains_with_names( stream_item.body.sentences ) for annotator_id, ratings in stream_item.ratings.items(): if annotator_id == aligner_data['annotator_id']: for rating in ratings: label = Label(annotator=rating.annotator, target=rating.target) for eqid, (chain_mentions, chain_tokens) in equiv_ids.items(): if chain_selector(rating.mentions, chain_mentions): ## apply the label for tok in chain_tokens: add_annotation(tok, label)
def get_john_smith_tagged_by_lingpipe_without_labels_data(test_data_dir): fh = StringIO() o_chunk = Chunk(file_obj=fh, mode='wb') path = get_john_smith_tagged_by_lingpipe_path(test_data_dir) for si in Chunk(path): for sentence in si.body.sentences['lingpipe']: for token in sentence.tokens: for labels in token.labels.values(): for label in labels: label.offsets.update(token.offsets) for offset in label.offsets.values(): offset.value = token.token add_annotation(si.body, label) token.labels = dict() o_chunk.add(si) o_chunk.flush() return fh.getvalue()
def get_john_smith_tagged_by_lingpipe_without_labels_data(): fh = StringIO() o_chunk = Chunk(file_obj=fh, mode='wb') path = get_john_smith_tagged_by_lingpipe_path() for si in Chunk(path): for sentence in si.body.sentences['lingpipe']: for token in sentence.tokens: for labels in token.labels.values(): for label in labels: label.offsets.update(token.offsets) for offset in label.offsets.values(): offset.value = token.token add_annotation(si.body, label) token.labels = dict() o_chunk.add(si) o_chunk.flush() return fh.getvalue()
def names_in_chains(stream_item, aligner_data): ''' Convert doc-level Rating object into a Label, and add that Label to all Token in all coref chains identified by aligner_data["chain_selector"] :param stream_item: document that has a doc-level Rating to translate into token-level Labels. :param aligner_data: dict containing: chain_selector: ALL or ANY annotator_id: string to find at stream_item.Ratings[i].annotator.annotator_id If chain_selector==ALL, then only apply Label to chains in which all of the Rating.mentions strings appear as substrings within at least one of the Token.token strings. If chain_selector==ANY, then apply Label to chains in which any of the Rating.mentions strings appear as a substring within at least one of the Token.token strings. If chain_selector==ANY_MULTI_TOKEN, then apply Label to chains in which all the names in any of the Rating.mentions strings appear as a substring within at least one of the Token.token strings. ''' chain_selector = aligner_data.get('chain_selector', '') assert chain_selector in _CHAIN_SELECTORS, \ 'chain_selector: %r not in %r' % (chain_selector, _CHAIN_SELECTORS.keys()) ## convert chain_selector to a function chain_selector = _CHAIN_SELECTORS[chain_selector] ## make inverted index equiv_id --> (names, tokens) equiv_ids = make_chains_with_names(stream_item.body.sentences) for annotator_id, ratings in stream_item.ratings.items(): if annotator_id == aligner_data['annotator_id']: for rating in ratings: label = Label(annotator=rating.annotator, target=rating.target) for eqid, (chain_mentions, chain_tokens) in equiv_ids.items(): if chain_selector(rating.mentions, chain_mentions): ## apply the label for tok in chain_tokens: add_annotation(tok, label)
def __call__(self, stream_item, context): ''' Act as an incremental transform in the kba.pipeline ''' ## right now, we only do clean_html assert self.config.get('require_clean_html', True) if stream_item.body and stream_item.body.clean_html: labels = self.make_labels(stream_item.body.clean_html, stream_item.body.clean_visible) if labels: if self.offset_type == OffsetType.LINES: ## for LINES-type labels, must replace clean_html ## with a new one that has newlines inserted stream_item.body.clean_html = self.clean_html ## Remove any previous author labels stream_item.body.labels['author'] = [] ## also add the new labels add_annotation(stream_item.body, *labels) return stream_item
def line_offset_labels(stream_item, aligner_data): ## get a set of tokens -- must have OffsetType.LINES in them. sentences = stream_item.body.sentences[aligner_data['tagger_id']] ## if labels on ContentItem, then make labels on Tokens for annotator_id in stream_item.body.labels: if annotator_id != aligner_data['annotator_id']: continue for label in stream_item.body.labels[annotator_id]: ## remove the offset from the label, because we are ## putting it into the token label_off = label.offsets.pop(OffsetType.LINES) assert label_off.length == len(label_off.value.split('\n')) #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value, # '\n'.join(hope_original.split('\n')[label_off.first: # label_off.first+label_off.length])) ## These next few steps are probably the most ## memory intensive, because they fully ## instantiate all the tokens. token_collection = SortedCollection( itertools.chain(*[sent.tokens for sent in sentences]), key=lambda tok: tok.offsets[OffsetType.LINES].first ) toks = token_collection.find_range( label_off.first, label_off.first + label_off.length) for tok in toks: add_annotation(tok, label) ## only for debugging if not tok.token or tok.token not in label_off.value: sys.exit('%r not in %r' % \ ([(t.offsets[OffsetType.LINES].first, t.token) for t in toks], label_off.value))
def line_offset_labels(stream_item, aligner_data): ## get a set of tokens -- must have OffsetType.LINES in them. sentences = stream_item.body.sentences[aligner_data['tagger_id']] ## if labels on ContentItem, then make labels on Tokens for annotator_id in stream_item.body.labels: if annotator_id != aligner_data['annotator_id']: continue for label in stream_item.body.labels[annotator_id]: ## remove the offset from the label, because we are ## putting it into the token label_off = label.offsets.pop(OffsetType.LINES) assert label_off.length == len(label_off.value.split('\n')) #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value, # '\n'.join(hope_original.split('\n')[label_off.first: # label_off.first+label_off.length])) ## These next few steps are probably the most ## memory intensive, because they fully ## instantiate all the tokens. token_collection = SortedCollection( itertools.chain(*[sent.tokens for sent in sentences]), key=lambda tok: tok.offsets[OffsetType.LINES].first) toks = token_collection.find_range( label_off.first, label_off.first + label_off.length) for tok in toks: add_annotation(tok, label) ## only for debugging if not tok.token or tok.token not in label_off.value: sys.exit('%r not in %r' % \ ([(t.offsets[OffsetType.LINES].first, t.token) for t in toks], label_off.value))
def generate_john_smith_chunk(path_to_original): ''' This _looks_ like a Chunk only in that it generates StreamItem instances when iterated upon. ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. Here, we assume the JS corpus ## was created at one moment at the end of 1998: creation_time = '1998-12-31T23:59:59.999999Z' correct_time = 915148799 if not path_to_original.startswith('/'): path_to_original = os.path.join(os.getcwd(), path_to_original) ## iterate over the files in the 35 input directories for label_id in range(35): dir_path = os.path.join(path_to_original, str(label_id)) fnames = os.listdir(dir_path) fnames.sort() for fname in fnames: stream_item = streamcorpus.make_stream_item( creation_time, ## make up an abs_url os.path.join( 'john-smith-corpus', str(label_id), fname)) if int(stream_item.stream_time.epoch_ticks) != correct_time: raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\ % (creation_time, stream_item.stream_time.epoch_ticks, correct_time)) ## These docs came from the authors of the paper cited above. stream_item.source = 'bagga-and-baldwin' ## build a ContentItem for the body body = streamcorpus.ContentItem() raw_string = open(os.path.join(dir_path, fname)).read() ## We know that this is already clean and has nothing ## tricky in it, because we manually cleansed it. To ## illustrate how we stick all strings into thrift, we ## convert this to unicode (which introduces no changes) ## and then encode it as utf-8, which also introduces no ## changes. Thrift stores strings as 8-bit character ## strings. # http://www.mail-archive.com/[email protected]/msg00210.html body.clean_visible = unicode(raw_string).encode('utf8') ## attach the content_item to the stream_item stream_item.body = body stream_item.body.language = streamcorpus.Language(code='en', name='ENGLISH') ## The authors also annotated the corpus anno = streamcorpus.Annotator() anno.annotator_id = 'bagga-and-baldwin' anno.annotation_time = stream_item.stream_time ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id = str(label_id)) # must be string rating.contains_mention = True rating.mentions = ['john', 'smith'] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline yield stream_item
def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences( stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode('utf8') except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str)) tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent)
def generate_john_smith_chunk(path_to_original): ''' This _looks_ like a Chunk only in that it generates StreamItem instances when iterated upon. ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. Here, we assume the JS corpus ## was created at one moment at the end of 1998: creation_time = '1998-12-31T23:59:59.999999Z' correct_time = 915148799 if not os.path.isabs(path_to_original): path_to_original = os.path.join(os.getcwd(), path_to_original) ## iterate over the files in the 35 input directories for label_id in range(35): dir_path = os.path.join(path_to_original, str(label_id)) fnames = os.listdir(dir_path) fnames.sort() for fname in fnames: stream_item = streamcorpus.make_stream_item( creation_time, ## make up an abs_url os.path.join('john-smith-corpus', str(label_id), fname)) if int(stream_item.stream_time.epoch_ticks) != correct_time: raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\ % (creation_time, stream_item.stream_time.epoch_ticks, correct_time)) ## These docs came from the authors of the paper cited above. stream_item.source = 'bagga-and-baldwin' ## build a ContentItem for the body body = streamcorpus.ContentItem() raw_string = open(os.path.join(dir_path, fname)).read() ## We know that this is already clean and has nothing ## tricky in it, because we manually cleansed it. To ## illustrate how we stick all strings into thrift, we ## convert this to unicode (which introduces no changes) ## and then encode it as utf-8, which also introduces no ## changes. Thrift stores strings as 8-bit character ## strings. # http://www.mail-archive.com/[email protected]/msg00210.html body.clean_visible = unicode(raw_string).encode('utf8') ## attach the content_item to the stream_item stream_item.body = body stream_item.body.language = streamcorpus.Language(code='en', name='ENGLISH') ## The authors also annotated the corpus anno = streamcorpus.Annotator() anno.annotator_id = 'bagga-and-baldwin' anno.annotation_time = stream_item.stream_time ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target( target_id=str(label_id)) # must be string rating.contains_mention = True rating.mentions = ['john', 'smith'] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline yield stream_item
def _make_stream_item(cls, path, metadata, abs_url, entities): ''' ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. creation_time = os.path.getctime(path) ## make stream item stream_item = streamcorpus.make_stream_item(creation_time, abs_url) stream_item.source = metadata.get('source') ## build a ContentItem for the body body = streamcorpus.ContentItem() body.media_type = magic.from_file(path, mime=True) logger.info('opening %r', path) with open(path) as f: body.raw = f.read() ## attach the content_item to the stream_item stream_item.body = body ## annotations anno = streamcorpus.Annotator() anno.annotator_id = metadata['annotator_id'] anno.annotation_time = stream_item.stream_time num_ratings = 0 for entity, is_profile in entities: num_ratings += 1 ## pull out target id and mention tokens target_id = str(entity['target_id']) ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id=target_id) rating.contains_mention = True if is_profile: rating.flags = [streamcorpus.FlagType.PROFILE] ## parse slots in yaml file slots = cls._parse_slots(entity['slots']) ## heuristically split the slots string on white space and ## use each token as a separate mention. rating.mentions = [ cleanse(unicode(slot[1], 'utf-8')) for slot in slots ] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url) return stream_item
def _make_stream_item(self, dir_path, fname): ## could use dirpath as the label. Instead, we illustrate ## using a TSV file to lookup the ground truth using the fname. assert fname in self.ground_truth, (dir_path, fname) ## "mention" is the name string from the text ## "target_id" is the label mention, target_id = self.ground_truth[fname] ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. Here, we assume the JS corpus ## was created at one moment at the end of 1998: creation_time = "1998-12-31T23:59:59.999999Z" stream_item = streamcorpus.make_stream_item( creation_time, ## make up an abs_url os.path.join("john-smith-corpus", target_id, fname), ) ## These docs came from the authors of the paper cited above. stream_item.source = "bagga-and-baldwin" ## build a ContentItem for the body body = streamcorpus.ContentItem() raw_string = open(os.path.join(dir_path, fname)).read() ## We know that this is already clean and has nothing ## tricky in it, because we manually cleansed it. To ## illustrate how we stick all strings into thrift, we ## convert this to unicode (which introduces no changes) ## and then encode it as utf-8, which also introduces no ## changes. Thrift stores strings as 8-bit character ## strings. # http://www.mail-archive.com/[email protected]/msg00210.html body.clean_visible = unicode(raw_string).encode("utf8") ## attach the content_item to the stream_item stream_item.body = body stream_item.body.language = streamcorpus.Language(code="en", name="ENGLISH") ## The authors also annotated the corpus anno = streamcorpus.Annotator() anno.annotator_id = "bagga-and-baldwin" anno.annotation_time = stream_item.stream_time ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id=target_id) rating.contains_mention = True ## heuristically split the mentions string on white space and ## use each token as a separate mention. For other corpora, ## this might need to be more sophisticated. rating.mentions = map(cleanse, mention.decode("utf8").split()) ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline return stream_item