def _make_stream_item(entry): """Given a single spinn3r feed entry, produce a single StreamItem. Returns 'None' if a complete item can't be constructed. """ # get standard metadata, assuming it's present... if not hasattr(entry, 'permalink_entry'): return None pe = entry.permalink_entry # ...and create a streamitem... si = streamcorpus.make_stream_item(pe.date_found[:-1] + '.0Z', pe.canonical_link.href.encode('utf8')) if not si.stream_time: logger.debug('failed to generate stream_time from {0!r}'.format( pe.date_found)) return None if not si.abs_url: logger.debug('failed to generate abs_url from {0!r}'.format( pe.canonical_link.href)) return None # ...filling in the actual data si.body = _make_content_item(pe.content, alternate_data=entry.feed_entry.content.data) if not si.body: return None if not si.body.raw: return None if pe.content_extract.data: si.other_content['extract'] = _make_content_item(pe.content_extract) si.other_content['title'] = streamcorpus.ContentItem( raw=pe.title.encode('utf8'), media_type=pe.content_extract.mime_type, encoding='UTF-8') si.other_content['feed_entry_title'] = streamcorpus.ContentItem( raw=entry.feed_entry.title.encode('utf8'), media_type=entry.feed_entry.content.mime_type, encoding='UTF-8') if entry.feed_entry.content.data: si.other_content['feed_entry'] = _make_content_item( entry.feed_entry.content) si.source_metadata['lang'] = pe.lang[0].code si.source_metadata['author'] = json.dumps( dict( name=pe.author[0].name, email=pe.author[0].email, link=pe.author[0].link[0].href, )) si.source = entry.source.publisher_type return si
class from_serifxml(Configured): '''Read a Serif XML intermediate file as the input to the pipeline. This is a specialized reader for unusual circumstances; you will still need to run :class:`~streamcorpus_pipeline._serif.serif` with special settings to complete the tagging. This expects to find serifxml flat files in a directory and creates a :class:`~streamcorpus.Tagging` with :attr:`~streamcorpus.Tagging.raw_tagging` holding the serifxml string. This :class:`~streamcorpus.Tagging` is stored in :attr:`~streamcorpus.StreamItem.body.taggings`. This also fills in :attr:`~streamcorpus.ContentItem.raw` field. This has one configuration option, which can usually be left at its default value: .. code-block:: yaml streamcorpus_pipeline: from_serifxml: tagger_id: serif `tagger_id` is the tagger name in the generated :class:`~streamcorpus.StreamItem`. To obtain :attr:`~streamcorpus.StreamItem.body.sentences`, one must run Serif in the special `read_serifxml` mode: .. code-block:: yaml streamcorpus_pipeline: third_dir_path: /third tmp_dir_path: tmp reader: from_serifxml incremental_transforms: - language - guess_media_type - clean_html - clean_visible - title batch_transforms: - serif language: force: name: English code: en guess_media_type: fallback_media_type: text/plain serif: path_in_third: serif/serif-latest serif_exe: bin/x86_64/Serif par: streamcorpus_read_serifxml par_additions: streamcorpus_read_serifxml: - "# example additional line" writer: to_local_chunks to_local_chunks: output_type: otherdir output_path: test_output output_name: "%(input_fname)s" ''' config_name = 'from_serifxml' default_config = { 'tagger_id': 'serif', } def __call__(self, i_str): # Read in the entire contents as text; we will need to # save it away later with open(i_str, 'r') as f: serifxml = f.read() fname = os.path.basename(i_str) stream_time = None date_m = date_in_file_name_re.match(fname) if date_m: year = int(date_m.group('year')) month = int(date_m.group('month')) day = int(date_m.group('day')) try: stream_time = streamcorpus.make_stream_time( zulu_timestamp='%d-%02d-%02dT00:00:01.000000Z' % (year, month, day)) except Exception, exc: logger.info( 'trapped failed parsing of file name to make stream_time', exc_info=True) stream_time = None if not stream_time: ## fall back to using the present moment on this system epoch_ticks = time.time() ### NOT IN THE SERIFXML FILE stream_time = streamcorpus.make_stream_time( epoch_ticks=epoch_ticks) # Parse the XML root = etree.fromstring(serifxml) # Get some key parts doc_id = root.xpath('string(/SerifXML/Document/@docid)') source = root.xpath('string(/SerifXML/Document/@source_type)') raw = root.xpath('string(/SerifXML/Document/OriginalText/Contents)') # Build the streamitem tagging = streamcorpus.Tagging( tagger_id=self.config['tagger_id'], raw_tagging=serifxml, ) body = streamcorpus.ContentItem( raw=raw, taggings={ self.config['tagger_id']: tagging, }, ) si = streamcorpus.StreamItem( version=streamcorpus.Versions.v0_3_0, doc_id=doc_id, abs_url=fname, source=source, body=body, stream_id='%d-%s' % (stream_time.epoch_ticks, doc_id), stream_time=stream_time, ) yield si
def generate_john_smith_chunk(path_to_original): ''' This _looks_ like a Chunk only in that it generates StreamItem instances when iterated upon. ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. Here, we assume the JS corpus ## was created at one moment at the end of 1998: creation_time = '1998-12-31T23:59:59.999999Z' correct_time = 915148799 if not os.path.isabs(path_to_original): path_to_original = os.path.join(os.getcwd(), path_to_original) ## iterate over the files in the 35 input directories for label_id in range(35): dir_path = os.path.join(path_to_original, str(label_id)) fnames = os.listdir(dir_path) fnames.sort() for fname in fnames: stream_item = streamcorpus.make_stream_item( creation_time, ## make up an abs_url os.path.join('john-smith-corpus', str(label_id), fname)) if int(stream_item.stream_time.epoch_ticks) != correct_time: raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\ % (creation_time, stream_item.stream_time.epoch_ticks, correct_time)) ## These docs came from the authors of the paper cited above. stream_item.source = 'bagga-and-baldwin' ## build a ContentItem for the body body = streamcorpus.ContentItem() raw_string = open(os.path.join(dir_path, fname)).read() ## We know that this is already clean and has nothing ## tricky in it, because we manually cleansed it. To ## illustrate how we stick all strings into thrift, we ## convert this to unicode (which introduces no changes) ## and then encode it as utf-8, which also introduces no ## changes. Thrift stores strings as 8-bit character ## strings. # http://www.mail-archive.com/[email protected]/msg00210.html body.clean_visible = unicode(raw_string).encode('utf8') ## attach the content_item to the stream_item stream_item.body = body stream_item.body.language = streamcorpus.Language(code='en', name='ENGLISH') ## The authors also annotated the corpus anno = streamcorpus.Annotator() anno.annotator_id = 'bagga-and-baldwin' anno.annotation_time = stream_item.stream_time ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target( target_id=str(label_id)) # must be string rating.contains_mention = True rating.mentions = ['john', 'smith'] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline yield stream_item
def _make_stream_item(cls, path, metadata, abs_url, entities): ''' ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. creation_time = os.path.getctime(path) ## make stream item stream_item = streamcorpus.make_stream_item(creation_time, abs_url) stream_item.source = metadata.get('source') ## build a ContentItem for the body body = streamcorpus.ContentItem() body.media_type = magic.from_file(path, mime=True) logger.info('opening %r', path) with open(path) as f: body.raw = f.read() ## attach the content_item to the stream_item stream_item.body = body ## annotations anno = streamcorpus.Annotator() anno.annotator_id = metadata['annotator_id'] anno.annotation_time = stream_item.stream_time num_ratings = 0 for entity, is_profile in entities: num_ratings += 1 ## pull out target id and mention tokens target_id = str(entity['target_id']) ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id=target_id) rating.contains_mention = True if is_profile: rating.flags = [streamcorpus.FlagType.PROFILE] ## parse slots in yaml file slots = cls._parse_slots(entity['slots']) ## heuristically split the slots string on white space and ## use each token as a separate mention. rating.mentions = [ cleanse(unicode(slot[1], 'utf-8')) for slot in slots ] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url) return stream_item
def __call__(self, si, context=None): if si.version == streamcorpus.Versions.v0_3_0: return si if not hasattr(si, 'version'): raise NotImplementedError( 'upgrade_streamcorpus_v0_3_0 does not support upgrading from v0_1_0; see "_upgrade_streamcorpus.py"' ) si3 = streamcorpus.make_stream_item( zulu_timestamp=si.stream_time.zulu_timestamp, abs_url=si.abs_url) if si3.stream_id != si.stream_id: si3.external_ids['kba-2013'] = {si3.stream_id: si.stream_id} ## copy everything for attr in [ 'original_url', 'ratings', 'schost', 'source', 'source_metadata', 'ratings', ]: setattr(si3, attr, copy.deepcopy(getattr(si, attr))) si3.body = streamcorpus.ContentItem() for name, ci in si.other_content.items(): ci3 = streamcorpus.ContentItem() si3.other_content[name] = ci3 for attr in content_item_attrs: setattr(ci3, attr, copy.deepcopy(getattr(ci, attr))) upgrade_labels(ci, ci3) for attr in content_item_attrs: setattr(si3.body, attr, copy.deepcopy(getattr(si.body, attr))) upgrade_labels(si.body, si3.body) ## fix the body.sentences['lingpipe'] mention_id ranges next_global_mention_id = 0 ## mapping from (sentence_id, mention_id) --> global_mention_id mention_ids = {} si3.body.sentences['lingpipe'] = [] for sentence_id, sentence in enumerate( si.body.sentences.get('lingpipe', [])): new_sent = streamcorpus.Sentence() si3.body.sentences['lingpipe'].append(new_sent) for token_id, token in enumerate(sentence.tokens): new_token = streamcorpus.Token() new_sent.tokens.append(new_token) for attr in [ 'token_num', 'token', 'offsets', 'sentence_pos', 'lemma', 'pos', 'entity_type', 'mention_id', 'equiv_id', 'parent_id', 'dependency_path' ]: setattr(new_token, attr, copy.deepcopy(getattr(token, attr))) upgrade_labels(token, new_token) if token.mention_id not in [-1, None]: key = (sentence_id, token.mention_id) if key in mention_ids: new_mention_id = mention_ids[key] else: new_mention_id = next_global_mention_id next_global_mention_id += 1 ## save it for later mention_ids[key] = new_mention_id new_token.mention_id = new_mention_id logger.debug('new_mention_id = %d' % new_mention_id) if token.entity_type in [3, 4]: ## convert FEMALE/MALE_PRONOUN new_token.mention_type = streamcorpus.MentionType.PRO new_token.entity_type = streamcorpus.EntityType.PER if token.entity_type == 3: gender_value = 1 else: gender_value = 0 attr = streamcorpus.Attribute( attribute_type=streamcorpus.AttributeType.PER_AGE, evidence=token.token, value=str(gender_value), sentence_id=sentence_id, mention_id=token.mention_id) if 'lingpipe' not in si3.body.attributes: si3.body.attributes['lingpipe'] = [] si3.body.attributes['lingpipe'].append(attr) else: new_token.mention_type = streamcorpus.MentionType.NAME ## return our newly manufacturered v0_3_0 StreamItem return si3
The ContentItem is created with raw data set to ``node.data``, decompressed if the node's encoding is 'zlib', and UTF-8 normalized, with a MIME type from ``node.mime_type``. ``node`` the actual node from the spinn3r protobuf data ``mime_type`` string MIME type to use (defaults to ``node.mime_type``) ``alternate_data`` alternate (compressed) data to use, if ``node.data`` is missing or can't be decompressed """ raw = node.data if getattr(node, 'encoding', None) == 'zlib': try: raw = zlib.decompress(node.data) except Exception, exc: if alternate_data is not None: try: raw = zlib.decompress(alternate_data) except Exception, eee: raise exc # the original exception else: raise if mime_type is None: mime_type = node.mime_type raw = raw.decode('utf8').encode('utf8') return streamcorpus.ContentItem(raw=raw, media_type=mime_type)