def _make_stream_item(entry):
    """Given a single spinn3r feed entry, produce a single StreamItem.

    Returns 'None' if a complete item can't be constructed.

    """
    # get standard metadata, assuming it's present...
    if not hasattr(entry, 'permalink_entry'):
        return None
    pe = entry.permalink_entry

    # ...and create a streamitem...
    si = streamcorpus.make_stream_item(pe.date_found[:-1] + '.0Z',
                                       pe.canonical_link.href.encode('utf8'))
    if not si.stream_time:
        logger.debug('failed to generate stream_time from {0!r}'.format(
            pe.date_found))
        return None
    if not si.abs_url:
        logger.debug('failed to generate abs_url from {0!r}'.format(
            pe.canonical_link.href))
        return None

    # ...filling in the actual data
    si.body = _make_content_item(pe.content,
                                 alternate_data=entry.feed_entry.content.data)
    if not si.body: return None
    if not si.body.raw: return None

    if pe.content_extract.data:
        si.other_content['extract'] = _make_content_item(pe.content_extract)
    si.other_content['title'] = streamcorpus.ContentItem(
        raw=pe.title.encode('utf8'),
        media_type=pe.content_extract.mime_type,
        encoding='UTF-8')
    si.other_content['feed_entry_title'] = streamcorpus.ContentItem(
        raw=entry.feed_entry.title.encode('utf8'),
        media_type=entry.feed_entry.content.mime_type,
        encoding='UTF-8')
    if entry.feed_entry.content.data:
        si.other_content['feed_entry'] = _make_content_item(
            entry.feed_entry.content)
    si.source_metadata['lang'] = pe.lang[0].code
    si.source_metadata['author'] = json.dumps(
        dict(
            name=pe.author[0].name,
            email=pe.author[0].email,
            link=pe.author[0].link[0].href,
        ))
    si.source = entry.source.publisher_type
    return si
예제 #2
0
class from_serifxml(Configured):
    '''Read a Serif XML intermediate file as the input to the pipeline.

    This is a specialized reader for unusual circumstances; you will
    still need to run :class:`~streamcorpus_pipeline._serif.serif`
    with special settings to complete the tagging.  This expects to
    find serifxml flat files in a directory and creates a
    :class:`~streamcorpus.Tagging` with
    :attr:`~streamcorpus.Tagging.raw_tagging` holding the serifxml
    string.  This :class:`~streamcorpus.Tagging` is stored in
    :attr:`~streamcorpus.StreamItem.body.taggings`.

    This also fills in :attr:`~streamcorpus.ContentItem.raw` field.

    This has one configuration option, which can usually be left at
    its default value:

    .. code-block:: yaml

        streamcorpus_pipeline:
          from_serifxml:
            tagger_id: serif

    `tagger_id` is the tagger name in the generated
    :class:`~streamcorpus.StreamItem`.

    To obtain :attr:`~streamcorpus.StreamItem.body.sentences`, one
    must run Serif in the special `read_serifxml` mode:

    .. code-block:: yaml

        streamcorpus_pipeline:
          third_dir_path: /third
          tmp_dir_path: tmp
          reader: from_serifxml
          incremental_transforms:
          - language
          - guess_media_type
          - clean_html
          - clean_visible
          - title
          batch_transforms:
          - serif
          language:
            force:
              name: English
              code: en
          guess_media_type:
            fallback_media_type: text/plain
          serif:
            path_in_third: serif/serif-latest
            serif_exe: bin/x86_64/Serif
            par: streamcorpus_read_serifxml
            par_additions:
              streamcorpus_read_serifxml:
              - "# example additional line"
          writer: to_local_chunks
          to_local_chunks:
            output_type: otherdir
            output_path: test_output
            output_name: "%(input_fname)s"

    '''
    config_name = 'from_serifxml'
    default_config = {
        'tagger_id': 'serif',
    }

    def __call__(self, i_str):
        # Read in the entire contents as text; we will need to
        # save it away later
        with open(i_str, 'r') as f:
            serifxml = f.read()

        fname = os.path.basename(i_str)
        stream_time = None
        date_m = date_in_file_name_re.match(fname)
        if date_m:
            year = int(date_m.group('year'))
            month = int(date_m.group('month'))
            day = int(date_m.group('day'))
            try:
                stream_time = streamcorpus.make_stream_time(
                    zulu_timestamp='%d-%02d-%02dT00:00:01.000000Z' %
                    (year, month, day))
            except Exception, exc:
                logger.info(
                    'trapped failed parsing of file name to make stream_time',
                    exc_info=True)
                stream_time = None

        if not stream_time:
            ## fall back to using the present moment on this system
            epoch_ticks = time.time()  ### NOT IN THE SERIFXML FILE
            stream_time = streamcorpus.make_stream_time(
                epoch_ticks=epoch_ticks)

        # Parse the XML
        root = etree.fromstring(serifxml)

        # Get some key parts
        doc_id = root.xpath('string(/SerifXML/Document/@docid)')
        source = root.xpath('string(/SerifXML/Document/@source_type)')
        raw = root.xpath('string(/SerifXML/Document/OriginalText/Contents)')

        # Build the streamitem
        tagging = streamcorpus.Tagging(
            tagger_id=self.config['tagger_id'],
            raw_tagging=serifxml,
        )
        body = streamcorpus.ContentItem(
            raw=raw,
            taggings={
                self.config['tagger_id']: tagging,
            },
        )
        si = streamcorpus.StreamItem(
            version=streamcorpus.Versions.v0_3_0,
            doc_id=doc_id,
            abs_url=fname,
            source=source,
            body=body,
            stream_id='%d-%s' % (stream_time.epoch_ticks, doc_id),
            stream_time=stream_time,
        )
        yield si
예제 #3
0
def generate_john_smith_chunk(path_to_original):
    '''
    This _looks_ like a Chunk only in that it generates StreamItem
    instances when iterated upon.
    '''
    ## Every StreamItem has a stream_time property.  It usually comes
    ## from the document creation time.  Here, we assume the JS corpus
    ## was created at one moment at the end of 1998:
    creation_time = '1998-12-31T23:59:59.999999Z'
    correct_time = 915148799

    if not os.path.isabs(path_to_original):
        path_to_original = os.path.join(os.getcwd(), path_to_original)

    ## iterate over the files in the 35 input directories
    for label_id in range(35):

        dir_path = os.path.join(path_to_original, str(label_id))
        fnames = os.listdir(dir_path)
        fnames.sort()
        for fname in fnames:

            stream_item = streamcorpus.make_stream_item(
                creation_time,
                ## make up an abs_url
                os.path.join('john-smith-corpus', str(label_id), fname))

            if int(stream_item.stream_time.epoch_ticks) != correct_time:
                raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\
                                            % (creation_time, stream_item.stream_time.epoch_ticks,
                                               correct_time))

            ## These docs came from the authors of the paper cited above.
            stream_item.source = 'bagga-and-baldwin'

            ## build a ContentItem for the body
            body = streamcorpus.ContentItem()
            raw_string = open(os.path.join(dir_path, fname)).read()
            ## We know that this is already clean and has nothing
            ## tricky in it, because we manually cleansed it.  To
            ## illustrate how we stick all strings into thrift, we
            ## convert this to unicode (which introduces no changes)
            ## and then encode it as utf-8, which also introduces no
            ## changes.  Thrift stores strings as 8-bit character
            ## strings.
            # http://www.mail-archive.com/[email protected]/msg00210.html
            body.clean_visible = unicode(raw_string).encode('utf8')

            ## attach the content_item to the stream_item
            stream_item.body = body

            stream_item.body.language = streamcorpus.Language(code='en',
                                                              name='ENGLISH')

            ## The authors also annotated the corpus
            anno = streamcorpus.Annotator()
            anno.annotator_id = 'bagga-and-baldwin'
            anno.annotation_time = stream_item.stream_time

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(
                target_id=str(label_id))  # must be string
            rating.contains_mention = True
            rating.mentions = ['john', 'smith']

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

            ## provide this stream_item to the pipeline
            yield stream_item
    def _make_stream_item(cls, path, metadata, abs_url, entities):
        '''
        
        '''
        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.
        creation_time = os.path.getctime(path)

        ## make stream item
        stream_item = streamcorpus.make_stream_item(creation_time, abs_url)

        stream_item.source = metadata.get('source')

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        body.media_type = magic.from_file(path, mime=True)

        logger.info('opening %r', path)
        with open(path) as f:
            body.raw = f.read()

        ## attach the content_item to the stream_item
        stream_item.body = body

        ## annotations
        anno = streamcorpus.Annotator()
        anno.annotator_id = metadata['annotator_id']
        anno.annotation_time = stream_item.stream_time

        num_ratings = 0
        for entity, is_profile in entities:
            num_ratings += 1

            ## pull out target id and mention tokens
            target_id = str(entity['target_id'])

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id=target_id)
            rating.contains_mention = True

            if is_profile:
                rating.flags = [streamcorpus.FlagType.PROFILE]

            ## parse slots in yaml file
            slots = cls._parse_slots(entity['slots'])

            ## heuristically split the slots string on white space and
            ## use each token as a separate mention.
            rating.mentions = [
                cleanse(unicode(slot[1], 'utf-8')) for slot in slots
            ]

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        logger.info('created StreamItem(num ratings=%d, abs_url=%r',
                    num_ratings, stream_item.abs_url)
        return stream_item
예제 #5
0
    def __call__(self, si, context=None):
        if si.version == streamcorpus.Versions.v0_3_0:
            return si

        if not hasattr(si, 'version'):
            raise NotImplementedError(
                'upgrade_streamcorpus_v0_3_0 does not support upgrading from v0_1_0; see "_upgrade_streamcorpus.py"'
            )

        si3 = streamcorpus.make_stream_item(
            zulu_timestamp=si.stream_time.zulu_timestamp, abs_url=si.abs_url)

        if si3.stream_id != si.stream_id:
            si3.external_ids['kba-2013'] = {si3.stream_id: si.stream_id}

        ## copy everything
        for attr in [
                'original_url',
                'ratings',
                'schost',
                'source',
                'source_metadata',
                'ratings',
        ]:
            setattr(si3, attr, copy.deepcopy(getattr(si, attr)))

        si3.body = streamcorpus.ContentItem()

        for name, ci in si.other_content.items():
            ci3 = streamcorpus.ContentItem()
            si3.other_content[name] = ci3
            for attr in content_item_attrs:
                setattr(ci3, attr, copy.deepcopy(getattr(ci, attr)))
            upgrade_labels(ci, ci3)

        for attr in content_item_attrs:
            setattr(si3.body, attr, copy.deepcopy(getattr(si.body, attr)))

        upgrade_labels(si.body, si3.body)

        ## fix the body.sentences['lingpipe'] mention_id ranges
        next_global_mention_id = 0
        ## mapping from (sentence_id, mention_id) --> global_mention_id
        mention_ids = {}
        si3.body.sentences['lingpipe'] = []
        for sentence_id, sentence in enumerate(
                si.body.sentences.get('lingpipe', [])):
            new_sent = streamcorpus.Sentence()
            si3.body.sentences['lingpipe'].append(new_sent)

            for token_id, token in enumerate(sentence.tokens):

                new_token = streamcorpus.Token()
                new_sent.tokens.append(new_token)

                for attr in [
                        'token_num', 'token', 'offsets', 'sentence_pos',
                        'lemma', 'pos', 'entity_type', 'mention_id',
                        'equiv_id', 'parent_id', 'dependency_path'
                ]:
                    setattr(new_token, attr,
                            copy.deepcopy(getattr(token, attr)))

                upgrade_labels(token, new_token)

                if token.mention_id not in [-1, None]:
                    key = (sentence_id, token.mention_id)
                    if key in mention_ids:
                        new_mention_id = mention_ids[key]

                    else:
                        new_mention_id = next_global_mention_id
                        next_global_mention_id += 1

                        ## save it for later
                        mention_ids[key] = new_mention_id

                    new_token.mention_id = new_mention_id
                    logger.debug('new_mention_id = %d' % new_mention_id)

                    if token.entity_type in [3, 4]:
                        ## convert FEMALE/MALE_PRONOUN
                        new_token.mention_type = streamcorpus.MentionType.PRO
                        new_token.entity_type = streamcorpus.EntityType.PER

                        if token.entity_type == 3:
                            gender_value = 1
                        else:
                            gender_value = 0

                        attr = streamcorpus.Attribute(
                            attribute_type=streamcorpus.AttributeType.PER_AGE,
                            evidence=token.token,
                            value=str(gender_value),
                            sentence_id=sentence_id,
                            mention_id=token.mention_id)

                        if 'lingpipe' not in si3.body.attributes:
                            si3.body.attributes['lingpipe'] = []
                        si3.body.attributes['lingpipe'].append(attr)

                    else:
                        new_token.mention_type = streamcorpus.MentionType.NAME

        ## return our newly manufacturered v0_3_0 StreamItem
        return si3
    The ContentItem is created with raw data set to ``node.data``,
    decompressed if the node's encoding is 'zlib', and UTF-8
    normalized, with a MIME type from ``node.mime_type``.

    ``node``
      the actual node from the spinn3r protobuf data
    ``mime_type``
      string MIME type to use (defaults to ``node.mime_type``)
    ``alternate_data``
      alternate (compressed) data to use, if ``node.data`` is missing
      or can't be decompressed

    """
    raw = node.data
    if getattr(node, 'encoding', None) == 'zlib':
        try:
            raw = zlib.decompress(node.data)
        except Exception, exc:
            if alternate_data is not None:
                try:
                    raw = zlib.decompress(alternate_data)
                except Exception, eee:
                    raise exc  # the original exception
                else:
                    raise
    if mime_type is None:
        mime_type = node.mime_type
    raw = raw.decode('utf8').encode('utf8')
    return streamcorpus.ContentItem(raw=raw, media_type=mime_type)