def make_hyperlink_labeled_test_chunk():
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    dpath = os.path.dirname(__file__)
    ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' )

    cv = _init_stage('clean_visible', {})
    hl = hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         'offset_types': ['BYTES']}
        )
    for si in Chunk(path=ipath):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

    o_chunk.close()
    return tpath
예제 #2
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('out_path')
    args = parser.parse_args()

    si = make_stream_item(1, 'http://crazydog.com')
    si.body.raw = '''
Flying dogs are amazing.
The flight of the super dog Sam Vroomvroom is often cited as the first such flying dog.
'''

    topic_name = 'The flight of the super dog Sam Vroomvroom'
    sel = Selector(
        selector_type=SelectorType.TOPIC.value,
        raw_selector=topic_name,
        canonical_selector=topic_name.lower(
        ),  # this is the key for making it appear for a profile of this title
        offsets={
            OffsetType.CHARS:
            Offset(
                type=OffsetType.CHARS,
                first=si.body.raw.find('The'),
                length=len(topic_name),
            )
        },
    )
    si.body.selectors['other'] = [sel]

    chunk = Chunk(args.out_path, mode='wb')
    chunk.add(si)
    chunk.close()
예제 #3
0
def make_hyperlink_labeled_test_chunk(tmpdir):
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = tmpdir.join(str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [BYTES],
    })
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath
def test_matcher():

    config = dict(
        ## command to run
        fpat_path="cat"
    )

    fm = fpat_matcher(config)

    si1 = make_stream_item(None, "http://example.com")
    si1.body = ContentItem(clean_visible="hello! This is a test of matching Bob.")

    si2 = make_stream_item(None, "http://example.com")
    si2.body = ContentItem(clean_visible="hello! This is a test of matching Sally.")

    chunk_path = "/tmp/%s" % uuid.uuid1()

    ch = Chunk(chunk_path, mode="wb")
    ch.add(si1)
    ch.add(si1)
    ch.add(si2)
    ch.close()

    fm(chunk_path)

    ch = Chunk(chunk_path, mode="rb")

    SIs = list(ch)

    ## verify the si has expected things
    for si in SIs:
        len(si.body.labels) == 1

    for i in range(2):
        print SIs[i].ratings
def test_get_name_info(tmpdir):

    path = str(tmpdir.join('test_path'))
    c = Chunk(path, mode='wb')
    c.add(make_stream_item(28491, 'abs_url'))

    name_info = get_name_info(path, i_str='foo')
    assert name_info['date_now'] == name_info['date_time_now'][:10]
    assert name_info['date_now'] + '-' + name_info['time_now'] == name_info['date_time_now']
예제 #6
0
class factorie(FactorieBase):
    '''
    incremental transform
    '''
    def __init__(self, config):
        super(FactorieIncremetal, self).__init__(config)

        self.toFactoriePipeName = None
        self.fromFactoriePipeName = None
        self.pipeToFactorie = None
        self.pipeFromFactorie = None
        self.taggedChunkIter = None

    def start(self):
        self.toFactoriePipeName = os.tmpnam()
        self.fromFactoriePipeName = os.tmpnam()
        os.mkfifo(self.toFactoriePipeName)
        os.mkfifo(self.fromFactoriePipeName)
        logger.debug('made fifos %r %r', self.toFactoriePipeName, self.fromFactoriePipeName)

        self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName)

        self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab')
        self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb')
        self.taggedChunkIter = iter(self.pipeFromFactorie)

    def close(self):
        self.pipeToFactorie.close()
        self.taggedChunkIter = None
        self.pipeFromFactorie = None
        os.unlink(self.toFactoriePipeName)
        os.unlink(self.fromFactoriePipeName)
        if self.process:
            self.process.terminate()
            self.process = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        return False

    def __call__(self, stream_item, context):
        if not self.process:
            self.start()
        logger.debug('pushing stream item to factorie')
        self.pipeToFactorie.add(stream_item)
        self.pipeToFactorie.flush()
        nc = self.taggedChunkIter.next()
        logger.debug('got item from factorie')
        return nc
예제 #7
0
def _aligner_core(t_path1, aligner, aligner_data):
    t_chunk1 = Chunk(t_path1, mode='rb')
    t_path2 = t_path1 + '-tmp-aligning'
    t_chunk2 = Chunk(t_path2, mode='wb')
    for si in t_chunk1:
        aligner(si, aligner_data)
        t_chunk2.add(si)
    t_chunk1.close()
    t_chunk2.close()

    if aligner_data.get('cleanup_tmp_files', True):
        logger.info('atomic rename: %r --> %r', t_path2, t_path1)
        os.rename(t_path2, t_path1)
        logger.debug('done renaming')
    else:
        # for development, leave intermediate tmp file
        shutil.copy(t_path2, t_path1)
        logger.info('copied %r -> %r', t_path2, t_path1)
예제 #8
0
def _aligner_core(t_path1, aligner, aligner_data):
    t_chunk1 = Chunk(t_path1, mode='rb')
    t_path2 = t_path1 + '-tmp-aligning'
    t_chunk2 = Chunk(t_path2, mode='wb')
    for si in t_chunk1:
        aligner( si, aligner_data )
        t_chunk2.add(si)
    t_chunk1.close()
    t_chunk2.close()

    if aligner_data.get('cleanup_tmp_files', True):
        logger.info('atomic rename: %r --> %r', t_path2, t_path1)
        os.rename(t_path2, t_path1)
        logger.debug('done renaming')
    else:
        # for development, leave intermediate tmp file
        shutil.copy(t_path2, t_path1)
        logger.info('copied %r -> %r', t_path2, t_path1)
def get_john_smith_tagged_by_lingpipe_without_labels_data():
    fh = StringIO()
    o_chunk = Chunk(file_obj=fh, mode='wb')

    path = get_john_smith_tagged_by_lingpipe_path()
    for si in Chunk(path):
        for sentence in si.body.sentences['lingpipe']:
            for token in sentence.tokens:
                for labels in token.labels.values():
                    for label in labels:
                        label.offsets.update(token.offsets)
                        for offset in label.offsets.values():
                            offset.value = token.token
                        add_annotation(si.body, label)
                token.labels = dict()
        o_chunk.add(si)

    o_chunk.flush()
    return fh.getvalue()
예제 #10
0
def get_john_smith_tagged_by_lingpipe_without_labels_data(test_data_dir):
    fh = StringIO()
    o_chunk = Chunk(file_obj=fh, mode='wb')

    path = get_john_smith_tagged_by_lingpipe_path(test_data_dir)
    for si in Chunk(path):
        for sentence in si.body.sentences['lingpipe']:
            for token in sentence.tokens:
                for labels in token.labels.values():
                    for label in labels:
                        label.offsets.update(token.offsets)
                        for offset in label.offsets.values():
                            offset.value = token.token
                        add_annotation(si.body, label)
                token.labels = dict()
        o_chunk.add(si)

    o_chunk.flush()
    return fh.getvalue()
예제 #11
0
    def __call__(self, chunk_path):
        '''
        batch-type transform stage: reads a chunk from chunk_path, and
        replaces it with a new chunk at the same path
        '''
        ## make a new output chunk at a temporary path
        tmp_chunk_path = chunk_path + '_'
        t_chunk = Chunk(path=tmp_chunk_path, mode='wb')

        for num, si in enumerate(Chunk(path=chunk_path)):
            if num < self.config['max_items']:
                t_chunk.add(si)
            else:
                break

        ## flush to disk
        t_chunk.close()

        ## atomic rename new chunk file into place
        os.rename(tmp_chunk_path, chunk_path)
예제 #12
0
    def __call__(self, chunk_path):
        '''
        batch-type transform stage: reads a chunk from chunk_path, and
        replaces it with a new chunk at the same path
        '''
        ## make a new output chunk at a temporary path
        tmp_chunk_path = chunk_path + '_'
        t_chunk = Chunk(path=tmp_chunk_path, mode='wb')

        for num, si in enumerate(Chunk(path=chunk_path)):
            if num < self.config['max_items']:
                t_chunk.add(si)
            else:
                break

        ## flush to disk
        t_chunk.close()

        ## atomic rename new chunk file into place
        os.rename(tmp_chunk_path, chunk_path)
def make_hyperlink_labeled_test_chunk(tmpdir):
    """
    returns a path to a temporary chunk that has been hyperlink labeled
    """
    tpath = tmpdir.join(str(uuid.uuid1()) + ".sc")
    o_chunk = Chunk(tpath, mode="wb")

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]})
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath
    #si.body.taggings['serif'] = Taggings(
    #    tagger_id = 'serif',
    #    raw_tagging = serifxml,
    #    tagger_config = 'streamcorpus-all.par',
    #    tagger_config = '6.0.1',
    #    generation_time = make_stream_time('2013-04-18T18:18:20.000000Z'),
    #    )

    ## To properly represent a taggers output in a StreamItem, you
    ## should populate these fields:
    #si.body.sentences[tagger_id] = [Sentence(....) for ... in sentence_builder]
    ## Serif can convert serifxml into this structure.

    ## now that you have populated this StreamItem, add it to the
    ## chunk file, and go to the next StreamItem
    ch.add(si)

    print 'added StreamItem.stream_id = %s from date_hour = %s' % (
        si.stream_id, get_date_hour(si))

## after adding all the StreamItems, close the chunk:
ch.close()

## Typically, chunk files should be limited to about 500 documents or
## smaller.  There are several nice pythonic techniques for making
## many chunk files, ask us for examples to suit your circumstances.

## Typically, all of the StreamItems in a chunk file have stream_times
## from the same hour in history.  That is, if you call
## get_date_hour(si) you should get the same string for every
## StreamItem in the chunk file.
    #si.body.taggings['serif'] = Taggings(
    #    tagger_id = 'serif',
    #    raw_tagging = serifxml,
    #    tagger_config = 'streamcorpus-all.par',
    #    tagger_config = '6.0.1',
    #    generation_time = make_stream_time('2013-04-18T18:18:20.000000Z'),
    #    )

    ## To properly represent a taggers output in a StreamItem, you
    ## should populate these fields:
    #si.body.sentences[tagger_id] = [Sentence(....) for ... in sentence_builder]
    ## Serif can convert serifxml into this structure.

    ## now that you have populated this StreamItem, add it to the
    ## chunk file, and go to the next StreamItem
    ch.add(si)

    print 'added StreamItem.stream_id = %s from date_hour = %s' % (
        si.stream_id, get_date_hour(si))

## after adding all the StreamItems, close the chunk:
ch.close()

## Typically, chunk files should be limited to about 500 documents or
## smaller.  There are several nice pythonic techniques for making
## many chunk files, ask us for examples to suit your circumstances.

## Typically, all of the StreamItems in a chunk file have stream_times
## from the same hour in history.  That is, if you call
## get_date_hour(si) you should get the same string for every
## StreamItem in the chunk file.
                )

        si.source_metadata['lang'] = pe.lang[0].code
        si.source_metadata['author'] = json.dumps( 
            dict(
                name = pe.author[0].name,
                email = pe.author[0].email,
                link = pe.author[0].link[0].href,
                )
            )
        si.source = entry.source.publisher_type

        yield si


if __name__ == '__main__':
    #import sys
    #from _handle_unconvertible_spinn3r import handle_unconvertible_spinn3r as hus
    #map(hus, _generate_stream_items(sys.stdin.read()))

    o_chunk = Chunk('/tmp/foo.sc', mode='wb')
    for si in _generate_stream_items(sys.stdin.read()):
        print '---post smoosh raw: %s --' % si.stream_id
        print si.body.raw
        print si.stream_id

        if si.stream_id == '1345928297-da71cfa833ce8218684b6dab152dd69b':
            o_chunk.add( si )

    o_chunk.close()
def align_chunk_with_ner(tmp_ner_path, i_chunk, tmp_done_path):
    '''
    iterate through the i_chunk and tmp_ner_path to generate a new
    Chunk with body.ner
    '''
    o_chunk = Chunk()
    input_iter = i_chunk.__iter__()
    ner = ''
    stream_id = None

    all_ner = xml.dom.minidom.parse(open(tmp_ner_path))

    for raw_ner in all_ner.getElementsByTagName('FILENAME'):
        
        stream_item = input_iter.next()
        ## get stream_id out of the XML
        stream_id = raw_ner.attributes.get('docid').value
        assert stream_id and stream_id == stream_item.stream_id, \
            '%s != %s\nner=%r' % (stream_id, stream_item.stream_id, ner)

        tagger_id = 'lingpipe'
        tagging = Tagging()
        tagging.tagger_id = tagger_id
        ## get this one file out of its FILENAME tags
        tagged_doc = list(lingpipe.files(raw_ner.toxml()))[0][1]
        tagging.raw_tagging = tagged_doc
        tagging.generation_time = streamcorpus.make_stream_time()
        stream_item.body.taggings[tagger_id] = tagging

        sentences = list(lingpipe.sentences(tagged_doc))

        ## make JS labels on individual tokens
        assert stream_item.ratings[0].mentions, stream_item.stream_id
        john_smith_label = Label()
        john_smith_label.annotator = stream_item.ratings[0].annotator
        john_smith_label.target_id = stream_item.ratings[0].target_id

        # first map all corefchains to their words
        equiv_ids = collections.defaultdict(lambda: set())
        for sent in sentences:
            for tok in sent.tokens:
                if tok.entity_type is not None:
                    equiv_ids[tok.equiv_id].add(cleanse(tok.token))

        ## find all the chains that are John Smith
        johnsmiths = set()
        for equiv_id, names in equiv_ids.items():
            ## detect 'smith' in 'smithye'
            _names = cleanse(' '.join(names))
            if 'john' in _names and 'smith' in _names:
                johnsmiths.add(equiv_id)

        print len(johnsmiths)
        ## now apply the label
        for sent in sentences:
            for tok in sent.tokens:
                if tok.equiv_id in johnsmiths:
                    tok.labels = [john_smith_label]                

        stream_item.body.sentences[tagger_id] = sentences
        
        o_chunk.add(stream_item)

    ## put the o_chunk bytes into the specified file
    open(tmp_done_path, 'wb').write(str(o_chunk))
    ## replace this with log.info()
    print 'created %s' % tmp_done_path