예제 #1
0
def make_hyperlink_labeled_test_chunk(tmpdir):
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = tmpdir.join(str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={
        'require_abs_url': True,
        'all_domains': True,
        'offset_types': [BYTES],
    })
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath
def make_hyperlink_labeled_test_chunk():
    '''
    returns a path to a temporary chunk that has been hyperlink labeled
    '''
    tpath = os.path.join('/tmp', str(uuid.uuid1()) + '.sc')
    o_chunk = Chunk(tpath, mode='wb')

    dpath = os.path.dirname(__file__)
    ipath = os.path.join( dpath, _TEST_DATA_ROOT, 'test/WEBLOG-100-fd5f05c8a680faa2bf8c55413e949bbf.sc' )

    cv = _init_stage('clean_visible', {})
    hl = hyperlink_labels(
        {'require_abs_url': True, 
         'all_domains': True,
         'offset_types': ['BYTES']}
        )
    for si in Chunk(path=ipath):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

    o_chunk.close()
    return tpath
예제 #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('out_path')
    args = parser.parse_args()

    si = make_stream_item(1, 'http://crazydog.com')
    si.body.raw = '''
Flying dogs are amazing.
The flight of the super dog Sam Vroomvroom is often cited as the first such flying dog.
'''

    topic_name = 'The flight of the super dog Sam Vroomvroom'
    sel = Selector(
        selector_type=SelectorType.TOPIC.value,
        raw_selector=topic_name,
        canonical_selector=topic_name.lower(
        ),  # this is the key for making it appear for a profile of this title
        offsets={
            OffsetType.CHARS:
            Offset(
                type=OffsetType.CHARS,
                first=si.body.raw.find('The'),
                length=len(topic_name),
            )
        },
    )
    si.body.selectors['other'] = [sel]

    chunk = Chunk(args.out_path, mode='wb')
    chunk.add(si)
    chunk.close()
def test_matcher():

    config = dict(
        ## command to run
        fpat_path="cat"
    )

    fm = fpat_matcher(config)

    si1 = make_stream_item(None, "http://example.com")
    si1.body = ContentItem(clean_visible="hello! This is a test of matching Bob.")

    si2 = make_stream_item(None, "http://example.com")
    si2.body = ContentItem(clean_visible="hello! This is a test of matching Sally.")

    chunk_path = "/tmp/%s" % uuid.uuid1()

    ch = Chunk(chunk_path, mode="wb")
    ch.add(si1)
    ch.add(si1)
    ch.add(si2)
    ch.close()

    fm(chunk_path)

    ch = Chunk(chunk_path, mode="rb")

    SIs = list(ch)

    ## verify the si has expected things
    for si in SIs:
        len(si.body.labels) == 1

    for i in range(2):
        print SIs[i].ratings
예제 #5
0
class factorie(FactorieBase):
    '''
    incremental transform
    '''
    def __init__(self, config):
        super(FactorieIncremetal, self).__init__(config)

        self.toFactoriePipeName = None
        self.fromFactoriePipeName = None
        self.pipeToFactorie = None
        self.pipeFromFactorie = None
        self.taggedChunkIter = None

    def start(self):
        self.toFactoriePipeName = os.tmpnam()
        self.fromFactoriePipeName = os.tmpnam()
        os.mkfifo(self.toFactoriePipeName)
        os.mkfifo(self.fromFactoriePipeName)
        logger.debug('made fifos %r %r', self.toFactoriePipeName, self.fromFactoriePipeName)

        self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName)

        self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab')
        self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb')
        self.taggedChunkIter = iter(self.pipeFromFactorie)

    def close(self):
        self.pipeToFactorie.close()
        self.taggedChunkIter = None
        self.pipeFromFactorie = None
        os.unlink(self.toFactoriePipeName)
        os.unlink(self.fromFactoriePipeName)
        if self.process:
            self.process.terminate()
            self.process = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        return False

    def __call__(self, stream_item, context):
        if not self.process:
            self.start()
        logger.debug('pushing stream item to factorie')
        self.pipeToFactorie.add(stream_item)
        self.pipeToFactorie.flush()
        nc = self.taggedChunkIter.next()
        logger.debug('got item from factorie')
        return nc
예제 #6
0
def _aligner_core(t_path1, aligner, aligner_data):
    t_chunk1 = Chunk(t_path1, mode='rb')
    t_path2 = t_path1 + '-tmp-aligning'
    t_chunk2 = Chunk(t_path2, mode='wb')
    for si in t_chunk1:
        aligner(si, aligner_data)
        t_chunk2.add(si)
    t_chunk1.close()
    t_chunk2.close()

    if aligner_data.get('cleanup_tmp_files', True):
        logger.info('atomic rename: %r --> %r', t_path2, t_path1)
        os.rename(t_path2, t_path1)
        logger.debug('done renaming')
    else:
        # for development, leave intermediate tmp file
        shutil.copy(t_path2, t_path1)
        logger.info('copied %r -> %r', t_path2, t_path1)
예제 #7
0
def _aligner_core(t_path1, aligner, aligner_data):
    t_chunk1 = Chunk(t_path1, mode='rb')
    t_path2 = t_path1 + '-tmp-aligning'
    t_chunk2 = Chunk(t_path2, mode='wb')
    for si in t_chunk1:
        aligner( si, aligner_data )
        t_chunk2.add(si)
    t_chunk1.close()
    t_chunk2.close()

    if aligner_data.get('cleanup_tmp_files', True):
        logger.info('atomic rename: %r --> %r', t_path2, t_path1)
        os.rename(t_path2, t_path1)
        logger.debug('done renaming')
    else:
        # for development, leave intermediate tmp file
        shutil.copy(t_path2, t_path1)
        logger.info('copied %r -> %r', t_path2, t_path1)
예제 #8
0
    def __call__(self, chunk_path):
        '''
        batch-type transform stage: reads a chunk from chunk_path, and
        replaces it with a new chunk at the same path
        '''
        ## make a new output chunk at a temporary path
        tmp_chunk_path = chunk_path + '_'
        t_chunk = Chunk(path=tmp_chunk_path, mode='wb')

        for num, si in enumerate(Chunk(path=chunk_path)):
            if num < self.config['max_items']:
                t_chunk.add(si)
            else:
                break

        ## flush to disk
        t_chunk.close()

        ## atomic rename new chunk file into place
        os.rename(tmp_chunk_path, chunk_path)
예제 #9
0
    def __call__(self, chunk_path):
        '''
        batch-type transform stage: reads a chunk from chunk_path, and
        replaces it with a new chunk at the same path
        '''
        ## make a new output chunk at a temporary path
        tmp_chunk_path = chunk_path + '_'
        t_chunk = Chunk(path=tmp_chunk_path, mode='wb')

        for num, si in enumerate(Chunk(path=chunk_path)):
            if num < self.config['max_items']:
                t_chunk.add(si)
            else:
                break

        ## flush to disk
        t_chunk.close()

        ## atomic rename new chunk file into place
        os.rename(tmp_chunk_path, chunk_path)
def make_hyperlink_labeled_test_chunk(tmpdir):
    """
    returns a path to a temporary chunk that has been hyperlink labeled
    """
    tpath = tmpdir.join(str(uuid.uuid1()) + ".sc")
    o_chunk = Chunk(tpath, mode="wb")

    ipath = get_test_chunk_path()

    hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]})
    cv = make_clean_visible(config={})
    for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0):
        ## clear out existing labels and tokens
        si.body.labels = {}
        si.body.sentences = {}
        context = {}
        hl(si, context)
        cv(si, context)
        o_chunk.add(si)

        o_chunk.close()
        return tpath
    #    )

    ## To properly represent a taggers output in a StreamItem, you
    ## should populate these fields:
    #si.body.sentences[tagger_id] = [Sentence(....) for ... in sentence_builder]
    ## Serif can convert serifxml into this structure.

    ## now that you have populated this StreamItem, add it to the
    ## chunk file, and go to the next StreamItem
    ch.add(si)

    print 'added StreamItem.stream_id = %s from date_hour = %s' % (
        si.stream_id, get_date_hour(si))

## after adding all the StreamItems, close the chunk:
ch.close()

## Typically, chunk files should be limited to about 500 documents or
## smaller.  There are several nice pythonic techniques for making
## many chunk files, ask us for examples to suit your circumstances.

## Typically, all of the StreamItems in a chunk file have stream_times
## from the same hour in history.  That is, if you call
## get_date_hour(si) you should get the same string for every
## StreamItem in the chunk file.

## Organizing a large number of documents to meet these requirements
## can take some work.  Post an issue ticket if you want to discuss
## your needs.

print 'saved a file to %s with md5 sum: %s' % (output_path, ch.md5_hexdigest)
                )

        si.source_metadata['lang'] = pe.lang[0].code
        si.source_metadata['author'] = json.dumps( 
            dict(
                name = pe.author[0].name,
                email = pe.author[0].email,
                link = pe.author[0].link[0].href,
                )
            )
        si.source = entry.source.publisher_type

        yield si


if __name__ == '__main__':
    #import sys
    #from _handle_unconvertible_spinn3r import handle_unconvertible_spinn3r as hus
    #map(hus, _generate_stream_items(sys.stdin.read()))

    o_chunk = Chunk('/tmp/foo.sc', mode='wb')
    for si in _generate_stream_items(sys.stdin.read()):
        print '---post smoosh raw: %s --' % si.stream_id
        print si.body.raw
        print si.stream_id

        if si.stream_id == '1345928297-da71cfa833ce8218684b6dab152dd69b':
            o_chunk.add( si )

    o_chunk.close()
    #    )

    ## To properly represent a taggers output in a StreamItem, you
    ## should populate these fields:
    #si.body.sentences[tagger_id] = [Sentence(....) for ... in sentence_builder]
    ## Serif can convert serifxml into this structure.

    ## now that you have populated this StreamItem, add it to the
    ## chunk file, and go to the next StreamItem
    ch.add(si)

    print 'added StreamItem.stream_id = %s from date_hour = %s' % (
        si.stream_id, get_date_hour(si))

## after adding all the StreamItems, close the chunk:
ch.close()

## Typically, chunk files should be limited to about 500 documents or
## smaller.  There are several nice pythonic techniques for making
## many chunk files, ask us for examples to suit your circumstances.

## Typically, all of the StreamItems in a chunk file have stream_times
## from the same hour in history.  That is, if you call
## get_date_hour(si) you should get the same string for every
## StreamItem in the chunk file.

## Organizing a large number of documents to meet these requirements
## can take some work.  Post an issue ticket if you want to discuss
## your needs.

print 'saved a file to %s with md5 sum: %s' % (output_path, ch.md5_hexdigest)