def test_kvlayer_simple(configurator, tmpdir):
    si = streamcorpus.make_stream_item('2000-01-01T12:34:00.000123Z',
                                       'test://test.stream.item/')
    chunkfile = str(tmpdir.join('chunk.sc.xz'))
    with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk:
        chunk.add(si)

    with configurator():
        writer = to_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'to_kvlayer'))
        writer(chunkfile, {}, '')

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'stream_items': 2})
        print repr(list(kvlclient.scan_keys('stream_items')))
        for (k,v) in kvlclient.get(
                'stream_items',
                (uuid.UUID(int=946730040),
                 uuid.UUID(hex='985c1e3ed73256cd9a399919fe93cf76'))):
            assert v is not None

        reader = from_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'from_kvlayer'))
        sis = list(reader(''))
        assert len(sis) == 1
        assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks
        assert sis[0].abs_url == si.abs_url
def chunks(configurator, test_data_dir, overlay={}):
    with configurator(overlay):
        path = get_test_v0_3_0_chunk_path(test_data_dir)
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'to_kvlayer')
        writer = to_kvlayer(config)

        ## name_info and i_str are not used by the writer
        i_str = ''
        name_info = {}
        writer(path, name_info, i_str)

        client = kvlayer.client()
        client.setup_namespace({'stream_items': 2,
                                'stream_items_doc_id_epoch_ticks': 2,
                                'stream_items_with_source': 2})
        yield path, client
def test_kvlayer_negative(configurator, tmpdir):
    si = streamcorpus.make_stream_item('1969-07-20T20:18:00.000000Z',
                                       'test://test.stream.item/')
    chunkfile = str(tmpdir.join('chunk.sc.xz'))
    with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk:
        chunk.add(si)

    with configurator():
        writer = to_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'to_kvlayer'))
        writer(chunkfile, {}, '')

        reader = from_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'from_kvlayer'))
        sis = list(reader(''))
        assert len(sis) == 1
        assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks
        assert sis[0].abs_url == si.abs_url
示例#4
0
def chunks(configurator, test_data_dir, overlay={}):
    with configurator(overlay):
        path = get_test_v0_3_0_chunk_path(test_data_dir)
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'to_kvlayer')
        writer = to_kvlayer(config)

        ## name_info and i_str are not used by the writer
        i_str = ''
        name_info = {}
        writer(path, name_info, i_str)

        client = kvlayer.client()
        client.setup_namespace({
            'stream_items': 2,
            'stream_items_doc_id_epoch_ticks': 2,
            'stream_items_with_source': 2
        })
        yield path, client
def test_kvlayer_extractor_and_loader(config):
    path = get_test_v0_3_0_chunk_path()
    loader = to_kvlayer(config)
    
    ## name_info and i_str are not used by the loader
    i_str = ''
    name_info = {}
    loader(path, name_info, i_str)

    ## check that index table was created
    all_doc_ids = set()
    all_epoch_ticks = set()
    for (doc_id, epoch_ticks), empty_data in loader.client.scan('stream_items_doc_id_epoch_ticks'):
        all_doc_ids.add(doc_id)
        all_epoch_ticks.add(epoch_ticks)
    all_doc_ids = sorted(all_doc_ids)
    all_epoch_ticks = sorted(all_epoch_ticks)
    logger.info('%d doc_ids', len(all_doc_ids))

    ## make an extractor
    extractor = from_kvlayer(config)

    ## test it with different i_str inputs:
    for i_str in ['', '0,,%d,' % 10**10, '%d,%s,%d,%s' % (all_epoch_ticks[0],  all_doc_ids[0],
                                                          all_epoch_ticks[-1], all_doc_ids[-1]) ]:
        stream_ids = []
        for si in extractor(i_str):
            stream_ids.append(si.stream_id)    
        _input_chunk_ids = [si.stream_id for si in streamcorpus.Chunk(path)]
        input_chunk_ids = list(set(_input_chunk_ids))
        logger.info('%d inserts, %d unique',
                    len(_input_chunk_ids), len(input_chunk_ids))
        input_chunk_ids.sort()
        stream_ids.sort()
        assert len(input_chunk_ids) == len(stream_ids)
        assert input_chunk_ids == stream_ids