def test_kvlayer_simple(configurator, tmpdir): si = streamcorpus.make_stream_item('2000-01-01T12:34:00.000123Z', 'test://test.stream.item/') chunkfile = str(tmpdir.join('chunk.sc.xz')) with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk: chunk.add(si) with configurator(): writer = to_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'to_kvlayer')) writer(chunkfile, {}, '') kvlclient = kvlayer.client() kvlclient.setup_namespace({'stream_items': 2}) print repr(list(kvlclient.scan_keys('stream_items'))) for (k,v) in kvlclient.get( 'stream_items', (uuid.UUID(int=946730040), uuid.UUID(hex='985c1e3ed73256cd9a399919fe93cf76'))): assert v is not None reader = from_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'from_kvlayer')) sis = list(reader('')) assert len(sis) == 1 assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks assert sis[0].abs_url == si.abs_url
def chunks(configurator, test_data_dir, overlay={}): with configurator(overlay): path = get_test_v0_3_0_chunk_path(test_data_dir) config = yakonfig.get_global_config('streamcorpus_pipeline', 'to_kvlayer') writer = to_kvlayer(config) ## name_info and i_str are not used by the writer i_str = '' name_info = {} writer(path, name_info, i_str) client = kvlayer.client() client.setup_namespace({'stream_items': 2, 'stream_items_doc_id_epoch_ticks': 2, 'stream_items_with_source': 2}) yield path, client
def test_kvlayer_negative(configurator, tmpdir): si = streamcorpus.make_stream_item('1969-07-20T20:18:00.000000Z', 'test://test.stream.item/') chunkfile = str(tmpdir.join('chunk.sc.xz')) with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk: chunk.add(si) with configurator(): writer = to_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'to_kvlayer')) writer(chunkfile, {}, '') reader = from_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'from_kvlayer')) sis = list(reader('')) assert len(sis) == 1 assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks assert sis[0].abs_url == si.abs_url
def chunks(configurator, test_data_dir, overlay={}): with configurator(overlay): path = get_test_v0_3_0_chunk_path(test_data_dir) config = yakonfig.get_global_config('streamcorpus_pipeline', 'to_kvlayer') writer = to_kvlayer(config) ## name_info and i_str are not used by the writer i_str = '' name_info = {} writer(path, name_info, i_str) client = kvlayer.client() client.setup_namespace({ 'stream_items': 2, 'stream_items_doc_id_epoch_ticks': 2, 'stream_items_with_source': 2 }) yield path, client
def test_kvlayer_extractor_and_loader(config): path = get_test_v0_3_0_chunk_path() loader = to_kvlayer(config) ## name_info and i_str are not used by the loader i_str = '' name_info = {} loader(path, name_info, i_str) ## check that index table was created all_doc_ids = set() all_epoch_ticks = set() for (doc_id, epoch_ticks), empty_data in loader.client.scan('stream_items_doc_id_epoch_ticks'): all_doc_ids.add(doc_id) all_epoch_ticks.add(epoch_ticks) all_doc_ids = sorted(all_doc_ids) all_epoch_ticks = sorted(all_epoch_ticks) logger.info('%d doc_ids', len(all_doc_ids)) ## make an extractor extractor = from_kvlayer(config) ## test it with different i_str inputs: for i_str in ['', '0,,%d,' % 10**10, '%d,%s,%d,%s' % (all_epoch_ticks[0], all_doc_ids[0], all_epoch_ticks[-1], all_doc_ids[-1]) ]: stream_ids = [] for si in extractor(i_str): stream_ids.append(si.stream_id) _input_chunk_ids = [si.stream_id for si in streamcorpus.Chunk(path)] input_chunk_ids = list(set(_input_chunk_ids)) logger.info('%d inserts, %d unique', len(_input_chunk_ids), len(input_chunk_ids)) input_chunk_ids.sort() stream_ids.sort() assert len(input_chunk_ids) == len(stream_ids) assert input_chunk_ids == stream_ids