def keys_and_values(): total_mb = 0. for si in streamcorpus.Chunk(t_path): key1 = uuid.UUID(int=si.stream_time.epoch_ticks) key2 = uuid.UUID(hex=si.doc_id) data = streamcorpus.serialize(si) errors, data = streamcorpus.compress_and_encrypt(data) assert not errors, errors total_mb += float(len(data)) / 2**20 logger.info('%r, %r --> %d, %.3f', key1, key2, len(data), total_mb) yield (key1, key2), data inverted_keys.append( ((key2, key1), r'') )
def keys_and_values(): for si in streamcorpus.Chunk(t_path): key1 = uuid.UUID(int=si.stream_time.epoch_ticks) key2 = uuid.UUID(hex=si.doc_id) data = streamcorpus.serialize(si) errors, data = streamcorpus.compress_and_encrypt(data) assert not errors, errors yield (key1, key2), data for ndx in indexes: if ndx == 'doc_id_epoch_ticks': kvp = ((key2, key1), r'') elif ndx == 'with_source': ## si.source can be None but we can't write None blobs to kvlayer if si.source: kvp = ((key1, key2), si.source) else: continue else: assert False, ('invalid index type ' + ndx) indexes[ndx].append(kvp)