def main(input_fnames, output_fname): merger = mtbl.merger(merge_func) writer = mtbl.writer(output_fname, compression=mtbl.COMPRESSION_SNAPPY) for fname in input_fnames: reader = mtbl.reader(fname) merger.add_reader(reader) merger.write(writer)
def main(fname, num_keys): sorter = mtbl.sorter(merge_func) writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY) a = time.time() last = a total_bytes = 0 count = 0 while count < num_keys: count += 1 key = '%020d' % random.randint(0, sys.maxint) val = random.choice(string.ascii_lowercase) * random.randint(1, 50) sorter[key] = val total_bytes += len(key) + len(val) if (count % report_interval) == 0: b = time.time() last_secs = b - last last = b sys.stderr.write( 'generated %s entries (%s MB) in %s seconds, %s entries/second\n' % (locale.format('%d', count, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', last_secs, grouping=True), locale.format( '%d', report_interval / last_secs, grouping=True))) sys.stderr.write('writing to output file %s\n' % fname) sorter.write(writer) b = time.time() total_secs = b - a sys.stderr.write( 'wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' % (locale.format('%d', count, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', total_secs, grouping=True), locale.format('%d', count / total_secs, grouping=True)))
def main(output_fname, input_fnames): merger = mtbl.merger(merge_func) writer = mtbl.writer(output_fname, compression=mtbl.COMPRESSION_SNAPPY) for fname in input_fnames: reader = mtbl.reader(fname) merger.add_reader(reader) for k, v in merger.iteritems(): writer[k] = v writer.close()
def main(txt_fname, mtbl_fname): txt = open(txt_fname) sorter = mtbl.sorter(merge_func) writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY) # trim header while True: line = txt.readline() if line.startswith('*** START OF THIS PROJECT GUTENBERG EBOOK'): break for x in range(0, 5): txt.readline() for line in txt: if line.startswith('End of the Project Gutenberg EBook') or \ line.startswith('*** END OF THIS PROJECT GUTENBERG EBOOK'): break for tok in line.strip().split(): word = tok.strip(string.punctuation).lower() sorter[word] = mtbl.varint_encode(1) sorter.write(writer)
def main(txt_fname, mtbl_fname): txt = open(txt_fname) sorter = mtbl.sorter(merge_func) writer = mtbl.writer(mtbl_fname, compression=mtbl.COMPRESSION_SNAPPY) # trim header while True: line = txt.readline() if line.startswith("*** START OF THIS PROJECT GUTENBERG EBOOK"): break for x in range(0, 5): txt.readline() for line in txt: if line.startswith("End of the Project Gutenberg EBook") or line.startswith( "*** END OF THIS PROJECT GUTENBERG EBOOK" ): break for tok in line.strip().split(): word = tok.strip(string.punctuation).lower() sorter[word] = mtbl.varint_encode(1) sorter.write(writer)
def main(fname, num_keys): writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY) a = time.time() last = a total_bytes = 0 count = 0 total = 0 while count < num_keys: count += 1 if random.random() >= 0.5: total += 1 key = '%010d' % count val = random.choice(string.ascii_lowercase) * random.randint(1, 50) writer[key] = val total_bytes += len(key) + len(val) if (count % report_interval) == 0: b = time.time() last_secs = b - last last = b sys.stderr.write('wrote %s entries (%s MB) in %s seconds, %s entries/second\n' % ( locale.format('%d', total, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', last_secs, grouping=True), locale.format('%d', report_interval / last_secs, grouping=True) ) ) b = time.time() total_secs = b - a sys.stderr.write('wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' % ( locale.format('%d', total, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', total_secs, grouping=True), locale.format('%d', total / total_secs, grouping=True) ) )
def main(fname, num_keys): sorter = mtbl.sorter(merge_func) writer = mtbl.writer(fname, compression=mtbl.COMPRESSION_SNAPPY) a = time.time() last = a total_bytes = 0 count = 0 while count < num_keys: count += 1 key = '%020d' % random.randint(0, sys.maxint) val = random.choice(string.ascii_lowercase) * random.randint(1, 50) sorter[key] = val total_bytes += len(key) + len(val) if (count % report_interval) == 0: b = time.time() last_secs = b - last last = b sys.stderr.write('generated %s entries (%s MB) in %s seconds, %s entries/second\n' % ( locale.format('%d', count, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', last_secs, grouping=True), locale.format('%d', report_interval / last_secs, grouping=True) ) ) sys.stderr.write('writing to output file %s\n' % fname) sorter.write(writer) b = time.time() total_secs = b - a sys.stderr.write('wrote %s total entries (%s MB) in %s seconds, %s entries/second\n' % ( locale.format('%d', count, grouping=True), locale.format('%d', total_bytes / megabyte, grouping=True), locale.format('%f', total_secs, grouping=True), locale.format('%d', count / total_secs, grouping=True) ) )