def test_sort(freezed_random, chunk_size, workers_cnt): sorted_data = list(map(lambda n: f"{n}\n", range(100, 200))) unsorted_data = sorted_data.copy() freezed_random.shuffle(unsorted_data) reader = io.BytesIO(''.join(unsorted_data).encode()) writer = io.BytesIO() es.sort(reader, writer, chunk_size=chunk_size, workers_cnt=workers_cnt) expected_result = ''.join(sorted_data).encode() writer.seek(0) actual_result = writer.read() assert actual_result == expected_result
class UrlSerializer(es.Serializer): def write(self, item): self._writer.write(b"%s\n" % item.geturl()) class UrlDeserializer(es.Deserializer): def read(self): line = self._reader.readline() if not line: return None return Url._make(urllib.parse.urlparse(line.strip())) logging.basicConfig( level=logging.DEBUG, format='[%(levelname)-8s] %(asctime)-15s (%(name)s): %(message)s', ) with open('/home/user/urls.txt', 'rb') as unsorted_file, open('/home/user/urls.sorted.txt', 'wb') as sorted_file: es.sort( unsorted_file, sorted_file, chunk_size=1_000_000, Serializer=UrlSerializer, Deserializer=UrlDeserializer, workers_cnt=4, )
def write(self, item): return self._writer.writerow(item) class CSVDeserializer(es.Deserializer): def __init__(self, reader): super().__init__(csv.reader(io.TextIOWrapper(reader))) def read(self): return next(self._reader) logging.basicConfig( level=logging.DEBUG, format='[%(levelname)-8s] %(asctime)-15s (%(name)s): %(message)s', ) with open('unsorted.txt', 'rb') as unsorted_file, open('sorted.txt', 'wb') as sorted_file: # save the csv header sorted_file.write(unsorted_file.readline()) es.sort( reader=unsorted_file, writer=sorted_file, chunk_size=10_000_000, Serializer=CSVSerializer, Deserializer=CSVDeserializer, workers_cnt=4, )
parser = argparse.ArgumentParser(description='External sort.') parser.add_argument('-l', '--loglevel', dest='loglevel', choices=['debug', 'info', 'warning', 'error'], default='info', help='logging level') parser.add_argument('-i', '--infile', dest='infile', required=True, help='input file to be sorted') parser.add_argument('-o', '--outfile', dest='outfile', required=True, help='file the result will be saved in') parser.add_argument('-b', '--chunk_size', dest='chunk_size', type=int, required=True, help='number of elements that will be sorted in the main memory') parser.add_argument('-m', '--chunk_mem', dest='chunk_mem', type=int, help='max memory size that will consumed by one worker') parser.add_argument('-t', '--total_mem', dest='total_mem', type=int, help='max total memory size that will consumed by all the workers') parser.add_argument('-w', '--workers', dest='workers', type=int, help='number of workers sorting will be performed in (default: number of cpu cores)') parser.add_argument('--tmp_dir', dest='tmp_dir', help='directory temporary files will be created in') parser.add_argument('-v', '--version', action='version', version=ext_sort.__version__) args = parser.parse_args() logging.basicConfig(level=getattr(logging, args.loglevel.upper()), format=LOGGER_FORMAT) with open(args.infile, 'rb') as reader, open(args.outfile, 'wb') as writer: ext_sort.sort( reader, writer, chunk_size=args.chunk_size, chunk_mem=args.chunk_mem, total_mem=args.total_mem, workers_cnt=args.workers, tmp_dir=args.tmp_dir, )