Пример #1
0
def test_sort(freezed_random, chunk_size, workers_cnt):
    sorted_data = list(map(lambda n: f"{n}\n", range(100, 200)))

    unsorted_data = sorted_data.copy()
    freezed_random.shuffle(unsorted_data)

    reader = io.BytesIO(''.join(unsorted_data).encode())
    writer = io.BytesIO()

    es.sort(reader, writer, chunk_size=chunk_size, workers_cnt=workers_cnt)

    expected_result = ''.join(sorted_data).encode()

    writer.seek(0)
    actual_result = writer.read()

    assert actual_result == expected_result
Пример #2
0
class UrlSerializer(es.Serializer):
    def write(self, item):
        self._writer.write(b"%s\n" % item.geturl())


class UrlDeserializer(es.Deserializer):
    def read(self):
        line = self._reader.readline()
        if not line:
            return None

        return Url._make(urllib.parse.urlparse(line.strip()))


logging.basicConfig(
    level=logging.DEBUG,
    format='[%(levelname)-8s] %(asctime)-15s (%(name)s): %(message)s',
)

with open('/home/user/urls.txt',
          'rb') as unsorted_file, open('/home/user/urls.sorted.txt',
                                       'wb') as sorted_file:
    es.sort(
        unsorted_file,
        sorted_file,
        chunk_size=1_000_000,
        Serializer=UrlSerializer,
        Deserializer=UrlDeserializer,
        workers_cnt=4,
    )
    def write(self, item):
        return self._writer.writerow(item)


class CSVDeserializer(es.Deserializer):
    def __init__(self, reader):
        super().__init__(csv.reader(io.TextIOWrapper(reader)))

    def read(self):
        return next(self._reader)


logging.basicConfig(
    level=logging.DEBUG,
    format='[%(levelname)-8s] %(asctime)-15s (%(name)s): %(message)s',
)

with open('unsorted.txt', 'rb') as unsorted_file, open('sorted.txt',
                                                       'wb') as sorted_file:
    # save the csv header
    sorted_file.write(unsorted_file.readline())

    es.sort(
        reader=unsorted_file,
        writer=sorted_file,
        chunk_size=10_000_000,
        Serializer=CSVSerializer,
        Deserializer=CSVDeserializer,
        workers_cnt=4,
    )
Пример #4
0
parser = argparse.ArgumentParser(description='External sort.')
parser.add_argument('-l', '--loglevel', dest='loglevel', choices=['debug', 'info', 'warning', 'error'],
                    default='info', help='logging level')
parser.add_argument('-i', '--infile', dest='infile', required=True, help='input file to be sorted')
parser.add_argument('-o', '--outfile', dest='outfile', required=True, help='file the result will be saved in')
parser.add_argument('-b', '--chunk_size', dest='chunk_size', type=int, required=True,
                    help='number of elements that will be sorted in the main memory')
parser.add_argument('-m', '--chunk_mem', dest='chunk_mem', type=int,
                    help='max memory size that will consumed by one worker')
parser.add_argument('-t', '--total_mem', dest='total_mem', type=int,
                    help='max total memory size that will consumed by all the workers')
parser.add_argument('-w', '--workers', dest='workers', type=int,
                    help='number of workers sorting will be performed in (default: number of cpu cores)')
parser.add_argument('--tmp_dir', dest='tmp_dir', help='directory temporary files will be created in')
parser.add_argument('-v', '--version', action='version', version=ext_sort.__version__)

args = parser.parse_args()

logging.basicConfig(level=getattr(logging, args.loglevel.upper()), format=LOGGER_FORMAT)


with open(args.infile, 'rb') as reader, open(args.outfile, 'wb') as writer:
    ext_sort.sort(
        reader, writer,
        chunk_size=args.chunk_size,
        chunk_mem=args.chunk_mem,
        total_mem=args.total_mem,
        workers_cnt=args.workers,
        tmp_dir=args.tmp_dir,
    )