def test_serializer_is_pickable():
    """Pickle/depickle the serializer to make sure it can be passed
    as a parameter cross process boundaries when using futures"""
    s = PyArrowSerializer()
    deserialized_s = pickle.loads(pickle.dumps(s))

    expected = [{'a': np.asarray([1, 2], dtype=np.uint64)}]
    actual = deserialized_s.deserialize(deserialized_s.serialize(expected))
    np.testing.assert_array_equal(actual[0]['a'], expected[0]['a'])
Exemplo n.º 2
0
def _create_worker_pool(pool_type, workers_count, profiling_enabled, pyarrow_serialize):
    """Different worker pool implementation (in process none or thread-pool, out of process pool)"""
    if pool_type == WorkerPoolType.THREAD:
        worker_pool = ThreadPool(workers_count, profiling_enabled=profiling_enabled)
    elif pool_type == WorkerPoolType.PROCESS:
        worker_pool = ProcessPool(workers_count,
                                  serializer=PyArrowSerializer() if pyarrow_serialize else PickleSerializer())
    elif pool_type == WorkerPoolType.NONE:
        worker_pool = DummyPool()
    else:
        raise ValueError('Supported pool types are thread, process or dummy. Got {}.'.format(pool_type))
    return worker_pool
Exemplo n.º 3
0
    def __init__(self, workers_count, pyarrow_serialize=False):
        """Initializes a ProcessPool.

        This pool is different from standard Python pool implementations by the fact that the workers are spawned
        without using fork. Some issues with using jvm based HDFS driver were observed when the process was forked
        (could not access HDFS from the forked worker if the driver was already used in the parent process).

        :param workers_count: Number of processes to be spawned
        :param pyarrow_serialize: Use ``pyarrow.serialize`` serialization if True. ``pyarrow.serialize`` is much faster
          than pickling. Integer types (int8, uint8 etc...) is not done yet in pyarrow, so all integer types are
          currently converted to 'int'
        """
        self._workers = []
        self._ventilator_send = None
        self._control_sender = None
        self.workers_count = workers_count
        self._results_receiver_poller = None

        self._ventilated_items = 0
        self._ventilated_items_processed = 0
        self._ventilator = None
        self._serializer = PyArrowSerializer() if pyarrow_serialize else PickleSerializer()
Exemplo n.º 4
0
    def __init__(self, workers_count, pyarrow_serialize=False):
        """Initializes a ProcessPool.

        This pool is different from standard Python pool implementations by the fact that the workers are spawned
        without using fork. Some issues with using jvm based HDFS driver were observed when the process was forked
        (could not access HDFS from the forked worker if the driver was already used in the parent process).

        :param workers_count: Number of processes to be spawned
        :param pyarrow_serialize: Use ``pyarrow.serialize`` serialization if True. ``pyarrow.serialize`` is much faster
          than pickling, but does not support ``Decimal`` data types, converts int64 into int32 (and probably modifies
          some other types). We can not use this serialization by default, but would allow to switch it on
          when a user knows what they are doing.
        """
        self._workers = []
        self._ventilator_send = None
        self._control_sender = None
        self._workers_count = workers_count
        self._results_receiver_poller = None

        self._ventilated_items = 0
        self._ventilated_items_processed = 0
        self._ventilator = None
        self._serializer = PyArrowSerializer() if pyarrow_serialize else None
def test_decimal():
    s = PyArrowSerializer()
    expected = [{'a': Decimal('1.2')}]
    actual = s.deserialize(s.serialize(expected))
    np.testing.assert_array_equal(actual[0]['a'], expected[0]['a'])

    expected = [{'a': [Decimal('1.2')]}]
    actual = s.deserialize(s.serialize(expected))
    np.testing.assert_array_equal(actual[0]['a'], expected[0]['a'])
def test_all_matrix_types():
    s = PyArrowSerializer()
    # We would be using serializer with arrays of dictionaries or arrays of dictionaries of dictionaries (ngram)
    serialized_values = [
        (np.int8, -127),
        (np.uint8, 255),
        (np.int16, -2**15),
        (np.uint16, 2**16 - 1),
        (np.int32, -2**31),
        (np.uint32, 2**32 - 1),
        (np.float16, 1.2),
        (np.float32, 1.2),
        (np.float64, 1.2),
        (np.string_, 'abc'),
        (np.unicode_, u'אבג'),
        (np.int64, -2**63),
        (np.uint64, 2**64 - 1),
    ]

    for type_factory, value in serialized_values:
        desired = [{'value': np.asarray(4 * [value], dtype=type_factory)}]
        actual = s.deserialize(s.serialize(desired))
        assert actual[0]['value'].dtype == desired[0]['value'].dtype
        np.testing.assert_array_equal(actual[0]['value'], desired[0]['value'])
Exemplo n.º 7
0
def make_reader(dataset_url,
                schema_fields=None,
                reader_pool_type='thread',
                workers_count=10,
                pyarrow_serialize=False,
                results_queue_size=50,
                shuffle_row_groups=True,
                shuffle_row_drop_partitions=1,
                predicate=None,
                rowgroup_selector=None,
                num_epochs=1,
                cur_shard=None,
                shard_count=None,
                cache_type='null',
                cache_location=None,
                cache_size_limit=None,
                cache_row_size_estimate=None,
                cache_extra_settings=None,
                hdfs_driver='libhdfs3',
                reader_engine='reader_v1',
                reader_engine_params=None):
    """
    Creates an instance of Reader for reading Petastorm datasets. A Petastorm dataset is a dataset generated using
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset` context manager as explained
    `here <https://petastorm.readthedocs.io/en/latest/readme_include.html#generating-a-dataset>`_.

    See :func:`~petastorm.make_batch_reader` to read from a Parquet store that was not generated using
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset`.

    :param dataset_url: an filepath or a url to a parquet directory,
        e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``
        or ``'s3://bucket/mydataset'``.
    :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields;
            an NGram object, then it will return an NGram of the specified fields.
    :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
        denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
        thread or process pool. Defaults to 10
    :param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool.
        Defaults to False.
    :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to
        thread reader pool type.
    :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
    :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
        break up a row group into for increased shuffling in exchange for worse performance (extra reads).
        For example if you specify 2 each row group read will drop half of the rows within every row group and
        read the remaining rows in separate reads. It is recommended to keep this number below the regular row
        group size in order to not waste reads which drop all rows.
    :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate
        will be passed a single row and must return a boolean value indicating whether to include it in the results.
    :param rowgroup_selector: instance of row group selector object to select row groups to be read
    :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
        ``None`` will result in an infinite number of epochs.
    :param cur_shard: An int denoting the current shard number. Each node reading a shard should
        pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well.
        Defaults to None
    :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None
    :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to
        either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
        to the main data store is either slow or expensive and the local machine has large enough storage
        to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache.
    :param cache_location: A string denoting the location or path of the cache.
    :param cache_size_limit: An int specifying the size limit of the cache in bytes
    :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param reader_engine: Multiple engine implementations exist ('reader_v1' and 'experimental_reader_v2'). 'reader_v1'
        (the default value) selects a stable reader implementation.
    :param reader_engine_params: For advanced usage: a dictionary with arguments passed directly to a reader
        implementation constructor chosen by ``reader_engine`` argument.  You should not use this parameter, unless you
        fine-tuning of a reader.
    :return: A :class:`Reader` object
    """

    if dataset_url is None or not isinstance(dataset_url, six.string_types):
        raise ValueError("""dataset_url must be a string""")

    dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url
    logger.debug('dataset_url: %s', dataset_url)

    resolver = FilesystemResolver(dataset_url, hdfs_driver=hdfs_driver)
    filesystem = resolver.filesystem()
    dataset_path = resolver.get_dataset_path()

    if cache_type is None or cache_type == 'null':
        cache = NullCache()
    elif cache_type == 'local-disk':
        cache = LocalDiskCache(cache_location, cache_size_limit,
                               cache_row_size_estimate, **cache_extra_settings
                               or {})
    else:
        raise ValueError('Unknown cache_type: {}'.format(cache_type))

    # Fail if this is a non-petastorm dataset. Typically, a Parquet store will have hundred thousands rows in a single
    # rowgroup. Using PyDictReaderWorker or ReaderV2 implementation is very inefficient as it processes data on a
    # row by row basis. ArrowReaderWorker (used by make_batch_reader) is much more efficient in these cases.
    try:
        dataset_metadata.get_schema_from_dataset_url(dataset_url)
    except PetastormMetadataError:
        raise RuntimeError(
            'Currently make_reader supports reading only Petastorm datasets. '
            'To read from a non-Petastorm Parquet store use make_batch_reader')

    if reader_engine == 'reader_v1':
        if reader_pool_type == 'thread':
            reader_pool = ThreadPool(workers_count, results_queue_size)
        elif reader_pool_type == 'process':
            if pyarrow_serialize:
                serializer = PyArrowSerializer()
            else:
                serializer = PickleSerializer()
            reader_pool = ProcessPool(workers_count, serializer)
        elif reader_pool_type == 'dummy':
            reader_pool = DummyPool()
        else:
            raise ValueError(
                'Unknown reader_pool_type: {}'.format(reader_pool_type))

        # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
        kwargs = {
            'schema_fields': schema_fields,
            'reader_pool': reader_pool,
            'shuffle_row_groups': shuffle_row_groups,
            'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
            'predicate': predicate,
            'rowgroup_selector': rowgroup_selector,
            'num_epochs': num_epochs,
            'cur_shard': cur_shard,
            'shard_count': shard_count,
            'cache': cache,
        }

        if reader_engine_params:
            kwargs.update(reader_engine_params)

        try:
            return Reader(filesystem,
                          dataset_path,
                          worker_class=PyDictReaderWorker,
                          **kwargs)
        except PetastormMetadataError as e:
            logger.error('Unexpected exception: %s', str(e))
            raise RuntimeError(
                'make_reader has failed. If you were trying to open a Parquet store that was not '
                'created using Petastorm materialize_dataset and it contains only scalar columns, '
                'you may use make_batch_reader to read it.\n'
                'Inner exception: %s', str(e))

    elif reader_engine == 'experimental_reader_v2':
        if reader_pool_type == 'thread':
            decoder_pool = ThreadPoolExecutor(workers_count)
        elif reader_pool_type == 'process':
            decoder_pool = ProcessPoolExecutor(workers_count)
        elif reader_pool_type == 'dummy':
            decoder_pool = SameThreadExecutor()
        else:
            raise ValueError(
                'Unknown reader_pool_type: {}'.format(reader_pool_type))

        # TODO(yevgeni): once ReaderV2 is ready to be out of experimental status, we should extend
        # the make_reader interfaces to take shuffling buffer parameters explicitly
        shuffling_queue = RandomShufflingBuffer(
            1000, 800) if shuffle_row_groups else NoopShufflingBuffer()

        # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
        kwargs = {
            'schema_fields': schema_fields,
            'predicate': predicate,
            'rowgroup_selector': rowgroup_selector,
            'num_epochs': num_epochs,
            'cur_shard': cur_shard,
            'shard_count': shard_count,
            'cache': cache,
            'decoder_pool': decoder_pool,
            'shuffling_queue': shuffling_queue,
            'shuffle_row_groups': shuffle_row_groups,
            'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
        }

        if reader_engine_params:
            kwargs.update(reader_engine_params)

        return ReaderV2(dataset_url, **kwargs)

    else:
        raise ValueError(
            'Unexpected value of reader_engine argument \'%s\'. '
            'Supported reader_engine values are \'reader_v1\' and \'experimental_reader_v2\'',
            reader_engine)
Exemplo n.º 8
0
def make_reader(dataset_url,
                schema_fields=None,
                reader_pool_type='thread',
                workers_count=10,
                pyarrow_serialize=False,
                results_queue_size=50,
                shuffle_row_groups=True,
                shuffle_row_drop_partitions=1,
                predicate=None,
                rowgroup_selector=None,
                num_epochs=1,
                cur_shard=None,
                shard_count=None,
                cache_type='null',
                cache_location=None,
                cache_size_limit=None,
                cache_row_size_estimate=None,
                cache_extra_settings=None,
                hdfs_driver='libhdfs3',
                transform_spec=None,
                filters=None,
                s3_config_kwargs=None,
                zmq_copy_buffers=True):
    """
    Creates an instance of Reader for reading Petastorm datasets. A Petastorm dataset is a dataset generated using
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset` context manager as explained
    `here <https://petastorm.readthedocs.io/en/latest/readme_include.html#generating-a-dataset>`_.

    See :func:`~petastorm.make_batch_reader` to read from a Parquet store that was not generated using
    :func:`~petastorm.etl.dataset_metadata.materialize_dataset`.

    :param dataset_url: an filepath or a url to a parquet directory,
        e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/parquet8'``, or ``'file:///tmp/mydataset'``,
        or ``'s3://bucket/mydataset'``, or ``'gs://bucket/mydataset'``.
    :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields;
            an NGram object, then it will return an NGram of the specified fields.
    :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
        denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
        thread or process pool. Defaults to 10
    :param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool.
        Defaults to False.
    :param results_queue_size: Size of the results queue to store prefetched row-groups. Currently only applicable to
        thread reader pool type.
    :param shuffle_row_groups: Whether to shuffle row groups (the order in which full row groups are read)
    :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
        break up a row group into for increased shuffling in exchange for worse performance (extra reads).
        For example if you specify 2 each row group read will drop half of the rows within every row group and
        read the remaining rows in separate reads. It is recommended to keep this number below the regular row
        group size in order to not waste reads which drop all rows.
    :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate
        will be passed a single row and must return a boolean value indicating whether to include it in the results.
    :param rowgroup_selector: instance of row group selector object to select row groups to be read
    :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
        ``None`` will result in an infinite number of epochs.
    :param cur_shard: An int denoting the current shard number. Each node reading a shard should
        pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well.
        Defaults to None
    :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None
    :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to
        either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
        to the main data store is either slow or expensive and the local machine has large enough storage
        to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache.
    :param cache_location: A string denoting the location or path of the cache.
    :param cache_size_limit: An int specifying the size limit of the cache in bytes
    :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
    :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
        libhdfs (java through JNI) or libhdfs3 (C++)
    :param transform_spec: An instance of :class:`~petastorm.transform.TransformSpec` object defining how a record
        is transformed after it is loaded and decoded. The transformation occurs on a worker thread/process (depends
        on the ``reader_pool_type`` value).
    :param filters: (List[Tuple] or List[List[Tuple]]): Standard PyArrow filters.
        These will be applied when loading the parquet file with PyArrow. More information
        here: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
    :param s3_config_kwargs: dict of parameters passed to ``botocore.client.Config``
    :param zmq_copy_buffers: A bool indicating whether to use 0mq copy buffers with ProcessPool.
    :return: A :class:`Reader` object
    """
    dataset_url = normalize_dir_url(dataset_url)

    filesystem, dataset_path = get_filesystem_and_path_or_paths(
        dataset_url, hdfs_driver, s3_config_kwargs=s3_config_kwargs)

    if cache_type is None or cache_type == 'null':
        cache = NullCache()
    elif cache_type == 'local-disk':
        cache = LocalDiskCache(cache_location, cache_size_limit,
                               cache_row_size_estimate, **cache_extra_settings
                               or {})
    else:
        raise ValueError('Unknown cache_type: {}'.format(cache_type))

    try:
        dataset_metadata.get_schema_from_dataset_url(dataset_url,
                                                     hdfs_driver=hdfs_driver)
    except PetastormMetadataError:
        raise RuntimeError(
            'Currently make_reader supports reading only Petastorm datasets. '
            'To read from a non-Petastorm Parquet store use make_batch_reader')

    if reader_pool_type == 'thread':
        reader_pool = ThreadPool(workers_count, results_queue_size)
    elif reader_pool_type == 'process':
        if pyarrow_serialize:
            serializer = PyArrowSerializer()
        else:
            serializer = PickleSerializer()
        reader_pool = ProcessPool(workers_count,
                                  serializer,
                                  zmq_copy_buffers=zmq_copy_buffers)
    elif reader_pool_type == 'dummy':
        reader_pool = DummyPool()
    else:
        raise ValueError(
            'Unknown reader_pool_type: {}'.format(reader_pool_type))

    kwargs = {
        'schema_fields': schema_fields,
        'reader_pool': reader_pool,
        'shuffle_row_groups': shuffle_row_groups,
        'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
        'predicate': predicate,
        'rowgroup_selector': rowgroup_selector,
        'num_epochs': num_epochs,
        'cur_shard': cur_shard,
        'shard_count': shard_count,
        'cache': cache,
        'transform_spec': transform_spec,
        'filters': filters
    }

    try:
        return Reader(filesystem,
                      dataset_path,
                      worker_class=PyDictReaderWorker,
                      is_batched_reader=False,
                      **kwargs)
    except PetastormMetadataError as e:
        logger.error('Unexpected exception: %s', str(e))
        raise RuntimeError(
            'make_reader has failed. If you were trying to open a Parquet store that was not '
            'created using Petastorm materialize_dataset and it contains only scalar columns, '
            'you may use make_batch_reader to read it.\n'
            'Inner exception: %s', str(e))
Exemplo n.º 9
0
class ProcessPool(object):
    def __init__(self, workers_count, pyarrow_serialize=False):
        """Initializes a ProcessPool.

        This pool is different from standard Python pool implementations by the fact that the workers are spawned
        without using fork. Some issues with using jvm based HDFS driver were observed when the process was forked
        (could not access HDFS from the forked worker if the driver was already used in the parent process).

        :param workers_count: Number of processes to be spawned
        :param pyarrow_serialize: Use ``pyarrow.serialize`` serialization if True. ``pyarrow.serialize`` is much faster
          than pickling. Integer types (int8, uint8 etc...) is not done yet in pyarrow, so all integer types are
          currently converted to 'int'
        """
        self._workers = []
        self._ventilator_send = None
        self._control_sender = None
        self.workers_count = workers_count
        self._results_receiver_poller = None

        self._ventilated_items = 0
        self._ventilated_items_processed = 0
        self._ventilator = None
        self._serializer = PyArrowSerializer() if pyarrow_serialize else PickleSerializer()

    def _create_local_socket_on_random_port(self, context, socket_type):
        """Creates a zmq socket on a random port.

        :param context: zmq context
        :param socket_type: zmq socket type
        :return: A tuple: ``(zmq_socket, endpoint_address)``
        """
        LOCALHOST = 'tcp://127.0.0.1'
        socket = context.socket(socket_type)

        # There are race conditions where the socket can close when messages are still trying to be sent by zmq.
        # This can end up causing zmq to block indefinitely when sending objects or shutting down. Having the socket
        # linger on close helps prevent this.
        socket.linger = _SOCKET_LINGER_MS

        port = socket.bind_to_random_port(LOCALHOST)
        return socket, '{}:{}'.format(LOCALHOST, port)

    def start(self, worker_class, worker_setup_args=None, ventilator=None):
        """Starts worker processes.

        Will block until all processes to subscribe to the worker queue (the messages are distributed by zmq on write
        so if only one, out of many, workers is up at the time of 'ventilation', the initial load won't be balanced
        between workers. If can not start the workers in timely fashion, will raise an exception.

        :param worker_class: A class of the worker class. The class will be instantiated in the worker process. The
            class must implement :class:`.WorkerBase` protocol.
        :param worker_setup_args: Argument that will be passed to 'args' property of the instantiated
            :class:`.WorkerBase`.
        :param ventilator: Optional ventilator to handle ventilating items to the process pool. Process pool needs
            to know about the ventilator to know if it has completed ventilating items.
        :return: ``None``
        """
        # Initialize a zeromq context
        self._context = zmq.Context()

        # Ventilator socket used to send out tasks to workers
        self._ventilator_send, worker_receiver_socket = self._create_local_socket_on_random_port(self._context,
                                                                                                 zmq.PUSH)

        # Control socket is used to signal termination of the pool
        self._control_sender, control_socket = self._create_local_socket_on_random_port(self._context, zmq.PUB)
        self._results_receiver, results_sender_socket = self._create_local_socket_on_random_port(self._context,
                                                                                                 zmq.PULL)

        # We need poller to be able to read results from workers in a non-blocking manner
        self._results_receiver_poller = zmq.Poller()
        self._results_receiver_poller.register(self._results_receiver, zmq.POLLIN)

        # Monitors will be used to count number of workers created.
        # We will block till all of them are ready to accept messages
        monitor_sockets = [
            self._ventilator_send.get_monitor_socket(zmq.constants.EVENT_ACCEPTED),
            self._control_sender.get_monitor_socket(zmq.constants.EVENT_ACCEPTED),
            self._results_receiver.get_monitor_socket(zmq.constants.EVENT_ACCEPTED),
        ]

        # Start a bunch of processes
        self._workers = [
            exec_in_new_process(_worker_bootstrap, worker_class, worker_id, control_socket, worker_receiver_socket,
                                results_sender_socket, self._serializer, worker_setup_args)
            for worker_id in range(self.workers_count)]

        # Block until we have all workers up. Will raise an error if fails to start in a timely fashion
        self._wait_for_workers_to_start(monitor_sockets)

        if ventilator:
            self._ventilator = ventilator
            self._ventilator.start()

    def _wait_for_workers_to_start(self, monitor_sockets):
        """Waits for all workers to start."""
        now = time()
        for monitor_socket in monitor_sockets:
            started_count = 0
            while started_count < self.workers_count and time() < now + _WORKERS_STARTED_TIMEOUT_S:
                _keep_retrying_while_zmq_again(_KEEP_TRYING_WHILE_ZMQ_AGAIN_IS_RAIZED_TIMEOUT_S,
                                               lambda sock=monitor_socket: monitor.recv_monitor_message(
                                                   sock, flags=zmq.constants.NOBLOCK))
                started_count += 1

            if started_count < self.workers_count:
                raise RuntimeError(
                    'Workers were not able to start within timeout {} s ({} has started)'.format(
                        _WORKERS_STARTED_TIMEOUT_S,
                        started_count))

    def ventilate(self, *args, **kargs):
        """Sends a work item to a worker process. Will result in worker.process(...) call with arbitrary arguments."""
        self._ventilated_items += 1
        logger.debug('ventilate called. total ventilated items count %d', self._ventilated_items)
        # There is a race condition when sending objects to zmq that if all workers have been killed, sending objects
        # can block indefinitely. By using NOBLOCK, an exception is thrown stating that all resources have been
        # exhausted which the user can decide how to handle instead of just having the process hang.
        _keep_retrying_while_zmq_again(_KEEP_TRYING_WHILE_ZMQ_AGAIN_IS_RAIZED_TIMEOUT_S,
                                       lambda: self._ventilator_send.send_pyobj((args, kargs),
                                                                                flags=zmq.constants.NOBLOCK))

    def get_results(self):
        """Returns results from worker pool

        :param timeout: If None, will block forever, otherwise will raise :class:`.TimeoutWaitingForResultError`
            exception if no data received within the timeout (in seconds)
        :return: arguments passed to ``publish_func(...)`` by a worker. If no more results are anticipated,
            :class:`.EmptyResultError` is raised.
        """

        while True:
            # If there is no more work to do, raise an EmptyResultError
            logger.debug('ventilated_items=%d ventilated_items_processed=%d ventilator.completed=%s',
                         self._ventilated_items, self._ventilated_items_processed,
                         str(self._ventilator.completed()) if self._ventilator else 'N/A')
            if self._ventilated_items == self._ventilated_items_processed:
                # We also need to check if we are using a ventilator and if it is completed
                if not self._ventilator or self._ventilator.completed():
                    logger.debug('ventilator reported it has completed. Reporting end of results')
                    raise EmptyResultError()

            logger.debug('get_results polling on the next result')
            socks = self._results_receiver_poller.poll(_VERIFY_END_OF_VENTILATION_PERIOD * 1e3)
            if not socks:
                continue
            # Result message is a tuple containing data payload and possible exception (or None).
            # By specifying pyarrow_serialize=True, we may choose to use pyarrow serializer which is faster, but
            # does not support all data types correctly.
            fast_serialized, pickle_serialized = self._results_receiver.recv_multipart(copy=False)
            pickle_serialized = pickle.loads(pickle_serialized)

            if pickle_serialized:
                logger.debug('get_results a pickled message %s', type(pickle_serialized))
                if isinstance(pickle_serialized, VentilatedItemProcessedMessage):
                    self._ventilated_items_processed += 1
                    if self._ventilator:
                        self._ventilator.processed_item()
                elif isinstance(pickle_serialized, Exception):
                    self.stop()
                    self.join()
                    raise pickle_serialized
            else:
                logger.debug('get_results received new results')
                deserialized_result = self._serializer.deserialize(fast_serialized.buffer)
                return deserialized_result

    def stop(self):
        """Stops all workers (non-blocking)"""
        logger.debug('stopping')
        if self._ventilator:
            self._ventilator.stop()
        self._control_sender.send_string(_CONTROL_FINISHED)

    def join(self):
        """Blocks until all workers are terminated."""

        logger.debug('joining')

        # Slow joiner problem with zeromq means that not all workers are guaranteed to have gotten
        # the stop event. Therefore we will keep sending it until all workers are stopped to prevent
        # a deadlock.
        while any([w.poll() is None for w in self._workers]):
            self.stop()
            sleep(.1)

        for w in self._workers:
            w.wait()
        self._ventilator_send.close()
        self._control_sender.close()
        self._results_receiver.close()
        self._context.destroy()

    @property
    def diagnostics(self):
        # items_produced is updated only when VentilatedItemProcessedMessage is received. This will happen only on the
        # next call to get_results, so it's value may lag.
        return {
            'items_consumed': self._ventilated_items,
            'items_produced': self._ventilated_items_processed,
            'items_inprocess': self._ventilated_items - self._ventilated_items_processed,
        }
def test_nominal():
    s = PyArrowSerializer()
    expected = [{'a': np.asarray([1, 2], dtype=np.uint64)}]
    actual = s.deserialize(s.serialize(expected))
    np.testing.assert_array_equal(actual[0]['a'], expected[0]['a'])
def make_carbon_reader(dataset_url,
                       key=None,
                       secret=None,
                       endpoint=None,
                       proxy=None,
                       proxy_port=None,
                       schema_fields=None,
                       reader_pool_type='thread', workers_count=10, pyarrow_serialize=False, results_queue_size=100,
                       shuffle_blocklets=True, shuffle_row_drop_partitions=1,
                       predicate=None,
                       blocklet_selector=None,
                       num_epochs=1,
                       cur_shard=None, shard_count=None,
                       cache_type='null', cache_location=None, cache_size_limit=None,
                       cache_row_size_estimate=None, cache_extra_settings=None,
                       hdfs_driver='libhdfs3',
                       reader_engine='reader_v1', reader_engine_params=None,
                       transform_spec=None):
  """
  Creates an instance of Reader for reading Pycarbon datasets. A Pycarbon dataset is a dataset generated using
  :func:`~pycarbon.etl.carbon_dataset_metadata.materialize_dataset_carbon` context manager as explained

  See :func:`~pycarbon.make_batch_carbon_reader` to read from a Carbon store that was not generated using
  :func:`~pycarbon.etl.carbon_dataset_metadata.materialize_dataset_carbon`.

  :param dataset_url: an filepath or a url to a carbon directory,
      e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/carbon8'``, or ``'file:///tmp/mydataset'``
      or ``'s3://bucket/mydataset'``.
  :param key: access key
  :param secret: secret key
  :param endpoint: endpoint_url
  :param proxy: proxy
  :param proxy_port:  proxy_port
  :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields;
          an NGram object, then it will return an NGram of the specified fields.
  :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
      denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    TODO: process support
  :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
      thread or process pool. Defaults to 10
  :param pyarrow_serialize: Whether to use pyarrow for serialization. Currently only applicable to process pool.
      Defaults to False.
  :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to
      thread reader pool type.
  :param shuffle_blocklets: Whether to shuffle blocklets (the order in which full blocklets are read)
  :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
      break up a blocklet into for increased shuffling in exchange for worse performance (extra reads).
      For example if you specify 2 each blocklet read will drop half of the rows within every blocklet and
      read the remaining rows in separate reads. It is recommended to keep this number below the regular row
      group size in order to not waste reads which drop all rows.
  :param predicate: instance of :class:`.PredicateBase` object to filter rows to be returned by reader. The predicate
      will be passed a single row and must return a boolean value indicating whether to include it in the results.
  :param blocklet_selector: instance of blocklet selector object to select blocklet to be read
    TODO: blocklet_selector
  :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
      ``None`` will result in an infinite number of epochs.
  :param cur_shard: An int denoting the current shard number. Each node reading a shard should
      pass in a unique shard number in the range [0, shard_count). shard_count must be supplied as well.
      Defaults to None
  :param shard_count: An int denoting the number of shards to break this dataset into. Defaults to None
    TODO: cur_shard & shard_count
  :param cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk'] to
      either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
      to the main data store is either slow or expensive and the local machine has large enough storage
      to store entire dataset (or a partition of a dataset if shard_count is used). By default will be a null cache.
  :param cache_location: A string denoting the location or path of the cache.
  :param cache_size_limit: An int specifying the size limit of the cache in bytes
  :param cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
  :param cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
  :param hdfs_driver: A string denoting the hdfs driver to use (if using a dataset on hdfs). Current choices are
      libhdfs (java through JNI) or libhdfs3 (C++)
  :param reader_engine: Multiple engine implementations exist ('reader_v1' and 'experimental_reader_v2'). 'reader_v1'
      (the default value) selects a stable reader implementation.
    TODO: experimental_reader_v2 for carbon
  :param reader_engine_params: For advanced usage: a dictionary with arguments passed directly to a reader
      implementation constructor chosen by ``reader_engine`` argument.  You should not use this parameter, unless you
      fine-tuning of a reader.
  :param transform_spec: An instance of :class:`~petastorm.transform.TransformSpec` object defining how a record
      is transformed after it is loaded and decoded. The transformation occurs on a worker thread/process (depends
      on the ``reader_pool_type`` value).
  :return: A :class:`Reader` object
  """

  if dataset_url is None or not isinstance(dataset_url, six.string_types):
    raise ValueError("""dataset_url must be a string""")

  dataset_url = dataset_url[:-1] if dataset_url[-1] == '/' else dataset_url
  logger.debug('dataset_url: %s', dataset_url)

  resolver = CarbonFilesystemResolver(dataset_url,
                                      key=key,
                                      secret=secret,
                                      endpoint=endpoint,
                                      proxy=proxy,
                                      proxy_port=proxy_port,
                                      hdfs_driver=hdfs_driver)
  filesystem = resolver.filesystem()

  if cache_type is None or cache_type == 'null':
    cache = NullCache()
  elif cache_type == 'local-disk':
    cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate, **cache_extra_settings or {})
  elif cache_type == 'memory-cache':
    cache = LocalMemoryCache(cache_size_limit)
  else:
    raise ValueError('Unknown cache_type: {}'.format(cache_type))

  # Fail if this is a non-pycarbon dataset. Typically, a Carbon store will have hundred thousands rows in a single
  # blocklet. Using PyDictCarbonReaderWorker or ReaderV2 implementation is very inefficient as it processes data on a
  # row by row basis. ArrowCarbonReaderWorker (used by make_batch_carbon_reader) is much more efficient in these cases.
  try:
    infer_or_load_unischema_carbon(CarbonDataset(dataset_url,
                                                 key=key,
                                                 secret=secret,
                                                 endpoint=endpoint,
                                                 proxy=proxy,
                                                 proxy_port=proxy_port,
                                                 filesystem=filesystem))
  except PycarbonMetadataError:
    raise RuntimeError('Currently make_reader supports reading only Pycarbon datasets. '
                       'To read from a non-Pycarbon Carbon store use make_batch_reader')

  if reader_engine == 'reader_v1':
    if reader_pool_type == 'thread':
      reader_pool = ThreadPool(workers_count, results_queue_size)
    elif reader_pool_type == 'process':
      if pyarrow_serialize:
        serializer = PyArrowSerializer()
      else:
        serializer = PickleSerializer()
      reader_pool = ProcessPool(workers_count, serializer)
    elif reader_pool_type == 'dummy':
      reader_pool = DummyPool()
    else:
      raise ValueError('Unknown reader_pool_type: {}'.format(reader_pool_type))

    # Create a dictionary with all ReaderV2 parameters, so we can merge with reader_engine_params if specified
    kwargs = {
      'key': key,
      'secret': secret,
      'endpoint': endpoint,
      'proxy': proxy,
      'proxy_port': proxy_port,
      'schema_fields': schema_fields,
      'reader_pool': reader_pool,
      'shuffle_blocklets': shuffle_blocklets,
      'shuffle_row_drop_partitions': shuffle_row_drop_partitions,
      'predicate': predicate,
      'blocklet_selector': blocklet_selector,
      'num_epochs': num_epochs,
      'cur_shard': cur_shard,
      'shard_count': shard_count,
      'cache': cache,
      'transform_spec': transform_spec,
    }

    if reader_engine_params:
      kwargs.update(reader_engine_params)

    try:
      return CarbonDataReader(filesystem, dataset_url,
                              worker_class=PyDictCarbonReaderWorker,
                              **kwargs)
    except PycarbonMetadataError as e:
      logger.error('Unexpected exception: %s', str(e))
      raise RuntimeError('make_carbon_reader has failed. If you were trying to open a Carbon store that was not '
                         'created using Pycarbon materialize_dataset_carbon and it contains only scalar columns, '
                         'you may use make_batch_reader to read it.\n'
                         'Inner exception: %s', str(e))

  elif reader_engine == 'experimental_reader_v2':
    raise NotImplementedError('not support experimental_reader_v2 reader engine now.')
  else:
    raise ValueError('Unexpected value of reader_engine argument \'%s\'. '
                     'Supported reader_engine values are \'reader_v1\' and \'experimental_reader_v2\'',
                     reader_engine)