Exemplo n.º 1
0
def test_carbondataset_dataset_url_not_exist(carbon_obs_dataset):
    # local
    with pytest.raises(Exception):
        CarbonDataset(LOCAL_FILE_PREFIX + "/not_exist_dir")

    # obs
    with pytest.raises(Exception):
        CarbonDataset(carbon_obs_dataset.not_exist_url,
                      key=pytest.config.getoption("--access_key"),
                      secret=pytest.config.getoption("--secret_key"),
                      endpoint=pytest.config.getoption("--end_point"))
Exemplo n.º 2
0
def get_schema_from_dataset_url_carbon(dataset_url,
                                       key=None,
                                       secret=None,
                                       endpoint=None,
                                       proxy=None,
                                       proxy_port=None,
                                       filesystem=None):
    """Returns a :class:`petastorm.unischema.Unischema` object loaded from a dataset specified by a url.

  :param dataset_url: A dataset URL
  :param key: access key
  :param secret: secret key
  :param endpoint: endpoint_url
  :param proxy: proxy
  :param proxy_port:  proxy_port
  :param filesystem: filesystem
  :return: A :class:`petastorm.unischema.Unischema` object
  """

    # Get a unischema stored in the dataset metadata.
    stored_schema = get_schema_carbon(
        CarbonDataset(dataset_url,
                      key=key,
                      secret=secret,
                      endpoint=endpoint,
                      proxy=proxy,
                      proxy_port=proxy_port,
                      filesystem=filesystem))

    return stored_schema
Exemplo n.º 3
0
def test_normalize_shuffle_partitions(carbon_synthetic_dataset):
  dataset = CarbonDataset(carbon_synthetic_dataset.path)
  row_drop_partitions = CarbonDataReader._normalize_shuffle_options(1, dataset)
  assert row_drop_partitions == 1

  row_drop_partitions = CarbonDataReader._normalize_shuffle_options(100, dataset)
  assert row_drop_partitions == 100
Exemplo n.º 4
0
def test_create_carbondataset_obs(carbon_obs_dataset):
    carbondataset_1 = CarbonDataset(
        carbon_obs_dataset.url,
        key=pytest.config.getoption("--access_key"),
        secret=pytest.config.getoption("--secret_key"),
        endpoint=pytest.config.getoption("--end_point"))

    carbondataset_2 = CarbonDataset(
        carbon_obs_dataset.url,
        key=pytest.config.getoption("--access_key"),
        secret=pytest.config.getoption("--secret_key"),
        endpoint=pytest.config.getoption("--end_point"),
        proxy=proxy,
        proxy_port="8080")

    assert len(carbondataset_1.pieces) == len(carbondataset_2.pieces)
    assert carbondataset_1.pieces
Exemplo n.º 5
0
def test_invalid_carbondataset_obs_parameters(carbon_obs_dataset):
    with pytest.raises(ValueError):
        CarbonDataset(carbon_obs_dataset.url)

    with pytest.raises(ValueError):
        CarbonDataset(carbon_obs_dataset.url,
                      key=pytest.config.getoption("--access_key"),
                      secret=pytest.config.getoption("--secret_key"),
                      endpoint=pytest.config.getoption("--end_point"),
                      proxy=proxy)

    with pytest.raises(ValueError):
        CarbonDataset(carbon_obs_dataset.url,
                      key=pytest.config.getoption("--access_key"),
                      secret=pytest.config.getoption("--secret_key"),
                      endpoint=pytest.config.getoption("--end_point"),
                      proxy_port="8080")
Exemplo n.º 6
0
def materialize_dataset_carbon(spark,
                               dataset_url,
                               schema,
                               blocklet_size_mb=None,
                               use_summary_metadata=False,
                               pyarrow_filesystem=None):
    """
  A Context Manager which handles all the initialization and finalization necessary
  to generate metadata for a pycarbon dataset. This should be used around your
  spark logic to materialize a dataset (specifically the writing of carbon output).

  Note: Any blocklet indexing should happen outside the materialize_dataset_carbon block

  Example:

  >>> spark = SparkSession.builder...
  >>> ds_url = 'hdfs:///path/to/my/dataset'
  >>> with materialize_dataset_carbon(spark, ds_url, MyUnischema, 64):
  >>>   spark.sparkContext.parallelize(range(0, 10)).
  >>>     ...
  >>>     .write.save(path=ds_url, format='carbon')

  A user may provide their own instance of pyarrow filesystem object in ``pyarrow_filesystem`` argument (otherwise,
  pycarbon will create a default one based on the url).

  The following example shows how a custom pyarrow HDFS filesystem, instantiated using ``libhdfs`` driver can be used
  during Pycarbon dataset generation:

  >>> resolver=FilesystemResolver(dataset_url, spark.sparkContext._jsc.hadoopConfiguration(),
  >>>                             hdfs_driver='libhdfs')
  >>> with materialize_dataset_carbon(..., pyarrow_filesystem=resolver.filesystem()):
  >>>     ...


  :param spark: The spark session you are using
  :param dataset_url: The dataset url to output your dataset to (e.g. ``hdfs:///path/to/dataset``)
  :param schema: The :class:`petastorm.unischema.Unischema` definition of your dataset
  :param blocklet_size_mb: The carbon blocklet size to use for your dataset
  :param use_summary_metadata: Whether to use the carbon summary metadata for blocklet indexing or a custom
    indexing method. The custom indexing method is more scalable for very large datasets.
  :param pyarrow_filesystem: A pyarrow filesystem object to be used when saving Pycarbon specific metadata to the
    Carbon store.

  """

    # After job completes, add the unischema metadata and check for the metadata summary file
    spark_config = {}
    _init_spark(spark, spark_config, blocklet_size_mb, use_summary_metadata)
    yield

    # After job completes, add the unischema metadata and check for the metadata summary file
    if pyarrow_filesystem is None:
        resolver = FilesystemResolver(
            dataset_url, spark.sparkContext._jsc.hadoopConfiguration())
        # filesystem = resolver.filesystem()
        dataset_path = resolver.get_dataset_path()
    else:
        # filesystem = pyarrow_filesystem
        dataset_path = urlparse(dataset_url).path

    carbon_dataset = CarbonDataset(dataset_path)
    _generate_unischema_metadata_carbon(carbon_dataset, schema)
    if not use_summary_metadata:
        _generate_num_blocklets_per_file_carbon(carbon_dataset,
                                                spark.sparkContext)

    _cleanup_spark(spark, spark_config, blocklet_size_mb)
Exemplo n.º 7
0
def test_create_carbondataset_local(carbon_synthetic_dataset):
    carbondataset = CarbonDataset(carbon_synthetic_dataset.url)
    assert len(carbondataset.pieces) == 2
Exemplo n.º 8
0
    def __init__(self,
                 pyarrow_filesystem,
                 dataset_path,
                 key=None,
                 secret=None,
                 endpoint=None,
                 proxy=None,
                 proxy_port=None,
                 schema_fields=None,
                 shuffle_blocklets=True,
                 shuffle_row_drop_partitions=1,
                 predicate=None,
                 blocklet_selector=None,
                 reader_pool=None,
                 num_epochs=1,
                 cur_shard=None,
                 shard_count=None,
                 cache=None,
                 worker_class=None,
                 transform_spec=None):
        """Initializes a reader object.

    :param pyarrow_filesystem: An instance of ``pyarrow.FileSystem`` that will be used. If not specified,
        then a default one will be selected based on the url (only for ``hdfs://`` or ``file://``; for
        ``s3://`` support, use ``make_reader``). The default hdfs driver is ``libhdfs3``. If you want
        to to use ``libhdfs``, use
        ``pyarrow_filesystem=pyarrow.hdfs.connect('hdfs:///some/path', driver='libhdfs')``.
    :param dataset_path: filepath to a carbon directory on the specified filesystem.
        e.g. ``'/user/yevgeni/carbon8'``, or ``'/tmp/mydataset'``.
    :param key: access key
    :param secret: secret key
    :param endpoint: endpoint_url
    :param proxy: proxy
    :param proxy_port:  proxy_port
    :param schema_fields: Either list of unischema fields to subset, or ``None`` to read all fields.
        OR an NGram object, then it will return an NGram of the specified properties.
    :param shuffle_blocklets: Whether to shuffle blocklets (the order in which full blocklets are read)
    :param shuffle_row_drop_partitions: This is is a positive integer which determines how many partitions to
        break up a blocklet into for increased shuffling in exchange for worse performance (extra reads).
        For example if you specify 2 each blocklet read will drop half of the rows within every blocklet and
        read the remaining rows in separate reads. It is recommended to keep this number below the regular row
        group size in order to not waste reads which drop all rows.
    :param predicate: instance of predicate object to filter rows to be returned by reader.
    :param blocklet_selector: instance of blocklet selector object to select blocklets to be read
    :param reader_pool: parallelization pool. ``ThreadPool(10)`` (10 threads) is used by default.
        This pool is a custom implementation used to parallelize reading data from the dataset.
        Any object from workers_pool package can be used
        (e.g. :class:`petastorm.workers_pool.process_pool.ProcessPool`).
    :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
        ``None`` will result in an infinite number of epochs.
    :param cur_shard: An int denoting the current shard number used. Each reader instance should
        pass in a unique shard number in the range ``[0, shard_count)``.
        ``shard_count`` must be supplied as well. Defaults to None
    :param shard_count: An int denoting the number of shard partitions there are. Defaults to None
    :param cache: An object conforming to :class:`.CacheBase` interface. Before loading blocklets from a carbon
        file the Reader will attempt to load these values from cache. Caching is useful when communication
        to the main data store is either slow or expensive and the local machine has large enough storage
        to store entire dataset (or a partition of a dataset if shards are used).
        By default, use the :class:`.NullCache` implementation.

    :param worker_class: This is the class that will be instantiated on a different thread/process. It's
        responsibility is to load and filter the data.
    """

        # 1. Open the carbon storage (dataset) & Get a list of all blocklets
        # 2. Filter blocklets
        # 4. Create a blocklet ventilator object
        # 5. Start workers pool
        if not (isinstance(schema_fields, collections.Iterable)
                or isinstance(schema_fields, NGram) or schema_fields is None):
            raise ValueError(
                """Fields must be either None, an iterable collection of Unischema fields or an NGram
            object.""")

        self.ngram = schema_fields if isinstance(schema_fields,
                                                 NGram) else None

        # By default, use original method of working with list of dictionaries and not arrow tables
        worker_class = worker_class or PyDictCarbonReaderWorker
        self._results_queue_reader = worker_class.new_results_queue_reader()

        if self.ngram and not self.ngram.timestamp_overlap and shuffle_row_drop_partitions > 1:
            raise NotImplementedError(
                'Using timestamp_overlap=False is not implemented with'
                ' shuffle_options.shuffle_row_drop_partitions > 1')

        self.cache = cache or NullCache()

        self._workers_pool = reader_pool or ThreadPool(10)
        # 1. Resolve dataset path (hdfs://, file://) and open the carbon storage (dataset)
        self.carbon_dataset = CarbonDataset(dataset_path,
                                            key=key,
                                            secret=secret,
                                            endpoint=endpoint,
                                            proxy=proxy,
                                            proxy_port=proxy_port,
                                            filesystem=pyarrow_filesystem)
        stored_schema = infer_or_load_unischema_carbon(self.carbon_dataset)

        # Make a schema view (a view is a Unischema containing only a subset of fields
        # Will raise an exception if invalid schema fields are in schema_fields
        fields = schema_fields if isinstance(schema_fields,
                                             collections.Iterable) else None
        storage_schema = stored_schema.create_schema_view(
            fields) if fields else stored_schema
        if transform_spec:
            self.schema = transform_schema(storage_schema, transform_spec)
        else:
            self.schema = storage_schema

        # 2. Filter blocklets
        filtered_blocklet_indexes = list(range(len(
            self.carbon_dataset.pieces)))
        worker_predicate = predicate

        # 3. Create a blocklet ventilator object
        normalized_shuffle_row_drop_partitions = \
          self._normalize_shuffle_options(shuffle_row_drop_partitions, self.carbon_dataset)
        self.ventilator = self._create_ventilator(
            filtered_blocklet_indexes, shuffle_blocklets,
            normalized_shuffle_row_drop_partitions, num_epochs,
            worker_predicate,
            self._workers_pool.workers_count + _VENTILATE_EXTRA_BLOCKLETS)

        # 4. Start workers pool
        self._workers_pool.start(
            worker_class,
            (pyarrow_filesystem, dataset_path, storage_schema, self.ngram,
             self.carbon_dataset.pieces, cache, transform_spec),
            ventilator=self.ventilator)
        logger.debug('Workers pool started')

        self.last_row_consumed = False