Exemplos de make_carbon_reader em Python, exemplos de pycarbon.core.carbon_reader.make_carbon_reader em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: tensorflow_hello_world_carbon.py Projeto: panfengfeng/pycarbon

def tensorflow_hello_world(dataset_url='file:///tmp/carbon_pycarbon_dataset/'):
    # Example: tf_tensors will return tensors with dataset data
    with make_carbon_reader(dataset_url) as reader:
        tensor = tf_tensors(reader)
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

    with make_reader(dataset_url, is_batch=False) as reader:
        tensor = make_tensor(reader)
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

    # Example: use tf.data.Dataset API
    with make_carbon_reader(dataset_url) as reader:
        dataset = make_pycarbon_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

    with make_reader(dataset_url, is_batch=False) as reader:
        dataset = make_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            sample = sess.run(tensor)
            print(sample.id)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_pytorch_mnist_carbon.py Projeto: panfengfeng/pycarbon

def test_full_pytorch_example(large_mock_mnist_data, tmpdir):
    # First, generate mock dataset
    dataset_url = 'file://{}'.format(tmpdir)
    mnist_data_to_pycarbon_dataset(tmpdir,
                                   dataset_url,
                                   mnist_data=large_mock_mnist_data,
                                   spark_master='local[1]',
                                   carbon_files_count=1)

    # Next, run a round of training using the pytorce adapting data loader
    from petastorm.pytorch import DataLoader

    torch.manual_seed(1)
    device = torch.device('cpu')
    model = pytorch_example.Net().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

    transform = TransformSpec(pytorch_example._transform_row,
                              removed_fields=['idx'])

    with DataLoader(make_carbon_reader('{}/train'.format(dataset_url),
                                       reader_pool_type='thread',
                                       num_epochs=1,
                                       transform_spec=transform),
                    batch_size=32) as train_loader:
        pytorch_example.train(model, device, train_loader, 10, optimizer, 1)

    with DataLoader(make_carbon_reader('{}/test'.format(dataset_url),
                                       reader_pool_type='thread',
                                       num_epochs=1,
                                       transform_spec=transform),
                    batch_size=100) as test_loader:
        pytorch_example.evaluation(model, device, test_loader)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_carbon_predicates.py Projeto: panfengfeng/pycarbon

def test_predicate_on_dataset(tmpdir):
  TestSchema = Unischema('TestSchema', [
    UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
    UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False),
  ])

  def test_row_generator(x):
    """Returns a single entry in the generated dataset."""
    return {'id': x,
            'test_field': x * x}

  blocklet_size_mb = 256
  dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

  spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate()
  sc = spark.sparkContext

  rows_count = 10
  with materialize_dataset_carbon(spark, dataset_url, TestSchema, blocklet_size_mb):
    rows_rdd = sc.parallelize(range(rows_count)) \
      .map(test_row_generator) \
      .map(lambda x: dict_to_spark_row(TestSchema, x))

    spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
      .write \
      .save(path=dataset_url, format='carbon')

  with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
    assert next(reader).id == 3
  with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader:
    with pytest.raises(StopIteration):
      # Predicate should have selected none, so a StopIteration should be raised.
      next(reader)

Exemplo n.º 4

0

Exibir arquivo

def _read_from_tf_tensors(synthetic_dataset,
                          count,
                          shuffling_queue_capacity,
                          min_after_dequeue,
                          ngram,
                          workers_count=10):
    """Used by several test cases. Reads a 'count' rows using reader.

  The reader is configured without row-group shuffling and guarantees deterministic order of rows up to the
  results queue TF shuffling which is controlled by 'shuffling_queue_capacity', 'min_after_dequeue' arguments.

  The function returns a tuple with: (actual data read from the dataset, a TF tensor returned by the reader)
  """

    schema_fields = (NON_NULLABLE_FIELDS if ngram is None else ngram)

    with make_carbon_reader(schema_fields=schema_fields,
                            dataset_url=synthetic_dataset.url,
                            reader_pool_type='thread',
                            shuffle_blocklets=False,
                            workers_count=workers_count) as reader:
        row_tensors = tf_tensors(
            reader,
            shuffling_queue_capacity=shuffling_queue_capacity,
            min_after_dequeue=min_after_dequeue)

        with _tf_session() as sess:
            rows_data = [sess.run(row_tensors) for _ in range(count)]

    return rows_data, row_tensors

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_carbon_end_to_end.py Projeto: panfengfeng/pycarbon

def test_carbon_reader(carbon_synthetic_dataset):
  with make_carbon_reader(carbon_synthetic_dataset.url, num_epochs=1) as reader:
    i = 0
    for sample in reader:
      print(sample.id)
      i += 1
    assert i == _ROWS_COUNT

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_carbon_predicates.py Projeto: panfengfeng/pycarbon

def test_invalid_carbon_reader_predicate_parameters(carbon_synthetic_dataset):
  with make_carbon_reader(carbon_synthetic_dataset.url,
                          cache_type="memory-cache",
                          predicate=in_lambda(['id2'], lambda id2: True)) as reader:
    with pytest.raises(RuntimeError):
      next(reader)

  with make_carbon_reader(carbon_synthetic_dataset.url,
                          predicate=in_lambda([], lambda x: False)) as reader:
    with pytest.raises(ValueError):
      next(reader)

  with make_carbon_reader(carbon_synthetic_dataset.url,
                          predicate=in_lambda(['not_exist_col'], lambda x: False)) as reader:
    with pytest.raises(ValueError):
      next(reader)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_pytorch_mnist_carbon.py Projeto: panfengfeng/pycarbon

def test_read_mnist_dataset(generate_mnist_dataset):
    # Verify both datasets via a reader
    for dset in SMALL_MOCK_IMAGE_COUNT.keys():
        with make_carbon_reader('file://{}/{}'.format(generate_mnist_dataset,
                                                      dset),
                                reader_pool_type='thread',
                                num_epochs=1) as reader:
            assert sum(1 for _ in reader) == SMALL_MOCK_IMAGE_COUNT[dset]

Exemplo n.º 8

0

Exibir arquivo

def test_bound_size_of_output_queue_size_reader(carbon_synthetic_dataset):
    """This test is timing sensitive so it might become flaky"""
    TIME_TO_GET_TO_STATIONARY_STATE = 0.5

    with make_carbon_reader(carbon_synthetic_dataset.url) as reader:
        next(reader)
        sleep(TIME_TO_GET_TO_STATIONARY_STATE)
        assert reader.diagnostics['output_queue_size'] > 0

Exemplo n.º 9

0

Exibir arquivo

def pytorch_hello_world(dataset_url='file:///tmp/carbon_pycarbon_dataset'):
    with DataLoader(make_carbon_reader(dataset_url)) as train_loader:
        sample = next(iter(train_loader))
        print(sample['id'])

    with make_data_loader(make_reader(dataset_url,
                                      is_batch=False)) as train_loader:
        sample = next(iter(train_loader))
        print(sample['id'])

Exemplo n.º 10

0

Exibir arquivo

def test_generate(pycarbon_dataset):
    # Read from it using a plain reader
    with make_carbon_reader(pycarbon_dataset.url) as reader:
        all_samples = list(reader)
    assert all_samples

    with make_reader(pycarbon_dataset.url, is_batch=False) as reader:
        all_samples = list(reader)
    assert all_samples

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_carbon_end_to_end.py Projeto: panfengfeng/pycarbon

def test_multithreaded_reads(carbon_synthetic_dataset):
  with make_carbon_reader(carbon_synthetic_dataset.url, workers_count=5, num_epochs=1) as reader:
    with ThreadPoolExecutor(max_workers=10) as executor:
      def read_one_row():
        return next(reader)

      futures = [executor.submit(read_one_row) for _ in range(100)]
      results = [f.result() for f in futures]
      assert len(results) == len(carbon_synthetic_dataset.data)
      assert set(r.id for r in results) == set(d['id'] for d in carbon_synthetic_dataset.data)

Exemplo n.º 12

0

Exibir arquivo

def python_hello_world(dataset_url='file:///tmp/carbon_pycarbon_dataset'):
    with make_carbon_reader(dataset_url) as reader:
        # Pure python
        for sample in reader:
            print(sample.id)

    with make_reader(dataset_url, is_batch=False) as reader:
        # Pure python
        for sample in reader:
            print(sample.id)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_carbon_predicates.py Projeto: panfengfeng/pycarbon

def test_predicate_on_single_column(carbon_synthetic_dataset):
  reader = make_carbon_reader(carbon_synthetic_dataset.url,
                              schema_fields=[TestSchema.id2],
                              predicate=in_lambda(['id2'], lambda id2: True),
                              reader_pool_type='thread')
  counter = 0
  for row in reader:
    counter += 1
    actual = dict(row._asdict())
    assert actual['id2'] < 2
  assert counter == len(carbon_synthetic_dataset.data)

Exemplo n.º 14

0

Exibir arquivo

  def __init__(self, dataset_path, data_name='data', label_name='softmax_label'):
    self.path = dataset_path
    self._provide_data = []
    self._provide_label = []

    reader = make_carbon_reader(dataset_path, num_epochs=1)
    self.iter = iter(reader)
    next_iter = next(self.iter)
    data = nd.array(next_iter.image).reshape(1, 1, 28, 28) / 255
    label = nd.array([next_iter.digit]).reshape(1, )
    self._provide_data = [mx.io.DataDesc(data_name, data.shape, data.dtype)]
    self._provide_label = [mx.io.DataDesc(label_name, label.shape, label.dtype)]

Exemplo n.º 15

0

Exibir arquivo

Arquivo: carbon_read_with_projection.py Projeto: panfengfeng/pycarbon

def just_read(dataset_url='file:///tmp/benchmark_dataset'):
    with make_carbon_reader(dataset_url,
                            num_epochs=1,
                            workers_count=16,
                            schema_fields=["id", "value1"]) as train_reader:
        i = 0
        for schema_view in train_reader:
            assert len(schema_view) == 2
            assert schema_view._fields == ('id', 'value1')
            i += 1
        assert i == ROW_COUNT
        return i

Exemplo n.º 16

0

Exibir arquivo

def just_read(dataset_url='file:///tmp/benchmark_dataset'):
    result = list()
    with make_carbon_reader(dataset_url,
                            num_epochs=1,
                            workers_count=10,
                            shuffle_row_drop_partitions=5) as train_reader:
        i = 0
        for schema_view in train_reader:
            result.append(schema_view.id)
            i += 1
        print(i)
        return result

Exemplo n.º 17

0

Exibir arquivo

def test_simple_read_tensorflow(carbon_synthetic_dataset):
    """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference
  data"""
    with make_carbon_reader(
            schema_fields=NON_NULLABLE_FIELDS,
            dataset_url=carbon_synthetic_dataset.url) as reader:
        row_tensors = tf_tensors(reader)
        with _tf_session() as sess:
            rows_data = [sess.run(row_tensors) for _ in range(30)]

    # Make sure we have static shape info for all fields
    _assert_all_tensors_have_shape(row_tensors)
    _assert_expected_rows_data(carbon_synthetic_dataset.data, rows_data)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: test_carbon_memory_cache.py Projeto: panfengfeng/pycarbon

def test_invalid_cache_parameter(carbon_synthetic_dataset,
                                 carbon_scalar_dataset):
    with make_carbon_reader(carbon_synthetic_dataset.url,
                            cache_type='memory-cache',
                            shuffle_row_drop_partitions=5) as reader:
        with pytest.raises(RuntimeError):
            next(reader)

    with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                  cache_type='memory-cache',
                                  shuffle_row_drop_partitions=5) as reader:
        with pytest.raises(RuntimeError):
            next(reader)

Exemplo n.º 19

0

Exibir arquivo

def just_read(dataset_url='file:///tmp/benchmark_dataset', num_epochs=1):
    with make_carbon_reader(dataset_url,
                            num_epochs=num_epochs) as train_reader:
        i = 0
        start = time.time()
        for schema_view in train_reader:
            print(schema_view.id)
            i += 1
            if i % ROW_COUNT == 0:
                end = time.time()
                print("time is " + str(end - start))
                start = end
        assert i == ROW_COUNT * num_epochs
        return i

Exemplo n.º 20

0

Exibir arquivo

Arquivo: carbon_read_with_filter.py Projeto: panfengfeng/pycarbon

def just_read(dataset_url='file:///tmp/benchmark_dataset'):
    values = [5]
    predicate = in_set(values, "id")

    with make_carbon_reader(dataset_url,
                            num_epochs=1,
                            workers_count=16,
                            predicate=predicate) as train_reader:
        i = 0
        for schema_view in train_reader:
            assert schema_view.id == 5
            i += 1
        assert i == 1
        return i

Exemplo n.º 21

0

Exibir arquivo

Arquivo: carbon_read_from_obs_download.py Projeto: panfengfeng/pycarbon

def just_read_obs(key=None,
                  secret=None,
                  endpoint=None,
                  bucketname='modelarts-carbon',
                  prefix='test/benchmark_dataset',
                  download_path='/tmp/download/',
                  num_epochs=1):

    path = download_files_from_obs_concurrently(key, secret, endpoint,
                                                bucketname, prefix,
                                                download_path)

    with make_carbon_reader(path, num_epochs=num_epochs) as train_reader:
        i = 0
        for schema_view in train_reader:
            print(schema_view)
            i += 1

        assert i == ROW_COUNT * num_epochs
        return i

Exemplo n.º 22

0

Exibir arquivo

def test_dynamic_batch_size_of_carbon_reader(carbon_synthetic_dataset):
    with make_carbon_reader(carbon_synthetic_dataset.url,
                            num_epochs=None) as reader:
        batch_size = tf.data.Dataset.range(
            1, 10).make_one_shot_iterator().get_next()

        dataset = make_pycarbon_dataset(reader) \
          .batch(batch_size=batch_size)

        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer

        tensor = iterator.get_next()

        with tf.train.MonitoredTrainingSession() as sess:
            sess.run(init_op)
            sample = sess.run(tensor)
            assert 1 == len(sample.id)

            sess.run(init_op)
            sample = sess.run(tensor)
            assert 2 == len(sample.id)

Exemplo n.º 23

0

Exibir arquivo

def test_dataset_carbon_reader(carbon_synthetic_dataset):
    with make_carbon_reader(carbon_synthetic_dataset.url,
                            num_epochs=1) as reader:
        dataset = make_pycarbon_dataset(reader) \
          .batch(batch_size=1)

        iterator = dataset.make_one_shot_iterator()

        tensor = iterator.get_next()

        with tf.Session() as sess:
            sess.run([
                tf.local_variables_initializer(),
                tf.global_variables_initializer(),
            ])
            i = 0
            try:
                while True:
                    sess.run(tensor)
                    i += 1
            except tf.errors.OutOfRangeError:
                print("Finish! the number is " + str(i))

            assert i == _ROWS_COUNT

Exemplo n.º 24

0

Exibir arquivo

def make_reader(dataset_url=None,
                workers_count=10,
                results_queue_size=100,
                num_epochs=1,
                obs_client=None,
                shuffle=True,
                schema_fields=None,
                is_batch=True,
                reader_pool_type='thread',
                data_format='carbon',
                cache_properties={'cache_type': None, 'cache_location': None, 'cache_size_limit': None,
                                  'cache_row_size_estimate': None, 'cache_extra_settings': None},
                **properties
                ):
  """
  an unified api for different data format dataset

  :param dataset_url: an filepath or a url to a carbon directory,
      e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/carbon8'``, or ``'file:///tmp/mydataset'``
      or ``'s3a://bucket/mydataset'``.
  :param data_format: dataset data format (default: carbon)
  :param is_batch: return single record or batch records (default: True)
  :param obs_client: obs client object
    access key
    secret key
    endpoint_url
  :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields;
          an NGram object, then it will return an NGram of the specified fields.
  :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
      denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    TODO: process support
  :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
      thread or process pool. Defaults to 10
  :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to
      thread reader pool type.
  :param shuffle: Whether to shuffle partition (the order in which full partition are read)
  :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
      ``None`` will result in an infinite number of epochs.
  :param cache_properties: a dict of cache parameters
    cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk', 'memory-cache']
      to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
      to the main data store is either slow or expensive and the local machine has large enough storage
      to store entire dataset. By default will be a null cache.
    cache_location: A string denoting the location or path of the cache.
    cache_size_limit: An int specifying the size limit of the cache in bytes
    cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
  :param **properties: other parameters (using dict)
  :return: A :class:`Reader` object
  """

  if is_batch is True:
    if data_format == 'carbon':
      if isinstance(obs_client, ObsClient):
        if obs_client.is_secure is True:
          endpoint = "https://" + obs_client.server
        else:
          endpoint = "http://" + obs_client.server
        return make_batch_carbon_reader(dataset_url,
                                        key=obs_client.securityProvider.access_key_id,
                                        secret=obs_client.securityProvider.secret_access_key,
                                        endpoint=endpoint,
                                        proxy=obs_client.proxy_host,
                                        proxy_port=obs_client.proxy_port,
                                        schema_fields=schema_fields,
                                        reader_pool_type=reader_pool_type,
                                        workers_count=workers_count,
                                        results_queue_size=results_queue_size,
                                        shuffle_blocklets=shuffle,
                                        num_epochs=num_epochs,
                                        cache_type=cache_properties['cache_type'],
                                        cache_location=cache_properties['cache_location'],
                                        cache_size_limit=cache_properties['cache_size_limit'],
                                        cache_row_size_estimate=cache_properties['cache_row_size_estimate'],
                                        cache_extra_settings=cache_properties['cache_extra_settings'],
                                        **properties)
      elif obs_client is None:
        return make_batch_carbon_reader(dataset_url,
                                        schema_fields=schema_fields,
                                        reader_pool_type=reader_pool_type,
                                        workers_count=workers_count,
                                        results_queue_size=results_queue_size,
                                        shuffle_blocklets=shuffle,
                                        num_epochs=num_epochs,
                                        cache_type=cache_properties['cache_type'],
                                        cache_location=cache_properties['cache_location'],
                                        cache_size_limit=cache_properties['cache_size_limit'],
                                        cache_row_size_estimate=cache_properties['cache_row_size_estimate'],
                                        cache_extra_settings=cache_properties['cache_extra_settings'],
                                        **properties)
      else:
        raise ValueError("""obs_client should be a ObsClient object or None""")
    else:
      raise NotImplementedError("""not support other data format datset""")

  elif is_batch is False:
    if data_format == 'carbon':
      if isinstance(obs_client, ObsClient):
        if obs_client.is_secure is True:
          endpoint = "https://" + obs_client.server
        else:
          endpoint = "http://" + obs_client.server
        return make_carbon_reader(dataset_url,
                                  key=obs_client.securityProvider.access_key_id,
                                  secret=obs_client.securityProvider.secret_access_key,
                                  endpoint=endpoint,
                                  proxy=obs_client.proxy_host,
                                  proxy_port=obs_client.proxy_port,
                                  schema_fields=schema_fields,
                                  reader_pool_type=reader_pool_type,
                                  workers_count=workers_count,
                                  results_queue_size=results_queue_size,
                                  shuffle_blocklets=shuffle,
                                  num_epochs=num_epochs,
                                  cache_type=cache_properties['cache_type'],
                                  cache_location=cache_properties['cache_location'],
                                  cache_size_limit=cache_properties['cache_size_limit'],
                                  cache_row_size_estimate=cache_properties['cache_row_size_estimate'],
                                  cache_extra_settings=cache_properties['cache_extra_settings'],
                                  **properties)
      elif obs_client is None:
        return make_carbon_reader(dataset_url,
                                  schema_fields=schema_fields,
                                  reader_pool_type=reader_pool_type,
                                  workers_count=workers_count,
                                  results_queue_size=results_queue_size,
                                  shuffle_blocklets=shuffle,
                                  num_epochs=num_epochs,
                                  cache_type=cache_properties['cache_type'],
                                  cache_location=cache_properties['cache_location'],
                                  cache_size_limit=cache_properties['cache_size_limit'],
                                  cache_row_size_estimate=cache_properties['cache_row_size_estimate'],
                                  cache_extra_settings=cache_properties['cache_extra_settings'],
                                  **properties)
      else:
        raise ValueError("""obs_client should be a ObsClient object or None""")
    else:
      raise NotImplementedError("""not support other data format datset""")

  else:
    raise ValueError("""the value of is_batch is invalid, it should be set True or False""")

Exemplo n.º 25

0

Exibir arquivo

    os.environ['PYSPARK_DRIVER_PYTHON'] = pytest.config.getoption(
        "--pyspark-driver-python")
elif 'PYSPARK_PYTHON' in os.environ.keys(
) and 'PYSPARK_DRIVER_PYTHON' in os.environ.keys():
    pass
else:
    raise ValueError(
        "please set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON variables, "
        "using cmd line "
        "--pyspark-python=PYSPARK_PYTHON_PATH --pyspark-driver-python=PYSPARK_DRIVER_PYTHON_PATH "
        "or set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON in system env")

# pylint: disable=unnecessary-lambda
READER_FACTORIES = [
    make_carbon_reader,
    lambda url, **kwargs: make_carbon_reader(url, **kwargs),
    make_batch_carbon_reader,
    lambda url, **kwargs: make_batch_carbon_reader(url, **kwargs),
]


@pytest.mark.parametrize('reader_factory', READER_FACTORIES)
def test_dataset_url_must_be_string(reader_factory):
    with pytest.raises(ValueError):
        reader_factory(None)

    with pytest.raises(ValueError):
        reader_factory(123)

    with pytest.raises(ValueError):
        reader_factory([])

Exemplo n.º 26

0

Exibir arquivo

def test_invalid_reader_engine(carbon_synthetic_dataset):
    with pytest.raises(ValueError, match='Supported reader_engine values'):
        make_carbon_reader(carbon_synthetic_dataset.url,
                           reader_engine='bogus reader engine')

Exemplo n.º 27

0

Exibir arquivo

def test_reader_engine_v2_not_supported(carbon_synthetic_dataset):
    with pytest.raises(NotImplementedError):
        make_carbon_reader(carbon_synthetic_dataset.url,
                           reader_engine='experimental_reader_v2')

Exemplo n.º 28

0

Exibir arquivo

def test_diagnostics_reader_v1(carbon_synthetic_dataset):
    with make_carbon_reader(carbon_synthetic_dataset.url) as reader:
        next(reader)
        diags = reader.diagnostics
        # # Hard to make a meaningful assert on the content of the diags without potentially introducing a race
        assert 'output_queue_size' in diags

Exemplo n.º 29

0

Exibir arquivo

Arquivo: test_carbon_end_to_end.py Projeto: panfengfeng/pycarbon

def test_make_carbon_reader_fails_loading_non_unischema_dataset(carbon_many_columns_non_unischema_dataset):
  with pytest.raises(RuntimeError, match='use make_batch_carbon_reader'):
    make_carbon_reader(carbon_many_columns_non_unischema_dataset.url)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_carbon_end_to_end.py Projeto: panfengfeng/pycarbon

if pytest.config.getoption("--pyspark-python") is not None and \
    pytest.config.getoption("--pyspark-driver-python") is not None:
  os.environ['PYSPARK_PYTHON'] = pytest.config.getoption("--pyspark-python")
  os.environ['PYSPARK_DRIVER_PYTHON'] = pytest.config.getoption("--pyspark-driver-python")
elif 'PYSPARK_PYTHON' in os.environ.keys() and 'PYSPARK_DRIVER_PYTHON' in os.environ.keys():
  pass
else:
  raise ValueError("please set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON variables, "
                   "using cmd line "
                   "--pyspark-python=PYSPARK_PYTHON_PATH --pyspark-driver-python=PYSPARK_DRIVER_PYTHON_PATH "
                   "or set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON in system env")

# pylint: disable=unnecessary-lambda
ALL_READER_FLAVOR_FACTORIES = [
  lambda url, **kwargs: make_carbon_reader(url, reader_pool_type='thread', **kwargs),
]

SCALAR_FIELDS = [f for f in TestSchema.fields.values() if isinstance(f.codec, ScalarCodec)]

SCALAR_ONLY_READER_FACTORIES = [
  lambda url, **kwargs: make_batch_carbon_reader(url, reader_pool_type='thread', **kwargs),
]


def _check_simple_reader(reader, expected_data, expected_rows_count=None, check_types=True, limit_checked_rows=None):
  # Read a bunch of entries from the dataset and compare the data to reference
  def _type(v):
    if isinstance(v, np.ndarray):
      if v.dtype.str.startswith('|S'):
        return '|S'