def test_predicate_on_dataset(tmpdir):
  TestSchema = Unischema('TestSchema', [
    UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
    UnischemaField('test_field', np.int32, (), ScalarCodec(IntegerType()), False),
  ])

  def test_row_generator(x):
    """Returns a single entry in the generated dataset."""
    return {'id': x,
            'test_field': x * x}

  blocklet_size_mb = 256
  dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

  spark = SparkSession.builder.config('spark.driver.memory', '2g').master('local[2]').getOrCreate()
  sc = spark.sparkContext

  rows_count = 10
  with materialize_dataset_carbon(spark, dataset_url, TestSchema, blocklet_size_mb):
    rows_rdd = sc.parallelize(range(rows_count)) \
      .map(test_row_generator) \
      .map(lambda x: dict_to_spark_row(TestSchema, x))

    spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
      .write \
      .save(path=dataset_url, format='carbon')

  with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
    assert next(reader).id == 3
  with make_carbon_reader(dataset_url, predicate=in_lambda(['id'], lambda x: x == '3')) as reader:
    with pytest.raises(StopIteration):
      # Predicate should have selected none, so a StopIteration should be raised.
      next(reader)
Exemplo n.º 2
0
def test_some_processing_functions(synthetic_dataset, reader_factory):
    """Try several ``tf.data.Dataset`` dataset operations on make_petastorm_dataset"""

    # reader1 will have a single row with id=1, reader2: a single row with id=2

    # Using functools.partial(_eq, 1)) which is equivalent to lambda x: x==1 because standard python pickle
    # can not pickle this lambda
    with reader_factory(synthetic_dataset.url,
                        predicate=in_lambda(['id'],
                                            functools.partial(operator.eq,
                                                              1))) as reader1:
        with reader_factory(synthetic_dataset.url,
                            predicate=in_lambda(['id'],
                                                functools.partial(
                                                    operator.eq,
                                                    2))) as reader2:
            dataset = make_petastorm_dataset(reader1) \
                .prefetch(10) \
                .concatenate(make_petastorm_dataset(reader2)) \
                .map(lambda x: x.id) \
                .batch(2)

            next_sample = dataset.make_one_shot_iterator().get_next()

            with tf.Session() as sess:
                # 'actual' is expected to be content of id column of a concatenated dataset
                actual = sess.run(next_sample)
                np.testing.assert_array_equal(actual, [1, 2])
def test_invalid_batch_carbon_reader_predicate_parameters(carbon_scalar_dataset):
  with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                cache_type="memory-cache",
                                predicate=in_lambda(['id2'], lambda id2: True)) as reader:
    with pytest.raises(RuntimeError):
      next(reader)

  with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                predicate=in_lambda([], lambda x: False)) as reader:
    with pytest.raises(ValueError):
      next(reader)

  with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                predicate=in_lambda(['not_exist_col'], lambda x: False)) as reader:
    with pytest.raises(ValueError):
      next(reader)
Exemplo n.º 4
0
    def read(self,
             table: DataFrameMetadata,
             columns: List[str] = None,
             predicate_func=None) -> Iterator[Batch]:
        """
        Reads the table and return a batch iterator for the
        tuples that passes the predicate func.

        Argument:
            table: table metadata object to write into
            columns List[str]: A list of column names to be
                considered in predicate_func
            predicate_func: customized predicate function returns bool

        Return:
            Iterator of Batch read.
        """
        predicate = None
        if predicate_func and columns:
            predicate = in_lambda(columns, predicate_func)

        # ToDo: Handle the sharding logic. We might have to maintain a
        # context for deciding which shard to read
        petastorm_reader = PetastormReader(self._spark_url(table),
                                           predicate=predicate)
        for batch in petastorm_reader.read():
            yield batch
Exemplo n.º 5
0
def test_custom_function(all_values):
    for value in ['guid_2', 'guid_1', 'guid_5', 'guid_XXX', 'guid_XX']:
        test_predicate = in_lambda(
            ['volume_guids'],
            lambda volume_guids, val=value: val in volume_guids)
        included = test_predicate.do_include({'volume_guids': all_values})
        assert included == (value in all_values)
def test_real_reader(synthetic_dataset):
    readers = [make_reader(synthetic_dataset.url, predicate=in_lambda(['id'], lambda id: id % 2 == 0), num_epochs=None,
                           reader_pool_type='dummy'),
               make_reader(synthetic_dataset.url, predicate=in_lambda(['id'], lambda id: id % 2 == 1), num_epochs=None,
                           reader_pool_type='dummy')]
    results = [0, 0]
    num_of_reads = 300
    with WeightedSamplingReader(readers, [0.5, 0.5]) as mixer:
        # Piggyback on this test to verify container interface of the WeightedSamplingReader
        for i, sample in enumerate(mixer):
            next_id = sample.id % 2
            results[next_id] += 1
            if i >= num_of_reads:
                break

    np.testing.assert_allclose(results, [num_of_reads * 0.5, num_of_reads * 0.5], atol=num_of_reads / 10)
Exemplo n.º 7
0
def test_predicate_on_partitioned_dataset(tmpdir):
    """
    Generates a partitioned dataset and ensures that readers evaluate the type of the partition
    column according to the type given in the Unischema.
    """
    TestSchema = Unischema('TestSchema', [
        UnischemaField('id', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('id2', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('test_field', np.int32,
                       (), ScalarCodec(IntegerType()), False),
    ])

    def test_row_generator(x):
        """Returns a single entry in the generated dataset."""
        return {'id': x, 'id2': x + 1, 'test_field': x * x}

    rowgroup_size_mb = 256
    dataset_url = "file://{0}/partitioned_test_dataset".format(tmpdir)

    spark = SparkSession.builder.config('spark.driver.memory',
                                        '2g').master('local[2]').getOrCreate()
    sc = spark.sparkContext

    rows_count = 10
    with materialize_dataset(spark, dataset_url, TestSchema, rowgroup_size_mb):

        rows_rdd = sc.parallelize(range(rows_count))\
            .map(test_row_generator)\
            .map(lambda x: dict_to_spark_row(TestSchema, x))

        spark.createDataFrame(rows_rdd, TestSchema.as_spark_schema()) \
            .write \
            .partitionBy('id', 'id2') \
            .parquet(dataset_url)

    with make_reader(dataset_url,
                     predicate=in_lambda(['id'], lambda x: x == 3)) as reader:
        assert next(reader).id == 3
    with make_reader(dataset_url,
                     predicate=in_lambda(['id2'], lambda x: x == 5)) as reader:
        assert next(reader).id == 5
    with make_reader(dataset_url,
                     predicate=in_lambda(['id'],
                                         lambda x: x == '3')) as reader:
        with pytest.raises(StopIteration):
            # Predicate should have selected none, so a StopIteration should be raised.
            next(reader)
Exemplo n.º 8
0
 def test_custom_function(self):
     for value in ['guid_2', 'guid_1', 'guid_5', 'guid_XXX', 'guid_XX']:
         test_predicate = in_lambda(
             ['volume_guids'],
             lambda volume_guids, val=value: val in volume_guids)
         included = test_predicate.do_include(
             {'volume_guids': PredicatesTest.all_values})
         self.assertEqual(included, value in PredicatesTest.all_values)
def test_predicate_on_single_column(carbon_synthetic_dataset):
  reader = make_carbon_reader(carbon_synthetic_dataset.url,
                              schema_fields=[TestSchema.id2],
                              predicate=in_lambda(['id2'], lambda id2: True),
                              reader_pool_type='thread')
  counter = 0
  for row in reader:
    counter += 1
    actual = dict(row._asdict())
    assert actual['id2'] < 2
  assert counter == len(carbon_synthetic_dataset.data)
Exemplo n.º 10
0
def test_custom_function_with_state(all_values):
    counter = [0]

    def pred_func(volume_guids, cntr):
        cntr[0] += 1
        return volume_guids in all_values

    test_predicate = in_lambda(['volume_guids'], pred_func, counter)
    for value in ['guid_2', 'guid_1', 'guid_5', 'guid_XXX', 'guid_XX']:
        included = test_predicate.do_include({'volume_guids': value})
        assert included == (value in all_values)
    assert counter[0] == 5
Exemplo n.º 11
0
    def test_custom_function_with_state(self):
        counter = [0]

        def pred_func(volume_guids, cntr):
            cntr[0] += 1
            return volume_guids in PredicatesTest.all_values

        test_predicate = in_lambda(['volume_guids'], pred_func, counter)
        for value in ['guid_2', 'guid_1', 'guid_5', 'guid_XXX', 'guid_XX']:
            included = test_predicate.do_include({'volume_guids': value})
            self.assertEqual(included, value in PredicatesTest.all_values)
        self.assertEqual(counter[0], 5)
Exemplo n.º 12
0
def test_transform_function_with_predicate_batched(scalar_dataset):
    def double_float64(sample):
        assert all(sample['id'] % 2 == 0)
        sample['float64'] *= 2
        return sample

    with make_batch_reader(scalar_dataset.url, transform_spec=TransformSpec(double_float64),
                           predicate=in_lambda(['id'], lambda id: id % 2 == 0)) as reader:
        actual = next(reader)
        for actual_id, actual_float64 in zip(actual.id, actual.float64):
            assert actual_id % 2 == 0
            original_sample = next(d for d in scalar_dataset.data if d['id'] == actual_id)
            expected_matrix = original_sample['float64'] * 2
            np.testing.assert_equal(expected_matrix, actual_float64)
Exemplo n.º 13
0
def test_transform_function_with_predicate(synthetic_dataset, reader_factory):
    """Make sure we apply transform only after we apply the predicate"""

    with reader_factory(
            synthetic_dataset.url,
            schema_fields=[TestSchema.id, TestSchema.id2],
            predicate=in_lambda(['id2'], lambda id2: id2 == 1),
            transform_spec=TransformSpec(removed_fields=['id2'])) as reader:
        rows = list(reader)
        assert 'id2' not in rows[0]._fields
        actual_ids = np.asarray(list(row.id for row in rows))
        assert actual_ids.size > 0
        # In the test data id2 = id % 2, which means we expect only odd ids to remain after
        # we apply lambda id2: id2 == 1 predicate.
        assert np.all(actual_ids % 2 == 1)
Exemplo n.º 14
0
def test_transform_function_returns_a_new_dict_with_predicate(
        synthetic_dataset, reader_factory):
    def transform(sample):
        return {'id': sample['id'], 'id2': -1}

    with reader_factory(
            synthetic_dataset.url,
            schema_fields=[TestSchema.id, TestSchema.id2],
            predicate=in_lambda(['id2'], lambda id2: id2 == 1),
            transform_spec=TransformSpec(func=transform)) as reader:
        rows = list(reader)
        actual_ids = np.asarray(list(row.id for row in rows))
        assert actual_ids.size > 0
        # In the test data id2 = id % 2, which means we expect only odd ids to remain after
        # we apply lambda id2: id2 == 1 predicate.
        assert np.all(actual_ids % 2 == 1)

        transformed_ids = np.asarray(list(row.id2 for row in rows))
        assert np.all(transformed_ids == -1)