def test_should_fail_if_reading_out_of_context_manager(synthetic_dataset): with make_reader(synthetic_dataset.url, workers_count=1) as reader: next(reader) with pytest.raises(RuntimeError, match='Trying to read a sample.*'): next(reader)
def test_simple_read_with_pyarrow_serialize(synthetic_dataset): """Same as test_simple_read, but don't check type correctness as pyarrow_serialize messes up integer types""" with make_reader(synthetic_dataset.url, reader_pool_type='process', workers_count=1, pyarrow_serialize=True) as reader: _check_simple_reader(reader, synthetic_dataset.data, check_types=False)
from pyspark.sql.types import LongType, ShortType, StringType from petastorm import make_reader from petastorm.codecs import ScalarCodec from petastorm.etl.dataset_metadata import materialize_dataset from petastorm.reader import ReaderV2 from petastorm.reader_impl.same_thread_executor import SameThreadExecutor from petastorm.selectors import SingleIndexSelector from petastorm.tests.test_common import create_test_dataset, TestSchema from petastorm.tests.test_end_to_end_predicates_impl import \ PartitionKeyInSetPredicate, EqualPredicate from petastorm.unischema import UnischemaField, Unischema # pylint: disable=unnecessary-lambda MINIMAL_READER_FLAVOR_FACTORIES = [ lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs), lambda url, **kwargs: make_reader( url, reader_engine='experimental_reader_v2', **kwargs), ] # pylint: disable=unnecessary-lambda ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [ lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs ), lambda url, **kwargs: make_reader( url, reader_pool_type='process', pyarrow_serialize=False, **kwargs), lambda url, **kwargs: make_reader(url, reader_pool_type='process', workers_count=1, pyarrow_serialize=True, **kwargs),
from petastorm import make_reader, TransformSpec, make_batch_reader from petastorm.pytorch import _sanitize_pytorch_types, DataLoader, BatchedDataLoader, decimal_friendly_collate from petastorm.tests.test_common import TestSchema ALL_DATA_LOADERS = [DataLoader, BatchedDataLoader] BATCHABLE_FIELDS = set(TestSchema.fields.values()) - \ {TestSchema.matrix_nullable, TestSchema.string_array_nullable, TestSchema.matrix_string, TestSchema.empty_matrix_string, TestSchema.integer_nullable} TORCH_BATCHABLE_FIELDS = BATCHABLE_FIELDS - \ {TestSchema.decimal, TestSchema.partition_key, } # pylint: disable=unnecessary-lambda MINIMAL_READER_FLAVOR_FACTORIES = [ lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs), ] # pylint: disable=unnecessary-lambda ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [ lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs ), lambda url, **kwargs: make_reader( url, reader_pool_type='process', workers_count=1, **kwargs), ] def _check_simple_reader(loader, expected_data, expected_fields): # Read a bunch of entries from the dataset and compare the data to reference def _type(v): return v.dtype if isinstance(v, np.ndarray) else type(v)
def reader_throughput(dataset_url, field_regex=None, warmup_cycles_count=300, measure_cycles_count=1000, pool_type=WorkerPoolType.THREAD, loaders_count=3, profile_threads=False, read_method=ReadMethod.PYTHON, shuffling_queue_size=500, min_after_dequeue=400, reader_extra_args=None, spawn_new_process=True): """Constructs a Reader instance and uses it to performs throughput measurements. The function will spawn a new process if ``spawn_separate_process`` is set. This is needed to make memory footprint measurements accurate. :param dataset_url: A url of the dataset to be used for measurements. :param field_regex: A list of regular expressions. Only fields that match one of the regex patterns will be used during the benchmark. :param warmup_cycles_count: Number of warmup cycles. During warmup cycles no measurements are being recorded. :param measure_cycles_count: Number of measurements cycles. Only time elapsed during measurements cycles are used in throughput calculations. :param pool_type: :class:`WorkerPoolType` enum value. :param loaders_count: Number of threads (same thread is used for IO and decoding). :param profile_threads: Enables profiling threads. Will print result when thread pool is shut down. :param read_method: An enum :class:`ReadMethod` that defines whether a :class:`petastorm.reader.Reader` will be used. :param shuffling_queue_size: Maximum number of elements in the shuffling queue. :param min_after_dequeue: Minimum number of elements in a shuffling queue before entries can be read from it. :param reader_extra_args: Extra arguments that would be passed to Reader constructor. :param spawn_new_process: This function will respawn itself in a new process if the argument is True. Spawning a new process is needed to get an accurate memory footprint. :return: An instance of ``BenchmarkResult`` namedtuple with the results of the benchmark. The namedtuple has the following fields: `time_mean`, `samples_per_second`, `memory_info` and `cpu` """ if not reader_extra_args: reader_extra_args = dict() if spawn_new_process: args = copy.deepcopy(locals()) args['spawn_new_process'] = False executor = ProcessPoolExecutor(1) future = executor.submit(reader_throughput, **args) return future.result() logger.info('Arguments: %s', locals()) if 'schema_fields' not in reader_extra_args: unischema_fields = match_unischema_fields( get_schema_from_dataset_url(dataset_url), field_regex) reader_extra_args['schema_fields'] = unischema_fields logger.info('Fields used in the benchmark: %s', str(reader_extra_args['schema_fields'])) with make_reader(dataset_url, num_epochs=None, reader_pool_type=str(pool_type), workers_count=loaders_count, **reader_extra_args) as reader: if read_method == ReadMethod.PYTHON: result = _time_warmup_and_work(reader, warmup_cycles_count, measure_cycles_count) elif read_method == ReadMethod.TF: result = _time_warmup_and_work_tf(reader, warmup_cycles_count, measure_cycles_count, shuffling_queue_size, min_after_dequeue) else: raise RuntimeError('Unexpected reader_type value: %s', str(read_method)) return result
def test_diagnostics_reader_v1(synthetic_dataset): with make_reader(synthetic_dataset.url) as reader: next(reader) diags = reader.diagnostics # Hard to make a meaningful assert on the content of the diags without potentially introducing a race assert 'output_queue_size' in diags
import numpy as np import pytest import six import tensorflow as tf from petastorm import make_reader, make_batch_reader from petastorm.ngram import NGram from petastorm.predicates import in_lambda from petastorm.tests.test_common import TestSchema from petastorm.tf_utils import make_petastorm_dataset _EXCLUDE_FIELDS = set(TestSchema.fields.values()) \ - {TestSchema.matrix_nullable, TestSchema.string_array_nullable, TestSchema.decimal} MINIMAL_READER_FLAVOR_FACTORIES = [ lambda url, **kwargs: make_reader(url, **_merge_params({'reader_pool_type': 'dummy', 'schema_fields': _EXCLUDE_FIELDS}, kwargs)), ] ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [ lambda url, **kwargs: make_reader(url, **_merge_params({'reader_pool_type': 'thread', 'workers_count': 1, 'schema_fields': _EXCLUDE_FIELDS}, kwargs)), lambda url, **kwargs: make_reader(url, **_merge_params({'reader_pool_type': 'process', 'workers_count': 1, 'schema_fields': _EXCLUDE_FIELDS}, kwargs)), ] def _merge_params(base, overwrite): """Merges two dictionaries when values from ``overwrite`` takes precedence over values of ``base`` dictionary. Both input parameters are not modified.
def main(): # Training settings parser = argparse.ArgumentParser(description='Petastorm MNIST Example') default_dataset_url = 'file://{}'.format(DEFAULT_MNIST_DATA_PATH) parser.add_argument( '--dataset-url', type=str, default=default_dataset_url, metavar='S', help='hdfs:// or file:/// URL to the MNIST petastorm dataset ' '(default: %s)' % default_dataset_url) parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--all-epochs', action='store_true', default=False, help='train all epochs before testing accuracy/loss') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device('cuda' if use_cuda else 'cpu') model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Configure loop and Reader epoch for illustrative purposes. # Typical training usage would use the `all_epochs` approach. # if args.all_epochs: # Run training across all the epochs before testing for accuracy loop_epochs = 1 reader_epochs = args.epochs else: # Test training accuracy after each epoch loop_epochs = args.epochs reader_epochs = 1 transform = TransformSpec(_transform_row, removed_fields=['idx']) # Instantiate each petastorm Reader with a single thread, shuffle enabled, and appropriate epoch setting for epoch in range(1, loop_epochs + 1): with DataLoader(make_reader('{}/train'.format(args.dataset_url), num_epochs=reader_epochs, transform_spec=transform), batch_size=args.batch_size) as train_loader: train(model, device, train_loader, args.log_interval, optimizer, epoch) with DataLoader(make_reader('{}/test'.format(args.dataset_url), num_epochs=reader_epochs, transform_spec=transform), batch_size=args.test_batch_size) as test_loader: test(model, device, test_loader)
def test_predicate_on_partition_filters_out_everything(synthetic_dataset, reader_factory): with pytest.warns(UserWarning, match='No matching data is available for loading'): # This predicate should filter out all rowgroups. We should raise an error in this case. make_reader(synthetic_dataset.url, reader_pool_type='dummy', predicate=PartitionKeyInSetPredicate({'non existing value'}))
def main(): parser = argparse.ArgumentParser( description='Petastorm/Sagemaker/Tensorflow MNIST Example') # Data, model, and output directories # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket. parser.add_argument('--model_dir', type=str) parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR')) parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING')) parser.add_argument('--hosts', type=list, default=json.loads(os.environ.get('SM_HOSTS'))) parser.add_argument('--current-host', type=str, default=os.environ.get('SM_CURRENT_HOST')) parser.add_argument('--dataset-url', type=str, metavar='S', help='S3:// URL to the MNIST petastorm dataset') parser.add_argument('--training_steps', type=int, default=300) parser.add_argument('--evaluation_steps', type=int, default=10) parser.add_argument('--log_step_count_steps', type=int, default=100) parser.add_argument('--save_checkpoints_steps', type=int, default=500) parser.add_argument('--save_summary_steps', type=int, default=50) parser.add_argument('--throttle_secs', type=int, default=10) parser.add_argument('--prefetch_size', type=int, default=16) parser.add_argument('--num_parallel_batches', type=int, default=1) parser.add_argument('--batch_size', type=int, default=256) args = parser.parse_args() tf.logging.set_verbosity(tf.logging.DEBUG) # TF 1.13 and 1.14 handle logging a bit different, so wrapping the logging setup in a try/except block try: tf_logger = tf_logging._get_logger() handler = tf_logger.handlers[0] handler.setFormatter( _logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) except: pass # In 1.14, a multi-worker synchronous training can be achieved using CollectiveAllReduceStrategy per # See https://github.com/tensorflow/tensorflow/issues/23664 # Without providing train_distribute, I believe asynchronous training is done run_config = tf.estimator.RunConfig( save_checkpoints_steps=args.save_checkpoints_steps, log_step_count_steps=args.log_step_count_steps, save_summary_steps=args.save_summary_steps, ) model_dir_parent_path = args.model_dir[:-5] model_dir_parent = model_dir_parent_path.split("/")[-2] print( f"Launch tensorboard by running the following in terminal:\n" + "aws s3 sync {model_dir_parent_path} ~/Downloads/{model_dir_parent} && " + "tensorboard --logdir=~/Downloads/{model_dir_parent}") estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=args.model_dir, params={"batch_size": args.batch_size}, config=run_config) workers = json.loads(os.environ['SM_HOSTS']) worker_index = workers.index(os.environ['SM_CURRENT_HOST']) nr_workers = len(workers) print( f"Inside training script on worker with (0-based) index {worker_index} out of {nr_workers - 1}." ) with make_reader(os.path.join(args.dataset_url, 'train'), num_epochs=None, cur_shard=worker_index, shard_count=nr_workers, workers_count=nr_workers) as train_reader: with make_reader(os.path.join(args.dataset_url, 'test'), num_epochs=None, cur_shard=0, shard_count=1) as eval_reader: train_fn = lambda: _input_fn(reader=train_reader, batch_size=args.batch_size, num_parallel_batches=args. num_parallel_batches) eval_fn = lambda: _input_fn(reader=eval_reader, batch_size=args.batch_size, num_parallel_batches=args. num_parallel_batches) train_spec = tf.estimator.TrainSpec(input_fn=train_fn, max_steps=args.training_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_fn, throttle_secs=args.throttle_secs, steps=args.evaluation_steps) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def pytorch_hello_world(dataset_url='file:///tmp/hello_world_dataset'): with DataLoader(make_reader(dataset_url)) as train_loader: sample = next(iter(train_loader)) print(sample['id'])
def test_no_metadata(self): self.vanish_metadata() with self.assertRaises(RuntimeError) as e: make_reader(self._dataset_url, reader_pool_type='dummy') self.assertTrue('make_reader supports reading only Petastorm datasets' in str(e.exception)) self.restore_metadata()
import numpy as np import pytest import tensorflow as tf from tensorflow.python.framework.errors_impl import OutOfRangeError from petastorm import make_reader from petastorm.ngram import NGram from petastorm.reader_impl.same_thread_executor import SameThreadExecutor from petastorm.tests.conftest import SyntheticDataset, maybe_cached_dataset from petastorm.tests.test_common import create_test_dataset, TestSchema from petastorm.tf_utils import tf_tensors # Tests in this module will run once for each entry in the READER_FACTORIES # pylint: disable=unnecessary-lambda READER_FACTORIES = [ lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs), lambda url, **kwargs: make_reader( url, reader_pool_type='process', workers_count=1, **kwargs), lambda url, **kwargs: make_reader(url, reader_engine='experimental_reader_v2', reader_pool_type='dummy', reader_engine_params= {'loader_pool': SameThreadExecutor()}, **kwargs), ] @pytest.fixture(scope="session") def dataset_num_files_1(request, tmpdir_factory): def _dataset_generator(): path = tmpdir_factory.mktemp("data").strpath
def test_schema_mismatch(synthetic_dataset): readers = [make_reader(synthetic_dataset.url, schema_fields=['id'], workers_count=1), make_reader(synthetic_dataset.url, schema_fields=['image_png'], workers_count=1)] with pytest.raises(ValueError, match='.*should have the same schema.*'): WeightedSamplingReader(readers, [0.5, 0.5])
def train_and_test(dataset_url, training_iterations, batch_size, evaluation_interval): """ Train a model for training iterations with a batch size batch_size, printing accuracy every log_interval. :param dataset_url: The MNIST dataset url. :param training_iterations: The training iterations to train for. :param batch_size: The batch size for training. :param evaluation_interval: The interval used to print the accuracy. :return: """ with make_reader(os.path.join(dataset_url, 'train'), num_epochs=None) as train_reader: with make_reader(os.path.join(dataset_url, 'test'), num_epochs=None) as test_reader: train_readout = tf_tensors(train_reader) train_image = tf.cast(tf.reshape(train_readout.image, [784]), tf.float32) train_label = train_readout.digit batch_image, batch_label = tf.train.batch( [train_image, train_label], batch_size=batch_size ) W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(batch_image, W) + b # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.losses.sparse_softmax_cross_entropy on the raw # outputs of 'y', and then average across the batch. cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=batch_label, logits=y) train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), batch_label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) test_readout = tf_tensors(test_reader) test_image = tf.cast(tf.reshape(test_readout.image, [784]), tf.float32) test_label = test_readout.digit test_batch_image, test_batch_label = tf.train.batch( [test_image, test_label], batch_size=batch_size ) # Train print('Training model for {0} training iterations with batch size {1} and evaluation interval {2}'.format( training_iterations, batch_size, evaluation_interval )) with tf.Session() as sess: sess.run([ tf.local_variables_initializer(), tf.global_variables_initializer(), ]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for i in range(training_iterations): if coord.should_stop(): break sess.run(train_step) if (i % evaluation_interval) == 0 or i == (training_iterations - 1): feed_batch_image, feed_batch_label = sess.run([test_batch_image, test_batch_label]) print('After {0} training iterations, the accuracy of the model is: {1:.2f}'.format( i, sess.run(accuracy, feed_dict={ batch_image: feed_batch_image, batch_label: feed_batch_label }))) finally: coord.request_stop() coord.join(threads)
def test_too_many_shards(synthetic_dataset, reader_factory): with pytest.raises(NoDataAvailableError, match='Number of row-groups in the dataset'): # If number of shards is greater than number of rowgroups, users might be surprised if a reader # does not produce any error, hence we raise an explicit exception make_reader(synthetic_dataset.url, reader_pool_type='dummy', cur_shard=0, shard_count=10000000)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from time import sleep import pyarrow.parquet as pq import pytest from petastorm import make_reader from petastorm.reader import Reader # pylint: disable=unnecessary-lambda READER_FACTORIES = [ make_reader, lambda url, **kwargs: make_reader( url, reader_engine='experimental_reader_v2', **kwargs), ] @pytest.mark.parametrize('reader_factory', READER_FACTORIES) def test_dataset_url_must_be_string(reader_factory): with pytest.raises(ValueError): reader_factory(None) with pytest.raises(ValueError): reader_factory(123) with pytest.raises(ValueError): reader_factory([])
def test_basic_pytorch_dataloader(synthetic_dataset): loader = DataLoader(make_reader(synthetic_dataset.url, reader_pool_type='dummy'), collate_fn=_noop_collate) for item in loader: assert len(item) == 1
def test_invalid_reader_engine(synthetic_dataset, reader_factory): with pytest.raises(ValueError, match='Supported reader_engine values'): make_reader(synthetic_dataset.url, reader_engine='bogus reader engine')
def test_pytorch_dataloader_context(synthetic_dataset): with DataLoader(make_reader(synthetic_dataset.url, reader_pool_type='dummy'), collate_fn=_noop_collate) as loader: for item in loader: assert len(item) == 1
# Must import pyarrow before torch. See: https://github.com/uber/petastorm/blob/master/docs/troubleshoot.rst import pyarrow # noqa: F401 pylint: disable=W0611 import torch from petastorm import make_reader from petastorm.pytorch import _sanitize_pytorch_types, DataLoader, decimal_friendly_collate from petastorm.reader import ReaderV2 from petastorm.tests.test_common import TestSchema BATCHABLE_FIELDS = set(TestSchema.fields.values()) - \ {TestSchema.matrix_nullable, TestSchema.string_array_nullable, TestSchema.matrix_string, TestSchema.empty_matrix_string} # pylint: disable=unnecessary-lambda MINIMAL_READER_FLAVOR_FACTORIES = [ lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs), lambda url, **kwargs: ReaderV2(url, **kwargs) ] # pylint: disable=unnecessary-lambda ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [ lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs ), lambda url, **kwargs: make_reader( url, reader_pool_type='process', pyarrow_serialize=False, **kwargs), lambda url, **kwargs: make_reader(url, reader_pool_type='process', workers_count=1, pyarrow_serialize=True, **kwargs), lambda url, **kwargs: ReaderV2(url, decoder_pool=ProcessPoolExecutor(10), **kwargs)
def test_reader_engine_v2_with_transform_is_not_supported( synthetic_dataset, reader_factory): with pytest.raises(NotImplementedError): make_reader(synthetic_dataset.url, reader_engine='experimental_reader_v2', transform_spec=TransformSpec(lambda x: x))
def test_no_metadata(self): self.vanish_metadata() with self.assertRaises(PetastormMetadataError) as e: make_reader(self._dataset_url, reader_pool_type='dummy') self.assertTrue('Could not find _common_metadata file' in str(e.exception)) self.restore_metadata()
def test_generate(petastorm_dataset): # Read from it using a plain reader with make_reader(petastorm_dataset.url) as reader: all_samples = list(reader) assert all_samples
from petastorm import make_reader, make_batch_reader, TransformSpec from petastorm.codecs import ScalarCodec from petastorm.etl.dataset_metadata import materialize_dataset from petastorm.predicates import in_lambda from petastorm.reader import ReaderV2 from petastorm.reader_impl.same_thread_executor import SameThreadExecutor from petastorm.selectors import SingleIndexSelector from petastorm.tests.test_common import create_test_dataset, TestSchema from petastorm.tests.test_end_to_end_predicates_impl import \ PartitionKeyInSetPredicate, EqualPredicate, VectorizedEqualPredicate from petastorm.unischema import UnischemaField, Unischema # pylint: disable=unnecessary-lambda MINIMAL_READER_FLAVOR_FACTORIES = [ lambda url, **kwargs: make_reader(url, reader_pool_type='dummy', **kwargs), lambda url, **kwargs: make_reader(url, reader_engine='experimental_reader_v2', **kwargs), ] # pylint: disable=unnecessary-lambda ALL_READER_FLAVOR_FACTORIES = MINIMAL_READER_FLAVOR_FACTORIES + [ lambda url, **kwargs: make_reader(url, reader_pool_type='thread', **kwargs), lambda url, **kwargs: make_reader(url, reader_pool_type='process', workers_count=2, **kwargs), lambda url, **kwargs: make_reader(url, workers_count=2, reader_engine='experimental_reader_v2', **kwargs), lambda url, **kwargs: make_reader(url, reader_pool_type='process', workers_count=2, **kwargs), lambda url, **kwargs: make_reader(url, workers_count=2, reader_engine='experimental_reader_v2', **kwargs), ] SCALAR_FIELDS = [f for f in TestSchema.fields.values() if isinstance(f.codec, ScalarCodec)] SCALAR_ONLY_READER_FACTORIES = [
def _check_reader(path, rowgroup_selector=None): # Just check that you can open and read from a reader successfully with make_reader('file://{}'.format(path), reader_pool_type='dummy', rowgroup_selector=rowgroup_selector) as reader: [next(reader) for _ in range(10)]
def test_make_reader_fails_loading_non_petastrom_dataset(scalar_dataset): with pytest.raises(RuntimeError, match='use make_batch_reader'): make_reader(scalar_dataset.url)
# define how to decorate the open method def retry_open(decorated_open, retry): def open(self, path, mode='rb', buffer_size=None, replication=None, default_block_size=None): print('opening {}'.format(path)) return retry.call(decorated_open, self, path, mode=mode, buffer_size=buffer_size, replication=replication, default_block_size=default_block_size) return open # decorate open retry = tenacity.Retrying() HadoopFileSystem.open = retry_open(HadoopFileSystem.open, retry) file = 'hdfs://ip-10-1-1-36.example.com/user/spark/petastorm_dataset.parquet' with make_reader(file, hdfs_driver='libhdfs', pyarrow_serialize=True) as train_reader: pass