def tensorflow_hello_world(dataset_url='file:///tmp/carbon_external_dataset'): # Example: tf_tensors will return tensors with dataset data with make_batch_carbon_reader(dataset_url) as reader: tensor = tf_tensors(reader) with tf.Session() as sess: # Because we are using make_batch_carbon_reader(), each read returns a batch of rows instead of a single row batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id)) with make_reader(dataset_url) as reader: tensor = make_tensor(reader) with tf.Session() as sess: # Because we are using make_batch_carbon_reader(), each read returns a batch of rows instead of a single row batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id)) # Example: use tf.data.Dataset API with make_batch_carbon_reader(dataset_url) as reader: dataset = make_pycarbon_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id)) with make_reader(dataset_url) as reader: dataset = make_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: batched_sample = sess.run(tensor) print("id batch: {0}".format(batched_sample.id))
def tensorflow_hello_world(dataset_url='file:///tmp/carbon_pycarbon_dataset/'): # Example: tf_tensors will return tensors with dataset data with make_carbon_reader(dataset_url) as reader: tensor = tf_tensors(reader) with tf.Session() as sess: sample = sess.run(tensor) print(sample.id) with make_reader(dataset_url, is_batch=False) as reader: tensor = make_tensor(reader) with tf.Session() as sess: sample = sess.run(tensor) print(sample.id) # Example: use tf.data.Dataset API with make_carbon_reader(dataset_url) as reader: dataset = make_pycarbon_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: sample = sess.run(tensor) print(sample.id) with make_reader(dataset_url, is_batch=False) as reader: dataset = make_dataset(reader) iterator = dataset.make_one_shot_iterator() tensor = iterator.get_next() with tf.Session() as sess: sample = sess.run(tensor) print(sample.id)
def pytorch_hello_world(dataset_url='file:///tmp/carbon_pycarbon_dataset'): with DataLoader(make_reader(dataset_url, is_batch=False)) as train_loader: sample = next(iter(train_loader)) print(sample['id']) with make_data_loader(make_reader(dataset_url, is_batch=False)) as train_loader: sample = next(iter(train_loader)) print(sample['id'])
def test_generate(pycarbon_dataset): # Read from it using a plain reader with make_reader(pycarbon_dataset.url) as reader: all_samples = list(reader) assert all_samples with make_reader(pycarbon_dataset.url, is_batch=False) as reader: all_samples = list(reader) assert all_samples
def pytorch_hello_world(dataset_url='file:///tmp/carbon_external_dataset'): with DataLoader(make_reader(dataset_url)) as train_loader: sample = next(iter(train_loader)) # Because we are using make_batch_reader(), each read returns a batch of rows instead of a single row print("id batch: {0}".format(sample['id'])) with make_data_loader(make_reader(dataset_url)) as train_loader: sample = next(iter(train_loader)) # Because we are using make_batch_reader(), each read returns a batch of rows instead of a single row print("id batch: {0}".format(sample['id']))
def python_hello_world(dataset_url='file:///tmp/carbon_pycarbon_dataset'): with make_reader(dataset_url) as reader: # Pure python for sample in reader: print(sample.id) with make_reader(dataset_url, is_batch=False) as reader: # Pure python for sample in reader: print(sample.id)
def test_unified_read_mnist_dataset(generate_mnist_dataset): # Verify both datasets via a reader for dset in SMALL_MOCK_IMAGE_COUNT.keys(): with make_reader('file://{}/{}'.format(generate_mnist_dataset, dset), is_batch=False, reader_pool_type='thread', num_epochs=1) as reader: assert sum(1 for _ in reader) == SMALL_MOCK_IMAGE_COUNT[dset]
def python_hello_world(dataset_url='file:///tmp/carbon_external_dataset'): # Reading data from the non-Pycarbon Carbon via pure Python with make_reader(dataset_url, schema_fields=["id", "value1", "value2"]) as reader: for schema_view in reader: # make_reader() returns batches of rows instead of individual rows print("Batched read:\nid: {0} value1: {1} value2: {2}".format( schema_view.id, schema_view.value1, schema_view.value2))
def test_batch_carbon_reader(carbon_synthetic_dataset): with make_reader(carbon_synthetic_dataset.url, num_epochs=1) as reader: i = 0 for sample in reader: for ele in sample.id: print(ele) i += 1 assert i == _ROWS_COUNT
def test_make_reader(carbon_synthetic_dataset): with make_reader(carbon_synthetic_dataset.url, is_batch=False, num_epochs=1) as reader: i = 0 for sample in reader: print(sample.id) i += 1 assert i == _ROWS_COUNT
def test_generate(external_dataset): # Read from it using a plain reader with make_batch_carbon_reader(external_dataset.url) as reader: all_samples = list(reader) assert all_samples with make_reader(external_dataset.url) as reader: all_samples = list(reader) assert all_samples
def test_make_reader_of_obs(carbon_obs_dataset): with make_reader( carbon_obs_dataset.url, key=pytest.config.getoption("--access_key"), secret=pytest.config.getoption("--secret_key"), endpoint=pytest.config.getoption("--end_point")) as reader: i = 0 for sample in reader: i += len(sample.id) assert i == _ROWS_COUNT
def just_unified_read_batch(dataset_url=LOCAL_FILE_PREFIX + EXAMPLES_MANIFEST_PATH + 'binary1558365345315_record_exist.manifest'): for num_epochs in [1, 4, 8]: with make_reader(dataset_url, num_epochs=num_epochs) as train_reader: i = 0 for schema_view in train_reader: for j in range(len(schema_view.name)): print(schema_view.name[j]) i += 1 print(i) assert 20 * num_epochs == i
def just_unified_read(dataset_url='file:///tmp/benchmark_dataset'): with make_reader(dataset_url, is_batch=False, num_epochs=1, workers_count=16, schema_fields=["id", "value1"]) as train_reader: i = 0 for schema_view in train_reader: assert len(schema_view) == 2 assert schema_view._fields == ('id', 'value1') i += 1 assert i == ROW_COUNT return i
def just_unified_read_batch(dataset_url='file:///tmp/benchmark_external_dataset', num_epochs=1): with make_reader(dataset_url, num_epochs=num_epochs) as train_reader: i = 0 start = time.time() for schema_view in train_reader: for j in range(len(schema_view.id)): print(schema_view.id[j]) i += 1 if i % ROW_COUNT == 0: end = time.time() print("time is " + str(end - start)) start = end assert i == ROW_COUNT * num_epochs return i
def test_full_pytorch_example_unified(large_mock_mnist_data, tmpdir): # First, generate mock dataset dataset_url = 'file://{}'.format(tmpdir) mnist_data_to_pycarbon_dataset(tmpdir, dataset_url, mnist_data=large_mock_mnist_data, spark_master='local[1]', carbon_files_count=1) # Next, run a round of training using the pytorce adapting data loader from pycarbon.reader import make_data_loader torch.manual_seed(1) device = torch.device('cpu') model = pytorch_example.Net().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) transform = TransformSpec(pytorch_example._transform_row, removed_fields=['idx']) with make_data_loader(make_reader('{}/train'.format(dataset_url), is_batch=False, reader_pool_type='thread', num_epochs=1, transform_spec=transform), batch_size=32) as train_loader: pytorch_example_unified.train(model, device, train_loader, 10, optimizer, 1) with make_data_loader(make_reader('{}/test'.format(dataset_url), is_batch=False, reader_pool_type='thread', num_epochs=1, transform_spec=transform), batch_size=100) as test_loader: pytorch_example_unified.evaluation(model, device, test_loader)
def just_unified_read(dataset_url='file:///tmp/benchmark_dataset', num_epochs=1): with make_reader(dataset_url, is_batch=False, num_epochs=num_epochs) as train_reader: i = 0 start = time.time() for schema_view in train_reader: print(schema_view.id) i += 1 if i % ROW_COUNT == 0: end = time.time() print("time is " + str(end - start)) start = end assert i == ROW_COUNT * num_epochs return i
def test_multithreaded_reads(carbon_synthetic_dataset): with make_reader(carbon_synthetic_dataset.url, is_batch=False, workers_count=5, num_epochs=1) as reader: with ThreadPoolExecutor(max_workers=10) as executor: def read_one_row(): return next(reader) futures = [executor.submit(read_one_row) for _ in range(100)] results = [f.result() for f in futures] assert len(results) == len(carbon_synthetic_dataset.data) assert set(r.id for r in results) == set( d['id'] for d in carbon_synthetic_dataset.data)
def just_unified_read_batch( dataset_url='file:///tmp/benchmark_external_dataset'): properties = { "shuffle_row_drop_partitions": 5, } with make_reader(dataset_url, num_epochs=1, workers_count=10, **properties) as train_reader: result = list() i = 0 for schema_view in train_reader: i += len(schema_view.id) for ele in schema_view.id: result.append(ele) return result
def just_unified_read_batch_obs(key=None, secret=None, endpoint=None, bucketname='modelarts-carbon', prefix='test/benchmark_external_dataset', download_path='/tmp/download_unified/', num_epochs=1): path = download_files_from_obs_concurrently(key, secret, endpoint, bucketname, prefix, download_path) with make_reader(path, num_epochs=num_epochs) as train_reader: i = 0 for schema_view in train_reader: i += len(schema_view.id) assert i == ROW_COUNT * num_epochs return i
def just_unified_read_batch_obs(dataset_url, key, secret, endpoint): obs_client = ObsClient( access_key_id=key, secret_access_key=secret, server=endpoint ) for num_epochs in [1, 4, 8]: with make_reader(dataset_url, obs_client=obs_client, num_epochs=num_epochs, workers_count=16) as train_reader: i = 0 for schema_view in train_reader: for j in range(len(schema_view.name)): print(schema_view.name[j]) i += 1 print(i) assert 20 * num_epochs == i
def just_unified_read(dataset_url='file:///tmp/benchmark_dataset'): result = list() properties = { "shuffle_row_drop_partitions": 5, } with make_reader(dataset_url, is_batch=False, num_epochs=1, workers_count=10, **properties) as train_reader: i = 0 for schema_view in train_reader: result.append(schema_view.id) i += 1 print(i) return result
def __init__(self, dataset_path, data_name='data', label_name='softmax_label'): self.path = dataset_path self._provide_data = [] self._provide_label = [] reader = make_reader(dataset_path, is_batch=False, num_epochs=1) self.iter = iter(reader) next_iter = next(self.iter) data = nd.array(next_iter.image).reshape(1, 1, 28, 28) / 255 label = nd.array([next_iter.digit]).reshape(1, ) self._provide_data = [ mx.io.DataDesc(data_name, data.shape, data.dtype) ] self._provide_label = [ mx.io.DataDesc(label_name, label.shape, label.dtype) ]
def just_unified_read(dataset_url='file:///tmp/benchmark_dataset'): values = [5] predicate = in_set(values, "id") properties = { "predicate": predicate, } with make_reader(dataset_url, is_batch=False, num_epochs=1, workers_count=16, **properties) as train_reader: i = 0 for schema_view in train_reader: assert schema_view.id == 5 i += 1 assert i == 1 return i
def just_unified_read_batch_obs( dataset_url="s3a://modelarts-carbon/test/benchmark_external_dataset/", key=None, secret=None, endpoint=None, num_epochs=1): obs_client = ObsClient(access_key_id=key, secret_access_key=secret, server=endpoint, long_conn_mode=True) with make_reader(dataset_url, obs_client=obs_client, num_epochs=num_epochs, workers_count=16) as train_reader: i = 0 for schema_view in train_reader: i += len(schema_view.id) assert i == ROW_COUNT * num_epochs return i
def test_generate(dataset): # Read from it using a plain reader with make_reader(dataset.url) as reader: all_samples = list(reader) assert all_samples
def main(): # Training settings parser = argparse.ArgumentParser(description='Pycarbon MNIST Example') default_dataset_url = 'file://{}'.format(DEFAULT_MNIST_DATA_PATH) parser.add_argument( '--dataset-url', type=str, default=default_dataset_url, metavar='S', help='hdfs:// or file:/// URL to the MNIST pycarbon dataset ' '(default: %s)' % default_dataset_url) parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--all-epochs', action='store_true', default=False, help='train all epochs before testing accuracy/loss') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--carbon-sdk-path', type=str, default=DEFAULT_CARBONSDK_PATH, help='carbon sdk path') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() jnius_config.set_classpath(args.carbon_sdk_path) torch.manual_seed(args.seed) device = torch.device('cuda' if use_cuda else 'cpu') model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Configure loop and Reader epoch for illustrative purposes. # Typical training usage would use the `all_epochs` approach. # if args.all_epochs: # Run training across all the epochs before testing for accuracy loop_epochs = 1 reader_epochs = args.epochs else: # Test training accuracy after each epoch loop_epochs = args.epochs reader_epochs = 1 transform = TransformSpec(_transform_row, removed_fields=['idx']) # Instantiate each pycarbon Reader with a single thread, shuffle enabled, and appropriate epoch setting for epoch in range(1, loop_epochs + 1): with make_data_loader(make_reader('{}/train'.format(args.dataset_url), is_batch=False, num_epochs=reader_epochs, transform_spec=transform), batch_size=args.batch_size) as train_loader: train(model, device, train_loader, args.log_interval, optimizer, epoch) with make_data_loader(make_reader('{}/test'.format(args.dataset_url), is_batch=False, num_epochs=reader_epochs, transform_spec=transform), batch_size=args.test_batch_size) as test_loader: evaluation(model, device, test_loader)
baseline_run = None for _ in range(RERUN_THE_TEST_COUNT): # TODO(yevgeni): factor out. Reading all ids appears multiple times in this test. with reader_factory(carbon_synthetic_dataset.url, shuffle_blocklets=False, workers_count=1) as reader: this_run = _readout_all_ids(reader) if baseline_run: assert this_run == baseline_run baseline_run = this_run @pytest.mark.parametrize('reader_factory', [ lambda url, **kwargs: make_reader( url, reader_pool_type='thread', is_batch=False, **kwargs) ]) def test_multiple_epochs(carbon_synthetic_dataset, reader_factory): """Tests that multiple epochs works as expected""" num_epochs = 5 with reader_factory(carbon_synthetic_dataset.url, num_epochs=num_epochs) as reader: # Read all expected entries from the dataset and compare the data to reference single_epoch_id_set = [d['id'] for d in carbon_synthetic_dataset.data] actual_ids_in_all_epochs = _readout_all_ids(reader) np.testing.assert_equal(sorted(actual_ids_in_all_epochs), sorted(num_epochs * single_epoch_id_set)) # Reset reader should reset ventilator. Should produce another `num_epochs` results reader.reset() actual_ids_in_all_epochs = _readout_all_ids(reader)
def train_and_test(dataset_url, training_iterations, batch_size, evaluation_interval, start): """ Train a model for training iterations with a batch size batch_size, printing accuracy every log_interval. :param dataset_url: The MNIST dataset url. :param training_iterations: The training iterations to train for. :param batch_size: The batch size for training. :param evaluation_interval: The interval used to print the accuracy. :return: """ with make_reader(os.path.join(dataset_url, 'train'), num_epochs=None, is_batch=False) as train_reader: with make_reader(os.path.join(dataset_url, 'test'), num_epochs=None, is_batch=False) as test_reader: train_readout = tf_tensors(train_reader) train_image = tf.cast(tf.reshape(train_readout.image, [784]), tf.float32) train_label = train_readout.digit batch_image, batch_label = tf.train.batch( [train_image, train_label], batch_size=batch_size) W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(batch_image, W) + b # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.losses.sparse_softmax_cross_entropy on the raw # outputs of 'y', and then average across the batch. cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=batch_label, logits=y) train_step = tf.train.GradientDescentOptimizer(0.5).minimize( cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), batch_label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) test_readout = tf_tensors(test_reader) test_image = tf.cast(tf.reshape(test_readout.image, [784]), tf.float32) test_label = test_readout.digit test_batch_image, test_batch_label = tf.train.batch( [test_image, test_label], batch_size=batch_size) end = time.time() print("before train time: " + str(end - start)) # Train print( 'Training model for {0} training iterations with batch size {1} and evaluation interval {2}' .format(training_iterations, batch_size, evaluation_interval)) with tf.Session() as sess: sess.run([ tf.local_variables_initializer(), tf.global_variables_initializer(), ]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for i in range(training_iterations): if coord.should_stop(): break sess.run(train_step) if (i % evaluation_interval) == 0 or i == ( training_iterations - 1): feed_batch_image, feed_batch_label = sess.run( [test_batch_image, test_batch_label]) print( 'After {0} training iterations, the accuracy of the model is: {1:.2f}' .format( i, sess.run(accuracy, feed_dict={ batch_image: feed_batch_image, batch_label: feed_batch_label }))) finally: coord.request_stop() coord.join(threads)
def train_and_test(dataset_url, num_epochs, batch_size, evaluation_interval): """ Train a model for training iterations with a batch size batch_size, printing accuracy every log_interval. :param dataset_url: The MNIST dataset url. :param num_epochs: The number of epochs to train for. :param batch_size: The batch size for training. :param evaluation_interval: The interval used to print the accuracy. :return: """ with make_reader(os.path.join(dataset_url, 'train'), num_epochs=num_epochs) as train_reader: with make_reader(os.path.join(dataset_url, 'test'), num_epochs=num_epochs) as test_reader: # Create the model x = tf.placeholder(tf.float32, [None, 784]) w = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(x, w) + b # Define loss and optimizer y_ = tf.placeholder(tf.int64, [None]) # Define the loss function cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y) train_step = tf.train.GradientDescentOptimizer(0.5).minimize( cross_entropy) correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) train_dataset = make_dataset(train_reader) \ .apply(tf.data.experimental.unbatch()) \ .batch(batch_size) \ .map(decode) train_iterator = train_dataset.make_one_shot_iterator() label, image = train_iterator.get_next() test_dataset = make_dataset(test_reader) \ .apply(tf.data.experimental.unbatch()) \ .batch(batch_size) \ .map(decode) test_iterator = test_dataset.make_one_shot_iterator() test_label, test_image = test_iterator.get_next() # Train print( 'Training model for {0} epoch with batch size {1} and evaluation interval {2}' .format(num_epochs, batch_size, evaluation_interval)) i = 0 with tf.Session() as sess: sess.run([ tf.local_variables_initializer(), tf.global_variables_initializer(), ]) try: while True: cur_label, cur_image = sess.run([label, image]) sess.run([train_step], feed_dict={ x: cur_image, y_: cur_label }) if i % evaluation_interval == 0: test_cur_label, test_cur_image = sess.run( [test_label, test_image]) print( 'After {0} training iterations, the accuracy of the model is: {1:.2f}' .format( i, sess.run(accuracy, feed_dict={ x: test_cur_image, y_: test_cur_label }))) i += 1 except tf.errors.OutOfRangeError: print("Finish! the number is " + str(i))