예제 #1
0
def test_manifest_correct():
  # local
  dataset_url = LOCAL_FILE_PREFIX + EXAMPLES_MANIFEST_PATH + "binary1558365345315_record_exist.manifest"
  for num_epochs in [1, 2, 4]:
    with make_batch_carbon_reader(dataset_url, num_epochs=num_epochs) as train_reader:
      i = 0
      for schema_view in train_reader:
        i += len(schema_view.name)

      print(i)
      assert 20 * num_epochs == i

  # obs
  key = pytest.config.getoption("--access_key")
  secret = pytest.config.getoption("--secret_key")
  endpoint = pytest.config.getoption("--end_point")

  dataset_url = "s3a://manifest/carbon/manifestcarbon/obsbinary1557717977531.manifest"

  for num_epochs in [1, 2, 4]:
    with make_batch_carbon_reader(dataset_url, key=key, secret=secret, endpoint=endpoint,
                                  num_epochs=num_epochs) as train_reader:
      i = 0
      for schema_view in train_reader:
        i += len(schema_view.name)

      print(i)
      assert 20 * num_epochs == i
def tensorflow_hello_world(dataset_url='file:///tmp/carbon_external_dataset'):
    # Example: tf_tensors will return tensors with dataset data
    with make_batch_carbon_reader(dataset_url) as reader:
        tensor = tf_tensors(reader)
        with tf.Session() as sess:
            # Because we are using make_batch_carbon_reader(), each read returns a batch of rows instead of a single row
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))

    with make_reader(dataset_url) as reader:
        tensor = make_tensor(reader)
        with tf.Session() as sess:
            # Because we are using make_batch_carbon_reader(), each read returns a batch of rows instead of a single row
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))

    # Example: use tf.data.Dataset API
    with make_batch_carbon_reader(dataset_url) as reader:
        dataset = make_pycarbon_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))

    with make_reader(dataset_url) as reader:
        dataset = make_dataset(reader)
        iterator = dataset.make_one_shot_iterator()
        tensor = iterator.get_next()
        with tf.Session() as sess:
            batched_sample = sess.run(tensor)
            print("id batch: {0}".format(batched_sample.id))
예제 #3
0
def test_manifest_exist_but_no_content():
  # local
  dataset_url = LOCAL_FILE_PREFIX + EXAMPLES_MANIFEST_PATH + "no_content.manifest"
  with pytest.raises(Exception):
    make_batch_carbon_reader(dataset_url)

  # obs
  key = pytest.config.getoption("--access_key")
  secret = pytest.config.getoption("--secret_key")
  endpoint = pytest.config.getoption("--end_point")

  dataset_url = "s3a://manifest/carbon/manifestcarbon/no_content.manifest"
  with pytest.raises(Exception):
    make_batch_carbon_reader(dataset_url, key=key, secret=secret, endpoint=endpoint)
예제 #4
0
def test_manifest_not_exist():
  # local
  dataset_url = LOCAL_FILE_PREFIX + "/tmp/not_exist.manifest"
  with pytest.raises(FileNotFoundError):
    make_batch_carbon_reader(dataset_url)

  # obs
  key = pytest.config.getoption("--access_key")
  secret = pytest.config.getoption("--secret_key")
  endpoint = pytest.config.getoption("--end_point")

  dataset_url = "s3a://manifest/carbon/manifestcarbon/not_exist.manifest"
  with pytest.raises(Exception):
    make_batch_carbon_reader(dataset_url, key=key, secret=secret, endpoint=endpoint)
예제 #5
0
def test_manifest_normal_but_record_not_exist():
  # local
  dataset_url = LOCAL_FILE_PREFIX + EXAMPLES_MANIFEST_PATH + "binary1558365345315.manifest"
  with pytest.raises(Exception):
    make_batch_carbon_reader(dataset_url)

  # obs
  key = pytest.config.getoption("--access_key")
  secret = pytest.config.getoption("--secret_key")
  endpoint = pytest.config.getoption("--end_point")

  dataset_url = "s3a://manifest/carbon/manifestcarbon/obsbinary1557717977531_record_not_exist.manifest"
  with pytest.raises(Exception):
    make_batch_carbon_reader(dataset_url, key=key, secret=secret, endpoint=endpoint)
예제 #6
0
def test_dataset_batch_carbon_reader(carbon_scalar_dataset):
    with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                  num_epochs=1) as reader:
        dataset = make_pycarbon_dataset(reader) \
          .apply(tf.data.experimental.unbatch()) \
          .batch(batch_size=1)

        iterator = dataset.make_one_shot_iterator()

        tensor = iterator.get_next()

        with tf.Session() as sess:
            sess.run([
                tf.local_variables_initializer(),
                tf.global_variables_initializer(),
            ])
            i = 0
            try:
                while True:
                    sess.run(tensor)
                    i += 1
            except tf.errors.OutOfRangeError:
                print("Finish! the number is " + str(i))

            assert i == _ROWS_COUNT
def test_invalid_batch_carbon_reader_predicate_parameters(carbon_scalar_dataset):
  with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                cache_type="memory-cache",
                                predicate=in_lambda(['id2'], lambda id2: True)) as reader:
    with pytest.raises(RuntimeError):
      next(reader)

  with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                predicate=in_lambda([], lambda x: False)) as reader:
    with pytest.raises(ValueError):
      next(reader)

  with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                predicate=in_lambda(['not_exist_col'], lambda x: False)) as reader:
    with pytest.raises(ValueError):
      next(reader)
def test_batch_carbon_reader(carbon_synthetic_dataset):
  with make_batch_carbon_reader(carbon_synthetic_dataset.url, num_epochs=1) as reader:
    i = 0
    for sample in reader:
      for ele in sample.id:
        print(ele)
        i += 1
    assert i == _ROWS_COUNT
예제 #9
0
def test_generate(external_dataset):
  # Read from it using a plain reader
  with make_batch_carbon_reader(external_dataset.url) as reader:
    all_samples = list(reader)
  assert all_samples

  with make_reader(external_dataset.url) as reader:
    all_samples = list(reader)
  assert all_samples
def pytorch_hello_world(dataset_url='file:///tmp/carbon_external_dataset'):
    with DataLoader(make_batch_carbon_reader(dataset_url)) as train_loader:
        sample = next(iter(train_loader))
        # Because we are using make_batch_reader(), each read returns a batch of rows instead of a single row
        print("id batch: {0}".format(sample['id']))

    with make_data_loader(make_reader(dataset_url)) as train_loader:
        sample = next(iter(train_loader))
        # Because we are using make_batch_reader(), each read returns a batch of rows instead of a single row
        print("id batch: {0}".format(sample['id']))
예제 #11
0
def test_make_batch_carbon_reader_of_obs(carbon_obs_external_dataset):
  with make_batch_carbon_reader(carbon_obs_external_dataset.url,
                                key=pytest.config.getoption("--access_key"),
                                secret=pytest.config.getoption("--secret_key"),
                                endpoint=pytest.config.getoption("--end_point")) as reader:
    i = 0
    for sample in reader:
      i += len(sample.id)

    assert i == _ROWS_COUNT
예제 #12
0
def just_read_batch_obs(dataset_url, key, secret, endpoint):
  for num_epochs in [1, 4, 8]:
    with make_batch_carbon_reader(dataset_url, key=key, secret=secret, endpoint=endpoint, num_epochs=num_epochs,
                                  workers_count=16) as train_reader:
      i = 0
      for schema_view in train_reader:
        for j in range(len(schema_view.name)):
          print(schema_view.name[j])
          i += 1

      print(i)
      assert 20 * num_epochs == i
def test_invalid_cache_parameter(carbon_synthetic_dataset,
                                 carbon_scalar_dataset):
    with make_carbon_reader(carbon_synthetic_dataset.url,
                            cache_type='memory-cache',
                            shuffle_row_drop_partitions=5) as reader:
        with pytest.raises(RuntimeError):
            next(reader)

    with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                  cache_type='memory-cache',
                                  shuffle_row_drop_partitions=5) as reader:
        with pytest.raises(RuntimeError):
            next(reader)
예제 #14
0
def just_read_batch(dataset_url=LOCAL_FILE_PREFIX + EXAMPLES_MANIFEST_PATH +
                    'binary1558365345315_record_exist.manifest'):
    for num_epochs in [1, 4, 8]:
        with make_batch_carbon_reader(dataset_url,
                                      num_epochs=num_epochs) as train_reader:
            i = 0
            for schema_view in train_reader:
                for j in range(len(schema_view.name)):
                    print(schema_view.name[j])
                    i += 1

            print(i)
            assert 20 * num_epochs == i
예제 #15
0
def just_read_batch(dataset_url='file:///tmp/benchmark_external_dataset'):
    with make_batch_carbon_reader(
            dataset_url,
            num_epochs=1,
            workers_count=10,
            shuffle_row_drop_partitions=5) as train_reader:
        result = list()
        i = 0
        for schema_view in train_reader:
            i += len(schema_view.id)
            for ele in schema_view.id:
                result.append(ele)

        return result
def just_read_batch(dataset_url='file:///tmp/benchmark_external_dataset'):
    with make_batch_carbon_reader(dataset_url,
                                  num_epochs=1,
                                  workers_count=16,
                                  schema_fields=["id",
                                                 "value1"]) as train_reader:
        i = 0
        for schema_view in train_reader:
            assert len(schema_view) == 2
            assert schema_view._fields == ('id', 'value1')
            i += len(schema_view.id)

        assert i == ROW_COUNT
        return i
예제 #17
0
def just_read_batch(dataset_url='file:///tmp/benchmark_external_dataset', num_epochs=1):
  with make_batch_carbon_reader(dataset_url, num_epochs=num_epochs) as train_reader:
    i = 0
    start = time.time()
    for schema_view in train_reader:
      for j in range(len(schema_view.id)):
        print(schema_view.id[j])
        i += 1
        if i % ROW_COUNT == 0:
          end = time.time()
          print("time is " + str(end - start))
          start = end
    assert i == ROW_COUNT * num_epochs
    return i
예제 #18
0
def just_read_batch_obs(key=None, secret=None, endpoint=None,
                        bucketname='modelarts-carbon',
                        prefix='test/benchmark_external_dataset',
                        download_path='/tmp/download/',
                        num_epochs=1):

  path = download_files_from_obs_concurrently(key, secret, endpoint,
                                              bucketname, prefix, download_path)

  with make_batch_carbon_reader(path, num_epochs=num_epochs) as train_reader:
    i = 0
    for schema_view in train_reader:
      i += len(schema_view.id)

    assert i == ROW_COUNT * num_epochs
    return i
예제 #19
0
def test_simple_read_tensorflow_with_non_unischema_many_columns_dataset(
        carbon_many_columns_non_unischema_dataset):
    """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference
  data"""
    with make_batch_carbon_reader(
            dataset_url=carbon_many_columns_non_unischema_dataset.url
    ) as reader:
        row_tensors = tf_tensors(reader)
        # Make sure we have static shape info for all fields
        for column in row_tensors:
            assert column.get_shape().as_list() == [None]

        with _tf_session() as sess:
            batch = sess.run(row_tensors)._asdict()
            assert set(batch.keys()) == set(
                carbon_many_columns_non_unischema_dataset.data[0].keys())
예제 #20
0
def python_hello_world(dataset_url='file:///tmp/carbon_external_dataset'):
    # Reading data from the non-Pycarbon Carbon via pure Python
    with make_batch_carbon_reader(dataset_url,
                                  schema_fields=["id", "value1",
                                                 "value2"]) as reader:
        for schema_view in reader:
            # make_batch_reader() returns batches of rows instead of individual rows
            print("Batched read:\nid: {0} value1: {1} value2: {2}".format(
                schema_view.id, schema_view.value1, schema_view.value2))

    with make_reader(dataset_url, schema_fields=["id", "value1",
                                                 "value2"]) as reader:
        for schema_view in reader:
            # make_batch_reader() returns batches of rows instead of individual rows
            print("Batched read:\nid: {0} value1: {1} value2: {2}".format(
                schema_view.id, schema_view.value1, schema_view.value2))
def just_read_batch_obs(
        dataset_url="s3a://modelarts-carbon/test/benchmark_external_dataset/",
        key=None,
        secret=None,
        endpoint=None,
        num_epochs=1):
    with make_batch_carbon_reader(dataset_url,
                                  key=key,
                                  secret=secret,
                                  endpoint=endpoint,
                                  num_epochs=num_epochs,
                                  workers_count=16) as train_reader:
        i = 0
        for schema_view in train_reader:
            i += len(schema_view.id)

        assert i == ROW_COUNT * num_epochs
        return i
예제 #22
0
def test_simple_read_tensorflow_with_carbon_dataset(carbon_scalar_dataset):
    """Read couple of rows. Make sure all tensors have static shape sizes assigned and the data matches reference
  data"""
    with make_batch_carbon_reader(
            dataset_url=carbon_scalar_dataset.url) as reader:
        row_tensors = tf_tensors(reader)
        # Make sure we have static shape info for all fields
        for column in row_tensors:
            assert column.get_shape().as_list() == [None]

        with _tf_session() as sess:
            for _ in range(2):
                batch = sess.run(row_tensors)._asdict()
                for i, id_value in enumerate(batch['id']):
                    expected_row = next(d for d in carbon_scalar_dataset.data
                                        if d['id'] == id_value)
                    for field_name in expected_row.keys():
                        _assert_fields_eq(batch[field_name][i],
                                          expected_row[field_name])
예제 #23
0
def test_non_unischema_with_many_colums_with_one_shot_iterator(
        carbon_many_columns_non_unischema_dataset):
    """Just a bunch of read and compares of all values to the expected values"""
    with make_batch_carbon_reader(
            carbon_many_columns_non_unischema_dataset.url,
            workers_count=1) as reader:
        dataset = make_pycarbon_dataset(reader)
        iterator = dataset.make_one_shot_iterator()

        # Make sure we have static shape info for all fields
        for shape in dataset.output_shapes:
            # TODO(yevgeni): check that the shapes are actually correct, not just not None
            assert shape.dims is not None

        # Read a bunch of entries from the dataset and compare the data to reference
        with tf.Session() as sess:
            iterator = iterator.get_next()
            sample = sess.run(iterator)._asdict()
            assert set(sample.keys()) == set(
                carbon_many_columns_non_unischema_dataset.data[0].keys())
예제 #24
0
def test_dynamic_batch_size_of_batch_carbon_reader(carbon_scalar_dataset):
    with make_batch_carbon_reader(carbon_scalar_dataset.url,
                                  num_epochs=None) as reader:
        batch_size = tf.data.Dataset.range(
            1, 10).make_one_shot_iterator().get_next()

        dataset = make_pycarbon_dataset(reader) \
          .apply(tf.data.experimental.unbatch()) \
          .batch(batch_size=batch_size)

        iterator = dataset.make_initializable_iterator()
        init_op = iterator.initializer

        tensor = iterator.get_next()

        with tf.train.MonitoredTrainingSession() as sess:
            sess.run(init_op)
            sample = sess.run(tensor)
            assert 1 == len(sample.id)

            sess.run(init_op)
            sample = sess.run(tensor)
            assert 2 == len(sample.id)
예제 #25
0
  pass
else:
  raise ValueError("please set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON variables, "
                   "using cmd line "
                   "--pyspark-python=PYSPARK_PYTHON_PATH --pyspark-driver-python=PYSPARK_DRIVER_PYTHON_PATH "
                   "or set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON in system env")

# pylint: disable=unnecessary-lambda
ALL_READER_FLAVOR_FACTORIES = [
  lambda url, **kwargs: make_carbon_reader(url, reader_pool_type='thread', **kwargs),
]

SCALAR_FIELDS = [f for f in TestSchema.fields.values() if isinstance(f.codec, ScalarCodec)]

SCALAR_ONLY_READER_FACTORIES = [
  lambda url, **kwargs: make_batch_carbon_reader(url, reader_pool_type='thread', **kwargs),
]


def _check_simple_reader(reader, expected_data, expected_rows_count=None, check_types=True, limit_checked_rows=None):
  # Read a bunch of entries from the dataset and compare the data to reference
  def _type(v):
    if isinstance(v, np.ndarray):
      if v.dtype.str.startswith('|S'):
        return '|S'
      else:
        return v.dtype
    else:
      return type(v)

  expected_rows_count = expected_rows_count or len(expected_data)
예제 #26
0
elif 'PYSPARK_PYTHON' in os.environ.keys(
) and 'PYSPARK_DRIVER_PYTHON' in os.environ.keys():
    pass
else:
    raise ValueError(
        "please set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON variables, "
        "using cmd line "
        "--pyspark-python=PYSPARK_PYTHON_PATH --pyspark-driver-python=PYSPARK_DRIVER_PYTHON_PATH "
        "or set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON in system env")

# pylint: disable=unnecessary-lambda
READER_FACTORIES = [
    make_carbon_reader,
    lambda url, **kwargs: make_carbon_reader(url, **kwargs),
    make_batch_carbon_reader,
    lambda url, **kwargs: make_batch_carbon_reader(url, **kwargs),
]


@pytest.mark.parametrize('reader_factory', READER_FACTORIES)
def test_dataset_url_must_be_string(reader_factory):
    with pytest.raises(ValueError):
        reader_factory(None)

    with pytest.raises(ValueError):
        reader_factory(123)

    with pytest.raises(ValueError):
        reader_factory([])

예제 #27
0
def make_reader(dataset_url=None,
                workers_count=10,
                results_queue_size=100,
                num_epochs=1,
                obs_client=None,
                shuffle=True,
                schema_fields=None,
                is_batch=True,
                reader_pool_type='thread',
                data_format='carbon',
                cache_properties={'cache_type': None, 'cache_location': None, 'cache_size_limit': None,
                                  'cache_row_size_estimate': None, 'cache_extra_settings': None},
                **properties
                ):
  """
  an unified api for different data format dataset

  :param dataset_url: an filepath or a url to a carbon directory,
      e.g. ``'hdfs://some_hdfs_cluster/user/yevgeni/carbon8'``, or ``'file:///tmp/mydataset'``
      or ``'s3a://bucket/mydataset'``.
  :param data_format: dataset data format (default: carbon)
  :param is_batch: return single record or batch records (default: True)
  :param obs_client: obs client object
    access key
    secret key
    endpoint_url
  :param schema_fields: Can be: a list of unischema fields and/or regex pattern strings; ``None`` to read all fields;
          an NGram object, then it will return an NGram of the specified fields.
  :param reader_pool_type: A string denoting the reader pool type. Should be one of ['thread', 'process', 'dummy']
      denoting a thread pool, process pool, or running everything in the master thread. Defaults to 'thread'
    TODO: process support
  :param workers_count: An int for the number of workers to use in the reader pool. This only is used for the
      thread or process pool. Defaults to 10
  :param results_queue_size: Size of the results queue to store prefetched rows. Currently only applicable to
      thread reader pool type.
  :param shuffle: Whether to shuffle partition (the order in which full partition are read)
  :param num_epochs: An epoch is a single pass over all rows in the dataset. Setting ``num_epochs`` to
      ``None`` will result in an infinite number of epochs.
  :param cache_properties: a dict of cache parameters
    cache_type: A string denoting the cache type, if desired. Options are [None, 'null', 'local-disk', 'memory-cache']
      to either have a null/noop cache or a cache implemented using diskcache. Caching is useful when communication
      to the main data store is either slow or expensive and the local machine has large enough storage
      to store entire dataset. By default will be a null cache.
    cache_location: A string denoting the location or path of the cache.
    cache_size_limit: An int specifying the size limit of the cache in bytes
    cache_row_size_estimate: An int specifying the estimated size of a row in the dataset
    cache_extra_settings: A dictionary of extra settings to pass to the cache implementation,
  :param **properties: other parameters (using dict)
  :return: A :class:`Reader` object
  """

  if is_batch is True:
    if data_format == 'carbon':
      if isinstance(obs_client, ObsClient):
        if obs_client.is_secure is True:
          endpoint = "https://" + obs_client.server
        else:
          endpoint = "http://" + obs_client.server
        return make_batch_carbon_reader(dataset_url,
                                        key=obs_client.securityProvider.access_key_id,
                                        secret=obs_client.securityProvider.secret_access_key,
                                        endpoint=endpoint,
                                        proxy=obs_client.proxy_host,
                                        proxy_port=obs_client.proxy_port,
                                        schema_fields=schema_fields,
                                        reader_pool_type=reader_pool_type,
                                        workers_count=workers_count,
                                        results_queue_size=results_queue_size,
                                        shuffle_blocklets=shuffle,
                                        num_epochs=num_epochs,
                                        cache_type=cache_properties['cache_type'],
                                        cache_location=cache_properties['cache_location'],
                                        cache_size_limit=cache_properties['cache_size_limit'],
                                        cache_row_size_estimate=cache_properties['cache_row_size_estimate'],
                                        cache_extra_settings=cache_properties['cache_extra_settings'],
                                        **properties)
      elif obs_client is None:
        return make_batch_carbon_reader(dataset_url,
                                        schema_fields=schema_fields,
                                        reader_pool_type=reader_pool_type,
                                        workers_count=workers_count,
                                        results_queue_size=results_queue_size,
                                        shuffle_blocklets=shuffle,
                                        num_epochs=num_epochs,
                                        cache_type=cache_properties['cache_type'],
                                        cache_location=cache_properties['cache_location'],
                                        cache_size_limit=cache_properties['cache_size_limit'],
                                        cache_row_size_estimate=cache_properties['cache_row_size_estimate'],
                                        cache_extra_settings=cache_properties['cache_extra_settings'],
                                        **properties)
      else:
        raise ValueError("""obs_client should be a ObsClient object or None""")
    else:
      raise NotImplementedError("""not support other data format datset""")

  elif is_batch is False:
    if data_format == 'carbon':
      if isinstance(obs_client, ObsClient):
        if obs_client.is_secure is True:
          endpoint = "https://" + obs_client.server
        else:
          endpoint = "http://" + obs_client.server
        return make_carbon_reader(dataset_url,
                                  key=obs_client.securityProvider.access_key_id,
                                  secret=obs_client.securityProvider.secret_access_key,
                                  endpoint=endpoint,
                                  proxy=obs_client.proxy_host,
                                  proxy_port=obs_client.proxy_port,
                                  schema_fields=schema_fields,
                                  reader_pool_type=reader_pool_type,
                                  workers_count=workers_count,
                                  results_queue_size=results_queue_size,
                                  shuffle_blocklets=shuffle,
                                  num_epochs=num_epochs,
                                  cache_type=cache_properties['cache_type'],
                                  cache_location=cache_properties['cache_location'],
                                  cache_size_limit=cache_properties['cache_size_limit'],
                                  cache_row_size_estimate=cache_properties['cache_row_size_estimate'],
                                  cache_extra_settings=cache_properties['cache_extra_settings'],
                                  **properties)
      elif obs_client is None:
        return make_carbon_reader(dataset_url,
                                  schema_fields=schema_fields,
                                  reader_pool_type=reader_pool_type,
                                  workers_count=workers_count,
                                  results_queue_size=results_queue_size,
                                  shuffle_blocklets=shuffle,
                                  num_epochs=num_epochs,
                                  cache_type=cache_properties['cache_type'],
                                  cache_location=cache_properties['cache_location'],
                                  cache_size_limit=cache_properties['cache_size_limit'],
                                  cache_row_size_estimate=cache_properties['cache_row_size_estimate'],
                                  cache_extra_settings=cache_properties['cache_extra_settings'],
                                  **properties)
      else:
        raise ValueError("""obs_client should be a ObsClient object or None""")
    else:
      raise NotImplementedError("""not support other data format datset""")

  else:
    raise ValueError("""the value of is_batch is invalid, it should be set True or False""")
def train_and_test(train_dataset_url, test_dataset_url, num_epochs, batch_size,
                   evaluation_interval):
    """
  Train a model for training iterations with a batch size batch_size, printing accuracy every log_interval.
  :param dataset_url: The MNIST dataset url.
  :param num_epochs: The number of epochs to train for.
  :param batch_size: The batch size for training.
  :param evaluation_interval: The interval used to print the accuracy.
  :return:
  """

    with make_batch_carbon_reader(os.path.join(train_dataset_url),
                                  num_epochs=num_epochs) as train_reader:
        with make_batch_carbon_reader(os.path.join(test_dataset_url),
                                      num_epochs=num_epochs) as test_reader:
            # Create the model
            x = tf.placeholder(tf.float32, [None, 784])
            w = tf.Variable(tf.zeros([784, 10]))
            b = tf.Variable(tf.zeros([10]))
            y = tf.matmul(x, w) + b

            # Define loss and optimizer
            y_ = tf.placeholder(tf.int64, [None])

            # Define the loss function
            cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_,
                                                                   logits=y)

            train_step = tf.train.GradientDescentOptimizer(0.5).minimize(
                cross_entropy)

            correct_prediction = tf.equal(tf.argmax(y, 1), y_)

            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

            train_dataset = make_pycarbon_dataset(train_reader) \
              .apply(tf.data.experimental.unbatch()) \
              .batch(batch_size) \
              .map(decode)

            train_iterator = train_dataset.make_one_shot_iterator()
            label, image = train_iterator.get_next()

            test_dataset = make_pycarbon_dataset(test_reader) \
              .apply(tf.data.experimental.unbatch()) \
              .batch(batch_size) \
              .map(decode)

            test_iterator = test_dataset.make_one_shot_iterator()
            test_label, test_image = test_iterator.get_next()

            # Train
            print(
                'Training model for {0} epoch with batch size {1} and evaluation interval {2}'
                .format(num_epochs, batch_size, evaluation_interval))

            i = 0
            with tf.Session() as sess:
                sess.run([
                    tf.local_variables_initializer(),
                    tf.global_variables_initializer(),
                ])

                try:
                    while True:
                        cur_label, cur_image = sess.run([label, image])

                        sess.run([train_step],
                                 feed_dict={
                                     x: cur_image,
                                     y_: cur_label
                                 })

                        if i % evaluation_interval == 0:
                            test_cur_label, test_cur_image = sess.run(
                                [test_label, test_image])
                            print(
                                'After {0} training iterations, the accuracy of the model is: {1:.2f}'
                                .format(
                                    i,
                                    sess.run(accuracy,
                                             feed_dict={
                                                 x: test_cur_image,
                                                 y_: test_cur_label
                                             })))
                        i += 1

                except tf.errors.OutOfRangeError:
                    print("Finish! the number is " + str(i))
예제 #29
0
def test_manifest_normal_but_record_part_exist_1():
  # local
  dataset_url = LOCAL_FILE_PREFIX + EXAMPLES_MANIFEST_PATH + "binary1558365345315_record_part_exist_1.manifest"
  with pytest.raises(Exception):
    make_batch_carbon_reader(dataset_url)
예제 #30
0
def test_shuffling_queue_with_make_batch_carbon_reader(carbon_scalar_dataset):
    with make_batch_carbon_reader(
            dataset_url=carbon_scalar_dataset.url) as reader:
        with pytest.raises(ValueError):
            tf_tensors(reader, 100, 90)