def test_pandas_input_fn(self, fc_impl):
        """Tests complete flow with pandas_input_fn."""
        if not HAS_PANDAS:
            return
        label_dimension = 1
        batch_size = 10
        data = np.linspace(0., 2., batch_size, dtype=np.float32)
        x = pd.DataFrame({'x': data})
        y = pd.Series(data)
        train_input_fn = pandas_io.pandas_input_fn(x=x,
                                                   y=y,
                                                   batch_size=batch_size,
                                                   num_epochs=None,
                                                   shuffle=True)
        eval_input_fn = pandas_io.pandas_input_fn(x=x,
                                                  y=y,
                                                  batch_size=batch_size,
                                                  shuffle=False)
        predict_input_fn = pandas_io.pandas_input_fn(x=x,
                                                     batch_size=batch_size,
                                                     shuffle=False)

        self._test_complete_flow(train_input_fn=train_input_fn,
                                 eval_input_fn=eval_input_fn,
                                 predict_input_fn=predict_input_fn,
                                 input_dimension=label_dimension,
                                 label_dimension=label_dimension,
                                 batch_size=batch_size,
                                 fc_impl=fc_impl)
Exemplo n.º 2
0
    def test_pandas_input_fn(self):
        """Tests complete flow with pandas_input_fn."""
        if not HAS_PANDAS:
            return

        # Pandas DataFrame natually supports 1 dim data only.
        label_dimension = 1
        input_dimension = label_dimension
        batch_size = 10
        data = np.array([1., 2., 3., 4.], dtype=np.float32)
        x = pd.DataFrame({'x': data})
        y = pd.Series(data)
        prediction_length = 4

        train_input_fn = pandas_io.pandas_input_fn(x=x,
                                                   y=y,
                                                   batch_size=batch_size,
                                                   num_epochs=None,
                                                   shuffle=True)
        eval_input_fn = pandas_io.pandas_input_fn(x=x,
                                                  y=y,
                                                  batch_size=batch_size,
                                                  shuffle=False)
        predict_input_fn = pandas_io.pandas_input_fn(x=x,
                                                     batch_size=batch_size,
                                                     shuffle=False)

        self._test_complete_flow(train_input_fn=train_input_fn,
                                 eval_input_fn=eval_input_fn,
                                 predict_input_fn=predict_input_fn,
                                 input_dimension=input_dimension,
                                 label_dimension=label_dimension,
                                 prediction_length=prediction_length)
Exemplo n.º 3
0
  def test_pandas_input_fn(self, fc_impl):
    """Tests complete flow with pandas_input_fn."""
    if not HAS_PANDAS:
      return
    label_dimension = 1
    batch_size = 10
    data = np.linspace(0., 2., batch_size, dtype=np.float32)
    x = pd.DataFrame({'x': data})
    y = pd.Series(data)
    train_input_fn = pandas_io.pandas_input_fn(
        x=x,
        y=y,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = pandas_io.pandas_input_fn(
        x=x,
        y=y,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = pandas_io.pandas_input_fn(
        x=x,
        batch_size=batch_size,
        shuffle=False)

    self._test_complete_flow(
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        predict_input_fn=predict_input_fn,
        input_dimension=label_dimension,
        label_dimension=label_dimension,
        batch_size=batch_size,
        fc_impl=fc_impl)
Exemplo n.º 4
0
  def test_pandas_input_fn(self):
    """Tests complete flow with pandas_input_fn."""
    if not HAS_PANDAS:
      return
    input_dimension = 1
    n_classes = 3
    batch_size = 10
    data = np.linspace(0., n_classes - 1., batch_size, dtype=np.float32)
    x = pd.DataFrame({'x': data})
    y = pd.Series(self._as_label(data))
    train_input_fn = pandas_io.pandas_input_fn(
        x=x,
        y=y,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = pandas_io.pandas_input_fn(
        x=x,
        y=y,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = pandas_io.pandas_input_fn(
        x=x,
        batch_size=batch_size,
        shuffle=False)

    self._test_complete_flow(
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        predict_input_fn=predict_input_fn,
        input_dimension=input_dimension,
        n_classes=n_classes,
        batch_size=batch_size)
Exemplo n.º 5
0
 def testPandasInputFn_IndexMismatch(self):
   if not HAS_PANDAS:
     return
   x, _ = self.makeTestDataFrame()
   y_noindex = pd.Series(np.arange(-32, -28))
   with self.assertRaises(ValueError):
     pandas_io.pandas_input_fn(
         x, y_noindex, batch_size=2, shuffle=False, num_epochs=1)
Exemplo n.º 6
0
 def testPandasInputFn_IndexMismatch(self):
   if not HAS_PANDAS:
     return
   x, _ = self.makeTestDataFrame()
   y_noindex = pd.Series(np.arange(-32, -28))
   with self.assertRaises(ValueError):
     pandas_io.pandas_input_fn(
         x, y_noindex, batch_size=2, shuffle=False, num_epochs=1)
Exemplo n.º 7
0
 def testPandasInputFn_NonBoolShuffle(self):
   if not HAS_PANDAS:
     return
   x, _ = self.makeTestDataFrame()
   y_noindex = pd.Series(np.arange(-32, -28))
   with self.assertRaisesRegexp(TypeError,
                                'shuffle must be explicitly set as boolean'):
     # Default shuffle is None
     pandas_io.pandas_input_fn(x, y_noindex)
Exemplo n.º 8
0
 def testPandasInputFn_NonBoolShuffle(self):
     if not HAS_PANDAS:
         return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
     with self.assertRaisesRegexp(
             TypeError, 'shuffle must be explicitly set as boolean'):
         # Default shuffle is None
         pandas_io.pandas_input_fn(x, y_noindex)
Exemplo n.º 9
0
 def testPandasInputFn_Idempotent(self):
   if not HAS_PANDAS:
     return
   x, y = self.makeTestDataFrame()
   for _ in range(2):
     pandas_io.pandas_input_fn(
         x, y, batch_size=2, shuffle=False, num_epochs=1)()
   for _ in range(2):
     pandas_io.pandas_input_fn(
         x, y, batch_size=2, shuffle=True, num_epochs=1)()
Exemplo n.º 10
0
 def testPandasInputFn_Idempotent(self):
   if not HAS_PANDAS:
     return
   x, y = self.makeTestDataFrame()
   for _ in range(2):
     pandas_io.pandas_input_fn(
         x, y, batch_size=2, shuffle=False, num_epochs=1)()
   for _ in range(2):
     pandas_io.pandas_input_fn(
         x, y, batch_size=2, shuffle=True, num_epochs=1)()
Exemplo n.º 11
0
  def testPandasInputFn_RaisesWhenTargetColumnIsAList(self):
    if not HAS_PANDAS:
      return

    x, y = self.makeTestDataFrame()

    with self.assertRaisesRegexp(TypeError,
                                 'target_column must be a string type'):
      pandas_io.pandas_input_fn(x, y, batch_size=2,
                                shuffle=False,
                                num_epochs=1,
                                target_column=['one', 'two'])
Exemplo n.º 12
0
  def testPandasInputFn_RaisesWhenTargetColumnIsAList(self):
    if not HAS_PANDAS:
      return

    x, y = self.makeTestDataFrame()

    with self.assertRaisesRegexp(TypeError,
                                 'target_column must be a string type'):
      pandas_io.pandas_input_fn(x, y, batch_size=2,
                                shuffle=False,
                                num_epochs=1,
                                target_column=['one', 'two'])
Exemplo n.º 13
0
  def testPandasInputFn_ProducesOutputsForLargeBatchAndMultipleEpochs(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      index = np.arange(100, 102)
      a = np.arange(2)
      b = np.arange(32, 34)
      x = pd.DataFrame({'a': a, 'b': b}, index=index)
      y = pd.Series(np.arange(-32, -30), index=index)
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=128, shuffle=False, num_epochs=2)

      results = input_fn()

      coord = coordinator.Coordinator()
      threads = queue_runner_impl.start_queue_runners(session, coord=coord)

      features, target = session.run(results)
      self.assertAllEqual(features['a'], [0, 1, 0, 1])
      self.assertAllEqual(features['b'], [32, 33, 32, 33])
      self.assertAllEqual(target, [-32, -31, -32, -31])

      with self.assertRaises(errors.OutOfRangeError):
        session.run(results)

      coord.request_stop()
      coord.join(threads)
    def testPandasInputFn_ProducesOutputsForLargeBatchAndMultipleEpochs(self):
        if not HAS_PANDAS:
            return
        with self.test_session() as session:
            index = np.arange(100, 102)
            a = np.arange(2)
            b = np.arange(32, 34)
            x = pd.DataFrame({'a': a, 'b': b}, index=index)
            y = pd.Series(np.arange(-32, -30), index=index)
            input_fn = pandas_io.pandas_input_fn(x,
                                                 y,
                                                 batch_size=128,
                                                 shuffle=False,
                                                 num_epochs=2)

            results = input_fn()

            coord = coordinator.Coordinator()
            threads = queue_runner_impl.start_queue_runners(session,
                                                            coord=coord)

            features, target = session.run(results)
            self.assertAllEqual(features['a'], [0, 1, 0, 1])
            self.assertAllEqual(features['b'], [32, 33, 32, 33])
            self.assertAllEqual(target, [-32, -31, -32, -31])

            with self.assertRaises(errors.OutOfRangeError):
                session.run(results)

            coord.request_stop()
            coord.join(threads)
Exemplo n.º 15
0
  def testPandasInputFn_RespectsEpoch_WithShuffleAutosize(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=True, queue_capacity=None, num_epochs=2)

      self.assertInputsCallableNTimes(input_fn, session, 4)
Exemplo n.º 16
0
  def testPandasInputFn_RespectsEpoch_NoShuffle(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=4, shuffle=False, num_epochs=1)

      self.assertInputsCallableNTimes(input_fn, session, 1)
Exemplo n.º 17
0
  def testPandasInputFn_RespectsEpoch_WithShuffleAutosize(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=True, queue_capacity=None, num_epochs=2)

      self.assertInputsCallableNTimes(input_fn, session, 4)
Exemplo n.º 18
0
  def testPandasInputFn_RespectsEpoch_NoShuffle(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=4, shuffle=False, num_epochs=1)

      self.assertInputsCallableNTimes(input_fn, session, 1)
Exemplo n.º 19
0
  def testPandasInputFn_RespectsEpochUnevenBatches(self):
    if not HAS_PANDAS:
      return
    x, y = self.makeTestDataFrame()
    with self.test_session() as session:
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=3, shuffle=False, num_epochs=1)

      # Before the last batch, only one element of the epoch should remain.
      self.assertInputsCallableNTimes(input_fn, session, 2)
Exemplo n.º 20
0
  def testPandasInputFn_RespectsEpochUnevenBatches(self):
    if not HAS_PANDAS:
      return
    x, y = self.makeTestDataFrame()
    with self.test_session() as session:
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=3, shuffle=False, num_epochs=1)

      # Before the last batch, only one element of the epoch should remain.
      self.assertInputsCallableNTimes(input_fn, session, 2)
Exemplo n.º 21
0
  def testPandasInputFn_ExcludesIndex(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=False, num_epochs=1)

      features, _ = self.callInputFnOnce(input_fn, session)

      self.assertFalse('index' in features)
Exemplo n.º 22
0
  def testPandasInputFn_ExcludesIndex(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=False, num_epochs=1)

      features, _ = self.callInputFnOnce(input_fn, session)

      self.assertFalse('index' in features)
Exemplo n.º 23
0
  def testPandasInputFn_OnlyX(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, _ = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y=None, batch_size=2, shuffle=False, num_epochs=1)

      features = self.callInputFnOnce(input_fn, session)

      self.assertAllEqual(features['a'], [0, 1])
      self.assertAllEqual(features['b'], [32, 33])
Exemplo n.º 24
0
  def testPandasInputFn_OnlyX(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, _ = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y=None, batch_size=2, shuffle=False, num_epochs=1)

      features = self.callInputFnOnce(input_fn, session)

      self.assertAllEqual(features['a'], [0, 1])
      self.assertAllEqual(features['b'], [32, 33])
Exemplo n.º 25
0
  def testPandasInputFn_ProducesExpectedOutputs(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=False, num_epochs=1)

      features, target = self.callInputFnOnce(input_fn, session)

      self.assertAllEqual(features['a'], [0, 1])
      self.assertAllEqual(features['b'], [32, 33])
      self.assertAllEqual(target, [-32, -31])
Exemplo n.º 26
0
  def testPandasInputFn_ProducesExpectedOutputs(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrame()
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=False, num_epochs=1)

      features, target = self.callInputFnOnce(input_fn, session)

      self.assertAllEqual(features['a'], [0, 1])
      self.assertAllEqual(features['b'], [32, 33])
      self.assertAllEqual(target, [-32, -31])
Exemplo n.º 27
0
  def test_pandas_input_fn(self):
    """Tests complete flow with pandas_input_fn."""
    if not HAS_PANDAS:
      return

    # Pandas DataFrame natually supports 1 dim data only.
    label_dimension = 1
    input_dimension = label_dimension
    batch_size = 10
    data = np.array([1., 2., 3., 4.], dtype=np.float32)
    x = pd.DataFrame({'x': data})
    y = pd.Series(data)
    prediction_length = 4

    train_input_fn = pandas_io.pandas_input_fn(
        x=x,
        y=y,
        batch_size=batch_size,
        num_epochs=None,
        shuffle=True)
    eval_input_fn = pandas_io.pandas_input_fn(
        x=x,
        y=y,
        batch_size=batch_size,
        shuffle=False)
    predict_input_fn = pandas_io.pandas_input_fn(
        x=x,
        batch_size=batch_size,
        shuffle=False)

    self._test_complete_flow(
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        predict_input_fn=predict_input_fn,
        input_dimension=input_dimension,
        label_dimension=label_dimension,
        prediction_length=prediction_length,
        batch_size=batch_size)
Exemplo n.º 28
0
  def testPandasInputFnYIsDataFrame_HandlesOverlappingColumnsInTargets(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrameWithYAsDataFrame()
      y = y.rename(columns={'a_target': 'a', 'b_target': 'a_n'})
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=False, num_epochs=1)

      features, targets = self.callInputFnOnce(input_fn, session)

      self.assertAllEqual(features['a'], [0, 1])
      self.assertAllEqual(features['b'], [32, 33])
      self.assertAllEqual(targets['a'], [10, 11])
      self.assertAllEqual(targets['a_n'], [50, 51])
Exemplo n.º 29
0
  def testPandasInputFnYIsDataFrame_HandlesOverlappingColumnsInTargets(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      x, y = self.makeTestDataFrameWithYAsDataFrame()
      y = y.rename(columns={'a_target': 'a', 'b_target': 'a_n'})
      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=False, num_epochs=1)

      features, targets = self.callInputFnOnce(input_fn, session)

      self.assertAllEqual(features['a'], [0, 1])
      self.assertAllEqual(features['b'], [32, 33])
      self.assertAllEqual(targets['a'], [10, 11])
      self.assertAllEqual(targets['a_n'], [50, 51])
Exemplo n.º 30
0
    def testPandasInputFnWhenYIsDataFrame_ProducesExpectedOutput(self):
        if not HAS_PANDAS:
            return
        with self.cached_session() as session:
            x, y = self.makeTestDataFrameWithYAsDataFrame()
            input_fn = pandas_io.pandas_input_fn(x,
                                                 y,
                                                 batch_size=2,
                                                 shuffle=False,
                                                 num_epochs=1)

            features, targets = self.callInputFnOnce(input_fn, session)

            self.assertAllEqual(features['a'], [0, 1])
            self.assertAllEqual(features['b'], [32, 33])
            self.assertAllEqual(targets['a_target'], [10, 11])
            self.assertAllEqual(targets['b_target'], [50, 51])
    def testPandasInputFn_ProducesOutputsWhenDataSizeNotDividedByBatchSize(
            self):
        if not HAS_PANDAS:
            return
        with self.test_session() as session:
            index = np.arange(100, 105)
            a = np.arange(5)
            b = np.arange(32, 37)
            x = pd.DataFrame({'a': a, 'b': b}, index=index)
            y = pd.Series(np.arange(-32, -27), index=index)

            input_fn = pandas_io.pandas_input_fn(x,
                                                 y,
                                                 batch_size=2,
                                                 shuffle=False,
                                                 num_epochs=1)

            results = input_fn()

            coord = coordinator.Coordinator()
            threads = queue_runner_impl.start_queue_runners(session,
                                                            coord=coord)

            features, target = session.run(results)
            self.assertAllEqual(features['a'], [0, 1])
            self.assertAllEqual(features['b'], [32, 33])
            self.assertAllEqual(target, [-32, -31])

            features, target = session.run(results)
            self.assertAllEqual(features['a'], [2, 3])
            self.assertAllEqual(features['b'], [34, 35])
            self.assertAllEqual(target, [-30, -29])

            features, target = session.run(results)
            self.assertAllEqual(features['a'], [4])
            self.assertAllEqual(features['b'], [36])
            self.assertAllEqual(target, [-28])

            with self.assertRaises(errors.OutOfRangeError):
                session.run(results)

            coord.request_stop()
            coord.join(threads)
Exemplo n.º 32
0
def main(argv):
    args = parser.parse_args(argv[1:])

    train_x, train_y = load_data()
    xuzhou_feature_columns = []

    input_fn = pandas_input_fn(train_x, y=train_y, batch_size=args.batch_size, shuffle=False)

    for i, colname in enumerate(train_x.columns):
        xuzhou_feature_columns.append(tf.feature_column.numeric_column(key=colname))

    classifier = tf.estimator.DNNClassifier(
        feature_columns=xuzhou_feature_columns,
        hidden_units=[40, 20],
        model_dir='/tmp',
        n_classes=3)

    classifier.train(
        input_fn=lambda: input_fn(),
        steps=args.train_steps)
Exemplo n.º 33
0
  def testPandasInputFn_ProducesOutputsWhenDataSizeNotDividedByBatchSize(self):
    if not HAS_PANDAS:
      return
    with self.test_session() as session:
      index = np.arange(100, 105)
      a = np.arange(5)
      b = np.arange(32, 37)
      x = pd.DataFrame({'a': a, 'b': b}, index=index)
      y = pd.Series(np.arange(-32, -27), index=index)

      input_fn = pandas_io.pandas_input_fn(
          x, y, batch_size=2, shuffle=False, num_epochs=1)

      results = input_fn()

      coord = coordinator.Coordinator()
      threads = queue_runner_impl.start_queue_runners(session, coord=coord)

      features, target = session.run(results)
      self.assertAllEqual(features['a'], [0, 1])
      self.assertAllEqual(features['b'], [32, 33])
      self.assertAllEqual(target, [-32, -31])

      features, target = session.run(results)
      self.assertAllEqual(features['a'], [2, 3])
      self.assertAllEqual(features['b'], [34, 35])
      self.assertAllEqual(target, [-30, -29])

      features, target = session.run(results)
      self.assertAllEqual(features['a'], [4])
      self.assertAllEqual(features['b'], [36])
      self.assertAllEqual(target, [-28])

      with self.assertRaises(errors.OutOfRangeError):
        session.run(results)

      coord.request_stop()
      coord.join(threads)
Exemplo n.º 34
0
def create_input_data_fn(mode,
                         pipeline_config,
                         scope=None,
                         input_type=None,
                         x=None,
                         y=None):
    """Creates an input data function that can be used with estimators.
    Note that you must pass "factory functions" for both the data provider and
    featurizer to ensure that everything will be created in  the same graph.

    Args:
        mode: `str`, Specifies if this training, evaluation or prediction. See `Modes`.
        pipeline_config: the configuration to create a Pipeline instance.
        scope: `str`. scope to use for this input data block.
        input_type: `str`. The type of the input, values: `NUMPY`, `PANDAS`.
                    If `None`, will create a function based on the pipeline config.
        x: `np.ndarray` or `np.Dataframe` or `None`.
        y: `np.ndarray` or `None`.

    Returns:
        An input function that returns `(feature_batch, labels_batch)`
        tuples when called.
    """
    pipeline_config = pipeline_config

    if input_type == 'NUMPY':
        # setup_train_data_feeder
        return numpy_input_fn(x,
                              y,
                              batch_size=pipeline_config.batch_size,
                              num_epochs=pipeline_config.num_epochs,
                              shuffle=pipeline_config.shuffle,
                              num_threads=pipeline_config.num_threads)

    if input_type == 'PANDAS':
        # setup_train_data_feeder
        return pandas_input_fn(x,
                               y,
                               batch_size=pipeline_config.batch_size,
                               num_epochs=pipeline_config.num_epochs,
                               shuffle=pipeline_config.shuffle,
                               num_threads=pipeline_config.num_threads)

    def input_fn():
        """Creates features and labels."""
        pipeline_params = pipeline_config.to_dict()
        batch_size = pipeline_params.pop('batch_size', None)
        dynamic_pad = pipeline_params.pop('dynamic_pad', None)
        capacity = pipeline_params.pop('capacity', None)
        del pipeline_params['num_threads']
        del pipeline_params['min_after_dequeue']
        allow_smaller_final_batch = pipeline_params.pop(
            'allow_smaller_final_batch', None)
        bucket_boundaries = pipeline_params.pop('bucket_boundaries', None)

        pipeline = getters.get_pipeline(mode=mode,
                                        module=pipeline_config.IDENTIFIER,
                                        **pipeline_params)

        with tf.variable_scope(scope or 'input_fn'):
            data_provider = pipeline.make_data_provider()
            features_and_labels = pipeline.read_from_data_provider(
                data_provider)
            # call pipeline processors
            features_and_labels = pipeline(features_and_labels, None)

            if bucket_boundaries:
                _, batch = tf.contrib.training.bucket_by_sequence_length(
                    input_length=features_and_labels['source_len'],
                    bucket_boundaries=bucket_boundaries,
                    tensors=features_and_labels,
                    batch_size=batch_size,
                    keep_input=features_and_labels['source_len'] >= 1,
                    dynamic_pad=dynamic_pad,
                    capacity=capacity,
                    allow_smaller_final_batch=allow_smaller_final_batch,
                    name='bucket_queue')
            else:
                batch = tf.train.batch(
                    tensors=features_and_labels,
                    enqueue_many=False,
                    batch_size=batch_size,
                    dynamic_pad=dynamic_pad,
                    capacity=capacity,
                    allow_smaller_final_batch=allow_smaller_final_batch,
                    name='batch_queue')

            # Separate features and labels
            features_batch = {k: batch[k] for k in pipeline.feature_keys}
            if set(batch.keys()).intersection(pipeline.label_keys):
                labels_batch = {k: batch[k] for k in pipeline.label_keys}
            else:
                labels_batch = None

            return features_batch, labels_batch

    return input_fn
Exemplo n.º 35
0
def create_input_data_fn(mode, pipeline_config, scope=None, input_type=None, x=None, y=None):
    """Creates an input data function that can be used with estimators.
    Note that you must pass "factory functions" for both the data provider and
    featurizer to ensure that everything will be created in  the same graph.

    Args:
        mode: `str`, Specifies if this training, evaluation or prediction. See `Modes`.
        pipeline_config: the configuration to create a Pipeline instance.
        scope: `str`. scope to use for this input data block.
        input_type: `str`. The type of the input, values: `NUMPY`, `PANDAS`.
                    If `None`, will create a function based on the pipeline config.
        x: `np.ndarray` or `np.Dataframe` or `None`.
        y: `np.ndarray` or `None`.

    Returns:
        An input function that returns `(feature_batch, labels_batch)`
        tuples when called.
    """
    pipeline_config = pipeline_config

    if input_type == InputDataConfig.NUMPY:
        # setup_train_data_feeder
        return numpy_input_fn(x, y,
                              batch_size=pipeline_config.batch_size,
                              num_epochs=pipeline_config.num_epochs,
                              shuffle=pipeline_config.shuffle,
                              num_threads=pipeline_config.num_threads)

    if input_type == InputDataConfig.PANDAS:
        # setup_train_data_feeder
        return pandas_input_fn(x, y,
                               batch_size=pipeline_config.batch_size,
                               num_epochs=pipeline_config.num_epochs,
                               shuffle=pipeline_config.shuffle,
                               num_threads=pipeline_config.num_threads)

    def input_fn():
        """Creates features and labels."""

        pipeline = getters.get_pipeline(
            mode=mode, module=pipeline_config.module, shuffle=pipeline_config.shuffle,
            num_epochs=pipeline_config.num_epochs,
            subgraph_configs_by_features=pipeline_config.subgraph_configs_by_features,
            **pipeline_config.params)

        with tf.variable_scope(scope or 'input_fn'):
            data_provider = pipeline.make_data_provider()
            features_and_labels = pipeline.read_from_data_provider(data_provider)
            # call pipeline processors
            features_and_labels = pipeline(features_and_labels)

            if pipeline_config.bucket_boundaries:
                _, batch = tf.contrib.training.bucket_by_sequence_length(
                    input_length=features_and_labels['source_len'],
                    bucket_boundaries=pipeline_config.bucket_boundaries,
                    tensors=features_and_labels,
                    batch_size=pipeline_config.batch_size,
                    keep_input=features_and_labels['source_len'] >= 1,
                    dynamic_pad=pipeline_config.dynamic_pad,
                    capacity=pipeline_config.capacity,
                    allow_smaller_final_batch=pipeline_config.allow_smaller_final_batch,
                    name='bucket_queue')
            else:
                batch = tf.train.batch(
                    tensors=features_and_labels,
                    enqueue_many=False,
                    batch_size=pipeline_config.batch_size,
                    dynamic_pad=pipeline_config.dynamic_pad,
                    capacity=pipeline_config.capacity,
                    allow_smaller_final_batch=pipeline_config.allow_smaller_final_batch,
                    name='batch_queue')

            # Separate features and labels
            features_batch = {k: batch[k] for k in pipeline.feature_keys}
            if set(batch.keys()).intersection(pipeline.label_keys):
                labels_batch = {k: batch[k] for k in pipeline.label_keys}
            else:
                labels_batch = None

            return features_batch, labels_batch

    return input_fn