def testCrossedColumnNotSupportRealValuedColumn(self):
   b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
   with self.assertRaisesRegexp(
       TypeError, "columns must be a set of _SparseColumn, _CrossedColumn, "
       "or _BucketizedColumn instances"):
     fc.crossed_column(
         set([b, fc.real_valued_column("real")]), hash_bucket_size=10000)
Пример #2
0
 def testCrossedColumnNotSupportRealValuedColumn(self):
   b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
   with self.assertRaisesRegexp(
       TypeError, "columns must be a set of _SparseColumn, _CrossedColumn, "
       "or _BucketizedColumn instances"):
     fc.crossed_column(
         set([b, fc.real_valued_column("real")]), hash_bucket_size=10000)
Пример #3
0
  def testMixedFeatures(self):
    """Tests SDCALogisticClassifier with a mix of features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    price = feature_column_lib.real_valued_column('price')
    sq_footage_bucket = feature_column_lib.bucketized_column(
        feature_column_lib.real_valued_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column_lib.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column_lib.crossed_column(
        [sq_footage_bucket, country], hash_bucket_size=10)
    classifier = sdca_estimator.SDCALogisticClassifier(
        example_id_column='example_id',
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        weight_column_name='weights')
    classifier.fit(input_fn=input_fn, steps=50)
    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
    self.assertGreater(metrics['accuracy'], 0.9)
Пример #4
0
  def testCrossedFeatures(self):
    """Tests SDCALogisticClassifier with crossed features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'language':
              sparse_tensor.SparseTensor(
                  values=['english', 'italian', 'spanish'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 1]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['US', 'IT', 'MX'],
                  indices=[[0, 0], [1, 0], [2, 0]],
                  dense_shape=[3, 1])
      }, constant_op.constant([[0], [0], [1]])

    language = feature_column_lib.sparse_column_with_hash_bucket(
        'language', hash_bucket_size=5)
    country = feature_column_lib.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    country_language = feature_column_lib.crossed_column(
        [language, country], hash_bucket_size=10)
    classifier = sdca_estimator.SDCALogisticClassifier(
        example_id_column='example_id', feature_columns=[country_language])
    classifier.fit(input_fn=input_fn, steps=10)
    metrics = classifier.evaluate(input_fn=input_fn, steps=1)
    self.assertGreater(metrics['accuracy'], 0.9)
    def testCrossedFeatures(self):
        """Tests SDCALogisticClassifier with crossed features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'language':
                sparse_tensor.SparseTensor(
                    values=['english', 'italian', 'spanish'],
                    indices=[[0, 0], [1, 0], [2, 0]],
                    dense_shape=[3, 1]),
                'country':
                sparse_tensor.SparseTensor(values=['US', 'IT', 'MX'],
                                           indices=[[0, 0], [1, 0], [2, 0]],
                                           dense_shape=[3, 1])
            }, constant_op.constant([[0], [0], [1]])

        with self._single_threaded_test_session():
            language = feature_column_lib.sparse_column_with_hash_bucket(
                'language', hash_bucket_size=5)
            country = feature_column_lib.sparse_column_with_hash_bucket(
                'country', hash_bucket_size=5)
            country_language = feature_column_lib.crossed_column(
                [language, country], hash_bucket_size=10)
            classifier = sdca_estimator.SDCALogisticClassifier(
                example_id_column='example_id',
                feature_columns=[country_language])
            classifier.fit(input_fn=input_fn, steps=10)
            metrics = classifier.evaluate(input_fn=input_fn, steps=1)
            self.assertGreater(metrics['accuracy'], 0.9)
Пример #6
0
    def testMixedFeatures(self):
        """Tests SDCALogisticClassifier with a mix of features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                constant_op.constant([[0.6], [0.8], [0.3]]),
                'sq_footage':
                constant_op.constant([[900.0], [700.0], [600.0]]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 3], [2, 1]],
                                           dense_shape=[3, 5]),
                'weights':
                constant_op.constant([[3.0], [1.0], [1.0]])
            }, constant_op.constant([[1], [0], [1]])

        price = feature_column_lib.real_valued_column('price')
        sq_footage_bucket = feature_column_lib.bucketized_column(
            feature_column_lib.real_valued_column('sq_footage'),
            boundaries=[650.0, 800.0])
        country = feature_column_lib.sparse_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        sq_footage_country = feature_column_lib.crossed_column(
            [sq_footage_bucket, country], hash_bucket_size=10)
        classifier = SDCALogisticClassifier(example_id_column='example_id',
                                            feature_columns=[
                                                price, sq_footage_bucket,
                                                country, sq_footage_country
                                            ],
                                            weight_column_name='weights')
        classifier.fit(input_fn=input_fn, steps=50)
        metrics = classifier.evaluate(input_fn=input_fn, steps=1)
        self.assertGreater(metrics['accuracy'], 0.9)
  def testCrossedColumnNameCreatesSortedNames(self):
    a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
    bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4])
    crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000)

    self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed.name,
                     "name should be generated by sorted column names")
    self.assertEqual("aaa", crossed.columns[0].name)
    self.assertEqual("bbb", crossed.columns[1].name)
    self.assertEqual("cost_bucketized", crossed.columns[2].name)
Пример #8
0
 def testCrossedColumnDeepCopy(self):
   a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100)
   b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100)
   bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4])
   crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000)
   crossed_copy = copy.deepcopy(crossed)
   self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed_copy.name,
                    "name should be generated by sorted column names")
   self.assertEqual("aaa", crossed_copy.columns[0].name)
   self.assertEqual("bbb", crossed_copy.columns[1].name)
   self.assertEqual("cost_bucketized", crossed_copy.columns[2].name)
Пример #9
0
  def testMixedFeaturesArbitraryWeightsPartitioned(self):
    """Tests SDCALinearRegressor works with a mix of features (partitioned)."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [5.0], [7.0]])
      }, constant_op.constant([[1.55], [-1.25], [-3.0]])

    with self._single_threaded_test_session():
      price = feature_column_lib.real_valued_column('price')
      sq_footage_bucket = feature_column_lib.bucketized_column(
          feature_column_lib.real_valued_column('sq_footage'),
          boundaries=[650.0, 800.0])
      country = feature_column_lib.sparse_column_with_hash_bucket(
          'country', hash_bucket_size=5)
      sq_footage_country = feature_column_lib.crossed_column(
          [sq_footage_bucket, country], hash_bucket_size=10)
      regressor = sdca_estimator.SDCALinearRegressor(
          example_id_column='example_id',
          feature_columns=[
              price, sq_footage_bucket, country, sq_footage_country
          ],
          l2_regularization=1.0,
          weight_column_name='weights',
          partitioner=partitioned_variables.fixed_size_partitioner(
              num_shards=2, axis=0))
      regressor.fit(input_fn=input_fn, steps=20)
      loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
      self.assertLess(loss, 0.05)
Пример #10
0
  def testMixedFeaturesArbitraryWeightsPartitioned(self):
    """Tests SDCALinearRegressor works with a mix of features (partitioned)."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([[0.6], [0.8], [0.3]]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [5.0], [7.0]])
      }, constant_op.constant([[1.55], [-1.25], [-3.0]])

    with self._single_threaded_test_session():
      price = feature_column_lib.real_valued_column('price')
      sq_footage_bucket = feature_column_lib.bucketized_column(
          feature_column_lib.real_valued_column('sq_footage'),
          boundaries=[650.0, 800.0])
      country = feature_column_lib.sparse_column_with_hash_bucket(
          'country', hash_bucket_size=5)
      sq_footage_country = feature_column_lib.crossed_column(
          [sq_footage_bucket, country], hash_bucket_size=10)
      regressor = sdca_estimator.SDCALinearRegressor(
          example_id_column='example_id',
          feature_columns=[
              price, sq_footage_bucket, country, sq_footage_country
          ],
          l2_regularization=1.0,
          weight_column_name='weights',
          partitioner=partitioned_variables.fixed_size_partitioner(
              num_shards=2, axis=0))
      regressor.fit(input_fn=input_fn, steps=20)
      loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
      self.assertLess(loss, 0.05)
Пример #11
0
  def testMixedFeatures(self):
    """Tests SVM classifier with a mix of features."""

    def input_fn():
      return {
          'example_id':
              constant_op.constant(['1', '2', '3']),
          'price':
              constant_op.constant([0.6, 0.8, 0.3]),
          'sq_footage':
              constant_op.constant([[900.0], [700.0], [600.0]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
          'weights':
              constant_op.constant([[3.0], [1.0], [1.0]])
      }, constant_op.constant([[1], [0], [1]])

    price = feature_column.real_valued_column('price')
    sq_footage_bucket = feature_column.bucketized_column(
        feature_column.real_valued_column('sq_footage'),
        boundaries=[650.0, 800.0])
    country = feature_column.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)
    sq_footage_country = feature_column.crossed_column(
        [sq_footage_bucket, country], hash_bucket_size=10)
    svm_classifier = svm.SVM(
        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
        example_id_column='example_id',
        weight_column_name='weights',
        l1_regularization=0.1,
        l2_regularization=1.0)

    svm_classifier.fit(input_fn=input_fn, steps=30)
    accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy']
    self.assertAlmostEqual(accuracy, 1.0, places=3)
Пример #12
0
    def testMixedFeatures(self):
        """Tests SVM classifier with a mix of features."""
        def input_fn():
            return {
                'example_id':
                constant_op.constant(['1', '2', '3']),
                'price':
                constant_op.constant([[0.6], [0.8], [0.3]]),
                'sq_footage':
                constant_op.constant([[900.0], [700.0], [600.0]]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 3], [2, 1]],
                                           dense_shape=[3, 5]),
                'weights':
                constant_op.constant([[3.0], [1.0], [1.0]])
            }, constant_op.constant([[1], [0], [1]])

        price = feature_column.real_valued_column('price')
        sq_footage_bucket = feature_column.bucketized_column(
            feature_column.real_valued_column('sq_footage'),
            boundaries=[650.0, 800.0])
        country = feature_column.sparse_column_with_hash_bucket(
            'country', hash_bucket_size=5)
        sq_footage_country = feature_column.crossed_column(
            [sq_footage_bucket, country], hash_bucket_size=10)
        svm_classifier = svm.SVM(feature_columns=[
            price, sq_footage_bucket, country, sq_footage_country
        ],
                                 example_id_column='example_id',
                                 weight_column_name='weights',
                                 l1_regularization=0.1,
                                 l2_regularization=1.0)

        svm_classifier.fit(input_fn=input_fn, steps=30)
        accuracy = svm_classifier.evaluate(input_fn=input_fn,
                                           steps=1)['accuracy']
        self.assertAlmostEqual(accuracy, 1.0, places=3)
    def testCreateFeatureSpec(self):
        sparse_col = fc.sparse_column_with_hash_bucket("sparse_column",
                                                       hash_bucket_size=100)
        embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
                                            dimension=4)
        sparse_id_col = fc.sparse_column_with_keys(
            "id_column", ["marlo", "omar", "stringer"])
        weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                    "id_weights_column")
        real_valued_col1 = fc.real_valued_column("real_valued_column1")
        real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
        real_valued_col3 = fc.real_valued_column("real_valued_column3",
                                                 dimension=None)
        bucketized_col1 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization1"),
            [0, 4])
        bucketized_col2 = fc.bucketized_column(
            fc.real_valued_column("real_valued_column_for_bucketization2", 4),
            [0, 4])
        a = fc.sparse_column_with_hash_bucket("cross_aaa",
                                              hash_bucket_size=100)
        b = fc.sparse_column_with_hash_bucket("cross_bbb",
                                              hash_bucket_size=100)
        cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
        feature_columns = set([
            sparse_col, embedding_col, weighted_id_col, real_valued_col1,
            real_valued_col2, real_valued_col3, bucketized_col1,
            bucketized_col2, cross_col
        ])
        expected_config = {
            "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
            "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
            "real_valued_column1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column2":
            parsing_ops.FixedLenFeature([5], dtype=dtypes.float32),
            "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
            "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
            "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature([4], dtype=dtypes.float32),
            "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
            "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
        }

        config = fc.create_feature_spec_for_parsing(feature_columns)
        self.assertDictEqual(expected_config, config)

        # Test that the same config is parsed out if we pass a dictionary.
        feature_columns_dict = {
            str(i): val
            for i, val in enumerate(feature_columns)
        }
        config = fc.create_feature_spec_for_parsing(feature_columns_dict)
        self.assertDictEqual(expected_config, config)
Пример #14
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
Пример #16
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    sparse_id_col = fc.sparse_column_with_keys("id_column",
                                               ["marlo", "omar", "stringer"])
    weighted_id_col = fc.weighted_sparse_column(sparse_id_col,
                                                "id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc.real_valued_column(
        "real_valued_column3", dimension=None)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, real_valued_col1,
        real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2,
        cross_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string)
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
Пример #17
0
  def testInitCrossedColumnWeightsFromCkpt(self):
    sparse_col_1 = fc.sparse_column_with_hash_bucket(
        column_name="col_1", hash_bucket_size=4)
    sparse_col_2 = fc.sparse_column_with_hash_bucket(
        column_name="col_2", hash_bucket_size=4)

    crossed_col = fc.crossed_column(
        columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4)

    input_tensor = sparse_tensor_lib.SparseTensor(
        indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
        values=[0, 1, 2, 3],
        dense_shape=[4, 4])

    # Invoking 'weighted_sum_from_feature_columns' will create the crossed
    # column weights variable.
    with variable_scope.variable_scope("run_1"):
      with variable_scope.variable_scope(crossed_col.name):
        # Returns looked up column weights which is same as crossed column
        # weights as well as actual references to weights variables.
        _, col_weights, _ = (
            feature_column_ops.weighted_sum_from_feature_columns({
                sparse_col_1.name: input_tensor,
                sparse_col_2.name: input_tensor
            }, [crossed_col], 1))
        # Update the weights since default initializer initializes all weights
        # to 0.0.
        for weight in col_weights.values():
          assign_op = state_ops.assign(weight[0], weight[0] + 0.5)

    save = saver.Saver()
    ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                   "init_crossed_col_w_from_ckpt")
    ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
    checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      sess.run(assign_op)
      saved_col_weights = col_weights[crossed_col][0].eval()
      save.save(sess, checkpoint_path)

    crossed_col_initialized = fc.crossed_column(
        columns=[sparse_col_1, sparse_col_2],
        hash_bucket_size=4,
        ckpt_to_load_from=checkpoint_path,
        tensor_name_in_ckpt=("run_1/col_1_X_col_2/"
                             "weighted_sum_from_feature_columns/"
                             "col_1_X_col_2/weights"))

    with variable_scope.variable_scope("run_2"):
      # This will initialize the crossed column weights from provided checkpoint
      # and return a [4, 1] tensor which is same as weights variable. Since we
      # won't modify weights, this should be same as 'saved_col_weights'.
      _, col_weights, _ = (feature_column_ops.weighted_sum_from_feature_columns(
          {
              sparse_col_1.name: input_tensor,
              sparse_col_2.name: input_tensor
          }, [crossed_col_initialized], 1))
      col_weights_from_ckpt = col_weights[crossed_col_initialized][0]

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      loaded_col_weights = col_weights_from_ckpt.eval()

    self.assertAllClose(saved_col_weights, loaded_col_weights)
Пример #18
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        bucketized_col1, bucketized_col2, cross_col, one_hot_col,
        scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Tests that contrib feature columns work with core library:
    config_core = fc_core.make_parse_example_spec(feature_columns)
    self.assertDictEqual(expected_config, config_core)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)
    def testInitCrossedColumnWeightsFromCkpt(self):
        sparse_col_1 = fc.sparse_column_with_hash_bucket(column_name="col_1",
                                                         hash_bucket_size=4)
        sparse_col_2 = fc.sparse_column_with_hash_bucket(column_name="col_2",
                                                         hash_bucket_size=4)

        crossed_col = fc.crossed_column(columns=[sparse_col_1, sparse_col_2],
                                        hash_bucket_size=4)

        input_tensor = sparse_tensor_lib.SparseTensor(indices=[[0, 0], [1, 1],
                                                               [2, 2], [3, 3]],
                                                      values=[0, 1, 2, 3],
                                                      dense_shape=[4, 4])

        # Invoking 'weighted_sum_from_feature_columns' will create the crossed
        # column weights variable.
        with variable_scope.variable_scope("run_1"):
            with variable_scope.variable_scope(crossed_col.name):
                # Returns looked up column weights which is same as crossed column
                # weights as well as actual references to weights variables.
                _, col_weights, _ = (
                    feature_column_ops.weighted_sum_from_feature_columns(
                        {
                            sparse_col_1.name: input_tensor,
                            sparse_col_2.name: input_tensor
                        }, [crossed_col], 1))
                # Update the weights since default initializer initializes all weights
                # to 0.0.
                for weight in col_weights.values():
                    assign_op = state_ops.assign(weight[0], weight[0] + 0.5)

        save = saver.Saver()
        ckpt_dir_prefix = os.path.join(self.get_temp_dir(),
                                       "init_crossed_col_w_from_ckpt")
        ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix)
        checkpoint_path = os.path.join(ckpt_dir, "model.ckpt")

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            sess.run(assign_op)
            saved_col_weights = col_weights[crossed_col][0].eval()
            save.save(sess, checkpoint_path)

        crossed_col_initialized = fc.crossed_column(
            columns=[sparse_col_1, sparse_col_2],
            hash_bucket_size=4,
            ckpt_to_load_from=checkpoint_path,
            tensor_name_in_ckpt=("run_1/col_1_X_col_2/"
                                 "weighted_sum_from_feature_columns/"
                                 "col_1_X_col_2/weights"))

        with variable_scope.variable_scope("run_2"):
            # This will initialize the crossed column weights from provided checkpoint
            # and return a [4, 1] tensor which is same as weights variable. Since we
            # won't modify weights, this should be same as 'saved_col_weights'.
            _, col_weights, _ = (
                feature_column_ops.weighted_sum_from_feature_columns(
                    {
                        sparse_col_1.name: input_tensor,
                        sparse_col_2.name: input_tensor
                    }, [crossed_col_initialized], 1))
            col_weights_from_ckpt = col_weights[crossed_col_initialized][0]

        with self.test_session() as sess:
            sess.run(variables.global_variables_initializer())
            loaded_col_weights = col_weights_from_ckpt.eval()

        self.assertAllClose(saved_col_weights, loaded_col_weights)
Пример #20
0
  def testCreateFeatureSpec(self):
    sparse_col = fc.sparse_column_with_hash_bucket(
        "sparse_column", hash_bucket_size=100)
    embedding_col = fc.embedding_column(
        fc.sparse_column_with_hash_bucket(
            "sparse_column_for_embedding", hash_bucket_size=10),
        dimension=4)
    str_sparse_id_col = fc.sparse_column_with_keys(
        "str_id_column", ["marlo", "omar", "stringer"])
    int32_sparse_id_col = fc.sparse_column_with_keys(
        "int32_id_column", [42, 1, -1000], dtype=dtypes.int32)
    int64_sparse_id_col = fc.sparse_column_with_keys(
        "int64_id_column", [42, 1, -1000], dtype=dtypes.int64)
    weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col,
                                                "str_id_weights_column")
    real_valued_col1 = fc.real_valued_column("real_valued_column1")
    real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
    real_valued_col3 = fc._real_valued_var_len_column(
        "real_valued_column3", is_sparse=True)
    real_valued_col4 = fc._real_valued_var_len_column(
        "real_valued_column4", dtype=dtypes.int64, default_value=0,
        is_sparse=False)
    bucketized_col1 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
    bucketized_col2 = fc.bucketized_column(
        fc.real_valued_column("real_valued_column_for_bucketization2", 4),
        [0, 4])
    a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
    b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
    cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
        "sparse_column_for_one_hot", hash_bucket_size=100))
    scattered_embedding_col = fc.scattered_embedding_column(
        "scattered_embedding_column", size=100, dimension=10, hash_key=1)
    feature_columns = set([
        sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
        int64_sparse_id_col, real_valued_col1, real_valued_col2,
        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
        cross_col, one_hot_col, scattered_embedding_col
    ])
    expected_config = {
        "sparse_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_embedding":
            parsing_ops.VarLenFeature(dtypes.string),
        "str_id_column":
            parsing_ops.VarLenFeature(dtypes.string),
        "int32_id_column":
            parsing_ops.VarLenFeature(dtypes.int32),
        "int64_id_column":
            parsing_ops.VarLenFeature(dtypes.int64),
        "str_id_weights_column":
            parsing_ops.VarLenFeature(dtypes.float32),
        "real_valued_column1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column2":
            parsing_ops.FixedLenFeature(
                [5], dtype=dtypes.float32),
        "real_valued_column3":
            parsing_ops.VarLenFeature(dtype=dtypes.float32),
        "real_valued_column4":
            parsing_ops.FixedLenSequenceFeature(
                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
        "real_valued_column_for_bucketization1":
            parsing_ops.FixedLenFeature(
                [1], dtype=dtypes.float32),
        "real_valued_column_for_bucketization2":
            parsing_ops.FixedLenFeature(
                [4], dtype=dtypes.float32),
        "cross_aaa":
            parsing_ops.VarLenFeature(dtypes.string),
        "cross_bbb":
            parsing_ops.VarLenFeature(dtypes.string),
        "sparse_column_for_one_hot":
            parsing_ops.VarLenFeature(dtypes.string),
        "scattered_embedding_column":
            parsing_ops.VarLenFeature(dtypes.string),
    }

    config = fc.create_feature_spec_for_parsing(feature_columns)
    self.assertDictEqual(expected_config, config)

    # Test that the same config is parsed out if we pass a dictionary.
    feature_columns_dict = {
        str(i): val
        for i, val in enumerate(feature_columns)
    }
    config = fc.create_feature_spec_for_parsing(feature_columns_dict)
    self.assertDictEqual(expected_config, config)