def insert_transformed_feature(self, columns_to_tensors): # Bucketize the source column. if self.source_column not in columns_to_tensors: self.source_column.insert_transformed_feature(columns_to_tensors) columns_to_tensors[self] = bucketization_op.bucketize( columns_to_tensors[self.source_column], boundaries=list(self.boundaries))
def test_normal_usecase(self): op = bucketization_op.bucketize( constant_op.constant([-5, 0, 2, 3, 5, 8, 10, 11, 12]), boundaries=[0, 3, 8, 11]) expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4] with self.test_session() as sess: self.assertAllEqual(expected_out, sess.run(op))
def insert_transformed_feature(self, columns_to_tensors): # Bucketize the source column. if self.source_column not in columns_to_tensors: self.source_column.insert_transformed_feature(columns_to_tensors) columns_to_tensors[self] = bucketization_op.bucketize( columns_to_tensors[self.source_column], boundaries=list(self.boundaries))
def _bucketize(instances, feature, schema, metadata): """Applies the bucketize transform to a numeric field. """ field = schema[feature.field] if not field.numeric: raise ValueError( 'A scale transform cannot be applied to non-numerical field "%s".' % feature.field) transform = feature.transform boundaries = map(float, transform['boundaries'].split(',')) # TODO: Figure out how to use tf.case instead of this contrib op from tensorflow.contrib.layers.python.ops.bucketization_op import bucketize # Create a one-hot encoded tensor. The dimension of this tensor is the set of buckets defined # by N boundaries == N + 1. # A squeeze is needed to remove the extra dimension added to the shape. value = instances[feature.field] value = tf.squeeze(tf.one_hot(bucketize(value, boundaries, name='bucket'), depth=len(boundaries) + 1, on_value=1.0, off_value=0.0, name='one_hot'), axis=1, name='bucketize') value.set_shape((None, len(boundaries) + 1)) return value
def test_normal_usecase(self): op = bucketization_op.bucketize(constant_op.constant( [-5, 0, 2, 3, 5, 8, 10, 11, 12]), boundaries=[0, 3, 8, 11]) expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4] with self.test_session() as sess: self.assertAllEqual(expected_out, sess.run(op))
def _transform_feature(self, inputs): """Handles cross transformation.""" # Bucketize the source column. if not self.add_random: return bucketization_op.bucketize(inputs.get(self.source_column), boundaries=list(self.boundaries), name="bucketize") else: rawts = inputs.get(self.source_column) tbn = np.asarray(self.boundaries[1:]) if len(tbn) > 30: # noise = min(np.median(tbn)-tbn[0],tbn[20:-20].std())/2. noise = tbn[10:-10].std() / 10. rndts = rawts + random_normal(array_ops.shape(rawts), 0, noise) return bucketization_op.bucketize(rndts, boundaries=list( self.boundaries), name="bucketize") else: return bucketization_op.bucketize(rawts, boundaries=list( self.boundaries), name="bucketize")
def test_invalid_boundaries_order(self): op = bucketization_op.bucketize( constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11]) with self.test_session() as sess: with self.assertRaises(errors_impl.InvalidArgumentError): sess.run(op)
def test_invalid_boundaries_order(self): op = bucketization_op.bucketize(constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11]) with self.test_session() as sess: with self.assertRaises(errors_impl.InvalidArgumentError): sess.run(op)