def testStringToOneHashBucketLegacyHash(self): with self.cached_session(): input_string = array_ops.placeholder(dtypes.string) output = string_ops.string_to_hash_bucket(input_string, 1) result = output.eval(feed_dict={input_string: ['a', 'b', 'c']}) self.assertAllEqual([0, 0, 0], result)
def replica_fn(elem): # Example of typical preprocessing of string to numeric feature hashed = string_to_hash_bucket(elem['str'], 10) # For dense string case, slice it to size of ragged int hashed_sliced = hashed[:, :elem['size'][0]] # Computation with both feature from string and numeric dataset output return elem['int'] * 10 + hashed_sliced
def insert_transformed_feature(self, columns_to_tensors): """Handles sparse column to id conversion.""" sparse_id_values = string_ops.string_to_hash_bucket( columns_to_tensors[self.name].values, self.bucket_size, name=self.name + "_lookup") columns_to_tensors[self] = ops.SparseTensor( columns_to_tensors[self.name].indices, sparse_id_values, columns_to_tensors[self.name].shape)
def testStringToHashBucketsLegacyHash(self): with self.cached_session(): input_string = array_ops.placeholder(dtypes.string) output = string_ops.string_to_hash_bucket(input_string, 10) result = output.eval(feed_dict={input_string: ['a', 'b', 'c']}) # Hash64('a') -> 2996632905371535868 -> mod 10 -> 8 # Hash64('b') -> 5795986006276551370 -> mod 10 -> 0 # Hash64('c') -> 14899841994519054197 -> mod 10 -> 7 self.assertAllEqual([8, 0, 7], result)
def run_dataset_implementation(self, batch_size): num_repeats = 5 starts = [] ends = [] for _ in range(num_repeats): ds = dataset_ops.Dataset.from_generator( word_gen, dtypes.string, tensor_shape.TensorShape([])) ds = ds.shuffle(batch_size * 100) ds = ds.batch(batch_size) num_batches = 5 ds = ds.take(num_batches) ds = ds.prefetch(num_batches) starts.append(time.time()) # Benchmarked code begins here. for i in ds: _ = string_ops.string_to_hash_bucket(i, num_buckets=2) # Benchmarked code ends here. ends.append(time.time()) avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches return avg_time
def replica_fn(elem): # Example of typical preprocessing of string to numeric feature hashed = string_to_hash_bucket(elem['str'], 10) return 1000 * hashed