def test_filter_input_subsample_vocab(self): """Tests input filtering based on vocab subsampling.""" # The outputs are non-deterministic, so set random seed to help ensure that # the outputs remain constant for testing. random_seed.set_random_seed(42) input_tensor = constant_op.constant([ # keep_prob = (sqrt(30/(0.05*100)) + 1) * (0.05*100/30) = 0.57. b"the", b"answer", # Not in vocab. (Always discarded) b"to", # keep_prob = 0.75. b"life", # keep_prob > 1. (Always kept) b"and", # keep_prob = 0.48. b"universe" # Below vocab threshold of 3. (Always discarded) ]) keys = constant_op.constant( [b"and", b"life", b"the", b"to", b"universe"]) values = constant_op.constant([40, 8, 30, 20, 2], dtypes.int64) vocab_freq_table = lookup.HashTable( lookup.KeyValueTensorInitializer(keys, values), -1) with self.test_session(): vocab_freq_table.init.run() output = skip_gram_ops._filter_input( input_tensor=input_tensor, vocab_freq_table=vocab_freq_table, vocab_min_count=3, vocab_subsampling=0.05, corpus_size=math_ops.reduce_sum(values), seed=9) self.assertAllEqual([b"the", b"to", b"life", b"and"], output.eval())
def testCaptureHashTableInSharedIterator(self): worker, _ = test_util.create_local_cluster(1, 1) # NOTE(mrry): We must use the V2 variants of `HashTable` # etc. because these produce a `tf.resource`-typed output that is # compatible with the in-graph function implementation. default_val = -1 keys = constant_op.constant(["brain", "salad", "surgery"]) values = constant_op.constant([0, 1, 2], dtypes.int64) table = lookup_ops.HashTable(lookup_ops.KeyValueTensorInitializer( keys, values), default_val, shared_name="shared_table") input_sentences = dataset_ops.Dataset.from_tensor_slices( ["brain brain tank salad surgery", "surgery brain"]) iterator = (input_sentences.map( lambda x: string_ops.string_split([x]).values).map( table.lookup).make_initializable_iterator( shared_name="shared_iterator")) init_op = iterator.initializer get_next = iterator.get_next() with session.Session(worker[0].target) as sess: sess.run(table.initializer) sess.run(init_op) self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next)) with session.Session(worker[0].target) as sess: self.assertAllEqual([2, 0], sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_skip_gram_sample_errors(self): """Tests various errors raised by skip_gram_sample().""" input_tensor = constant_op.constant([b"the", b"quick", b"brown"]) invalid_skips = ( # min_skips and max_skips must be >= 0. (-1, 2), (1, -2), # min_skips must be <= max_skips. (2, 1)) for min_skips, max_skips in invalid_skips: tokens, labels = text.skip_gram_sample(input_tensor, min_skips=min_skips, max_skips=max_skips) with self.test_session() as sess, self.assertRaises( errors.InvalidArgumentError): sess.run([tokens, labels]) # input_tensor must be of rank 1. with self.assertRaises(ValueError): invalid_tensor = constant_op.constant([[b"the"], [b"quick"], [b"brown"]]) text.skip_gram_sample(invalid_tensor) # vocab_freq_table must be provided if vocab_min_count, vocab_subsampling, # or corpus_size is specified. dummy_input = constant_op.constant([""]) with self.assertRaises(ValueError): text.skip_gram_sample(dummy_input, vocab_freq_table=None, vocab_min_count=1) with self.assertRaises(ValueError): text.skip_gram_sample(dummy_input, vocab_freq_table=None, vocab_subsampling=1e-5) with self.assertRaises(ValueError): text.skip_gram_sample(dummy_input, vocab_freq_table=None, corpus_size=100) with self.assertRaises(ValueError): text.skip_gram_sample(dummy_input, vocab_freq_table=None, vocab_subsampling=1e-5, corpus_size=100) # vocab_subsampling and corpus_size must both be present or absent. dummy_table = lookup.HashTable( lookup.KeyValueTensorInitializer([b"foo"], [10]), -1) with self.assertRaises(ValueError): text.skip_gram_sample(dummy_input, vocab_freq_table=dummy_table, vocab_subsampling=None, corpus_size=100) with self.assertRaises(ValueError): text.skip_gram_sample(dummy_input, vocab_freq_table=dummy_table, vocab_subsampling=1e-5, corpus_size=None)
def graph_fn(): keys = [1, 0, -1] dataset = tf.data.Dataset.from_tensor_slices([[1, 2, -1, 5]]) table = contrib_lookup.HashTable( initializer=contrib_lookup.KeyValueTensorInitializer( keys=keys, values=list(reversed(keys))), default_value=100) dataset = dataset.map(table.lookup) return dataset_builder.make_initializable_iterator(dataset).get_next()
def testMapCaptureLookupTable(self): default_val = -1 keys = constant_op.constant(['brain', 'salad', 'surgery']) values = constant_op.constant([0, 1, 2], dtypes.int64) table = lookup.HashTable( lookup.KeyValueTensorInitializer(keys, values), default_val) dataset = Dataset.from_tensor_slices(['brain', 'salad', 'surgery']) dataset = dataset.map(table.lookup) it = datasets.Iterator(dataset) got = [x.numpy() for x in it] self.assertAllEqual([0, 1, 2], got)
def test_make_initializable_iterator_with_hashTable(self): keys = [1, 0, -1] dataset = tf.data.Dataset.from_tensor_slices([[1, 2, -1, 5]]) table = contrib_lookup.HashTable( initializer=contrib_lookup.KeyValueTensorInitializer( keys=keys, values=list(reversed(keys))), default_value=100) dataset = dataset.map(table.lookup) data = dataset_builder.make_initializable_iterator(dataset).get_next() init = tf.tables_initializer() with self.test_session() as sess: sess.run(init) self.assertAllEqual(sess.run(data), [-1, 100, 1, 100])
def _load_json_dict_into_hashtable(self, filename): """Load json dictionary into a HashTable.""" with tf.gfile.Open(filename, "r") as filename: # pylint: disable=g-long-lambda temp_dict = json.load( filename, object_hook=lambda d: {int(k) if k.isdigit() else k: v for k, v in d.items()}) # pylint: enable=g-long-lambda keys = list(temp_dict.keys()) values = [temp_dict[k] for k in keys] feature_names_to_values = contrib_lookup.HashTable( contrib_lookup.KeyValueTensorInitializer( keys, values, key_dtype=tf.int64, value_dtype=tf.float32), -1) return feature_names_to_values
def test_filter_input_filter_vocab(self): """Tests input filtering based on vocab frequency table and thresholds.""" input_tensor = constant_op.constant( [b"the", b"answer", b"to", b"life", b"and", b"universe"]) keys = constant_op.constant( [b"and", b"life", b"the", b"to", b"universe"]) values = constant_op.constant([0, 1, 2, 3, 4], dtypes.int64) vocab_freq_table = lookup.HashTable( lookup.KeyValueTensorInitializer(keys, values), -1) with self.test_session(): vocab_freq_table.init.run() # No vocab_freq_table specified - output should be the same as input. no_table_output = skip_gram_ops._filter_input( input_tensor=input_tensor, vocab_freq_table=None, vocab_min_count=None, vocab_subsampling=None, corpus_size=None, seed=None) self.assertAllEqual(input_tensor.eval(), no_table_output.eval()) # vocab_freq_table specified, but no vocab_min_count - output should have # filtered out tokens not in the table (b"answer"). table_output = skip_gram_ops._filter_input( input_tensor=input_tensor, vocab_freq_table=vocab_freq_table, vocab_min_count=None, vocab_subsampling=None, corpus_size=None, seed=None) self.assertAllEqual([b"the", b"to", b"life", b"and", b"universe"], table_output.eval()) # vocab_freq_table and vocab_min_count specified - output should have # filtered out tokens whose frequencies are below the threshold # (b"and": 0, b"life": 1). threshold_output = skip_gram_ops._filter_input( input_tensor=input_tensor, vocab_freq_table=vocab_freq_table, vocab_min_count=2, vocab_subsampling=None, corpus_size=None, seed=None) self.assertAllEqual([b"the", b"to", b"universe"], threshold_output.eval())
def __init__(self, data_dir, batch_size, part, processor=None): self.batch_size = batch_size self.processor = processor self.part = part self.num_preprocess_threads = 10 self.min_queue_examples = 10 * batch_size if self.part == AFLWReader.DatasetPart.test: name_pattern = 'boundingboxesAFLW*.mat' else: raise Exception("Unsupported dataset part {}".format(part)) self.paths = [] self.bboxes = [] self.path2bbox_idx = [] for file_idx, fpath in enumerate( glob.glob(os.path.join(data_dir, name_pattern))): if file_idx >= FLAGS.max_num_parts: break mat = scipy.io.loadmat(fpath) img_names = np.transpose(mat['imnames'], (1, 0)) bboxes = mat['total_boxes'] good_boxes_idx = np.logical_and( np.logical_and(bboxes[:, 2] - bboxes[:, 0] > 40, bboxes[:, 3] - bboxes[:, 1] > 40), bboxes[:, 4] > 0.8) good_boxes_idx = np.logical_and(good_boxes_idx, bboxes[:, 0] >= 0) good_boxes_idx = np.logical_and(good_boxes_idx, bboxes[:, 1] >= 0) img_names = img_names[good_boxes_idx] bboxes = bboxes[good_boxes_idx] print(len(good_boxes_idx), len(bboxes)) paths = [] for idx, path in enumerate(img_names): paths.append(path[0][0]) self.bboxes.append(bboxes) self.paths.append(paths) self.path2bbox_idx.append( lookup.HashTable(lookup.KeyValueTensorInitializer( paths, range(0, len(bboxes))), default_value=-1))
def _create_saved_model_v1_with_hashtable(self): """Create a TensorFlow SavedModel V1 with unused hash table for testing.""" graph = tf.Graph() with graph.as_default(): x = tf.placeholder('float32', [2, 2]) w = tf.compat.v1.get_variable('w', shape=[2, 2]) output = tf.compat.v1.matmul(x, w) init_op = w.initializer # Add a hash table that is not used by the output. keys = tf.constant(['key']) values = tf.constant([1]) initializer = contrib_lookup.KeyValueTensorInitializer( keys, values) table = contrib_lookup.HashTable(initializer, -1) # Create a builder. save_dir = os.path.join(self._tmp_dir, SAVED_MODEL_DIR) builder = tf.compat.v1.saved_model.builder.SavedModelBuilder( save_dir) with tf.compat.v1.Session() as sess: # Run the initializer on `w`. sess.run(init_op) table.init.run() builder.add_meta_graph_and_variables( sess, [tf.compat.v1.saved_model.tag_constants.SERVING], signature_def_map={ "serving_default": tf.compat.v1.saved_model \ .signature_def_utils.predict_signature_def( inputs={"x": x}, outputs={"output": output}) }, assets_collection=None) builder.save()