def setUp(self): super(ChunkSizesTest, self).setUp() # Set up a DatasetSpecification with lots of classes and samples. self.dataset_spec = DatasetSpecification( name=None, classes_per_split=dict(zip(Split, [1000, 0, 0])), images_per_class={i: 1000 for i in range(1000)}, class_names=None, path=None, file_pattern='{}.tfrecords')
def get_dataset_spec(path): DATASET_SPEC = DatasetSpecification(name=None, classes_per_split={ Split.TRAIN: 15, Split.VALID: 5, Split.TEST: 10 }, images_per_class=dict( enumerate([10, 20, 30] * 10)), class_names=None, path=path, file_pattern='{}.h5') return DATASET_SPEC
def test_flush_logic(self): """Tests the "flush" logic avoiding example duplication in an episode.""" # Generate two episodes from un-shuffled data sources. For classes where # there are enough examples for both, new examples should be used for the # second episodes. Otherwise, the first examples should be re-used. # A data_spec with classes between 10 and 29 examples. num_classes = 30 dataset_spec = DatasetSpecification( name=None, classes_per_split={ Split.TRAIN: num_classes, Split.VALID: 0, Split.TEST: 0 }, images_per_class={i: 10 + i for i in range(num_classes)}, class_names=None, path=None, file_pattern='{}.tfrecords') # Sample from all train classes, 5 + 5 examples from each episode sampler = sampling.EpisodeDescriptionSampler( dataset_spec, Split.TRAIN, episode_descr_config=config.EpisodeDescriptionConfig( num_ways=num_classes, num_support=5, num_query=5)) episodes = self.generate_episodes(sampler, num_episodes=2, shuffle=False) # The "flush" part of the second episode should contain 0 from class_id 0, 1 # for 1, ..., 9 for 9, and then 0 for 10 and the following. chunk_sizes = sampler.compute_chunk_sizes() _, episode2 = episodes examples2, targets2 = episode2 flush_target2, _, _ = split_into_chunks(targets2, chunk_sizes) for class_id in range(10): self.assertEqual( sum(target == class_id for target in flush_target2), class_id) for class_id in range(10, num_classes): self.assertEqual( sum(target == class_id for target in flush_target2), 0) # The "support" part of the second episode should start at example 0 for # class_ids from 0 to 9 (included), and at example 10 for class_id 10 and # higher. _, support_examples2, query_examples2 = split_into_chunks( examples2, chunk_sizes) def _build_class_id_to_example_ids(examples): # Build a mapping: class_id -> list of example ids mapping = collections.defaultdict(list) for example in examples: if not example: # Padding is at the end break class_id, example_id = example.decode().split('.') mapping[int(class_id)].append(int(example_id)) return mapping support2_example_ids = _build_class_id_to_example_ids( support_examples2) query2_example_ids = _build_class_id_to_example_ids(query_examples2) for class_id in range(10): self.assertCountEqual(support2_example_ids[class_id], list(range(5))) self.assertCountEqual(query2_example_ids[class_id], list(range(5, 10))) for class_id in range(10, num_classes): self.assertCountEqual(support2_example_ids[class_id], list(range(10, 15))) self.assertCountEqual(query2_example_ids[class_id], list(range(15, 20)))
from meta_dataset.data import reader from meta_dataset.data import sampling from meta_dataset.data.dataset_spec import DatasetSpecification from meta_dataset.data.learning_spec import Split import numpy as np from six.moves import range from six.moves import zip import tensorflow.compat.v1 as tf # DatasetSpecification to use in tests DATASET_SPEC = DatasetSpecification(name=None, classes_per_split={ Split.TRAIN: 15, Split.VALID: 5, Split.TEST: 10 }, images_per_class=dict( enumerate([10, 20, 30] * 10)), class_names=None, path=None, file_pattern='{}.tfrecords') # Define defaults and set Gin configuration for EpisodeDescriptionConfig MIN_WAYS = 5 MAX_WAYS_UPPER_BOUND = 50 MAX_NUM_QUERY = 10 MAX_SUPPORT_SET_SIZE = 500 MAX_SUPPORT_SIZE_CONTRIB_PER_CLASS = 100 MIN_LOG_WEIGHT = np.log(0.5) MAX_LOG_WEIGHT = np.log(2)
def test_make_multisource_episode_pipeline_feature(self, decoder_type, config_file_path): # Create some feature records and write them to a temp directory. feat_size = 64 num_examples = 100 num_classes = 10 output_path = self.get_temp_dir() gin.parse_config_file(config_file_path) # 1-Write feature records to temp directory. self.rng = np.random.RandomState(0) class_features = [] class_examples = [] for class_id in range(num_classes): features = self.rng.randn(num_examples, feat_size).astype(np.float32) label = np.array(class_id).astype(np.int64) output_file = os.path.join(output_path, str(class_id) + '.tfrecords') examples = test_utils.write_feature_records( features, label, output_file) class_examples.append(examples) class_features.append(features) class_examples = np.stack(class_examples) class_features = np.stack(class_features) # 2-Read records back using multi-source pipeline. # DatasetSpecification to use in tests dataset_spec = DatasetSpecification( name=None, classes_per_split={ learning_spec.Split.TRAIN: 5, learning_spec.Split.VALID: 2, learning_spec.Split.TEST: 3 }, images_per_class={i: num_examples for i in range(num_classes)}, class_names=None, path=output_path, file_pattern='{}.tfrecords') # Duplicate the dataset to simulate reading from multiple datasets. use_bilevel_ontology_list = [False] * 2 use_dag_ontology_list = [False] * 2 all_dataset_specs = [dataset_spec] * 2 fixed_ways_shots = config.EpisodeDescriptionConfig(num_query=5, num_support=5, num_ways=5) dataset_episodic = pipeline.make_multisource_episode_pipeline( dataset_spec_list=all_dataset_specs, use_dag_ontology_list=use_dag_ontology_list, use_bilevel_ontology_list=use_bilevel_ontology_list, episode_descr_config=fixed_ways_shots, split=learning_spec.Split.TRAIN, image_size=None) episode, _ = self.evaluate( dataset_episodic.make_one_shot_iterator().get_next()) if decoder_type == 'feature': # 3-Check that support and query features are in class_features and have # the correct corresponding label. support_features, support_class_ids = episode[0], episode[2] query_features, query_class_ids = episode[3], episode[5] for feat, class_id in zip(list(support_features), list(support_class_ids)): abs_err = np.abs( np.sum(class_features - feat[None][None], axis=-1)) # Make sure the feature is present in the original data. self.assertEqual(abs_err.min(), 0.0) found_class_id = np.where(abs_err == 0.0)[0][0] self.assertEqual(found_class_id, class_id) for feat, class_id in zip(list(query_features), list(query_class_ids)): abs_err = np.abs( np.sum(class_features - feat[None][None], axis=-1)) # Make sure the feature is present in the original data. self.assertEqual(abs_err.min(), 0.0) found_class_id = np.where(abs_err == 0.0)[0][0] self.assertEqual(found_class_id, class_id) elif decoder_type == 'none': # 3-Check that support and query examples are in class_examples and have # the correct corresponding label. support_examples, support_class_ids = episode[0], episode[2] query_examples, query_class_ids = episode[3], episode[5] for example, class_id in zip(list(support_examples), list(support_class_ids)): found_class_id = np.where(class_examples == example)[0][0] self.assertEqual(found_class_id, class_id) for example, class_id in zip(list(query_examples), list(query_class_ids)): found_class_id = np.where(class_examples == example)[0][0] self.assertEqual(found_class_id, class_id)
def test_make_multisource_episode_pipeline_feature(self): def iterate_dataset(dataset, n): """Iterate over dataset.""" if not tf.executing_eagerly(): iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with tf.Session() as sess: for idx in range(n): yield idx, sess.run(next_element) else: for idx, episode in enumerate(dataset): if idx == n: break yield idx, episode def write_feature_records(features, label, output_path): """Create a record file from features and labels. Args: features: An [n, m] numpy array of features. label: A numpy array containing the label. output_path: A string specifying the location of the record. """ writer = tf.python_io.TFRecordWriter(output_path) with self.session(use_gpu=False) as sess: for feat in list(features): feat_serial = sess.run(tf.io.serialize_tensor(feat)) # Write the example. dataset_to_records.write_example( feat_serial, label, writer, input_key='image/embedding', label_key='image/class/label') writer.close() # Create some feature records and write them to a temp directory. feat_size = 64 num_examples = 100 num_classes = 10 output_path = self.get_temp_dir() gin.parse_config(""" import meta_dataset.data.decoder EpisodeDescriptionConfig.min_ways = 5 EpisodeDescriptionConfig.max_ways_upper_bound = 50 EpisodeDescriptionConfig.max_num_query = 10 EpisodeDescriptionConfig.max_support_set_size = 500 EpisodeDescriptionConfig.max_support_size_contrib_per_class = 100 EpisodeDescriptionConfig.min_log_weight = -0.69314718055994529 # np.log(0.5) EpisodeDescriptionConfig.max_log_weight = 0.69314718055994529 # np.log(2) EpisodeDescriptionConfig.ignore_dag_ontology = False EpisodeDescriptionConfig.ignore_bilevel_ontology = False process_episode.support_decoder = @FeatureDecoder() process_episode.query_decoder = @FeatureDecoder() """) # 1-Write feature records to temp directory. self.rng = np.random.RandomState(0) class_features = [] for class_id in range(num_classes): features = self.rng.randn(num_examples, feat_size).astype(np.float32) label = np.array(class_id).astype(np.int64) output_file = os.path.join(output_path, str(class_id) + '.tfrecords') write_feature_records(features, label, output_file) class_features.append(features) class_features = np.stack(class_features) # 2-Read records back using multi-source pipeline. # DatasetSpecification to use in tests dataset_spec = DatasetSpecification( name=None, classes_per_split={ learning_spec.Split.TRAIN: 5, learning_spec.Split.VALID: 2, learning_spec.Split.TEST: 3 }, images_per_class={i: num_examples for i in range(num_classes)}, class_names=None, path=output_path, file_pattern='{}.tfrecords') # Duplicate the dataset to simulate reading from multiple datasets. use_bilevel_ontology_list = [False] * 2 use_dag_ontology_list = [False] * 2 all_dataset_specs = [dataset_spec] * 2 fixed_ways_shots = config.EpisodeDescriptionConfig(num_query=5, num_support=5, num_ways=5) dataset_episodic = pipeline.make_multisource_episode_pipeline( dataset_spec_list=all_dataset_specs, use_dag_ontology_list=use_dag_ontology_list, use_bilevel_ontology_list=use_bilevel_ontology_list, episode_descr_config=fixed_ways_shots, split=learning_spec.Split.TRAIN, image_size=None) _, episode = next(iterate_dataset(dataset_episodic, 1)) # 3-Check that support and query features are in class_features and have # the correct corresponding label. support_features, support_class_ids = episode[0], episode[2] query_features, query_class_ids = episode[3], episode[5] for feat, class_id in zip(list(support_features), list(support_class_ids)): abs_err = np.abs(np.sum(class_features - feat[None][None], axis=-1)) # Make sure the feature is present in the original data. self.assertEqual(abs_err.min(), 0.0) found_class_id = np.where(abs_err == 0.0)[0][0] self.assertEqual(found_class_id, class_id) for feat, class_id in zip(list(query_features), list(query_class_ids)): abs_err = np.abs(np.sum(class_features - feat[None][None], axis=-1)) # Make sure the feature is present in the original data. self.assertEqual(abs_err.min(), 0.0) found_class_id = np.where(abs_err == 0.0)[0][0] self.assertEqual(found_class_id, class_id)