def load_data(cache_dir=None): """Loads the federated Shakespeare dataset. Downloads and caches the dataset locally. If previously downloaded, tries to load the dataset from cache. This dataset is derived from the Leaf repository (https://github.com/TalwalkarLab/leaf) pre-processing on the works of Shakespeare, which is published in "LEAF: A Benchmark for Federated Settings" https://arxiv.org/abs/1812.01097. The data set consists of 715 users (characters of Shakespeare plays), where each example corresponds to a contiguous set of lines spoken by the character in a given play. Data set sizes: - train: 16,068 examples - test: 2,356 examples Rather than holding out specific users, each user's examples are split across _train_ and _test_ so that all users have at least one example in _train_ and one example in _test_. Characters that had less than 2 examples are excluded from the data set. The `tf.data.Datasets` returned by `tff.simulation.ClientData.create_tf_dataset_for_client` will yield `collections.OrderedDict` objects at each iteration, with the following keys and values: - `'snippets'`: a `tf.Tensor` with `dtype=tf.string`, the snippet of contiguous text. Args: cache_dir: (Optional) directory to cache the downloaded file. If `None`, caches in Keras' default cache directory. Returns: Tuple of (train, test) where the tuple elements are `tff.simulation.ClientData` objects. """ path = tf.keras.utils.get_file( 'shakespeare.tar.bz2', origin= 'https://storage.googleapis.com/tff-datasets-public/shakespeare.tar.bz2', file_hash= '0285be9906cb5f268092eee4edeeacfc2af4574f2941f7cc2f08a321d7f5c707', hash_algorithm='sha256', extract=True, archive_format='tar', cache_dir=cache_dir) dir_path = os.path.dirname(path) train_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, 'shakespeare_train.h5')) test_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, 'shakespeare_test.h5')) return train_client_data, test_client_data
def get_emnist_dataset(dataset_filename): """Loads and preprocesses the EMNIST dataset. Returns: A `(emnist_train, emnist_test)` tuple where `emnist_train` is a `tff.simulation.ClientData` object representing the training data and `emnist_test` is a single `tf.data.Dataset` representing the test data of all clients. """ emnist_train = hdf5_client_data.HDF5ClientData( f'./dataset/{dataset_filename}.h5') emnist_test = hdf5_client_data.HDF5ClientData('./dataset/test.h5') def element_fn(element): return collections.OrderedDict(x=tf.expand_dims(element['pixels'], -1), y=element['label']) def preprocess_train_dataset(dataset): # Use buffer_size same as the maximum client dataset size, # 418 for Federated EMNIST return dataset.map(element_fn).shuffle(buffer_size=418).repeat( count=FLAGS.client_epochs_per_round).batch(FLAGS.batch_size, drop_remainder=False) def preprocess_test_dataset(dataset): return dataset.map(element_fn).batch(FLAGS.test_batch_size, drop_remainder=False) emnist_train = emnist_train.preprocess(preprocess_train_dataset) emnist_test = preprocess_test_dataset( emnist_test.create_tf_dataset_from_all_clients()) return emnist_train, emnist_test
def test_create_tf_dataset_for_client(self): client_data = hdf5_client_data.HDF5ClientData( TransformingClientDataTest.test_data_filepath) transformed_client_data = transforming_client_data.TransformingClientData( client_data, _test_transform, 9) for client_id in transformed_client_data.client_ids: tf_dataset = transformed_client_data.create_tf_dataset_for_client( client_id) self.assertIsInstance(tf_dataset, tf.data.Dataset) pattern = r'^(.*)_(\d*)$' match = re.search(pattern, client_id) client = match.group(1) index = int(match.group(2)) for i, actual in enumerate(tf_dataset): expected = { k: v[i] for k, v in six.iteritems(TEST_DATA[client]) } expected['x'] = expected['x'] + 10 * index self.assertCountEqual(actual, expected) for k, v in six.iteritems(actual): self.assertAllEqual(v.numpy(), expected[k])
def test_create_tf_dataset_from_all_clients(self): client_data = hdf5_client_data.HDF5ClientData( TransformingClientDataTest.test_data_filepath) num_transformed_clients = 9 transformed_client_data = transforming_client_data.TransformingClientData( client_data, _test_transform, num_transformed_clients) expansion_factor = num_transformed_clients // len(TEST_DATA) tf_dataset = transformed_client_data.create_tf_dataset_from_all_clients( ) self.assertIsInstance(tf_dataset, tf.data.Dataset) expected_examples = [] for expected_data in six.itervalues(TEST_DATA): for index in range(expansion_factor): for i in range(len(expected_data['x'])): example = { k: v[i] for k, v in six.iteritems(expected_data) } example['x'] += 10 * index expected_examples.append(example) for actual in tf_dataset: expected = expected_examples.pop(0) actual = tf.contrib.framework.nest.map_structure( lambda t: t.numpy(), actual) self.assertCountEqual(actual, expected) self.assertEmpty(expected_examples)
def test_create_tf_dataset_from_all_clients(self): client_data = hdf5_client_data.HDF5ClientData( TransformingClientDataTest.test_data_filepath) num_transformed_clients = 9 transformed_client_data = transforming_client_data.TransformingClientData( client_data, _test_transform_cons, num_transformed_clients) expansion_factor = num_transformed_clients // len(TEST_DATA) tf_dataset = transformed_client_data.create_tf_dataset_from_all_clients( ) self.assertIsInstance(tf_dataset, tf.data.Dataset) expected_examples = [] for expected_data in TEST_DATA.values(): for index in range(expansion_factor): for i in range(len(expected_data['x'])): example = { k: v[i].copy() for k, v in expected_data.items() } example['x'] += 10 * index expected_examples.append(example) for actual in tf_dataset: actual = self.evaluate(actual) expected = expected_examples.pop(0) self.assertCountEqual(actual, expected) self.assertEmpty(expected_examples)
def test_client_ids_property(self): client_data = hdf5_client_data.HDF5ClientData( TransformingClientDataTest.test_data_filepath) expansion_factor = 2.5 transformed_client_data = transforming_client_data.TransformingClientData( client_data, lambda: 0, expansion_factor) self.assertLen(transformed_client_data.client_ids, int(len(TEST_DATA) * expansion_factor))
def test_output_shapes_property(self): expected_shapes = { 'x': tf.TensorShape([2]), 'y': tf.TensorShape([]), 'z': tf.TensorShape([]), } client_data = hdf5_client_data.HDF5ClientData( HDF5ClientDataTest.test_data_filepath) self.assertDictEqual(client_data.output_shapes, expected_shapes)
def test_output_types_property(self): expected_types = { 'w': tf.int64, 'x': tf.int32, 'y': tf.float32, 'z': tf.string, } client_data = hdf5_client_data.HDF5ClientData( HDF5ClientDataTest.test_data_filepath) self.assertDictEqual(client_data.output_types, expected_types)
def test_element_type_structure(self): expected_structure = { 'w': tf.TensorSpec(shape=[], dtype=tf.int64), 'x': tf.TensorSpec(shape=[2], dtype=tf.int32), 'y': tf.TensorSpec(shape=[], dtype=tf.float32), 'z': tf.TensorSpec(shape=[], dtype=tf.string), } client_data = hdf5_client_data.HDF5ClientData( HDF5ClientDataTest.test_data_filepath) self.assertDictEqual(client_data.element_type_structure, expected_structure)
def test_create_tf_dataset_from_all_clients(self): client_data = hdf5_client_data.HDF5ClientData( HDF5ClientDataTest.test_data_filepath) tf_dataset = client_data.create_tf_dataset_from_all_clients() self.assertIsInstance(tf_dataset, tf.data.Dataset) expected_examples = [] for expected_data in TEST_DATA.values(): for i in range(len(expected_data['x'])): expected_examples.append({k: v[i] for k, v in expected_data.items()}) for actual in tf_dataset: expected = expected_examples.pop(0) actual = self.evaluate(actual) self.assertCountEqual(actual, expected) self.assertEmpty(expected_examples)
def test_client_ids_property(self): client_data = hdf5_client_data.HDF5ClientData( TransformingClientDataTest.test_data_filepath) num_transformed_clients = 7 transformed_client_data = transforming_client_data.TransformingClientData( client_data, _test_transform, num_transformed_clients) client_ids = transformed_client_data.client_ids # Check length of client_ids. self.assertLen(client_ids, 7) # Check that they are all strings. for client_id in client_ids: self.assertIsInstance(client_id, str) # Check ids are sorted. self.assertListEqual(client_ids, sorted(client_ids))
def test_create_tf_dataset_for_client(self): client_data = hdf5_client_data.HDF5ClientData( HDF5ClientDataTest.test_data_filepath) # Iterate over each client, ensuring we received a tf.data.Dataset with the # correct data. for client_id, expected_data in TEST_DATA.items(): tf_dataset = client_data.create_tf_dataset_for_client(client_id) self.assertIsInstance(tf_dataset, tf.data.Dataset) expected_examples = [] for i in range(len(expected_data['x'])): expected_examples.append({k: v[i] for k, v in expected_data.items()}) for actual in tf_dataset: expected = expected_examples.pop(0) actual = self.evaluate(actual) self.assertCountEqual(actual, expected) self.assertEmpty(expected_examples)
def test_create_tf_dataset_from_all_clients(self): client_data = hdf5_client_data.HDF5ClientData( HDF5ClientDataTest.test_data_filepath) tf_dataset = client_data.create_tf_dataset_from_all_clients() self.assertIsInstance(tf_dataset, tf.data.Dataset) expected_examples = [] for expected_data in six.itervalues(TEST_DATA): for i in range(len(expected_data['x'])): expected_examples.append( {k: v[i] for k, v in six.iteritems(expected_data)}) for actual in tf_dataset: expected = expected_examples.pop(0) actual = tf.nest.map_structure(lambda t: t.numpy(), actual) self.assertCountEqual(actual, expected) self.assertEmpty(expected_examples)
def test_fail_on_bad_client_id(self): client_data = hdf5_client_data.HDF5ClientData( TransformingClientDataTest.test_data_filepath) transformed_client_data = transforming_client_data.TransformingClientData( client_data, _test_transform_cons, 7) # The following three should be valid. transformed_client_data.create_tf_dataset_for_client('CLIENT A_1') transformed_client_data.create_tf_dataset_for_client('CLIENT B_1') transformed_client_data.create_tf_dataset_for_client('CLIENT A_2') # This should not be valid: no corresponding client. with self.assertRaisesRegex( ValueError, 'client_id must be a valid string from client_ids.'): transformed_client_data.create_tf_dataset_for_client('CLIENT D_0') # This should not be valid: index out of range. with self.assertRaisesRegex( ValueError, 'client_id must be a valid string from client_ids.'): transformed_client_data.create_tf_dataset_for_client('CLIENT B_2')
def test_create_tf_dataset_for_client(self): client_data = hdf5_client_data.HDF5ClientData( HDF5ClientDataTest.test_data_filepath) # Iterate over each client, ensuring we received a tf.data.Dataset with the # correct data. for client_id, expected_data in six.iteritems(TEST_DATA): tf_dataset = client_data.create_tf_dataset_for_client(client_id) self.assertIsInstance(tf_dataset, tf.data.Dataset) expected_examples = [] for i in range(len(expected_data['x'])): expected_examples.append( {k: v[i] for k, v in six.iteritems(expected_data)}) for actual in tf_dataset: expected = expected_examples.pop(0) actual = tf.contrib.framework.nest.map_structure( lambda t: t.numpy(), actual) self.assertCountEqual(actual, expected) self.assertEmpty(expected_examples)
def load_data(only_digits=True, cache_dir=None): """Loads the Federated EMNIST dataset. Downloads and caches the dataset locally. If previously downloaded, tries to load the dataset from cache. This dataset is derived from the Leaf repository (https://github.com/TalwalkarLab/leaf) pre-processing of the Extended MNIST dataset, grouping examples by writer. Details about Leaf were published in "LEAF: A Benchmark for Federated Settings" https://arxiv.org/abs/1812.01097. Data set sizes: *only_digits=True*: 3,383 users, 10 label classes - train: 341,873 examples - test: 40,832 examples *only_digits=False*: 3,400 users, 62 label classes - train: 671,585 examples - test: 77,483 examples Rather than holding out specific users, each user's examples are split across _train_ and _test_ so that all users have at least one example in _train_ and one example in _test_. Writers that had less than 2 examples are excluded from the data set. The `tf.data.Datasets` returned by `tff.simulation.ClientData.create_tf_dataset_for_client` will yield `collections.OrderedDict` objects at each iteration, with the following keys and values: - `'pixels'`: a `tf.Tensor` with `dtype=tf.float32` and shape [28, 28], containing the pixels of the handwritten digit. - `'label'`: a `tf.Tensor` with `dtype=tf.int32` and shape [1], the class label of the corresponding pixels. Args: only_digits: (Optional) whether to only include examples that are from the digits [0-9] classes. If `False`, includes lower and upper case characters, for a total of 62 class labels. cache_dir: (Optional) directory to cache the downloaded file. If `None`, caches in Keras' default cache directory. Returns: Tuple of (train, test) where the tuple elements are `tff.simulation.ClientData` objects. """ if only_digits: fileprefix = 'fed_emnist_digitsonly' sha256 = '55333deb8546765427c385710ca5e7301e16f4ed8b60c1dc5ae224b42bd5b14b' else: fileprefix = 'fed_emnist' sha256 = 'fe1ed5a502cea3a952eb105920bff8cffb32836b5173cb18a57a32c3606f3ea0' filename = fileprefix + '.tar.bz2' path = tf.keras.utils.get_file( filename, origin='https://storage.googleapis.com/tff-datasets-public/' + filename, file_hash=sha256, hash_algorithm='sha256', extract=True, archive_format='tar', cache_dir=cache_dir) dir_path = os.path.dirname(path) train_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, fileprefix + '_train.h5')) test_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, fileprefix + '_test.h5')) return train_client_data, test_client_data
def load_data(cache_dir=None): """Loads a federated version of the CIFAR-100 dataset. The dataset is downloaded and cached locally. If previously downloaded, it tries to load the dataset from cache. The dataset is derived from the [CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html). The training and testing examples are partitioned across 500 and 100 clients (respectively). No clients share any data samples, so it is a true partition of CIFAR-100. The train clients have string client IDs in the range [0-499], while the test clients have string client IDs in the range [0-99]. The train clients form a true partition of the CIFAR-100 training split, while the test clients form a true partition of the CIFAR-100 testing split. The data partitioning is done using a hierarchical Latent Dirichlet Allocation (LDA) process, referred to as the [Pachinko Allocation Method] (https://people.cs.umass.edu/~mccallum/papers/pam-icml06.pdf) (PAM). This method uses a two-stage LDA process, where each client has an associated multinomial distribution over the coarse labels of CIFAR-100, and a coarse-to-fine label multinomial distribution for that coarse label over the labels under that coarse label. The coarse label multinomial is drawn from a symmetric Dirichlet with parameter 0.1, and each coarse-to-fine multinomial distribution is drawn from a symmetric Dirichlet with parameter 10. Each client has 100 samples. To generate a sample for the client, we first select a coarse label by drawing from the coarse label multinomial distribution, and then draw a fine label using the coarse-to-fine multinomial distribution. We then randomly draw a sample from CIFAR-100 with that label (without replacement). If this exhausts the set of samples with this label, we remove the label from the coarse-to-fine multinomial and renormalize the multinomial distribution. Data set sizes: - train: 500,000 examples - test: 100,000 examples The `tf.data.Datasets` returned by `tff.simulation.ClientData.create_tf_dataset_for_client` will yield `collections.OrderedDict` objects at each iteration, with the following keys and values: - `'coarse_label'`: a `tf.Tensor` with `dtype=tf.int64` and shape [1] that corresponds to the coarse label of the associated image. Labels are in the range [0-19]. - `'image'`: a `tf.Tensor` with `dtype=tf.uint8` and shape [32, 32, 3], corresponding to the pixels of the handwritten digit, with values in the range [0, 255]. - `'label'`: a `tf.Tensor` with `dtype=tf.int64` and shape [1], the class label of the corresponding image. Labels are in the range [0-99]. Args: cache_dir: (Optional) directory to cache the downloaded file. If `None`, caches in Keras' default cache directory. Returns: Tuple of (train, test) where the tuple elements are `tff.simulation.ClientData` objects. """ path = tf.keras.utils.get_file( 'fed_cifar100.tar.bz2', origin= 'https://storage.googleapis.com/tff-datasets-public/fed_cifar100.tar.bz2', file_hash= 'e8575e22c038ecef1ce6c7d492d7abee7da13b1e1ba9b70a7fc18531ba7590de', hash_algorithm='sha256', extract=True, archive_format='tar', cache_dir=cache_dir) dir_path = os.path.dirname(path) train_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, 'fed_cifar100_train.h5')) test_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, 'fed_cifar100_test.h5')) return train_client_data, test_client_data
def load_data(cache_dir=None): """Loads the federated Stack Overflow dataset. Downloads and caches the dataset locally. If previously downloaded, tries to load the dataset from cache. This dataset is derived from the Stack Overflow Data hosted by kaggle.com and available to query through Kernels using the BigQuery API: https://www.kaggle.com/stackoverflow/stackoverflow. The Stack Overflow Data is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. The data consists of the body text of all questions and answers. The bodies were parsed into sentences, and any user with fewer than 100 sentences was expunged from the data. Minimal preprocessing was performed as follows: 1. Lowercase the text, 2. Unescape HTML symbols, 3. Remove non-ascii symbols, 4. Separate punctuation as individual tokens (except apostrophes and hyphens), 5. Removing extraneous whitespace, 6. Replacing URLS with a special token. In addition the following metadata is available: 1. Creation date 2. Question title 3. Question tags 4. Question score 5. Type ('question' or 'answer') The data is divided into three sets: - Train: Data before 2018-01-01 UTC except the held-out users. 342,477 unique users with 135,818,730 examples. - Held-out: All examples from users with user_id % 10 == 0 (all dates). 38,758 unique users with 16,491,230 examples. - Test: All examples after 2018-01-01 UTC except from held-out users. 204,088 unique users with 16,586,035 examples. The `tf.data.Datasets` returned by `tff.simulation.ClientData.create_tf_dataset_for_client` will yield `collections.OrderedDict` objects at each iteration, with the following keys and values: - `'creation_date'`: a `tf.Tensor` with `dtype=tf.string` and shape [] containing the date/time of the question or answer in UTC format. - `'title'`: a `tf.Tensor` with `dtype=tf.string` and shape [] containing the title of the question. - `'score'`: a `tf.Tensor` with `dtype=tf.int64` and shape [] containing the score of the question. - `'tags'`: a `tf.Tensor` with `dtype=tf.string` and shape [] containing the tags of the question, separated by '|' characters. - `'tokens'`: a `tf.Tensor` with `dtype=tf.string` and shape [] containing the tokens of the question/answer, separated by space (' ') characters. - `'type'`: a `tf.Tensor` with `dtype=tf.string` and shape [] containing either the string 'question' or 'answer'. Args: cache_dir: (Optional) directory to cache the downloaded file. If `None`, caches in Keras' default cache directory. Returns: Tuple of (train, held_out, test) where the tuple elements are `tff.simulation.ClientData` objects. """ path = tf.keras.utils.get_file( 'stackoverflow.tar.bz2', origin= 'https://storage.googleapis.com/tff-datasets-public/stackoverflow.tar.bz2', file_hash= '99eca2f8b8327a09e5fc123979df2d237acbc5e52322f6d86bf523ee47b961a2', hash_algorithm='sha256', extract=True, archive_format='tar', cache_dir=cache_dir) dir_path = os.path.dirname(path) train_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, 'stackoverflow_train.h5')) held_out_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, 'stackoverflow_held_out.h5')) test_client_data = hdf5_client_data.HDF5ClientData( os.path.join(dir_path, 'stackoverflow_test.h5')) return train_client_data, held_out_client_data, test_client_data
def test_client_ids_property(self): client_data = hdf5_client_data.HDF5ClientData( HDF5ClientDataTest.test_data_filepath) self.assertEqual(client_data.client_ids, sorted(TEST_DATA.keys()))
def load_dataset(filepath): client_data = hdf5_client_data.HDF5ClientData(filepath) #test_client_data = hdf5_client_data.HDF5ClientData(filepath_test) return client_data