예제 #1
0
def load_data(
    cache_dir: Optional[str] = None
) -> Tuple[client_data.ClientData, client_data.ClientData]:
    """Loads the federated Shakespeare dataset.

  Downloads and caches the dataset locally. If previously downloaded, tries to
  load the dataset from cache.

  This dataset is derived from the Leaf repository
  (https://github.com/TalwalkarLab/leaf) pre-processing on the works of
  Shakespeare, which is published in "LEAF: A Benchmark for Federated Settings"
  https://arxiv.org/abs/1812.01097.

  The data set consists of 715 users (characters of Shakespeare plays), where
  each
  example corresponds to a contiguous set of lines spoken by the character in a
  given play.

  Data set sizes:

  -   train: 16,068 examples
  -   test: 2,356 examples

  Rather than holding out specific users, each user's examples are split across
  _train_ and _test_ so that all users have at least one example in _train_ and
  one example in _test_. Characters that had less than 2 examples are excluded
  from the data set.

  The `tf.data.Datasets` returned by
  `tff.simulation.datasets.ClientData.create_tf_dataset_for_client` will yield
  `collections.OrderedDict` objects at each iteration, with the following keys
  and values:

    -   `'snippets'`: a `tf.Tensor` with `dtype=tf.string`, the snippet of
      contiguous text.

  Args:
    cache_dir: (Optional) directory to cache the downloaded file. If `None`,
      caches in Keras' default cache directory.

  Returns:
    Tuple of (train, test) where the tuple elements are
    `tff.simulation.datasets.ClientData` objects.
  """
    database_path = download.get_compressed_file(
        origin=
        'https://storage.googleapis.com/tff-datasets-public/shakespeare.sqlite.lzma',
        cache_dir=cache_dir)
    train_client_data = sql_client_data.SqlClientData(
        database_path, split_name='train').preprocess(_add_parsing)
    test_client_data = sql_client_data.SqlClientData(
        database_path, split_name='test').preprocess(_add_parsing)
    return train_client_data, test_client_data
예제 #2
0
 def test_split(split_name):
     client_data = sql_client_data.SqlClientData(
         test_dataset_filepath(), split_name=split_name)
     client_data = client_data.preprocess(lambda x: x.take(1))
     expected_examples = len(client_data.client_ids)
     dataset = client_data.create_tf_dataset_from_all_clients()
     actual_examples = dataset.reduce(0, lambda s, x: s + 1)
     self.assertEqual(actual_examples, expected_examples)
예제 #3
0
 def test_split(split_name):
   client_data = sql_client_data.SqlClientData(
       test_dataset_filepath(), split_name=split_name)
   preprocessed_client_data = client_data.preprocess(lambda x: x)
   self.assertIsInstance(preprocessed_client_data,
                         sql_client_data.PreprocessSqlClientData)
   self.assertEqual(preprocessed_client_data.element_type_structure,
                    client_data.element_type_structure)
예제 #4
0
 def test_split(split_name, expected_examples):
     client_data = sql_client_data.SqlClientData(
         test_dataset_filepath(), split_name=split_name)
     self.assertEqual(
         str(client_data.dataset_computation.type_signature),
         '(string -> string*)')
     dataset = client_data.dataset_computation('test_c')
     actual_examples = dataset.reduce(0, lambda s, x: s + 1)
     self.assertEqual(actual_examples, expected_examples)
예제 #5
0
 def test_split(split_name, example_counts):
   client_data = sql_client_data.SqlClientData(
       test_dataset_filepath(), split_name=split_name)
   self.assertEqual(client_data.client_ids, list(example_counts.keys()))
   self.assertEqual(client_data.element_type_structure,
                    tf.TensorSpec(shape=(), dtype=tf.string))
   for client_id, expected_examples in example_counts.items():
     dataset = client_data.create_tf_dataset_for_client(client_id)
     actual_examples = dataset.reduce(0, lambda s, x: s + 1)
     self.assertEqual(actual_examples, expected_examples, msg=client_id)
예제 #6
0
 def test_split(split_name, example_counts):
   client_data = sql_client_data.SqlClientData(
       test_dataset_filepath(), split_name=split_name)
   client_data = client_data.preprocess(lambda x: x)
   self.assertEqual(client_data.client_ids, list(example_counts.keys()))
   self.assertEqual(client_data.element_type_structure,
                    tf.TensorSpec(shape=(), dtype=tf.string))
   expected_examples = sum(example_counts.values())
   dataset = client_data.create_tf_dataset_from_all_clients()
   actual_examples = dataset.reduce(0, lambda s, x: s + 1)
   self.assertEqual(actual_examples, expected_examples)
예제 #7
0
def load_and_parse_sql_client_data(
        database_filepath: str,
        element_spec: Mapping[str, tf.TensorSpec]) -> client_data.ClientData:
    """Load a `ClientData` arises by parsing a serialized `SqlClientData`.

  Args:
    database_filepath: A `str` filepath of the SQL database. This function will
      first fetch the SQL database to a local temporary directory if
      `database_filepath` is a remote directory.
    element_spec: The `element_spec` of the local dataset. This is used to parse
      the serialized `tff.simulation.datasets.SqlClientData`. The `element_spec`
      must be of type `Mapping[str, TensorSpec]`.

  Returns:
    A `tff.simulation.datasets.ClientData` instance arised from parsing a
    `tff.simulation.datasets.SqlClientData`.

  Raises:
    FileNotFoundError: if database_filepath does not exist.
    ElementSpecCompatibilityError: if the `element_spec` of datasets are not of
      type `Mapping[str, TensorSpec]`.
  """
    parser = _build_parser(element_spec)

    def dataset_parser(ds: tf.data.Dataset) -> tf.data.Dataset:
        return ds.map(parser, num_parallel_calls=tf.data.AUTOTUNE)

    if not tf.io.gfile.exists(database_filepath):
        raise FileNotFoundError(
            f'No such file or directory: {database_filepath}')
    elif not os.path.exists(database_filepath):
        logging.info('Starting fetching SQL database to local.')
        tmp_dir = tempfile.mkdtemp()
        tmp_database_filepath = tf.io.gfile.join(
            tmp_dir, os.path.basename(database_filepath))
        tf.io.gfile.copy(database_filepath,
                         tmp_database_filepath,
                         overwrite=True)
        database_filepath = tmp_database_filepath
        logging.info('Finished fetching SQL database to local.')

    return sql_client_data.SqlClientData(database_filepath).preprocess(
        dataset_parser)
예제 #8
0
def load_data(cache_dir=None):
  """Loads the federated Stack Overflow dataset.

  Downloads and caches the dataset locally. If previously downloaded, tries to
  load the dataset from cache.

  This dataset is derived from the Stack Overflow Data hosted by kaggle.com and
  available to query through Kernels using the BigQuery API:
  https://www.kaggle.com/stackoverflow/stackoverflow. The Stack Overflow Data
  is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported
  License. To view a copy of this license, visit
  http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to
  Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.

  The data consists of the body text of all questions and answers. The bodies
  were parsed into sentences, and any user with fewer than 100 sentences was
  expunged from the data. Minimal preprocessing was performed as follows:

  1. Lowercase the text,
  2. Unescape HTML symbols,
  3. Remove non-ascii symbols,
  4. Separate punctuation as individual tokens (except apostrophes and hyphens),
  5. Removing extraneous whitespace,
  6. Replacing URLS with a special token.

  In addition the following metadata is available:

  1. Creation date
  2. Question title
  3. Question tags
  4. Question score
  5. Type ('question' or 'answer')

  The data is divided into three sets:

    -   Train: Data before 2018-01-01 UTC except the held-out users. 342,477
        unique users with 135,818,730 examples.
    -   Held-out: All examples from users with user_id % 10 == 0 (all dates).
        38,758 unique users with 16,491,230 examples.
    -   Test: All examples after 2018-01-01 UTC except from held-out users.
        204,088 unique users with 16,586,035 examples.

  The `tf.data.Datasets` returned by
  `tff.simulation.datasets.ClientData.create_tf_dataset_for_client` will yield
  `collections.OrderedDict` objects at each iteration, with the following keys
  and values, in lexicographic order by key:

    -   `'creation_date'`: a `tf.Tensor` with `dtype=tf.string` and shape []
        containing the date/time of the question or answer in UTC format.
    -   `'score'`: a `tf.Tensor` with `dtype=tf.int64` and shape [] containing
        the score of the question.
    -   `'tags'`: a `tf.Tensor` with `dtype=tf.string` and shape [] containing
        the tags of the question, separated by '|' characters.
    -   `'title'`: a `tf.Tensor` with `dtype=tf.string` and shape [] containing
        the title of the question.
    -   `'tokens'`: a `tf.Tensor` with `dtype=tf.string` and shape []
        containing the tokens of the question/answer, separated by space (' ')
        characters.
    -   `'type'`: a `tf.Tensor` with `dtype=tf.string` and shape []
        containing either the string 'question' or 'answer'.

  Args:
    cache_dir: (Optional) directory to cache the downloaded file. If `None`,
      caches in Keras' default cache directory.

  Returns:
    Tuple of (train, held_out, test) where the tuple elements are
    `tff.simulation.datasets.ClientData` objects.
  """
  database_path = download.get_compressed_file(
      origin='https://storage.googleapis.com/tff-datasets-public/stackoverflow.sqlite.lzma',
      cache_dir=cache_dir)
  train_client_data = sql_client_data.SqlClientData(
      database_path, 'train').preprocess(_add_proto_parsing)
  heldout_client_data = sql_client_data.SqlClientData(
      database_path, 'heldout').preprocess(_add_proto_parsing)
  test_client_data = sql_client_data.SqlClientData(
      database_path, 'test').preprocess(_add_proto_parsing)
  return train_client_data, heldout_client_data, test_client_data
예제 #9
0
 def test_client_missing(self):
     client_data = sql_client_data.SqlClientData(test_dataset_filepath())
     with self.assertRaisesRegex(ValueError,
                                 'not a client in this ClientData'):
         client_data.create_tf_dataset_for_client('missing_client_id')
예제 #10
0
def load_data(cache_dir=None):
    """Loads a federated version of the CIFAR-100 dataset.

  The dataset is downloaded and cached locally. If previously downloaded, it
  tries to load the dataset from cache.

  The dataset is derived from the [CIFAR-100
  dataset](https://www.cs.toronto.edu/~kriz/cifar.html). The training and
  testing examples are partitioned across 500 and 100 clients (respectively).
  No clients share any data samples, so it is a true partition of CIFAR-100. The
  train clients have string client IDs in the range [0-499], while the test
  clients have string client IDs in the range [0-99]. The train clients form a
  true partition of the CIFAR-100 training split, while the test clients form a
  true partition of the CIFAR-100 testing split.

  The data partitioning is done using a hierarchical Latent Dirichlet Allocation
  (LDA) process, referred to as the [Pachinko Allocation Method]
  (https://people.cs.umass.edu/~mccallum/papers/pam-icml06.pdf) (PAM).
  This method uses a two-stage LDA process, where each client has an associated
  multinomial distribution over the coarse labels of CIFAR-100, and a
  coarse-to-fine label multinomial distribution for that coarse label over the
  labels under that coarse label. The coarse label multinomial is drawn from a
  symmetric Dirichlet with parameter 0.1, and each coarse-to-fine multinomial
  distribution is drawn from a symmetric Dirichlet with parameter 10. Each
  client has 100 samples. To generate a sample for the client, we first select
  a coarse label by drawing from the coarse label multinomial distribution, and
  then draw a fine label using the coarse-to-fine multinomial distribution. We
  then randomly draw a sample from CIFAR-100 with that label (without
  replacement). If this exhausts the set of samples with this label, we
  remove the label from the coarse-to-fine multinomial and renormalize the
  multinomial distribution.

  Data set sizes:
  -   train: 500,000 examples
  -   test: 100,000 examples

  The `tf.data.Datasets` returned by
  `tff.simulation.datasets.ClientData.create_tf_dataset_for_client` will yield
  `collections.OrderedDict` objects at each iteration, with the following keys
  and values, in lexicographic order by key:

    -   `'coarse_label'`: a `tf.Tensor` with `dtype=tf.int64` and shape [1] that
        corresponds to the coarse label of the associated image. Labels are
        in the range [0-19].
    -   `'image'`: a `tf.Tensor` with `dtype=tf.uint8` and shape [32, 32, 3],
        corresponding to the pixels of the handwritten digit, with values in
        the range [0, 255].
    -   `'label'`: a `tf.Tensor` with `dtype=tf.int64` and shape [1], the class
        label of the corresponding image. Labels are in the range [0-99].

  Args:
    cache_dir: (Optional) directory to cache the downloaded file. If `None`,
      caches in Keras' default cache directory.

  Returns:
    Tuple of (train, test) where the tuple elements are
    `tff.simulation.datasets.ClientData` objects.
  """
    database_path = download.get_compressed_file(
        origin=
        'https://storage.googleapis.com/tff-datasets-public/cifar100.sqlite.lzma',
        cache_dir=cache_dir)
    train_client_data = sql_client_data.SqlClientData(
        database_path, 'train').preprocess(_add_proto_parsing)
    test_client_data = sql_client_data.SqlClientData(
        database_path, 'test').preprocess(_add_proto_parsing)
    return train_client_data, test_client_data
예제 #11
0
def load_data(only_digits=True, cache_dir=None):
  """Loads the Federated EMNIST dataset.

  Downloads and caches the dataset locally. If previously downloaded, tries to
  load the dataset from cache.

  This dataset is derived from the Leaf repository
  (https://github.com/TalwalkarLab/leaf) pre-processing of the Extended MNIST
  dataset, grouping examples by writer. Details about Leaf were published in
  "LEAF: A Benchmark for Federated Settings" https://arxiv.org/abs/1812.01097.

  *Note*: This dataset does not include some additional preprocessing that
  MNIST includes, such as size-normalization and centering.
  In the Federated EMNIST data, the value of 1.0
  corresponds to the background, and 0.0 corresponds to the color of the digits
  themselves; this is the *inverse* of some MNIST representations,
  e.g. in [tensorflow_datasets]
  (https://github.com/tensorflow/datasets/blob/master/docs/datasets.md#mnist),
  where 0 corresponds to the background color, and 255 represents the color of
  the digit.

  Data set sizes:

  *only_digits=True*: 3,383 users, 10 label classes

  -   train: 341,873 examples
  -   test: 40,832 examples

  *only_digits=False*: 3,400 users, 62 label classes

  -   train: 671,585 examples
  -   test: 77,483 examples

  Rather than holding out specific users, each user's examples are split across
  _train_ and _test_ so that all users have at least one example in _train_ and
  one example in _test_. Writers that had less than 2 examples are excluded from
  the data set.

  The `tf.data.Datasets` returned by
  `tff.simulation.datasets.ClientData.create_tf_dataset_for_client` will yield
  `collections.OrderedDict` objects at each iteration, with the following keys
  and values, in lexicographic order by key:

    -   `'label'`: a `tf.Tensor` with `dtype=tf.int32` and shape [1], the class
        label of the corresponding pixels. Labels [0-9] correspond to the digits
        classes, labels [10-35] correspond to the uppercase classes (e.g., label
        11 is 'B'), and labels [36-61] correspond to the lowercase classes
        (e.g., label 37 is 'b').
    -   `'pixels'`: a `tf.Tensor` with `dtype=tf.float32` and shape [28, 28],
        containing the pixels of the handwritten digit, with values in
        the range [0.0, 1.0].

  Args:
    only_digits: (Optional) whether to only include examples that are from the
      digits [0-9] classes. If `False`, includes lower and upper case
      characters, for a total of 62 class labels.
    cache_dir: (Optional) directory to cache the downloaded file. If `None`,
      caches in Keras' default cache directory.

  Returns:
    Tuple of (train, test) where the tuple elements are
    `tff.simulation.datasets.ClientData` objects.
  """
  database_path = download.get_compressed_file(
      origin='https://storage.googleapis.com/tff-datasets-public/emnist_all.sqlite.lzma',
      cache_dir=cache_dir)
  if only_digits:
    train_client_data = sql_client_data.SqlClientData(
        database_path, 'digits_only_train').preprocess(_add_proto_parsing)
    test_client_data = sql_client_data.SqlClientData(
        database_path, 'digits_only_test').preprocess(_add_proto_parsing)
  else:
    train_client_data = sql_client_data.SqlClientData(
        database_path, 'all_train').preprocess(_add_proto_parsing)
    test_client_data = sql_client_data.SqlClientData(
        database_path, 'all_test').preprocess(_add_proto_parsing)
  return train_client_data, test_client_data
예제 #12
0
def load_data(split_by_clients=True, cache_dir=None):
  """Loads the Federated CelebA dataset.

  Downloads and caches the dataset locally. If previously downloaded, tries to
  load the dataset from cache.

  This dataset is derived from the
  [LEAF repository](https://github.com/TalwalkarLab/leaf) preprocessing of the
  [CelebA dataset](https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html),
  grouping examples by celebrity id. Details about LEAF were published in
  ["LEAF: A Benchmark for Federated
  Settings"](https://arxiv.org/abs/1812.01097), and details about CelebA were
  published in ["Deep Learning Face Attributes in the
  Wild"](https://arxiv.org/abs/1411.7766).

  The raw CelebA dataset contains 10,177 unique identities. During LEAF
  preprocessing, all clients with less than 5 examples are removed; this leaves
  9,343 clients.

  The data is available with train and test splits by clients or by examples.
  That is, when split by clients, ~90% of clients are selected for the train
  set, ~10% of clients are selected for test, and all the examples for a given
  user are part of the same data split.  When split by examples, each client is
  located in both the train data and the test data, with ~90% of the examples
  on each client selected for train and ~10% of the examples selected for test.

  Data set sizes:

  *split_by_clients=True*:

  -   train: 8,408 clients, 180,429 total examples
  -   test: 935 clients, 19,859 total examples

  *split_by_clients=False*:

  -   train: 9,343 clients, 177,457 total examples
  -   test: 9,343 clients, 22,831 total examples

  The `tf.data.Datasets` returned by
  `tff.simulation.datasets.ClientData.create_tf_dataset_for_client` will yield
  `collections.OrderedDict` objects at each iteration. These objects have a
  key/value pair storing the image of the celebrity:

    -   `'image'`: a `tf.Tensor` with `dtype=tf.int64` and shape [84, 84, 3],
        containing the red/blue/green pixels of the image. Each pixel is a value
        in the range [0, 255].

  The OrderedDict objects also contain an additional 40 key/value pairs for the
  celebrity image attributes, each of the format:

    -   `{attribute name}`: a `tf.Tensor` with `dtype=tf.bool` and shape [1],
        set to True if the celebrity has this attribute in the image, or False
        if they don't.

  The attribute names are:
    'five_o_clock_shadow', 'arched_eyebrows', 'attractive', 'bags_under_eyes',
    'bald', 'bangs', 'big_lips', 'big_nose', 'black_hair', 'blond_hair',
    'blurry', 'brown_hair', 'bushy_eyebrows', 'chubby', 'double_chin',
    'eyeglasses', 'goatee', 'gray_hair', 'heavy_makeup', 'high_cheekbones',
    'male', 'mouth_slightly_open', 'mustache', 'narrow_eyes', 'no_beard',
    'oval_face', 'pale_skin', 'pointy_nose', 'receding_hairline', 'rosy_cheeks',
    'sideburns', 'smiling', 'straight_hair', 'wavy_hair', 'wearing_earrings',
    'wearing_hat', 'wearing_lipstick', 'wearing_necklace', 'wearing_necktie',
    'young'

  Note: The CelebA dataset may contain potential bias. The
  [fairness indicators TF tutorial](
  https://www.tensorflow.org/responsible_ai/fairness_indicators/tutorials/Fairness_Indicators_TFCO_CelebA_Case_Study)
  goes into detail about several considerations to keep in mind while using the
  CelebA dataset.

  Args:
    split_by_clients: There are 9,343 clients in the federated CelebA dataset
      with 5 or more examples. If this argument is True, clients are divided
      into train and test groups, with 8,408 and 935 clients respectively. If
      this argument is False, the data is divided by examples instead, i.e., all
      clients participate in both the train and test groups, with ~90% of the
      examples belonging to the train group and the rest belonging to the test
      group.
    cache_dir: (Optional) directory to cache the downloaded file. If `None`,
      caches in Keras' default cache directory.

  Returns:
    Tuple of `(train, test)` where the tuple elements are
    `tff.simulation.datasets.ClientData` objects.
  """
  database_path = download.get_compressed_file(
      origin='https://storage.googleapis.com/tff-datasets-public/celeba.sqlite.lzma',
      cache_dir=cache_dir)
  if split_by_clients:
    train_client_data = sql_client_data.SqlClientData(
        database_path, 'split_by_clients_train').preprocess(_add_proto_parsing)
    test_client_data = sql_client_data.SqlClientData(
        database_path, 'split_by_clients_test').preprocess(_add_proto_parsing)
  else:
    train_client_data = sql_client_data.SqlClientData(
        database_path, 'split_by_examples_train').preprocess(_add_proto_parsing)
    test_client_data = sql_client_data.SqlClientData(
        database_path, 'split_by_examples_test').preprocess(_add_proto_parsing)
  return train_client_data, test_client_data