def test_preprocessed_img_inversion(self): raw_images_ds = _get_example_client_dataset() # Inversion turned off, average pixel is dark. standard_images_ds = emnist_data_utils.preprocess_img_dataset( raw_images_ds, invert_imagery=False, batch_size=BATCH_SIZE) for batch in iter(standard_images_ds): for image in batch: self.assertLessEqual(np.average(image), -0.7) # Inversion turned on, average pixel is light. inverted_images_ds = emnist_data_utils.preprocess_img_dataset( raw_images_ds, invert_imagery=True, batch_size=BATCH_SIZE) for batch in iter(inverted_images_ds): for image in batch: self.assertGreaterEqual(np.average(image), 0.7)
def _load_and_preprocess_datasets(): """Load raw EMNIST data and preprocess images and labels.""" emnist_train, emnist_test = ( emnist_data_utils.create_real_images_tff_client_data()) # Raw image datasets. train_dataset = emnist_train.create_tf_dataset_from_all_clients() test_dataset = emnist_test.create_tf_dataset_from_all_clients() # Preprocessed image datasets. preprocessed_train_dataset = emnist_data_utils.preprocess_img_dataset( train_dataset, include_label=True, batch_size=BATCH_SIZE, shuffle=True) preprocessed_test_dataset = emnist_data_utils.preprocess_img_dataset( test_dataset, include_label=True, batch_size=BATCH_SIZE, shuffle=False) return preprocessed_train_dataset, preprocessed_test_dataset
def test_preprocessed_img_labels_are_case_agnostic(self): raw_images_ds = _get_example_client_dataset_containing_lowercase() raw_ds_iterator = iter(raw_images_ds) # The first element in the raw dataset is an uppercase 'I' (label is 18). self.assertEqual(next(raw_ds_iterator)['label'].numpy(), 18) # The second element in the raw dataset is an uppercase 'C' (label is 12). self.assertEqual(next(raw_ds_iterator)['label'].numpy(), 12) # The third element in the raw dataset is a lowercase 'd' (label is 39). self.assertEqual(next(raw_ds_iterator)['label'].numpy(), 47) processed_ds = emnist_data_utils.preprocess_img_dataset( raw_images_ds, include_label=True, batch_size=BATCH_SIZE, shuffle=False) _, label_batch = next(iter(processed_ds)) processed_label_iterator = iter(label_batch) # The first element (in first batch) in the processed dataset has a case # agnostic label of 18 (i.e., assert that value remains unchanged). self.assertEqual(next(processed_label_iterator).numpy(), 18) # The second element (in first batch) in the processed dataset has a case # agnostic label of 12 (i.e., assert that value remains unchanged). self.assertEqual(next(processed_label_iterator).numpy(), 12) # The third element (in first batch) in the processed dataset should now # have a case agnostic label of 47 - 26 = 21. self.assertEqual(next(processed_label_iterator).numpy(), 47 - 26) for _, label_batch in iter(processed_ds): for label in label_batch: self.assertGreaterEqual(label, 0) self.assertLessEqual(label, 36)
def _get_client_ids_meeting_condition(train_tff_data, bad_accuracy_cutoff, good_accuracy_cutoff, invert_imagery_likelihood, classifier_model): """Get clients that classify <bad_accuracy_cutoff or >good_accuracy_cutoff.""" bad_client_ids_inversion_map = {} good_client_ids_inversion_map = {} for client_id in train_tff_data.client_ids: invert_imagery = (1 == np.random.binomial(n=1, p=invert_imagery_likelihood)) # TF Dataset for particular client. raw_images_ds = train_tff_data.create_tf_dataset_for_client(client_id) # Preprocess into format expected by classifier. images_ds = emnist_data_utils.preprocess_img_dataset( raw_images_ds, invert_imagery=invert_imagery, include_label=True, batch_size=None, shuffle=False, repeat=False) # Run classifier on all data on client, compute % classified correctly. total_count, correct_count = _analyze_classifier( images_ds, classifier_model) accuracy = float(correct_count) / float(total_count) if accuracy < bad_accuracy_cutoff: bad_client_ids_inversion_map[client_id] = invert_imagery if accuracy > good_accuracy_cutoff: good_client_ids_inversion_map[client_id] = invert_imagery return bad_client_ids_inversion_map, good_client_ids_inversion_map
def _create_real_images_dataset_for_eval(): """Returns a `tf.data.Dataset` of real images.""" eval_tff_data = emnist_data_utils.create_real_images_tff_client_data( split='test') raw_data = eval_tff_data.create_tf_dataset_from_all_clients() return emnist_data_utils.preprocess_img_dataset(raw_data, include_label=False, batch_size=EVAL_BATCH_SIZE, shuffle=True, repeat=True)
def setUp(self): super().setUp() client_data = emnist_data_utils.create_real_images_tff_client_data( split='synthetic') images_ds = client_data.create_tf_dataset_for_client( client_data.client_ids[0]) images_ds = emnist_data_utils.preprocess_img_dataset(images_ds, shuffle=False) images_ds_iterator = iter(images_ds) self.real_images = next(images_ds_iterator) np.random.seed(seed=123456) self.fake_images = tf.constant(np.random.random((32, 28, 28, 1)), dtype=tf.float32)
def _get_dataset(client_id): """Retrieve/preprocess a tf.data.Dataset for a given client_id.""" raw_ds = raw_client_data.create_tf_dataset_for_client(client_id) invert_imagery = False if selected_client_ids_inversion_map: invert_imagery = selected_client_ids_inversion_map[client_id] # If filter-by-example, do it here. if client_ids_example_indices_map: raw_ds = _filter_by_example(raw_ds, client_ids_example_indices_map, client_id) return emnist_data_utils.preprocess_img_dataset( raw_ds, invert_imagery=invert_imagery, include_label=False, batch_size=batch_size, shuffle=True, repeat=False)
def test_preprocessed_img_labels_are_case_agnostic(self): total_num_labels = 62 raw_dataset = _get_example_client_dataset_containing_lowercase() raw_dataset_iterator = iter(raw_dataset) num_raw_images = _compute_dataset_length(raw_dataset) self.assertEqual(num_raw_images, total_num_labels) processed_dataset = emnist_data_utils.preprocess_img_dataset( raw_dataset, include_label=True, batch_size=None, shuffle=False) processed_dataset_iterator = iter(processed_dataset) num_processed_images = _compute_dataset_length(processed_dataset) self.assertEqual(num_processed_images, total_num_labels) for _ in range(total_num_labels): raw_label = next(raw_dataset_iterator)['label'] if raw_label > 35: raw_label = raw_label - 26 # Convert from lowercase to capital processed_label = next(processed_dataset_iterator)[1] self.assertEqual(raw_label, processed_label)
def _get_client_ids_and_examples_based_on_classification( train_tff_data, min_num_examples, invert_imagery_likelihood, classifier_model): """Get maps storing whether imagery inverted and how examples classified.""" client_ids_with_correct_examples_map = {} client_ids_with_incorrect_examples_map = {} client_ids_correct_example_indices_map = {} client_ids_incorrect_example_indices_map = {} for client_id in train_tff_data.client_ids: invert_imagery = (1 == np.random.binomial(n=1, p=invert_imagery_likelihood)) # TF Dataset for particular client. raw_images_ds = train_tff_data.create_tf_dataset_for_client(client_id) # Preprocess into format expected by classifier. images_ds = emnist_data_utils.preprocess_img_dataset( raw_images_ds, invert_imagery=invert_imagery, include_label=True, batch_size=None, shuffle=False, repeat=False) # Run classifier on all data on client, return lists of indices of examples # classified correctly and incorrectly. correct_indices, incorrect_indices = _analyze_classifier( images_ds, classifier_model) if len(correct_indices) >= min_num_examples: client_ids_with_correct_examples_map[client_id] = invert_imagery client_ids_correct_example_indices_map[client_id] = correct_indices if len(incorrect_indices) >= min_num_examples: client_ids_with_incorrect_examples_map[client_id] = invert_imagery client_ids_incorrect_example_indices_map[client_id] = incorrect_indices return (client_ids_with_correct_examples_map, client_ids_with_incorrect_examples_map, client_ids_correct_example_indices_map, client_ids_incorrect_example_indices_map)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') invert_imagery_likelihood = FLAGS.invert_imagery_likelihood print('invert_imagery_likelihood is %s' % invert_imagery_likelihood) if invert_imagery_likelihood > 1.0: raise ValueError( 'invert_imagery_likelihood cannot be greater than 1.0') # TFF Dataset. client_real_images_tff_data = ( emnist_data_utils.create_real_images_tff_client_data(split='train')) print('There are %d unique clients.' % len(client_real_images_tff_data.client_ids)) # EMNIST Classifier. classifier_model = ecm.get_trained_emnist_classifier_model() accuracy_list = [] overall_total_count = 0 overall_correct_count = 0 for client_id in client_real_images_tff_data.client_ids: invert_imagery = (1 == np.random.binomial(n=1, p=invert_imagery_likelihood)) # TF Dataset for particular client. raw_images_ds = client_real_images_tff_data.create_tf_dataset_for_client( client_id) # Preprocess into format expected by classifier. images_ds = emnist_data_utils.preprocess_img_dataset( raw_images_ds, invert_imagery=invert_imagery, include_label=True, batch_size=None, shuffle=False, repeat=False) # Run classifier on all data on client, compute % classified correctly. total_count, correct_count = _analyze_classifier( images_ds, classifier_model) accuracy = float(correct_count) / float(total_count) accuracy_list.append(accuracy) overall_total_count += total_count overall_correct_count += correct_count # Calculate histogram. bin_width = 1 histogram = _compute_histogram(accuracy_list, bin_width) print('\nHistogram:') print(histogram.numpy()) # Reseasonable check (should be 3400) print('(Histogram sum):') print(sum(histogram.numpy())) # Calculate percentile values. percentile_25, percentile_75 = np.percentile(accuracy_list, q=(25, 75)) print('\nPercentiles...') print('25th Percentile : %f' % percentile_25) print('75th Percentile : %f' % percentile_75) overall_accuracy = (float(overall_correct_count) / float(overall_total_count)) print('\nOverall classification success percentage: %d / %d (%f)' % (overall_correct_count, overall_total_count, overall_accuracy))