def load_data(config): tsprint('Begin load_data: loading yamnet embeddings and determine labels.', 1) with open(paths['cifar10_embeddings'], 'rb') as file: embeddings = pickle.load(file) with open(paths['cifar10_labels'], 'rb') as file: labels = pickle.load(file) data = {} X = [] y = [] p = [] for partition in ['train', 'val', 'test']: X_current = embeddings[partition] y_current = to_categorical(labels[partition], num_classes=10) p_current = np.asarray(len(y_current) * [partition]) X.append(X_current) y.append(y_current) p.append(p_current) X = np.vstack(X) y = np.vstack(y) p = np.hstack(p) sample_client_id = simulate_clients(X, y, config) data = {} for partition in np.unique(p): client_data_dict = {} for current_client in np.unique(sample_client_id): client_filter = (sample_client_id == current_client) partition_filter = (p == partition) idx = np.logical_and(client_filter, partition_filter) client_data_dict[str(current_client)] = { 'features': X[idx, :], 'label': y[idx] } if config.global_conditioning: batch_client_id = np.ones(shape=(X[idx, :].shape[0], ), dtype=int) * current_client batch_client_id = tf.one_hot(batch_client_id, depth=config.num_clients) client_data_dict[str( current_client)]['client_id'] = batch_client_id data[partition] = tff.simulation.FromTensorSlicesClientData( client_data_dict) example_dataset = data['train'].create_tf_dataset_for_client( data['train'].client_ids[0]) preprocessed_example_dataset = preprocess(config, example_dataset) tsprint('Done load_data: returning data dict.', 1) return data, preprocessed_example_dataset
def simulate_clients(X, y, p, config): num_samples = len(y) if config.simulated_client_dim_reduction == 'pca': dimensionality_reduction = PCA(n_components=2) elif config.simulated_client_dim_reduction == 'tsne': dimensionality_reduction = TSNE(n_components=2, random_state=config.seed) train_filter = (p == 'train') X_train = X[train_filter, :] dimensionality_reduction.fit(X_train) X_reduced = dimensionality_reduction.transform(X) sample_client_id = np.zeros( num_samples ) # an ID that specifies which client each sample should go to client_ids = np.arange(config.num_clients, dtype=int) for current_class in range(config.num_classes): class_filter = (y == current_class) class_train_filter = np.logical_and(class_filter, train_filter) num_train_samples_class = np.sum(class_train_filter) if num_train_samples_class < config.num_clients: tsprint( f'Number of training samples {num_train_samples_class} is smaller than number of clients {config.num_clients} for class {current_class}.', 3) tsprint(f'Randomly assigning samples to clients.', 3) sample_client_id[class_filter] = np.random.choice( client_ids, size=np.sum(class_filter)) else: X_reduced_train_current_class = X_reduced[class_train_filter, :] tsprint( f'Making simulated client dataset with {config.num_clients} clients.', 3) kmeans = KMeans( n_clusters=config.num_clients, random_state=config.seed).fit(X_reduced_train_current_class) sample_client_id[class_filter] = kmeans.predict( X_reduced[class_filter, :]) tsprint( f'Randomly shuffle with percentage {config.label_shuffle_percentage}.', 2) num_to_shuffle = int(num_samples * config.label_shuffle_percentage) choices = np.random.choice(np.arange(num_samples), size=num_to_shuffle, replace=False) choices_permute = np.random.permutation(choices) sample_client_id[choices] = sample_client_id[choices_permute] return sample_client_id
def main(): try: smelt(sys.argv) except Exception as e: tsprint(traceback.format_exc()) tsprint( "*** USAGE: See https://github.com/czbiohub/MIDAS-IGGdb/blob/master/README.md#smelter ***\n" ) if hasattr(e, 'help_text'): tsprint(f"*** {e.help_text} ***") # pylint: disable=no-member
def simulate_clients(X, y, config): num_samples = len(y) if config.simulated_client_dim_reduction == 'pca': dimensionality_reduction = PCA(n_components=2) elif config.simulated_client_dim_reduction == 'tsne': dimensionality_reduction = TSNE(n_components=2, random_state=config.seed) X_reduced = dimensionality_reduction.fit_transform(X) sample_client_id = np.zeros( num_samples ) # an ID that specifies which client each sample should go to client_ids = np.arange(config.num_clients, dtype=int) for current_class in range(config.num_classes): idx = (y[:, current_class] == 1) x = X_reduced[idx, :] num_samples_class = x.shape[0] if num_samples_class < config.num_clients: tsprint( f'Number of samples {num_samples_class} is smaller than number of clients {config.num_clients} for class {config.num_samples_class}. Randomly assigning samples to clients.', 3) sample_client_id[idx] = np.random.choice(client_ids, size=num_samples_class) else: tsprint( f'Making simulated client dataset with {config.num_clients} clients.', 3) kmeans = KMeans(n_clusters=config.num_clients, random_state=config.seed).fit(x) sample_client_id[idx] = kmeans.labels_ tsprint( f'Randomly shuffle with percentage {config.label_shuffle_percentage}.', 2) num_to_shuffle = int(num_samples * config.label_shuffle_percentage) choices = np.random.choice(np.arange(num_samples), size=num_to_shuffle, replace=False) choices_permute = np.random.permutation(choices) sample_client_id[choices] = sample_client_id[choices_permute] return sample_client_id
def __init__(self, iggdb_toc_species, quiet=False): try: assert basename(iggdb_toc_species) == "species_info.tsv" except Exception as e: e.help_text = f"Expected /path/to/species_info.tsv, was given '{iggdb_toc_species}' instead." raise try: self.iggdb_root = dirname(dirname(abspath(iggdb_toc_species))) iggdb_toc_genomes = f"{self.iggdb_root}/metadata/genome_info.tsv" assert isfile(iggdb_toc_genomes) assert isdir(f"{self.iggdb_root}/pangenomes") assert isdir(f"{self.iggdb_root}/repgenomes") except Exception as e: e.help_text = f"Unexpected MIDAS-IGGdb directory structure around {iggdb_toc_species}." raise self.species_info = list(parse_table(tsv_rows(iggdb_toc_species))) self.genome_info = list(parse_table(tsv_rows(iggdb_toc_genomes))) if not quiet: tsprint( f"Found {len(self.genome_info)} genomes in {iggdb_toc_genomes}." ) tsprint( f"Found {len(self.species_info)} species in {iggdb_toc_species}, for example:" ) random.seed(time.time()) random_index = random.randrange(0, len(self.species_info)) tsprint(json.dumps(self.species_info[random_index], indent=4)) self.species = {s['species_id']: s for s in self.species_info} self.genomes = {g['genome_id']: g for g in self.genome_info} for s in self.species_info: genome_id = s['representative_genome'] g = self.genomes[genome_id] s['repgenome_with_origin'] = genome_id + "." + g[ 'repository'].lower() s['repgenome_path'] = f"{self.iggdb_root}/repgenomes/{s['repgenome_with_origin']}.fna" s['pangenome_path'] = f"{self.iggdb_root}/pangenomes/{s['species_alt_id']}"
def load_data(config): tsprint('Begin load_data: loading yamnet embeddings and determine labels.', 1) with open(paths['yamnet_embeddings'], 'rb') as file: embeddings = pickle.load(file) labels = {} df_train_labels = pd.read_csv(paths['fsd_train_labels_path']) df_test_labels = pd.read_csv(paths['fsd_test_labels_path']) for df in [df_train_labels, df_test_labels]: for fname, label in zip(list(df["fname"]), list(df["label"])): fsdid = fname.split('.')[0] labels[fsdid] = int(label == 'Cough') X = [] y = [] p = [] for partition, data in embeddings['fsd'].items(): for k, v in data.items(): X.append(v) y.append(labels[k]) p.append(partition) X = np.asarray(X) y = np.asarray(y) p = np.asarray(p) sample_client_id = simulate_clients(X, y, p, config) # Determine the Frechét distances between clients and rest of data client_frechet_distances = [] frechet_dict = {} for current_client in np.unique(sample_client_id): client_filter = (sample_client_id == current_client) not_client_filter = (sample_client_id != current_client) distance = frechet_distance(X[not_client_filter, :], X[client_filter, :]) client_frechet_distances.append(distance) frechet_dict[str(int(current_client))] = distance mu = np.mean(client_frechet_distances) std = np.std(client_frechet_distances) tsprint(f'Client average Frechét distances: {mu:.02f}) +/- {std:.02f}', 1) tsprint(f'Client Frechét distances: {client_frechet_distances}', 1) client_average_frechet = mu data = {} for partition in np.unique(p): client_data_dict = {} for current_client in np.unique(sample_client_id): client_filter = (sample_client_id == current_client) partition_filter = (p == partition) idx = np.logical_and(client_filter, partition_filter) label_distribution = [np.sum(y[idx] == 0), np.sum(y[idx] == 1)] tsprint( f'Client {current_client}, {partition} label distribution: {label_distribution}.', 1) client_id = str(int(current_client)) client_data_dict[client_id] = { 'features': X[idx, :], 'label': y[idx] } if config.global_conditioning: batch_client_id = np.ones(shape=(X[idx, :].shape[0], ), dtype=int) * current_client batch_client_id = tf.one_hot(batch_client_id, depth=config.num_clients) client_data_dict[client_id]['client_id'] = batch_client_id data[partition] = tff.simulation.FromTensorSlicesClientData( client_data_dict) example_dataset = data['train'].create_tf_dataset_for_client( data['train'].client_ids[0]) preprocessed_example_dataset = preprocess(config, example_dataset) tsprint('Done load_data: returning data dict.', 1) return data, preprocessed_example_dataset, client_average_frechet
def main(config): tsprint(f'Using seed: {config.seed}.') tsprint('Loading data and simulating clients.') data, preprocessed_example_dataset = load_data(config) tsprint('Setting up model definition based on sample dataset.') client_optimizer_fn = tf.keras.optimizers.SGD( learning_rate=config.client_learning_rate, momentum=config.client_learning_rate_momentum, decay=config.client_learning_rate_decay) m = Model(config, preprocessed_example_dataset) print("***************************\n MODEL DONE") tsprint('Setup federated learning process.') iterative_process = tff.learning.build_federated_averaging_process( m.get_tff_model, client_optimizer_fn=lambda: client_optimizer_fn) state = iterative_process.initialize() evaluation = tff.learning.build_federated_evaluation(m.get_tff_model) tsprint('Beginning federated training.', 1) # This is overwritten at each step, if the number of sampled clients is smaller than # the total number of clients. federated_train_data = make_federated_data(config, data['train'], data['train'].client_ids) federated_val_data = make_federated_data(config, data['val'], data['val'].client_ids) global_step = 0 best_loss = 1e6 for _ in range(config.max_num_fl_rounds): global_step += 1 if config.num_sampled_clients < config.num_clients: sample_clients = subsample_clients(config, data['train'].client_ids) federated_train_data = make_federated_data(config, data['train'], sample_clients) federated_val_data = make_federated_data(config, data['val'], sample_clients) state, train_metrics = iterative_process.next(state, federated_train_data) val_metrics = evaluation(state.model, federated_val_data) log_metrics_train_val(train_metrics, val_metrics, global_step) if val_metrics.loss < best_loss: tsprint( f'Updating best state (previous best: {best_loss}, new best: {val_metrics.loss}.', 2) best_loss, best_state, best_step = val_metrics.loss, state, global_step for split_str, metrics in zip(('train', 'validation'), (train_metrics, val_metrics)): for metric_str, metric in zip( ('loss', 'accuracy', 'auc'), (metrics.loss, metrics.categorical_accuracy, metrics.auc)): wandb.log({f'{split_str}/{metric_str}': metric}, step=global_step) tsprint('Begin final evaluation.', 1) for (split_str, ds) in data.items(): federated_data = make_federated_data(config, ds, ds.client_ids, final_eval=True) metrics = evaluation(best_state.model, federated_data) log_metrics(split_str, metrics, global_step) for metric_str, metric in zip( ('loss', 'accuracy', 'auc'), (metrics.loss, metrics.categorical_accuracy, metrics.auc)): wandb.log({f'final_{split_str}/{metric_str}': metric}, step=global_step)
parser.add_argument('--client_learning_rate', type=float) parser.add_argument('--client_learning_rate_decay', type=float) parser.add_argument('--client_learning_rate_momentum', type=float) args = parser.parse_args() # Use Weights&Biases to keep track of experiments, # TODO: something is fishy with the logging, though. # To turn of logging, do 'wandb off' from the terminal # in the federated_cough_detector folder. wandb.init(project="[FILL IN]", entity="[FILL IN]", group=args.__dict__['experiment_group']) for k, v in args.__dict__.items(): if v is not None: tsprint(f'Changeing parameter {k} from default to: {v}') wandb.config.update({k: v}, allow_val_change=True) config = wandb.config # Get and use random seed np.random.seed(config.seed) tf.random.set_seed(config.seed) # The config dotdict stems from the config-defaults.yaml, which # is automatically loaded by W&B; do not change the name of the # YAML file. tsprint('Final config:') tsprint(config) main(config)
def main(config): tsprint(f'Using seed: {config.seed}.') tsprint('Loading data and simulating clients.') data, preprocessed_example_dataset, client_average_frechet = load_data( config) tsprint('Setting up model definition based on sample dataset.') #client_optimizer_fn = tf.keras.optimizers.SGD(learning_rate=config.client_learning_rate, # momentum=config.client_learning_rate_momentum, # decay=config.client_learning_rate_decay) m = Model(config, preprocessed_example_dataset) tsprint('Setup federated learning process.') #iterative_process = tff.learning.build_federated_averaging_process( # m.get_tff_model, # client_optimizer_fn=lambda: client_optimizer_fn) #state = iterative_process.initialize() #evaluation = tff.learning.build_federated_evaluation(m.get_tff_model) def server_optimizer_fn(): return tf.keras.optimizers.SGD(learning_rate=1.0) def client_optimizer_fn(): return tf.keras.optimizers.SGD( learning_rate=config.client_learning_rate, momentum=config.client_learning_rate_momentum, decay=config.client_learning_rate_decay) def keras_evaluate(model, test_data, metrics): metrics_collective, metrics_clients = metrics results_collective = {} results_clients = {} for client_id in metrics_clients.keys(): results_clients[client_id] = {} for key, metric in metrics_collective.items(): metric.reset_states() for client_id, client_ds in zip(metrics_clients.keys(), test_data): metrics_clients[client_id][key].reset_states() for batch in client_ds: preds = model(batch['x'], training=False) metric(batch['y'], preds) metrics_clients[client_id][key](batch['y'], preds) results_clients[client_id][key] = metrics_clients[client_id][ key].result() results_collective[key] = metric.result() return dotdict(results_collective), results_clients iterative_process = simple_fedavg_tff.build_federated_averaging_process( m.get_simplefedavg_tff_model, server_optimizer_fn, client_optimizer_fn) state = iterative_process.initialize() tsprint('Beginning federated training.', 1) # This is overwritten at each step, if the number of sampled clients is smaller than # the total number of clients. federated_train_data = make_federated_data(config, data['train'], data['train'].client_ids) federated_val_data = make_federated_data(config, data['val'], data['val'].client_ids, final_eval=True) global_step = 0 best_loss = 1e6 monitoring_metrics_fns = { 'loss': tf.keras.metrics.BinaryCrossentropy, } monitoring_metrics = {} monitoring_metrics_clients = {} for c in data['train'].client_ids: monitoring_metrics_clients[c] = {} for k, v in monitoring_metrics_fns.items(): monitoring_metrics[k] = v() for c in data['train'].client_ids: monitoring_metrics_clients[c][k] = v() model = m.get_simplefedavg_tff_model() for _ in range(config.max_num_fl_rounds): global_step += 1 if config.num_sampled_clients < config.num_clients: sample_clients = subsample_clients(config, data['train'].client_ids) federated_train_data = make_federated_data(config, data['train'], sample_clients) federated_val_data = make_federated_data(config, data['val'], sample_clients, final_eval=True) state, train_metrics = iterative_process.next(state, federated_train_data) train_metrics_loss = train_metrics model.from_weights(state.model_weights) val_metrics, val_metrics_clients = keras_evaluate( model.keras_model, federated_val_data, [monitoring_metrics, monitoring_metrics_clients]) tsprint( f'Round {global_step} loss: {train_metrics_loss} \t {val_metrics.loss}', 0) wandb.log({f'train/loss': train_metrics_loss}, step=global_step) wandb.log({f'validation/loss': val_metrics.loss}, step=global_step) if val_metrics.loss < best_loss: tsprint( f'Updating best state (previous best: {best_loss}, new best: {val_metrics.loss}.', 2) best_loss, best_state, best_step = val_metrics.loss, state, global_step tsprint('Begin final evaluation.', 1) wandb.log({f'client_average_frechet': client_average_frechet}, step=global_step) final_eval_metrics_fns = { 'loss': tf.keras.metrics.BinaryCrossentropy, 'accuracy': tf.keras.metrics.BinaryAccuracy, 'auc': tf.keras.metrics.AUC } final_eval_metrics = {} final_eval_metrics_clients = {} for c in data['train'].client_ids: final_eval_metrics_clients[c] = {} for k, v in final_eval_metrics_fns.items(): final_eval_metrics[k] = v() for c in data['train'].client_ids: final_eval_metrics_clients[c][k] = v() for (split_str, ds) in data.items(): federated_data = make_federated_data(config, ds, ds.client_ids, final_eval=True) model.from_weights(best_state.model_weights) cur_metrics, cur_metrics_clients = keras_evaluate( model.keras_model, federated_data, [final_eval_metrics, final_eval_metrics_clients]) for k, v in cur_metrics_clients.items(): tsprint(f'{k}: {v}', 1) for metric_str, metric in zip( ('loss', 'auc', 'accuracy'), (cur_metrics.loss, cur_metrics.auc, cur_metrics.accuracy)): wandb.log({f'final_{split_str}/{metric_str}': metric}, step=global_step) tsprint(f'final_{split_str}/{metric_str}: {metric}') tsprint('Done.')
def smelt(argv): cwd = backtick('pwd') my_command = f"cd {cwd}; " + ' '.join(argv) tsprint(my_command) _, subcmd, outdir, iggdb_toc = argv subcmd = subcmd.replace("-", "_").lower() SUBCOMMANDS = { f"collate_{gdim}": gdim for gdim in ["pangenomes", "repgenomes"] } gdim = SUBCOMMANDS.get(subcmd) try: assert gdim in ["pangenomes", "repgenomes"] except Exception as e: e.help_text = f"Try a supported subcommand instead of {subcmd}." raise makedirs(outdir, exist_ok=False) iggdb = IGGdb(iggdb_toc) tsprint(f"Now collating fasta for gsnap {gdim} index construction.") MAX_FAILURES = 100 count_successes = 0 ticker = ProgressTracker(target=len(iggdb.species_info)) failures = [] for s in iggdb.species_info: try: s_species_alt_id = s['species_alt_id'] s_tempfile = f"{outdir}/temp_{gdim}_{s_species_alt_id}.fa" # Note how the header tags we emit below for pangenomes and repgenomes are consistent; # this should enable easy reconciliation of gsnap alignments against the # two separate indexes. if gdim == "pangenomes": # # The header tag we wish to emit would be # # >4547837|1657.8.patric|rest_of_original_header_from_pangenome_file # # where # # species_alt_id = 4547837 # from species_alt_id column in table # repgenome_with_origin = 1657.8.patric # from original header in pangenome file # # As the original header in the pangenome file already begins # with s_rg_id_origin, we just need to prepend species_alt_id. # s_header_xform = f"sed 's=^>=>{s_species_alt_id}|=' {s['pangenome_path']} > {s_tempfile} && cat {s_tempfile} >> {outdir}/temp_{gdim}.fa && rm {s_tempfile} && echo SUCCEEDED || echo FAILED" else: assert gdim == "repgenomes" # # The header tag we wish to emit would be # # >4547837|1657.8.patric|entire_original_header_from_repgenome_file # # where # # species_alt_id = 4547837 # species_alt_id column in species_info # repgenome_with_origin = 1657.8.patric # from file listing in repgenomes dir # s_repgenome_with_origin = s['repgenome_with_origin'] s_repgenome_path = s['repgenome_path'] s_header_xform = f"sed 's=^>=>{s_species_alt_id}|{s_repgenome_with_origin}|=' {s_repgenome_path} > {s_tempfile} && cat {s_tempfile} >> {outdir}/temp_{gdim}.fa && rm {s_tempfile} && echo SUCCEEDED || echo FAILED" status = backtick(s_header_xform) assert status == "SUCCEEDED" count_successes += 1 except Exception as e: failures.append(s) if len(failures) == MAX_FAILURES: count_examined = len(failures) + count_successes e.help_text = f"Giving up after {MAX_FAILURES} failures in first {count_examined} species. See temp files for more info." raise finally: ticker.advance(1) failed_species_alt_ids = [s['species_alt_id'] for s in failures] if not failures: tsprint( f"All {len(iggdb.species_info)} species were processed successfully." ) else: tsprint( f"Collation of {len(failures)} species failed. Those are missing from the final {gdim}.fa" ) # Create output file only on success. # Dump stats in json. collation_status = { "comment": f"Collation into {gdim}.fa succeeded on {time.asctime()} with command '{my_command}'.", "successfully_collated_species_count": count_successes, "failed_species_count": len(failures), "total_species_count": len(iggdb.species_info), "failed_species_alt_ids": failed_species_alt_ids, "elapsed_time": time.time() - ticker.t_start } collation_status_str = json.dumps(collation_status, indent=4) with open(f"{outdir}/{gdim}_collation_status.json", "w") as pcs: chars_written = pcs.write(collation_status_str) assert chars_written == len(collation_status_str) tsprint(collation_status_str) os.rename(f"{outdir}/temp_{gdim}.fa", f"{outdir}/{gdim}.fa")