예제 #1
0
def load_data(config):
    tsprint('Begin load_data: loading yamnet embeddings and determine labels.',
            1)

    with open(paths['cifar10_embeddings'], 'rb') as file:
        embeddings = pickle.load(file)
    with open(paths['cifar10_labels'], 'rb') as file:
        labels = pickle.load(file)

    data = {}
    X = []
    y = []
    p = []
    for partition in ['train', 'val', 'test']:

        X_current = embeddings[partition]

        y_current = to_categorical(labels[partition], num_classes=10)
        p_current = np.asarray(len(y_current) * [partition])
        X.append(X_current)
        y.append(y_current)
        p.append(p_current)
    X = np.vstack(X)
    y = np.vstack(y)
    p = np.hstack(p)

    sample_client_id = simulate_clients(X, y, config)

    data = {}
    for partition in np.unique(p):
        client_data_dict = {}
        for current_client in np.unique(sample_client_id):
            client_filter = (sample_client_id == current_client)
            partition_filter = (p == partition)
            idx = np.logical_and(client_filter, partition_filter)

            client_data_dict[str(current_client)] = {
                'features': X[idx, :],
                'label': y[idx]
            }
            if config.global_conditioning:
                batch_client_id = np.ones(shape=(X[idx, :].shape[0], ),
                                          dtype=int) * current_client
                batch_client_id = tf.one_hot(batch_client_id,
                                             depth=config.num_clients)
                client_data_dict[str(
                    current_client)]['client_id'] = batch_client_id
        data[partition] = tff.simulation.FromTensorSlicesClientData(
            client_data_dict)

    example_dataset = data['train'].create_tf_dataset_for_client(
        data['train'].client_ids[0])
    preprocessed_example_dataset = preprocess(config, example_dataset)

    tsprint('Done load_data: returning data dict.', 1)
    return data, preprocessed_example_dataset
예제 #2
0
def simulate_clients(X, y, p, config):
    num_samples = len(y)

    if config.simulated_client_dim_reduction == 'pca':
        dimensionality_reduction = PCA(n_components=2)
    elif config.simulated_client_dim_reduction == 'tsne':
        dimensionality_reduction = TSNE(n_components=2,
                                        random_state=config.seed)

    train_filter = (p == 'train')
    X_train = X[train_filter, :]
    dimensionality_reduction.fit(X_train)
    X_reduced = dimensionality_reduction.transform(X)

    sample_client_id = np.zeros(
        num_samples
    )  # an ID that specifies which client each sample should go to
    client_ids = np.arange(config.num_clients, dtype=int)

    for current_class in range(config.num_classes):
        class_filter = (y == current_class)
        class_train_filter = np.logical_and(class_filter, train_filter)
        num_train_samples_class = np.sum(class_train_filter)
        if num_train_samples_class < config.num_clients:
            tsprint(
                f'Number of training samples {num_train_samples_class} is smaller than number of clients {config.num_clients} for class {current_class}.',
                3)
            tsprint(f'Randomly assigning samples to clients.', 3)
            sample_client_id[class_filter] = np.random.choice(
                client_ids, size=np.sum(class_filter))
        else:
            X_reduced_train_current_class = X_reduced[class_train_filter, :]
            tsprint(
                f'Making simulated client dataset with {config.num_clients} clients.',
                3)
            kmeans = KMeans(
                n_clusters=config.num_clients,
                random_state=config.seed).fit(X_reduced_train_current_class)
            sample_client_id[class_filter] = kmeans.predict(
                X_reduced[class_filter, :])

    tsprint(
        f'Randomly shuffle with percentage {config.label_shuffle_percentage}.',
        2)
    num_to_shuffle = int(num_samples * config.label_shuffle_percentage)
    choices = np.random.choice(np.arange(num_samples),
                               size=num_to_shuffle,
                               replace=False)
    choices_permute = np.random.permutation(choices)
    sample_client_id[choices] = sample_client_id[choices_permute]
    return sample_client_id
예제 #3
0
def main():
    try:
        smelt(sys.argv)
    except Exception as e:
        tsprint(traceback.format_exc())
        tsprint(
            "*** USAGE:  See https://github.com/czbiohub/MIDAS-IGGdb/blob/master/README.md#smelter ***\n"
        )
        if hasattr(e, 'help_text'):
            tsprint(f"*** {e.help_text} ***")  # pylint: disable=no-member
예제 #4
0
def simulate_clients(X, y, config):
    num_samples = len(y)

    if config.simulated_client_dim_reduction == 'pca':
        dimensionality_reduction = PCA(n_components=2)
    elif config.simulated_client_dim_reduction == 'tsne':
        dimensionality_reduction = TSNE(n_components=2,
                                        random_state=config.seed)

    X_reduced = dimensionality_reduction.fit_transform(X)

    sample_client_id = np.zeros(
        num_samples
    )  # an ID that specifies which client each sample should go to
    client_ids = np.arange(config.num_clients, dtype=int)

    for current_class in range(config.num_classes):
        idx = (y[:, current_class] == 1)
        x = X_reduced[idx, :]
        num_samples_class = x.shape[0]
        if num_samples_class < config.num_clients:
            tsprint(
                f'Number of samples {num_samples_class} is smaller than number of clients {config.num_clients} for class {config.num_samples_class}. Randomly assigning samples to clients.',
                3)
            sample_client_id[idx] = np.random.choice(client_ids,
                                                     size=num_samples_class)
        else:
            tsprint(
                f'Making simulated client dataset with {config.num_clients} clients.',
                3)
            kmeans = KMeans(n_clusters=config.num_clients,
                            random_state=config.seed).fit(x)
            sample_client_id[idx] = kmeans.labels_

    tsprint(
        f'Randomly shuffle with percentage {config.label_shuffle_percentage}.',
        2)
    num_to_shuffle = int(num_samples * config.label_shuffle_percentage)
    choices = np.random.choice(np.arange(num_samples),
                               size=num_to_shuffle,
                               replace=False)
    choices_permute = np.random.permutation(choices)
    sample_client_id[choices] = sample_client_id[choices_permute]
    return sample_client_id
예제 #5
0
 def __init__(self, iggdb_toc_species, quiet=False):
     try:
         assert basename(iggdb_toc_species) == "species_info.tsv"
     except Exception as e:
         e.help_text = f"Expected /path/to/species_info.tsv, was given '{iggdb_toc_species}' instead."
         raise
     try:
         self.iggdb_root = dirname(dirname(abspath(iggdb_toc_species)))
         iggdb_toc_genomes = f"{self.iggdb_root}/metadata/genome_info.tsv"
         assert isfile(iggdb_toc_genomes)
         assert isdir(f"{self.iggdb_root}/pangenomes")
         assert isdir(f"{self.iggdb_root}/repgenomes")
     except Exception as e:
         e.help_text = f"Unexpected MIDAS-IGGdb directory structure around {iggdb_toc_species}."
         raise
     self.species_info = list(parse_table(tsv_rows(iggdb_toc_species)))
     self.genome_info = list(parse_table(tsv_rows(iggdb_toc_genomes)))
     if not quiet:
         tsprint(
             f"Found {len(self.genome_info)} genomes in {iggdb_toc_genomes}."
         )
         tsprint(
             f"Found {len(self.species_info)} species in {iggdb_toc_species}, for example:"
         )
         random.seed(time.time())
         random_index = random.randrange(0, len(self.species_info))
         tsprint(json.dumps(self.species_info[random_index], indent=4))
     self.species = {s['species_id']: s for s in self.species_info}
     self.genomes = {g['genome_id']: g for g in self.genome_info}
     for s in self.species_info:
         genome_id = s['representative_genome']
         g = self.genomes[genome_id]
         s['repgenome_with_origin'] = genome_id + "." + g[
             'repository'].lower()
         s['repgenome_path'] = f"{self.iggdb_root}/repgenomes/{s['repgenome_with_origin']}.fna"
         s['pangenome_path'] = f"{self.iggdb_root}/pangenomes/{s['species_alt_id']}"
예제 #6
0
def load_data(config):
    tsprint('Begin load_data: loading yamnet embeddings and determine labels.',
            1)

    with open(paths['yamnet_embeddings'], 'rb') as file:
        embeddings = pickle.load(file)

    labels = {}
    df_train_labels = pd.read_csv(paths['fsd_train_labels_path'])
    df_test_labels = pd.read_csv(paths['fsd_test_labels_path'])
    for df in [df_train_labels, df_test_labels]:
        for fname, label in zip(list(df["fname"]), list(df["label"])):
            fsdid = fname.split('.')[0]
            labels[fsdid] = int(label == 'Cough')

    X = []
    y = []
    p = []
    for partition, data in embeddings['fsd'].items():
        for k, v in data.items():
            X.append(v)
            y.append(labels[k])
            p.append(partition)
    X = np.asarray(X)
    y = np.asarray(y)
    p = np.asarray(p)
    sample_client_id = simulate_clients(X, y, p, config)

    # Determine the Frechét distances between clients and rest of data
    client_frechet_distances = []
    frechet_dict = {}
    for current_client in np.unique(sample_client_id):
        client_filter = (sample_client_id == current_client)
        not_client_filter = (sample_client_id != current_client)

        distance = frechet_distance(X[not_client_filter, :],
                                    X[client_filter, :])
        client_frechet_distances.append(distance)

        frechet_dict[str(int(current_client))] = distance

    mu = np.mean(client_frechet_distances)
    std = np.std(client_frechet_distances)
    tsprint(f'Client average Frechét distances: {mu:.02f}) +/- {std:.02f}', 1)
    tsprint(f'Client Frechét distances: {client_frechet_distances}', 1)

    client_average_frechet = mu

    data = {}
    for partition in np.unique(p):
        client_data_dict = {}
        for current_client in np.unique(sample_client_id):
            client_filter = (sample_client_id == current_client)
            partition_filter = (p == partition)
            idx = np.logical_and(client_filter, partition_filter)

            label_distribution = [np.sum(y[idx] == 0), np.sum(y[idx] == 1)]
            tsprint(
                f'Client {current_client}, {partition} label distribution: {label_distribution}.',
                1)
            client_id = str(int(current_client))
            client_data_dict[client_id] = {
                'features': X[idx, :],
                'label': y[idx]
            }
            if config.global_conditioning:
                batch_client_id = np.ones(shape=(X[idx, :].shape[0], ),
                                          dtype=int) * current_client
                batch_client_id = tf.one_hot(batch_client_id,
                                             depth=config.num_clients)
                client_data_dict[client_id]['client_id'] = batch_client_id

        data[partition] = tff.simulation.FromTensorSlicesClientData(
            client_data_dict)

    example_dataset = data['train'].create_tf_dataset_for_client(
        data['train'].client_ids[0])
    preprocessed_example_dataset = preprocess(config, example_dataset)

    tsprint('Done load_data: returning data dict.', 1)
    return data, preprocessed_example_dataset, client_average_frechet
def main(config):
    tsprint(f'Using seed: {config.seed}.')
    tsprint('Loading data and simulating clients.')
    data, preprocessed_example_dataset = load_data(config)

    tsprint('Setting up model definition based on sample dataset.')
    client_optimizer_fn = tf.keras.optimizers.SGD(
        learning_rate=config.client_learning_rate,
        momentum=config.client_learning_rate_momentum,
        decay=config.client_learning_rate_decay)
    m = Model(config, preprocessed_example_dataset)
    print("***************************\n MODEL DONE")
    tsprint('Setup federated learning process.')
    iterative_process = tff.learning.build_federated_averaging_process(
        m.get_tff_model, client_optimizer_fn=lambda: client_optimizer_fn)
    state = iterative_process.initialize()
    evaluation = tff.learning.build_federated_evaluation(m.get_tff_model)

    tsprint('Beginning federated training.', 1)
    # This is overwritten at each step, if the number of sampled clients is smaller than
    # the total number of clients.
    federated_train_data = make_federated_data(config, data['train'],
                                               data['train'].client_ids)
    federated_val_data = make_federated_data(config, data['val'],
                                             data['val'].client_ids)

    global_step = 0
    best_loss = 1e6
    for _ in range(config.max_num_fl_rounds):
        global_step += 1

        if config.num_sampled_clients < config.num_clients:
            sample_clients = subsample_clients(config,
                                               data['train'].client_ids)
            federated_train_data = make_federated_data(config, data['train'],
                                                       sample_clients)
            federated_val_data = make_federated_data(config, data['val'],
                                                     sample_clients)

        state, train_metrics = iterative_process.next(state,
                                                      federated_train_data)
        val_metrics = evaluation(state.model, federated_val_data)
        log_metrics_train_val(train_metrics, val_metrics, global_step)

        if val_metrics.loss < best_loss:
            tsprint(
                f'Updating best state (previous best: {best_loss}, new best: {val_metrics.loss}.',
                2)
            best_loss, best_state, best_step = val_metrics.loss, state, global_step

        for split_str, metrics in zip(('train', 'validation'),
                                      (train_metrics, val_metrics)):
            for metric_str, metric in zip(
                ('loss', 'accuracy', 'auc'),
                (metrics.loss, metrics.categorical_accuracy, metrics.auc)):
                wandb.log({f'{split_str}/{metric_str}': metric},
                          step=global_step)

    tsprint('Begin final evaluation.', 1)
    for (split_str, ds) in data.items():
        federated_data = make_federated_data(config,
                                             ds,
                                             ds.client_ids,
                                             final_eval=True)
        metrics = evaluation(best_state.model, federated_data)
        log_metrics(split_str, metrics, global_step)
        for metric_str, metric in zip(
            ('loss', 'accuracy', 'auc'),
            (metrics.loss, metrics.categorical_accuracy, metrics.auc)):
            wandb.log({f'final_{split_str}/{metric_str}': metric},
                      step=global_step)
    parser.add_argument('--client_learning_rate', type=float)
    parser.add_argument('--client_learning_rate_decay', type=float)
    parser.add_argument('--client_learning_rate_momentum', type=float)
    args = parser.parse_args()

    # Use Weights&Biases to keep track of experiments,
    # TODO: something is fishy with the logging, though.
    # To turn of logging, do 'wandb off' from the terminal
    # in the federated_cough_detector folder.
    wandb.init(project="[FILL IN]",
               entity="[FILL IN]",
               group=args.__dict__['experiment_group'])

    for k, v in args.__dict__.items():
        if v is not None:
            tsprint(f'Changeing parameter {k} from default to: {v}')
            wandb.config.update({k: v}, allow_val_change=True)

    config = wandb.config

    # Get and use random seed
    np.random.seed(config.seed)
    tf.random.set_seed(config.seed)

    # The config dotdict stems from the config-defaults.yaml, which
    # is automatically loaded by W&B; do not change the name of the
    # YAML file.
    tsprint('Final config:')
    tsprint(config)

    main(config)
예제 #9
0
def main(config):
    tsprint(f'Using seed: {config.seed}.')
    tsprint('Loading data and simulating clients.')
    data, preprocessed_example_dataset, client_average_frechet = load_data(
        config)

    tsprint('Setting up model definition based on sample dataset.')
    #client_optimizer_fn = tf.keras.optimizers.SGD(learning_rate=config.client_learning_rate,
    #                                              momentum=config.client_learning_rate_momentum,
    #                                              decay=config.client_learning_rate_decay)
    m = Model(config, preprocessed_example_dataset)

    tsprint('Setup federated learning process.')

    #iterative_process = tff.learning.build_federated_averaging_process(
    #  m.get_tff_model,
    #  client_optimizer_fn=lambda: client_optimizer_fn)
    #state = iterative_process.initialize()
    #evaluation = tff.learning.build_federated_evaluation(m.get_tff_model)
    def server_optimizer_fn():
        return tf.keras.optimizers.SGD(learning_rate=1.0)

    def client_optimizer_fn():
        return tf.keras.optimizers.SGD(
            learning_rate=config.client_learning_rate,
            momentum=config.client_learning_rate_momentum,
            decay=config.client_learning_rate_decay)

    def keras_evaluate(model, test_data, metrics):
        metrics_collective, metrics_clients = metrics
        results_collective = {}
        results_clients = {}
        for client_id in metrics_clients.keys():
            results_clients[client_id] = {}

        for key, metric in metrics_collective.items():
            metric.reset_states()
            for client_id, client_ds in zip(metrics_clients.keys(), test_data):
                metrics_clients[client_id][key].reset_states()
                for batch in client_ds:
                    preds = model(batch['x'], training=False)
                    metric(batch['y'], preds)
                    metrics_clients[client_id][key](batch['y'], preds)
                results_clients[client_id][key] = metrics_clients[client_id][
                    key].result()
            results_collective[key] = metric.result()

        return dotdict(results_collective), results_clients

    iterative_process = simple_fedavg_tff.build_federated_averaging_process(
        m.get_simplefedavg_tff_model, server_optimizer_fn, client_optimizer_fn)
    state = iterative_process.initialize()

    tsprint('Beginning federated training.', 1)
    # This is overwritten at each step, if the number of sampled clients is smaller than
    # the total number of clients.
    federated_train_data = make_federated_data(config, data['train'],
                                               data['train'].client_ids)
    federated_val_data = make_federated_data(config,
                                             data['val'],
                                             data['val'].client_ids,
                                             final_eval=True)

    global_step = 0
    best_loss = 1e6
    monitoring_metrics_fns = {
        'loss': tf.keras.metrics.BinaryCrossentropy,
    }
    monitoring_metrics = {}
    monitoring_metrics_clients = {}
    for c in data['train'].client_ids:
        monitoring_metrics_clients[c] = {}

    for k, v in monitoring_metrics_fns.items():
        monitoring_metrics[k] = v()
        for c in data['train'].client_ids:
            monitoring_metrics_clients[c][k] = v()

    model = m.get_simplefedavg_tff_model()
    for _ in range(config.max_num_fl_rounds):
        global_step += 1

        if config.num_sampled_clients < config.num_clients:
            sample_clients = subsample_clients(config,
                                               data['train'].client_ids)
            federated_train_data = make_federated_data(config, data['train'],
                                                       sample_clients)
            federated_val_data = make_federated_data(config,
                                                     data['val'],
                                                     sample_clients,
                                                     final_eval=True)

        state, train_metrics = iterative_process.next(state,
                                                      federated_train_data)

        train_metrics_loss = train_metrics
        model.from_weights(state.model_weights)
        val_metrics, val_metrics_clients = keras_evaluate(
            model.keras_model, federated_val_data,
            [monitoring_metrics, monitoring_metrics_clients])

        tsprint(
            f'Round {global_step} loss: {train_metrics_loss} \t {val_metrics.loss}',
            0)

        wandb.log({f'train/loss': train_metrics_loss}, step=global_step)
        wandb.log({f'validation/loss': val_metrics.loss}, step=global_step)

        if val_metrics.loss < best_loss:
            tsprint(
                f'Updating best state (previous best: {best_loss}, new best: {val_metrics.loss}.',
                2)
            best_loss, best_state, best_step = val_metrics.loss, state, global_step

    tsprint('Begin final evaluation.', 1)
    wandb.log({f'client_average_frechet': client_average_frechet},
              step=global_step)

    final_eval_metrics_fns = {
        'loss': tf.keras.metrics.BinaryCrossentropy,
        'accuracy': tf.keras.metrics.BinaryAccuracy,
        'auc': tf.keras.metrics.AUC
    }
    final_eval_metrics = {}
    final_eval_metrics_clients = {}
    for c in data['train'].client_ids:
        final_eval_metrics_clients[c] = {}

    for k, v in final_eval_metrics_fns.items():
        final_eval_metrics[k] = v()
        for c in data['train'].client_ids:
            final_eval_metrics_clients[c][k] = v()

    for (split_str, ds) in data.items():
        federated_data = make_federated_data(config,
                                             ds,
                                             ds.client_ids,
                                             final_eval=True)

        model.from_weights(best_state.model_weights)
        cur_metrics, cur_metrics_clients = keras_evaluate(
            model.keras_model, federated_data,
            [final_eval_metrics, final_eval_metrics_clients])

        for k, v in cur_metrics_clients.items():
            tsprint(f'{k}: {v}', 1)

        for metric_str, metric in zip(
            ('loss', 'auc', 'accuracy'),
            (cur_metrics.loss, cur_metrics.auc, cur_metrics.accuracy)):
            wandb.log({f'final_{split_str}/{metric_str}': metric},
                      step=global_step)
            tsprint(f'final_{split_str}/{metric_str}: {metric}')

    tsprint('Done.')
예제 #10
0
def smelt(argv):
    cwd = backtick('pwd')
    my_command = f"cd {cwd}; " + ' '.join(argv)
    tsprint(my_command)
    _, subcmd, outdir, iggdb_toc = argv
    subcmd = subcmd.replace("-", "_").lower()
    SUBCOMMANDS = {
        f"collate_{gdim}": gdim
        for gdim in ["pangenomes", "repgenomes"]
    }
    gdim = SUBCOMMANDS.get(subcmd)
    try:
        assert gdim in ["pangenomes", "repgenomes"]
    except Exception as e:
        e.help_text = f"Try a supported subcommand instead of {subcmd}."
        raise
    makedirs(outdir, exist_ok=False)
    iggdb = IGGdb(iggdb_toc)
    tsprint(f"Now collating fasta for gsnap {gdim} index construction.")
    MAX_FAILURES = 100
    count_successes = 0
    ticker = ProgressTracker(target=len(iggdb.species_info))
    failures = []
    for s in iggdb.species_info:
        try:
            s_species_alt_id = s['species_alt_id']
            s_tempfile = f"{outdir}/temp_{gdim}_{s_species_alt_id}.fa"
            # Note how the header tags we emit below for pangenomes and repgenomes are consistent;
            # this should enable easy reconciliation of gsnap alignments against the
            # two separate indexes.
            if gdim == "pangenomes":
                #
                # The header tag we wish to emit would be
                #
                #    >4547837|1657.8.patric|rest_of_original_header_from_pangenome_file
                #
                # where
                #
                #    species_alt_id = 4547837                  # from species_alt_id column in table
                #    repgenome_with_origin = 1657.8.patric     # from original header in pangenome file
                #
                # As the original header in the pangenome file already begins
                # with s_rg_id_origin, we just need to prepend species_alt_id.
                #
                s_header_xform = f"sed 's=^>=>{s_species_alt_id}|=' {s['pangenome_path']} > {s_tempfile} && cat {s_tempfile} >> {outdir}/temp_{gdim}.fa && rm {s_tempfile} && echo SUCCEEDED || echo FAILED"
            else:
                assert gdim == "repgenomes"
                #
                # The header tag we wish to emit would be
                #
                #    >4547837|1657.8.patric|entire_original_header_from_repgenome_file
                #
                # where
                #
                #    species_alt_id = 4547837                # species_alt_id column in species_info
                #    repgenome_with_origin = 1657.8.patric   # from file listing in repgenomes dir
                #
                s_repgenome_with_origin = s['repgenome_with_origin']
                s_repgenome_path = s['repgenome_path']
                s_header_xform = f"sed 's=^>=>{s_species_alt_id}|{s_repgenome_with_origin}|=' {s_repgenome_path} > {s_tempfile} && cat {s_tempfile} >> {outdir}/temp_{gdim}.fa && rm {s_tempfile} && echo SUCCEEDED || echo FAILED"
            status = backtick(s_header_xform)
            assert status == "SUCCEEDED"
            count_successes += 1
        except Exception as e:
            failures.append(s)
            if len(failures) == MAX_FAILURES:
                count_examined = len(failures) + count_successes
                e.help_text = f"Giving up after {MAX_FAILURES} failures in first {count_examined} species.  See temp files for more info."
                raise
        finally:
            ticker.advance(1)
    failed_species_alt_ids = [s['species_alt_id'] for s in failures]
    if not failures:
        tsprint(
            f"All {len(iggdb.species_info)} species were processed successfully."
        )
    else:
        tsprint(
            f"Collation of {len(failures)} species failed.  Those are missing from the final {gdim}.fa"
        )
    # Create output file only on success.
    # Dump stats in json.
    collation_status = {
        "comment":
        f"Collation into {gdim}.fa succeeded on {time.asctime()} with command '{my_command}'.",
        "successfully_collated_species_count": count_successes,
        "failed_species_count": len(failures),
        "total_species_count": len(iggdb.species_info),
        "failed_species_alt_ids": failed_species_alt_ids,
        "elapsed_time": time.time() - ticker.t_start
    }
    collation_status_str = json.dumps(collation_status, indent=4)
    with open(f"{outdir}/{gdim}_collation_status.json", "w") as pcs:
        chars_written = pcs.write(collation_status_str)
        assert chars_written == len(collation_status_str)
        tsprint(collation_status_str)
    os.rename(f"{outdir}/temp_{gdim}.fa", f"{outdir}/{gdim}.fa")