예제 #1
0
def train_vae(config_filename, input_data_filename):
    """
    Trains VAE model using parameters set in config file

    Arguments
    ----------
    config_filename: str
        File containing user defined parameters

    input_data_filename: str
        File path corresponding to input dataset to use

    """

    # Read in config variables
    params = utils.read_config(config_filename)

    # Load parameters
    base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
    dataset_name = params["dataset_name"]
    learning_rate = params["learning_rate"]
    batch_size = params["batch_size"]
    epochs = params["epochs"]
    kappa = params["kappa"]
    intermediate_dim = params["intermediate_dim"]
    latent_dim = params["latent_dim"]
    epsilon_std = params["epsilon_std"]
    train_architecture = params["NN_architecture"]
    validation_frac = params["validation_frac"]

    # Read data
    normalized_data = pd.read_csv(input_data_filename,
                                  header=0,
                                  sep="\t",
                                  index_col=0)

    print("input dataset contains {} samples and {} genes".format(
        normalized_data.shape[0], normalized_data.shape[1]))

    # Train (VAE)
    vae.tybalt_2layer_model(
        learning_rate,
        batch_size,
        epochs,
        kappa,
        intermediate_dim,
        latent_dim,
        epsilon_std,
        normalized_data,
        base_dir,
        dataset_name,
        train_architecture,
        validation_frac,
    )
예제 #2
0
def normalize_expression_data(base_dir, config_filename,
                              raw_input_data_filename,
                              normalized_data_filename):
    """
    0-1 normalize the expression data.

    Arguments
    ----------
    base_dir: str
        Root directory containing analysis subdirectories

    config_filename: str
        File containing user defined parameters

    raw_input_data_filename: str
        File containing raw expression data

    normalize_data_filename:
        Output file containing normalized expression data
    """
    # Read in config variables
    params = utils.read_config(config_filename)

    # Read data
    data = pd.read_csv(raw_input_data_filename,
                       header=0,
                       sep="\t",
                       index_col=0)
    print("input: dataset contains {} samples and {} genes".format(
        data.shape[0], data.shape[1]))

    # 0-1 normalize per gene
    scaler = preprocessing.MinMaxScaler()
    data_scaled_df = scaler.fit_transform(data)
    data_scaled_df = pd.DataFrame(data_scaled_df,
                                  columns=data.columns,
                                  index=data.index)

    print("Output: normalized dataset contains {} samples and {} genes".format(
        data_scaled_df.shape[0], data_scaled_df.shape[1]))

    # Save scaler transform
    scaler_filename = params["scaler_transform_filename"]

    outfile = open(scaler_filename, "wb")
    pickle.dump(scaler, outfile)
    outfile.close()

    # Save scaled data
    data_scaled_df.to_csv(normalized_data_filename, sep="\t", compression="xz")
예제 #3
0
# %load_ext autoreload
# %autoreload 2

import os
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from ponyo import utils

# +
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
config_filename = os.path.abspath(
    os.path.join(base_dir, "configs", "config_human_general.tsv"))

params = utils.read_config(config_filename)

local_dir = params["local_dir"]

project_id1 = "SRP012656"
project_id2 = "SRP061689"

# +
# Get data directory containing gene summary data
data_dir = os.path.join(base_dir, "human_general_analysis")

# Get gene ranking files
gene_ranking_filename1 = os.path.join(
    data_dir, f"generic_gene_summary_{project_id1}.tsv")
gene_ranking_filename1_run2 = os.path.join(
    data_dir, f"generic_gene_summary_{project_id1}_run2.tsv")
def run_simulation(config_file,
                   input_data_file,
                   corrected,
                   experiment_ids_file=None):
    """
    Runs simulation experiment: either sample-level or experiment-level; with or without
    correction. The simulation experiment that is run depends on the parameters passed
    and specified in the config file

    Arguments
    ----------
    config_file: str
        File containing user defined parameters

    input_data_file: str
        File path corresponding to input dataset to use

    corrected: bool
        True if simulation is applying noise correction

    experiment_ids_file: str
        File containing experiment ids with expression data associated generated from ```create_experiment_id_file```

    """

    # Read in config variables
    params = utils.read_config(config_file)

    # Load parameters
    dataset_name = params["dataset_name"]
    simulation_type = params["simulation_type"]
    NN_architecture = params["NN_architecture"]
    use_pca = params["use_pca"]
    num_PCs = params["num_PCs"]
    local_dir = params["local_dir"]
    correction_method = params["correction_method"]
    sample_id_colname = params["metadata_colname"]
    iterations = params["iterations"]
    num_cores = params["num_cores"]

    if "sample" in simulation_type:
        num_simulated_samples = params["num_simulated_samples"]
        lst_num_experiments = params["lst_num_experiments"]
    else:
        num_simulated_experiments = params["num_simulated_experiments"]
        lst_num_partitions = params["lst_num_partitions"]

    # Output files
    # base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
    base_dir = os.path.abspath(os.pardir)
    if corrected:
        similarity_uncorrected_file = os.path.join(
            base_dir,
            dataset_name,
            "results",
            "saved_variables",
            f"{dataset_name}_{simulation_type}_svcca_corrected_{correction_method}.pickle",
        )

        ci_file = os.path.join(
            base_dir,
            dataset_name,
            "results",
            "saved_variables",
            f"{dataset_name}_{simulation_type}_ci_corrected_{correction_method}.pickle",
        )

    else:
        similarity_uncorrected_file = os.path.join(
            base_dir,
            dataset_name,
            "results",
            "saved_variables",
            f"{dataset_name}_{simulation_type}_svcca_uncorrected_{correction_method}.pickle",
        )

        ci_file = os.path.join(
            base_dir,
            dataset_name,
            "results",
            "saved_variables",
            f"{dataset_name}_{simulation_type}_ci_uncorrected_{correction_method}.pickle",
        )

    similarity_permuted_file = os.path.join(
        base_dir,
        dataset_name,
        "results",
        "saved_variables",
        dataset_name + "_" + simulation_type + "_permuted",
    )

    # Run multiple simulations
    if "sample" in simulation_type:
        if corrected:
            file_prefix = "Experiment_corrected"
        else:
            file_prefix = "Experiment"
        results = Parallel(n_jobs=num_cores, verbose=100)(
            delayed(simulations.sample_level_simulation)(
                i,
                NN_architecture,
                dataset_name,
                simulation_type,
                num_simulated_samples,
                lst_num_experiments,
                corrected,
                correction_method,
                use_pca,
                num_PCs,
                file_prefix,
                input_data_file,
                local_dir,
                base_dir,
            ) for i in iterations)

    else:
        if corrected:
            file_prefix = "Partition_corrected"
        else:
            file_prefix = "Partition"
        results = Parallel(n_jobs=num_cores, verbose=100)(
            delayed(simulations.experiment_level_simulation)(
                i,
                NN_architecture,
                dataset_name,
                simulation_type,
                num_simulated_experiments,
                lst_num_partitions,
                corrected,
                correction_method,
                use_pca,
                num_PCs,
                file_prefix,
                input_data_file,
                experiment_ids_file,
                sample_id_colname,
                local_dir,
                base_dir,
            ) for i in iterations)

    # permuted score
    permuted_score = results[0][0]

    # Concatenate output dataframes
    all_svcca_scores = pd.DataFrame()

    for i in iterations:
        all_svcca_scores = pd.concat([all_svcca_scores, results[i][1]], axis=1)

    # Get mean svcca score for each row (number of experiments)
    mean_scores = all_svcca_scores.mean(axis=1).to_frame()
    mean_scores.columns = ["score"]
    print(mean_scores)

    # Get standard dev for each row (number of experiments)
    std_scores = (all_svcca_scores.std(axis=1) / math.sqrt(10)).to_frame()
    std_scores.columns = ["score"]
    print(std_scores)

    # Get confidence interval for each row (number of experiments)
    # z-score for 95% confidence interval
    err = std_scores * 1.96

    # Get boundaries of confidence interval
    ymax = mean_scores + err
    ymin = mean_scores - err

    ci = pd.concat([ymin, ymax], axis=1)
    ci.columns = ["ymin", "ymax"]
    print(ci)

    # Pickle dataframe of mean scores scores for first run, interval
    mean_scores.to_pickle(similarity_uncorrected_file)
    ci.to_pickle(ci_file)
    np.save(similarity_permuted_file, permuted_score)
def run_experiment_effect_simulation(
    config_file,
    input_data_file,
    num_simulated_experiments,
    lst_num_partitions,
    experiment_ids_file=None,
):
    """
    Runs experiment-level simulation keeping the size of partitions
    fixed (i.e. one experiment per partition). This script examines
    the contribution of individual experiments in signal detection.

    Arguments
    ----------
    config_file: str
        File containing user defined parameters

    input_data_file: str
        File path corresponding to input dataset to use

    num_simulated_experiments: int

    lst_num_partitions: list

    experiment_ids_file: str
        File containing experiment ids with expression data associated generated from ```create_experiment_id_file```

    """

    # Read in config variables
    params = utils.read_config(config_file)

    # Load parameters
    dataset_name = params["dataset_name"]
    simulation_type = params["simulation_type"]
    NN_architecture = params["NN_architecture"]
    use_pca = params["use_pca"]
    num_PCs = params["num_PCs"]
    local_dir = params["local_dir"]
    correction_method = params["correction_method"]
    sample_id_colname = params["metadata_colname"]
    iterations = params["iterations"]
    num_cores = params["num_cores"]

    # Output files
    base_dir = os.path.abspath(os.pardir)

    # Run multiple simulations
    results = Parallel(n_jobs=num_cores, verbose=100)(
        delayed(simulations.experiment_effect_simulation)(
            i,
            NN_architecture,
            dataset_name,
            simulation_type,
            num_simulated_experiments,
            lst_num_partitions,
            correction_method,
            use_pca,
            num_PCs,
            input_data_file,
            experiment_ids_file,
            sample_id_colname,
            local_dir,
            base_dir,
        ) for i in iterations)

    # permuted score
    permuted_score = results[0][0]

    # Concatenate output dataframes
    uncorrected_svcca_scores = pd.DataFrame()
    corrected_svcca_scores = pd.DataFrame()

    for i in iterations:
        # svcca_scores = pd.concat([svcca_scores, results[i][1]], axis=1)
        uncorrected_svcca_scores = pd.concat(
            [uncorrected_svcca_scores, results[i][1]], axis=1)
        corrected_svcca_scores = pd.concat(
            [corrected_svcca_scores, results[i][2]], axis=1)

    # Get mean svcca score for each row (number of experiments)
    uncorrected_mean_scores = uncorrected_svcca_scores.mean(axis=1).to_frame()
    uncorrected_mean_scores.columns = ["score"]
    corrected_mean_scores = corrected_svcca_scores.mean(axis=1).to_frame()
    corrected_mean_scores.columns = ["score"]
    print("mean uncorrected svcca scores")
    print(uncorrected_mean_scores)
    print("mean corrected svcca scores")
    print(corrected_mean_scores)

    # Get CI for each row (number of experiments)
    ci_threshold = 0.95
    alpha = 1 - ci_threshold
    offset = int(len(iterations) * (alpha / 2))

    # Get CI for uncorrected data
    ymax = []
    ymin = []
    for size_compendia in [1, num_simulated_experiments]:
        sort_scores = sorted(uncorrected_svcca_scores.loc[size_compendia])
        ymin.append(sort_scores[offset])
        if offset == 0:
            ymax.append(sort_scores[-1])
        else:
            ymax.append(sort_scores[len(iterations) - offset])

    ci_uncorrected = pd.DataFrame(data={
        "ymin": ymin,
        "ymax": ymax
    },
                                  index=[1, num_simulated_experiments])

    print("uncorrected confidence interval")
    print(ci_uncorrected)

    # Get CI for corrected data
    ymax = []
    ymin = []
    for size_compendia in [1, num_simulated_experiments]:
        sort_scores = sorted(corrected_svcca_scores.loc[size_compendia])
        ymin.append(sort_scores[offset])
        if offset == 0:
            ymax.append(sort_scores[-1])
        else:
            ymax.append(sort_scores[len(iterations) - offset])

    ci_corrected = pd.DataFrame(data={
        "ymin": ymin,
        "ymax": ymax
    },
                                index=[1, num_simulated_experiments])

    print("corrected_confidence interval")
    print(ci_corrected)

    return (
        uncorrected_mean_scores,
        ci_uncorrected,
        permuted_score,
        corrected_mean_scores,
        ci_corrected,
    )