Пример #1
0
    def __init__(self, name, settings_file, alpha, extensions, interest):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param alpha: float, the significance cut-off.
        :param extensions: str, the output figure file type extension.
        :param interest: list, the HGNC names to print.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        self.settings = LocalSettings(current_dir, settings_file)
        self.covs = self.settings.get_setting("covariates_to_include")
        self.covs_excl_from_overview = [
            x.lower()
            for x in self.settings.get_setting("covariates_excl_from_overview")
        ]
        self.max_url_len = self.settings.get_setting("max_url_length")
        self.maf_cutoff = self.settings.get_setting("maf_cutoff")
        self.include_top_n = self.settings.get_setting("include_top_n")

        # Load the variables.
        self.name = name
        self.alpha = alpha
        self.extensions = extensions
        self.interest = interest

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)
Пример #2
0
    def __init__(self, settings, profile_file, profile_df, ct_expr_file,
                 ct_expr_df, force, outdir):
        """
        The initializer for the class.


        :param settings: string, the settings.
        :param profile_file: string, the datafile contaioning the celltype
                             profile.
        :param profile_df: DataFrame, the celltype profile.
        :param ct_expr_file: string, the datafile containing expression
                             of the celltype profiles.
        :param ct_expr_df: string, the celltype expression.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.sample_cohort_file = settings["sample_cohort_datafile"]
        self.sample_id = settings["sample_cohort_identifiers"]["sample"]
        self.cohort_id = settings["sample_cohort_identifiers"]["cohort"]
        self.profile_file = profile_file
        self.profile_df = profile_df
        self.ct_expr_file = ct_expr_file
        self.ct_expr_df = ct_expr_df
        self.force = force

        # Prepare an output directories.
        self.outdir = os.path.join(outdir, 'perform_deconvolution')
        prepare_output_dir(self.outdir)

        # Construct the output paths.
        self.outpath = os.path.join(self.outdir, "deconvolution_table.txt.gz")

        # Create empty variable.
        self.deconvolution = None
Пример #3
0
    def __init__(self, settings, eqtl_df, geno_df, alleles_df, expr_df, cov_df,
                 groups_file, force, outdir):
        """
        The initializer for the class.

        :param settings: string, the settings.
        :param eqtl_df: DataFrame, the eQTL probes data.
        :param geno_df: DataFrame, the genotype data.
        :param alleles_df: DataFrame, the alleles data.
        :param expr_df: DataFrame, the expression data.
        :param cov_df: DataFrame, the covariate data.
        :param groups_file: string, path to the groups file.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.eqtl_df = eqtl_df
        self.geno_df = geno_df
        self.alleles_df = alleles_df
        self.expr_df = expr_df
        self.cov_df = cov_df
        self.force = force

        # Load the groups.
        with open(groups_file, "rb") as f:
            groups_data = pickle.load(f)

        # Remove uninteresting groups.
        self.groups = self.filter_groups(groups_data,
                                         settings["min_eqtl_in_group"],
                                         settings["min_samples_in_group"])
        del groups_data

        # Prepare an output directories.
        self.outdir = os.path.join(outdir, 'create_groups')
        prepare_output_dir(self.outdir)
Пример #4
0
    def __init__(self, dataset, outdir, extension):
        """
        The initializer for the class.

        :param dataset: Dataset, the input data.
        :param outdir: string, the output directory.
        :param extension: str, the output figure file type extension.
        """
        self.outdir = os.path.join(outdir, 'inter_eqtl_effect')
        prepare_output_dir(self.outdir)
        self.extension = extension

        # Set the right pdf font for exporting.
        matplotlib.rcParams['pdf.fonttype'] = 42

        # Extract the required data.
        print("Loading data")
        self.eqtl_df = dataset.get_eqtl_df()
        self.geno_df = dataset.get_geno_df()
        self.expr_df = dataset.get_expr_df()
        self.alleles_df = dataset.get_alleles_df()
        self.cov_df = dataset.get_cov_df()
        self.inter_df = dataset.get_inter_cov_zscore_df()
        self.z_score_cutoff = dataset.get_significance_cutoff()
        colormap = dataset.get_colormap()

        # Create color map.
        self.group_color_map, self.value_color_map = self.create_color_map(
            colormap)
        self.sex_color_map = {
            "Male": colormap["male"],
            "Female": colormap["female"]
        }
Пример #5
0
    def __init__(self, name, settings_file, alpha, plots, top, interest,
                 extension, validate):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param alpha: float, the significance cut-off.
        :param plots: list, the names of the plots to create.
        :param top: int, the number of top eQTLs to plot.
        :param interest: list, the indices of equals to plot.
        :param extension: str, the output figure file type extension.
        :param validate: boolean, whether or not to validate the input.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        self.settings = LocalSettings(current_dir, settings_file)

        # Load the variables.
        self.name = name
        self.alpha = alpha
        self.plots = plots
        self.top = top
        self.interest = interest
        self.extension = extension
        self.validate = validate

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)
    def __init__(self, dataset, outdir, extension):
        """
        The initializer for the class.

        :param dataset: Dataset, the input data.
        :param outdir: string, the output directory.
        :param extension: str, the output figure file type extension.
        """
        self.outdir = os.path.join(outdir, 'inter_eqtl_effect_deconvolution')
        prepare_output_dir(self.outdir)
        self.extension = extension

        # Set the right pdf font for exporting.
        matplotlib.rcParams['pdf.fonttype'] = 42

        # Extract the required data.
        print("Loading data")
        self.eqtl_df = dataset.get_eqtl_df()
        self.geno_df = dataset.get_geno_df()
        self.expr_df = dataset.get_expr_df()
        self.alleles_df = dataset.get_alleles_df()
        self.cov_df = dataset.get_cov_df()
        self.inter_df = dataset.get_inter_cov_zscore_df()
        self.celltypes = dataset.get_celltypes()
        self.cellmap_methods = dataset.get_cellmap_methods()
        self.marker_genes = dataset.get_marker_genes()

        # Create color map.
        self.group_color_map, self.value_color_map = self.create_color_map()
Пример #7
0
    def __init__(self, dataset, outdir, extension):
        """
        The initializer for the class.

        :param dataset: Dataset, the input data.
        :param outdir: string, the output directory.
        :param extension: str, the output figure file type extension.
        """
        self.outdir = os.path.join(outdir, 'inter_eqtl_celltype_details')
        prepare_output_dir(self.outdir)
        self.extension = extension

        # Set the right pdf font for exporting.
        matplotlib.rcParams['pdf.fonttype'] = 42

        # Extract the required data.
        print("Loading data")
        self.eqtl_df = dataset.get_eqtl_df()
        self.geno_df = dataset.get_geno_df()
        self.zscore_df = dataset.get_inter_cov_zscore_df()
        self.tvalue_df = dataset.get_inter_cov_inter_tvalue_df()
        self.cellmap_methods = dataset.get_cellmap_methods()
        self.marker_genes = dataset.get_marker_genes()
        self.z_score_cutoff = dataset.get_significance_cutoff()
        self.colormap = dataset.get_colormap()
    def __init__(self, settings, profile_file, profile_df, ct_expr_file, force,
                 outdir):
        """
        The initializer for the class.


        :param settings: string, the settings.
        :param profile_file: string, the datafile contaioning the celltype
                             profile.
        :param profile_df: DataFrame, the celltype profile.
        :param ct_expr_file: string, the datafile containing expression
                             of the celltype profiles.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.profile_file = profile_file
        self.profile_df = profile_df
        self.ct_expr_file = ct_expr_file
        self.force = force

        # Prepare an output directory.
        self.outdir = os.path.join(outdir, 'perform_celltype_factorization')
        prepare_output_dir(self.outdir)
        self.pca_outpath = os.path.join(self.outdir, "celltype_pca.txt.gz")
        self.nmf_outpath = os.path.join(self.outdir, "celltype_nmf.txt.gz")

        # Create empty variables.
        self.celltype_expression = None
        self.celltype_pcs = None
        self.celltype_cs = None
Пример #9
0
    def __init__(self, settings, eqtl_df, geno_df, alleles_df, expr_df, cov_df,
                 force, outdir):
        """
        The initializer for the class.

        :param settings: string, the settings.
        :param eqtl_df: DataFrame, the eQTL data.
        :param geno_df: DataFrame, the genotype data.
        :param alleles_df: DataFrame, the alleles data.
        :param expr_df: DataFrame, the expression data.
        :param cov_df: DataFrame, the covariate data.
        :param marker_file: string, path to the marker file.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.eqtl_df = eqtl_df
        self.geno_df = geno_df
        self.alleles_df = alleles_df
        self.expr_df = expr_df
        self.cov_df = cov_df
        self.force = force

        # Prepare an output directories.
        self.outdir = os.path.join(outdir, 'mask_matrices')
        prepare_output_dir(self.outdir)
Пример #10
0
    def __init__(self, settings, disease, force, outdir):
        """
        The initializer for the class.

        :param settings: string, the settings.
        :param disease: string, the name of the disease to analyse.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.indir = settings["input_directory"]
        self.iter_dirname = settings["iteration_dirname"]
        self.in_filename = settings["in_filename"]
        self.n_iterations = settings["iterations"]
        self.snp_to_gwasid_filename = settings["snp_to_gwasid_filename"]
        self.gwasid_to_trait_filename = settings["gwasid_to_trait_filename"]
        self.disease = disease
        self.force = force

        # Prepare an output directory.
        self.outdir = os.path.join(outdir, 'combine_eqtlprobes')
        prepare_output_dir(self.outdir)
        self.outpath = os.path.join(self.outdir, "eQTLprobes_combined.txt.gz")

        # Declare variables.
        self.eqtl_probes = None
Пример #11
0
    def __init__(self, name, settings_file, skip_rows, n_eqtls, n_samples,
                 verbose):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param skip_rows: int, the number of rows to skip.
        :param n_eqtls: int, the number of eqtls in the input files.
        :param n_samples: int, the number of samples in the input files.
        :param cores: int, the number of cores to use.
        :param verbose: boolean, whether or not to print all update info.
        :param include: boolean, whether or not to include the unfinished
                        wait_list.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        settings = LocalSettings(current_dir, settings_file)

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)

        # Safe settings.
        input_dir = settings.get_setting("input_dir")
        filenames = settings.get_setting("filenames")
        self.geno_inpath = os.path.join(input_dir, name, filenames["genotype"])
        self.expr_inpath = os.path.join(input_dir, name,
                                        filenames["expression"])
        self.cov_inpath = os.path.join(input_dir, name,
                                       filenames["covariates"])
        self.drop_covs = settings.get_setting("drop_covariates")
        self.tech_covs = settings.get_setting("technical_covariates")
        self.cov_outdir = settings.get_setting("covariates_folder")
        self.tech_cov_outdir = settings.get_setting(
            "technical_covariates_folder")
        self.perm_orders_filename = settings.get_setting(
            "permutations_order_pickle_filename")
        self.pvalues_filename = settings.get_setting(
            "actual_pvalues_pickle_filename")
        self.snp_tvalues_filename = settings.get_setting(
            "snp_tvalues_pickle_filename")
        self.inter_tvalues_filename = settings.get_setting(
            "inter_tvalues_pickle_filename")
        self.perm_pvalues_filename = settings.get_setting(
            "permuted_pvalues_pickle_filename")
        self.n_permutations = settings.get_setting("n_permutations")
        self.max_end_time = int(time.time(
        )) + settings.get_setting("max_runtime_in_hours") * 60 * 60
        self.panic_time = self.max_end_time - (
            settings.get_setting("panic_time_in_min") * 60)
        self.skip_rows = skip_rows
        self.n_eqtls = n_eqtls
        self.n_samples = n_samples
        self.verbose = verbose
Пример #12
0
    def __init__(self, df, outdir, extension="png"):
        self.df = self.set_df(df)
        self.outdir = os.path.join(outdir, 'plots')
        self.extension = extension

        prepare_output_dir(self.outdir)

        # Set the right pdf font for exporting.
        if self.extension == "pdf":
            matplotlib.rcParams['pdf.fonttype'] = 42
Пример #13
0
    def __init__(self, dataset, outdir, extension):
        """
        The initializer for the class.

        :param dataset: Dataset, the input data.
        :param outdir: string, the output directory.
        :param extension: str, the output figure file type extension.
        """
        self.outdir = os.path.join(outdir, 'inter_zscore_dist')
        prepare_output_dir(self.outdir)
        self.extension = extension

        # Set the right pdf font for exporting.
        matplotlib.rcParams['pdf.fonttype'] = 42

        # Extract the required data.
        print("Loading data")
        self.inter_df = dataset.get_inter_cov_zscore_df()
        self.z_score_cutoff = dataset.get_significance_cutoff()
Пример #14
0
    def __init__(self, dataset, outdir, extension):
        """
        The initializer for the class.

        :param dataset: Dataset, the input data.
        :param outdir: string, the output directory.
        :param extension: str, the output figure file type extension.
        """
        self.outdir = os.path.join(outdir, 'covariate_clustermap')
        prepare_output_dir(self.outdir)
        self.extension = extension

        # Set the right pdf font for exporting.
        matplotlib.rcParams['pdf.fonttype'] = 42

        # Extract the required data.
        print("Loading data")
        self.cov_df = dataset.get_cov_df()
        self.cmap = dataset.get_diverging_cmap()
Пример #15
0
    def __init__(self, settings_file, groups, force, verbose):
        """
        Initializer of the class.

        :param settings_file: string, the name of the settings file.
        :param groups: list, the names of groups to analyse.
        :param force: boolean, whether or not to force to redo each step.
        :param verbose: boolean, whether or not to print each step.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        settings = LocalSettings(current_dir, settings_file)

        # Safe arguments.
        self.indir = settings.get_setting("input_dir")
        self.tech_covs = settings.get_setting("technical_covariates")
        self.eqtl_ia = settings.get_setting("eQTLInteractionAnalyser")
        self.inter_regex = settings.get_setting("interaction_regex")
        self.groups = groups
        self.force = force
        self.verbose = verbose

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir,
                                   settings.get_setting("output_dir"))
        prepare_output_dir(self.outdir)

        # Find which groups are in the input directory.
        if self.groups is not None:
            groups_in_indir = glob.glob(os.path.join(self.indir, 'group_*'))
            self.group_indirs = self.filter_groups(groups_in_indir)
        else:
            self.group_indirs = [self.indir]

        # Prepare filenames.
        filenames = settings.get_setting("filenames")
        self.eqtl_filename = filenames["eqtl"]
        self.geno_filename = filenames["genotype"]
        self.expr_filename = filenames["expression"]
        self.cov_filename = filenames["covariate"]
Пример #16
0
    def __init__(self, settings, marker_file, celltype_pcs, celltype_cs,
                 deconvolution, sample_order, force, outdir):
        """
        The initializer for the class.

        :param settings: string, the settings.
        :param marker_file: string, path to the marker file.
        :param celltype_pcs: DataFrame, the first component from PCA of each
                            celltype expression.
        :param celltype_cs: DataFrame, the first component from NMF of each
                            celltype expression.
        :param deconvolution: DataFrame, the estimated cell count proportions
                              of each celltype per sample.
        :param sample_order: list, order of samples.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.cov_file = settings["covariate_datafile"]
        self.tech_covs = settings["technical_covariates"]
        self.cohorts = settings["cohorts"]
        self.ref_cohort = settings["reference_cohort"]
        self.pheno_file = settings["phenotype_datafile"]
        self.eig_file = settings["eigenvectors_datafile"]
        self.n_eigen = settings["num_eigenvectors"]
        self.eig_bef_cov_corr_file = settings[
            "eigenvectors_before_cov_corr_datafile"]
        self.marker_file = marker_file
        self.sample_order = sample_order
        self.celltype_pcs = celltype_pcs
        self.celltype_cs = celltype_cs
        self.deconvolution = deconvolution
        self.force = force

        # Prepare an output directories.
        self.outdir = os.path.join(outdir, 'create_cov_matrix')
        prepare_output_dir(self.outdir)
        self.outpath = os.path.join(self.outdir, "covariates_table.txt.gz")

        # Variables.
        self.covariates = None
        self.sex_dict = {"M": 0, "F": 1, np.nan: -1}
Пример #17
0
    def __init__(self, settings, force, outdir):
        """
        The initializer for the class.

        :param settings: string, the settings.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.inpath = os.path.join(settings["input_directory"],
                                   settings["filename_regex"])
        self.force = force

        # Prepare an output directory.
        self.outdir = os.path.join(outdir, 'combine_gte_files')
        prepare_output_dir(self.outdir)
        self.outpath = os.path.join(self.outdir, "GTE_combined.txt.gz")

        # Declare variables.
        self.gte = None
        self.sample_dict = None
        self.sample_order = None
Пример #18
0
    def __init__(self, settings_file, groups, force):
        """
        Initializer of the class.

        :param settings_file: string, the name of the settings file.
        :param groups: list, the names of groups to analyse.
        :param force: boolean, whether or not to force to redo each step.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        settings = LocalSettings(current_dir, settings_file)

        # Safe arguments.)
        self.eqtl_inpath = settings.get_setting("eqtl_datafile")
        self.cov_inpath = settings.get_setting("cov_datafile")
        self.data_indir = settings.get_setting("data_dir")
        self.g_data_indir = settings.get_setting("groups_data_dir")
        self.g_inter_indir = settings.get_setting("inter_groups_dir")
        self.inter_regex = settings.get_setting("interaction_regex")
        self.group_ids = self.filter_groups(groups)
        self.celltypes = settings.get_setting("celltypes")
        self.force = force

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir,
                                   settings.get_setting("output_dir"))
        prepare_output_dir(self.outdir)

        # Prepare filenames.
        filenames = settings.get_setting("filenames")
        self.obj_filename = filenames["object"]
        self.eqtl_filename = filenames["eqtl"]
        self.geno_filename = filenames["genotype"]
        self.alleles_filename = filenames["alleles"]
        self.expr_filename = filenames["expression"]
        self.cov_filename = filenames["covariates"]
        self.inter_filename = filenames["interaction"]
        self.markers_filename = filenames["markers"]
Пример #19
0
    def __init__(self, name, settings_file, disease, force_steps):
        """
        Initializer of the class.

        :param name: string, the name of the base input/ouput directory.
        :param settings_file: string, the name of the settings file.
        :param disease: string, the name of the disease to analyse.
        :param force_steps: list, the names of the steps to force to redo.
        """
        # Define the current directory.
        current_dir = str(Path(__file__).parent.parent)

        # Load the LocalSettings singelton class.
        self.settings = LocalSettings(current_dir, settings_file)

        # Safe arguments.
        self.disease = disease
        self.force_dict = self.create_force_dict(force_steps)

        # Prepare an output directory.
        self.outdir = os.path.join(current_dir, name)
        prepare_output_dir(self.outdir)
    def __init__(self, settings, expr_file, expr_df, sample_dict, sample_order,
                 force, outdir):
        """
        The initializer for the class.

        :param settings: string, the settings.
        :param expr_file: string, the expression data file.
        :param expr_df: DataFrame, the complete expression dataframe.
        :param sample_dict: dictionary, a dictionary for translating unmasked
                            sampels to the same format.
        :param sample_order: list, order of samples.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.decon_expr_file = settings["decon_expression_datafile"]
        self.celltype_profile_file = settings["celltype_profile_datafile"]
        self.translate_file = settings["translate_datafile"]
        self.marker_genes_suffix = settings["marker_genes_suffix"]
        self.marker_dict = settings["marker_dict"]
        self.expr_file = expr_file
        self.expr_df = expr_df
        self.sample_dict = sample_dict
        self.sample_order = sample_order
        self.force = force

        # Prepare an output directories.
        self.outdir = os.path.join(outdir, 'create_deconvolution_matrices')
        prepare_output_dir(self.outdir)

        # Construct the output paths.
        self.decon_expr_outpath = os.path.join(self.outdir,
                                               "decon_expr_table.txt.gz")
        self.ct_profile_expr_outpath = os.path.join(
            self.outdir, "ct_profile_expr_table.txt.gz")
        self.markers_outpath = os.path.join(self.outdir, "marker_genes.txt.gz")

        # Create empty variable.
        self.celltype_profile = None
Пример #21
0
    def save_per_group(self):
        indices_of_interest = []

        for interaction in self.df["Interaction"].unique():
            inter_df = self.df.loc[self.df["Interaction"] == interaction, :].copy()
            inter_df.drop(["Interaction"], axis=1, inplace=True)
            if len(inter_df.index) <= 0:
                return indices_of_interest

            out_dir = os.path.join(self.outdir, '{}_interaction'.format(interaction))
            prepare_output_dir(out_dir)

            for covariate in inter_df["Covariate"].unique():
                cov_df = inter_df.loc[inter_df["Covariate"] == covariate, :].copy()
                cov_df.drop(["Covariate"], axis=1, inplace=True)
                if len(cov_df.index) <= 0:
                    continue

                fpath = os.path.join(out_dir,
                                     "{}_{}.txt".format(interaction, covariate))
                self.save(cov_df, fpath, self.max_url_len, self.signif_cutoff)

                for direction in ["up", "down"]:
                    dir_df = cov_df.loc[cov_df["Direction"] == direction, :].copy()
                    dir_df.drop(["Direction"], axis=1, inplace=True)
                    if len(dir_df.index) <= 0:
                        continue

                    fpath = os.path.join(out_dir, "{}_{}_{}.txt".format(interaction,
                                                                        covariate,
                                                                        direction))
                    self.save(dir_df, fpath, self.max_url_len, self.signif_cutoff)

                    indices_of_interest.extend(dir_df["Index"][:self.top])

        indices_of_interest = list(set(indices_of_interest))
        indices_of_interest.sort()
        return indices_of_interest
Пример #22
0
    def __init__(self, settings, eqtl_df, geno_df, alleles_df, expr_df, force,
                 outdir):
        """
        The initializer for the class.

        :param settings: string, the settings.
        :param eqtl_df: DataFrame, the eQTL probes data.
        :param geno_df: DataFrame, the genotype data.
        :param alleles_df: DataFrame, the alleles data.
        :param expr_df: DataFrame, the expression data.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.eqtl_df = eqtl_df
        self.geno_df = geno_df
        self.alleles_df = alleles_df
        self.expr_df = expr_df
        self.force = force

        # Prepare an output directories.
        outdir = os.path.join(outdir, 'create_regression_matrix')
        prepare_output_dir(outdir)
        self.outpath = os.path.join(outdir, "regression_table.txt.gz")
Пример #23
0
    def __init__(self, settings, gte_df, sample_dict, sample_order, eqtl_df,
                 force, outdir):
        """
        The initializer for the class.

        :param settings: string, the settings.
        :param gte_df: DataFrame, the combined GTE files in a dataframe.
        :param sample_dict: dictionary, a dictionary for translating unmasked
                            sampels to the same format.
        :param sample_order: list, order of samples.
        :param eqtl_df: DataFrame, the combined eQTL probe files in a dataframe.
        :param force: boolean, whether or not to force the step to redo.
        :param outdir: string, the output directory.
        """
        self.geno_file = settings["genotype_datafile"]
        self.expr_file = settings["expression_datafile"]
        self.gte_df = gte_df
        self.sample_dict = sample_dict
        self.sample_order = sample_order
        self.eqtl_df = eqtl_df
        self.force = force

        # Prepare an output directories.
        self.outdir = os.path.join(outdir, 'create_matrices')
        prepare_output_dir(self.outdir)

        # Construct the output paths.
        self.geno_outpath = os.path.join(self.outdir, "genotype_table.txt.gz")
        self.alleles_outpath = os.path.join(self.outdir,
                                            "genotype_alleles.txt.gz")
        self.expr_outpath = os.path.join(self.outdir,
                                         "expression_table.txt.gz")
        # self.group_outpath = os.path.join(self.outdir, "groups.pkl")

        # Create empty variable.
        self.complete_expr_matrix = None
Пример #24
0
    def start(self):
        """
        Method to start the manager.
        """
        self.print_arguments()
        print("Starting Custom Interaction Analyser "
              "[{}]".format(datetime.now().strftime("%d-%m-%Y, %H:%M:%S")))

        # Start the timer.
        start_time = int(time.time())

        # Get the permutation orders.
        permutation_orders = None
        perm_orders_outfile = os.path.join(self.outdir,
                                           self.perm_orders_filename + ".pkl")
        if check_file_exists(perm_orders_outfile):
            print("Loading permutation order")
            permutation_orders = self.load_pickle(perm_orders_outfile)

            # Validate the permutation orders for the given input.
            if len(permutation_orders) != (self.n_permutations + 1):
                print("\tinvalid")
                permutation_orders = None

            if permutation_orders is not None:
                for order in permutation_orders:
                    if len(order) != self.n_samples:
                        print("\tinvalid")
                        permutation_orders = None
                        break

            print("\tvalid")

        if permutation_orders is None:
            print("Creating permutation order")
            permutation_orders = self.create_perm_orders()
            self.dump_pickle(permutation_orders, self.outdir,
                             self.perm_orders_filename)

        # Start the work.
        print("Start the analyses", flush=True)
        storage = self.work(permutation_orders)
        tc_container = storage.get_tech_cov_container()
        c_container = storage.get_cov_container()

        print("Saving output files", flush=True)
        filename_suffix = "{}_{}".format(self.skip_rows, self.n_eqtls)
        for container, outdir in zip([tc_container, c_container],
                                     [self.tech_cov_outdir, self.cov_outdir]):
            full_outdir = os.path.join(self.outdir, outdir)
            prepare_output_dir(full_outdir)

            self.dump_pickle(container.get_pvalues(),
                             full_outdir,
                             self.pvalues_filename,
                             filename_suffix=filename_suffix,
                             subdir=True,
                             unique=True)
            self.dump_pickle(container.get_snp_tvalues(),
                             full_outdir,
                             self.snp_tvalues_filename,
                             filename_suffix=filename_suffix,
                             subdir=True,
                             unique=True)
            self.dump_pickle(container.get_inter_tvalues(),
                             full_outdir,
                             self.inter_tvalues_filename,
                             filename_suffix=filename_suffix,
                             subdir=True,
                             unique=True)
            self.dump_pickle(container.get_perm_pvalues(),
                             full_outdir,
                             self.perm_pvalues_filename,
                             filename_suffix=filename_suffix,
                             subdir=True,
                             unique=True)

        # Print the process time.
        run_time = int(time.time()) - start_time
        run_time_min, run_time_sec = divmod(run_time, 60)
        run_time_hour, run_time_min = divmod(run_time_min, 60)
        print("Finished in  {} hour(s), {} minute(s) and "
              "{} second(s)".format(int(run_time_hour), int(run_time_min),
                                    int(run_time_sec)))
        print("Received {:.2f} analyses per minute".format(
            (self.n_eqtls * (self.n_permutations + 1)) / (run_time / 60)))

        # Shutdown the manager.
        print("Shutting down manager [{}]".format(
            datetime.now().strftime("%d-%m-%Y, %H:%M:%S")),
              flush=True)
Пример #25
0
    def start(self):
        print("Plotting interaction eQTL radar plots.")
        self.print_arguments()

        methods = self.cellmap_methods
        methods.append((self.marker_genes, ""))

        print("Iterating over eQTLs.")
        for i, (index, row) in enumerate(self.eqtl_df.iterrows()):
            # Extract the usefull information from the row.
            snp_name = row["SNPName"]
            probe_name = row["ProbeName"]
            hgnc_name = row["HGNCName"]

            print("\tWorking on: {}\t{}\t{} [{}/{} "
                  "{:.2f}%]".format(snp_name, probe_name, hgnc_name,
                                    i + 1,
                                    self.eqtl_df.shape[0],
                                    (100 / self.eqtl_df.shape[0]) * (i + 1)))

            # Check if we need to flip the genotypes.
            genotype = self.geno_df.iloc[i, :]
            counts = genotype.value_counts()
            for x in [0.0, 1.0, 2.0]:
                if x not in counts:
                    counts.loc[x] = 0
            zero_geno_count = (counts[0.0] * 2) + counts[1.0]
            two_geno_count = (counts[2.0] * 2) + counts[1.0]
            flip = 1
            if two_geno_count > zero_geno_count:
                flip = -1

            # Prepare output directory.
            eqtl_outdir = os.path.join(self.outdir,
                                       "{}_{}_{}_{}".format(index, snp_name,
                                                            probe_name,
                                                            hgnc_name))
            prepare_output_dir(eqtl_outdir)

            # Iterate over the rows.
            for (prefix, suffix) in methods:
                if prefix != "CellMapNNLS_":
                    continue
                name = prefix.replace("_", "") + suffix

                tvalues = self.tvalue_df.loc[
                          self.tvalue_df.index.str.startswith(prefix), :].copy()
                tvalues = tvalues.iloc[:, i]
                tvalues = tvalues * flip
                tvalues = tvalues.to_frame()

                zscores = self.zscore_df.loc[
                          self.zscore_df.index.str.startswith(prefix), :].copy()
                zscores = zscores.iloc[:, i].to_frame()

                df = tvalues.merge(zscores, left_index=True, right_index=True)
                df.columns = ["tvalue", "zscore"]
                df.index = ["{}".format(x.replace(prefix, "").replace(suffix, "")) for x in df.index]

                self.plot_forest(hgnc_name, name, df, self.z_score_cutoff,
                                 eqtl_outdir, self.extension)
Пример #26
0
    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting interaction analyser.")
        self.print_arguments()

        # Loop over the groups.
        print("Performing interaction analyses.")
        for i, group_indir in enumerate(self.group_indirs):
            # Prepare the input and output directories.
            if self.groups is not None:
                group_id = get_leaf_dir(group_indir)
                group_outdir = os.path.join(self.outdir, group_id)
            else:
                group_id = ""
                group_outdir = self.outdir
            ia_indir = os.path.join(group_outdir, 'input')
            ia_outdir = os.path.join(group_outdir, 'output')
            for outdir in [group_outdir, ia_indir, ia_outdir]:
                prepare_output_dir(outdir)

            # Check if we can find an InteractionZSCoreMatrix
            has_inter_matrix = False
            if not self.force:
                for path in glob.glob(os.path.join(ia_outdir, "*")):
                    if re.match(self.inter_regex, get_basename(path)):
                        has_inter_matrix = True
                        break

            # Stop if we already have the interaction matrix.
            if has_inter_matrix and not self.force:
                continue

            print("\tWorking on: {:15s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.group_indirs),
                                    (100 / len(self.group_indirs)) * (i + 1)))

            # Prepare the EQTLInteractioAnalyser expected input.
            self.print_string("\n### STEP1 ###\n")
            expected_input = ["Genotypes", "Expression", "Covariates"]
            filenames = [
                self.geno_filename, self.expr_filename, self.cov_filename
            ]
            for exp_ia_infile, filename in zip(expected_input, filenames):
                # Check if the files alreadt exist.
                file1 = os.path.join(ia_indir, exp_ia_infile + ".binary.dat")
                file2 = os.path.join(ia_indir,
                                     exp_ia_infile + ".binary.rows.txt")
                file3 = os.path.join(ia_indir,
                                     exp_ia_infile + ".binary.columns.txt")

                if not check_file_exists(file1) or \
                        not check_file_exists(file2) or \
                        not check_file_exists(file3) or \
                        self.force:
                    self.print_string("\nPreparing {}.".format(filename))

                    # Define the filenames.
                    compr_file = os.path.join(self.indir, group_id,
                                              filename + '.txt.gz')
                    copy_file = os.path.join(ia_indir, filename + '.txt.gz')
                    uncompr_file = os.path.join(ia_indir, filename + '.txt')
                    bin_file = os.path.join(ia_indir,
                                            exp_ia_infile + ".binary")

                    # Copy and decompressed the file.
                    self.print_string("\nCopying the input files.")
                    self.copy_file(compr_file, copy_file)
                    self.print_string("\nDecompressing the input files.")
                    self.decompress(copy_file)

                    # Convert to binary.
                    self.print_string("\nConverting files to binary format.")
                    self.convert_to_binary(uncompr_file, bin_file)

                    # Remove the uncompressed file.
                    self.print_string("\nRemoving uncompressed files.")
                    if check_file_exists(uncompr_file):
                        self.print_string(
                            "\tos.remove({})".format(uncompr_file))
                        os.remove(uncompr_file)
                else:
                    self.print_string(
                        "Skipping {} preparation.".format(filename))

            # prepare the eQTL file.
            self.print_string("\n### STEP2 ###\n")
            eqtl_file = os.path.join(ia_indir, self.eqtl_filename + '.txt')
            if not check_file_exists(eqtl_file) or self.force:
                self.print_string("\nPreparing eQTL file.")
                # Define the filenames.
                compr_file = os.path.join(self.indir, group_id,
                                          self.eqtl_filename + '.txt.gz')
                copy_file = os.path.join(ia_indir,
                                         self.eqtl_filename + '.txt.gz')

                # Copy and decompressed the file.
                self.print_string("\nCopying the input files.")
                self.copy_file(compr_file, copy_file)
                self.print_string("\nDecompressing the input files.")
                self.decompress(copy_file)
            else:
                self.print_string("Skipping eqtl preparation.")

            # execute the program.
            self.print_string("\n### STEP3 ###\n")
            self.print_string("Executing the eQTLInteractionAnalyser.")
            self.execute(ia_indir, ia_outdir, eqtl_file)
Пример #27
0
    def start(self):
        print("Creating groups.")
        for i, (group_id, group_obj) in enumerate(self.groups.items()):
            print("  Working on: {:10s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.groups),
                                    (100 / len(self.groups)) * (i + 1)))

            # Create the group dir.
            group_dir = os.path.join(self.outdir, group_id)
            prepare_output_dir(group_dir)

            # Define the output names.
            group_object = os.path.join(group_dir,
                                        "group.pkl")
            eqtl_outpath = os.path.join(group_dir,
                                        "eqtl_table.txt.gz")
            geno_outpath = os.path.join(group_dir,
                                        "genotype_table.txt.gz")
            alleles_outpath = os.path.join(group_dir,
                                           "genotype_alleles.txt.gz")
            expr_outpath = os.path.join(group_dir,
                                        "expression_table.txt.gz")
            cov_outpath = os.path.join(group_dir,
                                       "covariates_table.txt.gz")

            # Check if output file exist, if not, create it.
            if not check_file_exists(group_object) or self.force:
                with open(group_object, "wb") as f:
                    pickle.dump(group_obj, f)
                print("\tSaved group object: "
                      "{}".format(get_basename(group_object)))

            # Get the group indices.
            snp_mask = group_obj.get_snp_indices()
            sample_mask = group_obj.get_sample_indices()

            # Check if output file exist, if not, create it.
            if not check_file_exists(eqtl_outpath) or self.force:
                group_eqtl = self.eqtl_df.iloc[snp_mask, :].copy()
                save_dataframe(outpath=eqtl_outpath, df=group_eqtl,
                               index=False, header=True)
                del group_eqtl

            if not check_file_exists(geno_outpath) or self.force:
                group_geno = self.geno_df.iloc[snp_mask, sample_mask].copy()
                save_dataframe(outpath=geno_outpath, df=group_geno,
                               index=True, header=True)
                del group_geno

            if not check_file_exists(alleles_outpath) or self.force:
                group_alleles = self.alleles_df.iloc[snp_mask, :].copy()
                save_dataframe(outpath=alleles_outpath, df=group_alleles,
                               index=True, header=True)
                del group_alleles

            if not check_file_exists(expr_outpath) or self.force:
                group_expr = self.expr_df.iloc[snp_mask, sample_mask].copy()
                save_dataframe(outpath=expr_outpath, df=group_expr,
                               index=True, header=True)
                del group_expr

            if not check_file_exists(cov_outpath) or self.force:
                group_cov = self.cov_df.iloc[:, sample_mask].copy()
                save_dataframe(outpath=cov_outpath, df=group_cov,
                               index=True, header=True)
                del group_cov
    def __init__(self, dataset, outdir, extension):
        """
        The initializer for the class.

        :param dataset: Dataset, the input data.
        :param outdir: string, the output directory.
        :param extension: str, the output figure file type format.
        """
        self.outdir = os.path.join(outdir, 'covariates_explained_by_others')
        prepare_output_dir(self.outdir)
        self.extension = extension

        # Set the right pdf font for exporting.
        matplotlib.rcParams['pdf.fonttype'] = 42

        # Extract the required data.
        print("Loading data")
        self.groups = dataset.get_groups()
        self.cov_df = dataset.get_cov_df()
        self.colormap = self.create_color_map()
        self.tech_covs = ["PCT_CODING_BASES",
                          "PCT_MRNA_BASES",
                          "PCT_INTRONIC_BASES",
                          "MEDIAN_3PRIME_BIAS",
                          "PCT_USABLE_BASES",
                          "PCT_INTERGENIC_BASES",
                          "PCT_UTR_BASES",
                          #"PF_HQ_ALIGNED_READS",
                          "PCT_READS_ALIGNED_IN_PAIRS",
                          "PCT_CHIMERAS",
                          "PF_READS_IMPROPER_PAIRS",
                          "PF_HQ_ALIGNED_Q20_BASES",
                          "PF_HQ_ALIGNED_BASES",
                          "PCT_PF_READS_IMPROPER_PAIRS",
                          "PF_READS_ALIGNED",
                          "avg_mapped_read_length",
                          "avg_input_read_length",
                          "uniquely_mapped",
                          "total_reads",
                          "Total.Sequences_R1",
                          "MDS1",
                          "MDS2",
                          "MDS3",
                          "MDS4",
                          "AMPAD-MAYO-V2-EUR",
                          "AMPAD-MSBB-V2-EUR",
                          "BrainGVEX-V2-EUR",
                          "CMC_HBCC_set2-EUR",
                          "CMC_HBCC_set3-EUR",
                          "CMC-EUR",
                          "ENA-EUR",
                          "GTEx-EUR",
                          "GVEX-EUR",
                          "LIBD_1M-EUR",
                          "LIBD_h650-EUR",
                          "NABEC-H550-EUR",
                          "NABEC-H610-EUR",
                          "TargetALS-EUR",
                          "UCLA_ASD-EUR",
#                          "AMPAD-ROSMAP-V2-EUR"
                          ]