Exemplo n.º 1
0
 def get_marker_df(self):
     if self.marker_df is None:
         self.marker_df = load_dataframe(inpath=os.path.join(
             self.input_dir, self.markers_filename),
                                         header=0,
                                         index_col=False)
         self.validate()
     return self.marker_df
Exemplo n.º 2
0
 def get_cov_df(self):
     if self.cov_df is None:
         self.cov_df = load_dataframe(inpath=os.path.join(
             self.input_dir, self.cov_filename),
                                      header=0,
                                      index_col=0)
         self.validate()
     return self.cov_df
    def start(self):
        print("Starting factorization of celltype profile expression.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.pca_outpath) and check_file_exists(
                self.nmf_outpath) and not self.force:
            print("Skipping step, loading result.")
            self.celltype_pcs = load_dataframe(inpath=self.pca_outpath,
                                               header=0,
                                               index_col=0)
            self.celltype_cs = load_dataframe(inpath=self.nmf_outpath,
                                              header=0,
                                              index_col=0)
        else:
            self.celltype_expression, self.celltype_pcs, self.celltype_cs = self.perform_matrix_factorization(
            )
            self.save()
Exemplo n.º 4
0
    def filter_on_trait(self, df):
        tmp1 = load_dataframe(inpath=self.gwasid_to_trait_filename,
                              header=0,
                              index_col=False)
        gwas_to_trait = pd.Series(tmp1["Trait"].values,
                                  index=tmp1["ID"]).to_dict()
        del tmp1

        gwas_map = {}
        disease_map = {}
        tmp2 = load_dataframe(inpath=self.snp_to_gwasid_filename,
                              header=0,
                              index_col=False,
                              low_memory=False)

        for index, row in tmp2.iterrows():
            rs = row["RsID"]
            id = row["ID"]

            gwasses = gwas_map.get(rs)
            if gwasses is None:
                gwasses = id
            else:
                gwasses = "{}, {}".format(gwasses, id)
            gwas_map[rs] = gwasses

            diseases = disease_map.get(rs)
            if id in gwas_to_trait.keys():
                trait = gwas_to_trait.get(id)
                if diseases is None:
                    diseases = trait
                else:
                    diseases = "{}, {}".format(diseases, trait)
            disease_map[rs] = diseases

        df["GWASIDS"] = df["SNPName"].map(gwas_map, na_action="")
        df["Trait"] = df["SNPName"].map(disease_map, na_action="")

        # Subset.
        df.dropna(subset=['Trait'], inplace=True)
        df = df[df['Trait'].str.contains(self.disease, case=False)]
Exemplo n.º 5
0
    def get_alleles_df(self):
        if self.alleles_df is None:
            alleles_df = load_dataframe(inpath=os.path.join(
                self.input_dir, self.alleles_filename),
                                        header=0,
                                        index_col=0,
                                        nrows=self.nrows)
            if self.interest is not None:
                alleles_df = alleles_df.iloc[self.interest, :]
            self.alleles_df = alleles_df

            self.validate()
        return self.alleles_df
Exemplo n.º 6
0
    def start(self):
        print("Starting deconvolution.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step, loading result.")
            self.deconvolution = load_dataframe(inpath=self.outpath,
                                                header=0,
                                                index_col=0)
        else:
            self.deconvolution = self.perform_deconvolution()
            self.save()
Exemplo n.º 7
0
    def start(self):
        print("Starting creating covariate file.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step, loading result.")
            self.covariates = load_dataframe(inpath=self.outpath,
                                             header=0,
                                             index_col=0)
        else:
            self.covariates = self.combine_files()
            self.save()
Exemplo n.º 8
0
    def combine_files(self):
        combined = None
        for i, infile in enumerate(glob.glob(self.inpath)):
            df = load_dataframe(inpath=infile, header=None, index_col=None)
            if combined is None:
                combined = df
            else:
                combined = pd.concat([combined, df], axis=0, ignore_index=True)

        # Remove duplicate entries.
        combined.drop_duplicates(inplace=True)

        return combined
Exemplo n.º 9
0
    def get_eqtl_and_interactions_df(self):
        # Get the complete input dataframes.
        df1 = load_dataframe(inpath=os.path.join(self.input_dir,
                                                 self.eqtl_filename),
                             header=0,
                             index_col=False)
        df2 = load_dataframe(inpath=os.path.join(self.inter_input_dir,
                                                 self.inter_cov_subdir,
                                                 self.zscore_filename),
                             header=0,
                             index_col=0).T

        # Check if the files math up.
        if df1.shape[0] != df2.shape[0]:
            print("Input files do not match (1).")
            exit()
        for i in range(df1.shape[0]):
            if not df2.index[i].startswith(df1["SNPName"][i]):
                print("Input files do not match (2).")
                exit()

        # Reset the indices.
        df1.reset_index(drop=True, inplace=True)
        df2.reset_index(drop=True, inplace=True)

        # Replace the z-scores with 1's and 0's (significant vs not-siginifcant)
        df2[df2 <= self.signif_cutoff] = 0
        df2[df2 > self.signif_cutoff] = 1
        df2 = df2.fillna(0).astype('int8')

        # Combine.
        self.eqtl_and_interactions_df = pd.concat([df1, df2], axis=1)
        self.eqtl_and_interactions_df.index = self.eqtl_and_interactions_df.index.astype(
            str) + "_" + self.eqtl_and_interactions_df[
                "SNPName"] + "_" + self.eqtl_and_interactions_df["ProbeName"]
        del df1, df2

        return self.eqtl_and_interactions_df
Exemplo n.º 10
0
    def get_inter_cov_snp_tvalue_df(self):
        if self.inter_cov_snp_tvalue_df is None:
            inter_cov_snp_tvalue_df = load_dataframe(inpath=os.path.join(
                self.inter_input_dir, self.inter_cov_subdir,
                self.snp_tvalue_filename),
                                                     header=0,
                                                     index_col=0)
            if self.interest is not None:
                inter_cov_snp_tvalue_df = inter_cov_snp_tvalue_df.iloc[:, self.
                                                                       interest]
            self.inter_cov_snp_tvalue_df = inter_cov_snp_tvalue_df

            self.validate()
        return self.inter_cov_snp_tvalue_df
Exemplo n.º 11
0
    def combine_files(self):
        combined = None
        for i in range(1, self.n_iterations + 1):
            infile = os.path.join(self.indir, self.iter_dirname + str(i),
                                  self.in_filename)
            df = load_dataframe(inpath=infile, header=0, index_col=False)
            df["Iteration"] = i
            if combined is None:
                combined = df
            else:
                combined = pd.concat([combined, df], axis=0, ignore_index=True)

        # Remove duplicate entries.
        combined.drop_duplicates(inplace=True)

        return combined
Exemplo n.º 12
0
    def start(self):
        print("Starting combining GTE files.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step, loading result.")
            self.gte = load_dataframe(inpath=self.outpath,
                                      header=None,
                                      index_col=None)
        else:
            # Load each GTE file.
            self.gte = self.combine_files()
            self.save()

        # Construct sample translate dict.
        self.sample_dict = self.create_sample_dict()
        self.sample_order = self.set_sample_order()
Exemplo n.º 13
0
    def start(self):
        print("Starting combining eQTL probe files.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step, loading result.")
            self.eqtl_probes = load_dataframe(inpath=self.outpath,
                                              header=0,
                                              index_col=False)
        else:
            # Load each GTE file.
            print("Loading eQTLprobes files.")
            combined_eqtl_probes = self.combine_files()
            if self.disease != "" and self.disease is not None:
                print("Filtering on trait: {}".format(self.disease))
                combined_eqtl_probes = self.filter_on_trait(
                    combined_eqtl_probes)
            self.eqtl_probes = combined_eqtl_probes
            self.save()
Exemplo n.º 14
0
    def combine_groups(self, inter_outpath):
        print("Combining groups.")
        snp_mask = np.array([], dtype=np.int16)
        sample_mask = np.array([], dtype=np.int16)
        inter_df = None
        for i, group_id in enumerate(self.group_ids):
            print("  Working on: {:10s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.group_ids),
                                    (100 / len(self.group_ids)) * (i + 1)))

            # Define the directory names.
            data_indir = os.path.join(self.g_data_indir, group_id)
            inter_indir = os.path.join(self.g_inter_indir, group_id, 'output')

            # Load the group object.
            with open(os.path.join(data_indir, self.obj_filename), "rb") as f:
                group_object = pickle.load(f)

            # Safe the indices.
            snp_mask = np.append(snp_mask, group_object.get_snp_indices())
            sample_mask = np.append(sample_mask,
                                    group_object.get_sample_indices())

            if not check_file_exists(inter_outpath) or self.force:
                # Search for the interaction filename.
                inter_inpath = None
                for path in glob.glob(os.path.join(inter_indir, "*")):
                    if re.match(self.inter_regex, get_basename(path)):
                        inter_inpath = path
                        break
                if inter_inpath is None:
                    print("Interaction matrix not found.")
                    exit()

                # Load the interaction file.
                group_inter_df = load_dataframe(inpath=inter_inpath,
                                                header=0,
                                                index_col=0)

                # Merge them.
                if inter_df is None:
                    inter_df = group_inter_df
                else:
                    inter_df = inter_df.merge(group_inter_df,
                                              left_index=True,
                                              right_index=True)

        print("Preparing interaction matrix.")
        if not check_file_exists(inter_outpath) or self.force:
            # Sort the matrix according to the indices.
            inter_df = inter_df.T
            inter_df["index"] = snp_mask
            inter_df.sort_values(by=['index'], inplace=True)
            inter_df.drop(["index"], axis=1, inplace=True)
            inter_df = inter_df.T

            save_dataframe(df=inter_df,
                           outpath=inter_outpath,
                           index=True,
                           header=True)
        else:
            inter_df = load_dataframe(inpath=inter_outpath,
                                      header=0,
                                      index_col=0)

        # Prepare the masks.
        snp_mask = sorted(list(set(snp_mask)))
        sample_mask = sorted(list(set(sample_mask)))

        return snp_mask, sample_mask, inter_df
Exemplo n.º 15
0
    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting combining groups.")
        self.print_arguments()

        # Combine the indices of each group and combine the interaction
        # matrix if need be.
        inter_outpath = os.path.join(self.outdir, self.inter_filename)
        snp_mask, sample_mask, inter_df = self.combine_groups(inter_outpath)

        print("\nSubsetting data with masks:")
        print("\tSNP mask:\tlength: {}\tlowest index: {}"
              "\thighest index: {}".format(len(snp_mask), min(snp_mask),
                                           max(snp_mask)))
        print("\tSample mask:\tlength: {}\tlowest index: {}"
              "\thighest index: {}".format(len(sample_mask), min(sample_mask),
                                           max(sample_mask)))
        print("")

        # Load the eQTL file if either the marker df or the eqtl df needs to be
        # created.
        markers_outpath = os.path.join(self.outdir, self.markers_filename)
        eqtl_outpath = os.path.join(self.outdir, self.eqtl_filename)
        if not check_file_exists(eqtl_outpath) or \
                not check_file_exists(markers_outpath) \
                or self.force:
            print("Loading eQTL file.")
            eqtl_df = load_dataframe(inpath=self.eqtl_inpath,
                                     header=0,
                                     index_col=None)
            eqtl_df = eqtl_df.iloc[snp_mask, :]

            print("Preparing marker matrix.")
            if not check_file_exists(markers_outpath) or self.force:
                self.create_marker_df(inter_df, eqtl_df, markers_outpath)
            else:
                print("\tSkipping step.")

            print("Preparing eQTL matrix.")
            if not check_file_exists(eqtl_outpath) or self.force:
                save_dataframe(outpath=eqtl_outpath,
                               df=eqtl_df,
                               index=False,
                               header=True)
            else:
                print("\tSkipping step.")
            del eqtl_df

        del inter_df

        print("\nPreparing genotype matrix.")
        geno_outpath = os.path.join(self.outdir, self.geno_filename)
        if not check_file_exists(geno_outpath) or self.force:
            geno_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.geno_filename),
                                     header=0,
                                     index_col=0)
            geno_df = geno_df.iloc[snp_mask, sample_mask]
            save_dataframe(outpath=geno_outpath,
                           df=geno_df,
                           index=True,
                           header=True)
            del geno_df
        else:
            print("\tSkipping step.")

        print("\nPreparing alleles matrix.")
        alleles_outpath = os.path.join(self.outdir, self.alleles_filename)
        if not check_file_exists(alleles_outpath) or self.force:
            alleles_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.alleles_filename),
                                        header=0,
                                        index_col=0)
            alleles_df = alleles_df.iloc[snp_mask, :]
            save_dataframe(outpath=alleles_outpath,
                           df=alleles_df,
                           index=True,
                           header=True)
            del alleles_df
        else:
            print("\tSkipping step.")

        print("\nPreparing expression matrix.")
        expr_outpath = os.path.join(self.outdir, self.expr_filename)
        if not check_file_exists(expr_outpath) or self.force:
            expr_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.expr_filename),
                                     header=0,
                                     index_col=0)
            expr_df = expr_df.iloc[snp_mask, sample_mask]
            save_dataframe(outpath=expr_outpath,
                           df=expr_df,
                           index=True,
                           header=True)
            del expr_df
        else:
            print("\tSkipping step.")

        print("\nPreparing covariate matrix.")
        cov_outpath = os.path.join(self.outdir, self.cov_filename)
        if not check_file_exists(cov_outpath) or self.force:
            cov_df = load_dataframe(inpath=self.cov_inpath,
                                    header=0,
                                    index_col=0)
            cov_df = cov_df.iloc[:, sample_mask].copy()
            save_dataframe(outpath=cov_outpath,
                           df=cov_df,
                           index=True,
                           header=True)
            del cov_df
        else:
            print("\tSkipping step.")
Exemplo n.º 16
0
    def perform_deconvolution(self):
        if self.profile_df is None:
            # Load the celltype profile file.
            print("Loading cell type profile matrix.")
            self.profile_df = load_dataframe(self.profile_file,
                                             header=0,
                                             index_col=0)

        if self.ct_expr_df is None:
            # Load the celltype expression file.
            print("Loading cell type expression matrix.")
            self.ct_expr_df = load_dataframe(self.ct_expr_file,
                                             header=0,
                                             index_col=0)

        print("Loading sample cohort matrix.")
        sample_cohort_df = load_dataframe(self.sample_cohort_file,
                                          header=0,
                                          index_col=None)

        # Correct for cohort effects.
        cohort_df = self.create_cohort_df(list(self.ct_expr_df.columns),
                                          sample_cohort_df, self.sample_id,
                                          self.cohort_id)

        # Filter uninformative genes from the signature matrix.
        prof_df = self.filter(self.profile_df)

        # Subset and reorder.
        prof_df, expr_df, cohort_df = self.subset(prof_df, self.ct_expr_df,
                                                  cohort_df)

        # Correct for cohorts.
        expr_df = self.cohort_correction(expr_df, cohort_df)

        # Transform.
        prof_df = self.perform_log2_transform(prof_df)

        # Shift the data to be positive.
        print("Shifting data to be positive")
        if prof_df.values.min() < 0:
            prof_df = self.perform_shift(prof_df)

        if expr_df.values.min() < 0:
            expr_df = self.perform_shift(expr_df)

        print("Profile shape: {}".format(prof_df.shape))
        print("Expression shape: {}".format(expr_df.shape))

        # Perform deconvolution per sample.
        print("Performing partial deconvolution.")
        decon_data = []
        residuals_data = []
        for _, sample in expr_df.T.iterrows():
            proportions, rnorm = self.nnls(prof_df, sample)
            decon_data.append(proportions)
            residuals_data.append(rnorm)

        decon_df = pd.DataFrame(decon_data,
                                index=expr_df.columns,
                                columns=[
                                    "{}NNLS_{}".format(*x.split("_"))
                                    for x in prof_df.columns
                                ])
        residuals_df = pd.Series(residuals_data, index=expr_df.columns)

        print("Estimated weights:")
        print(decon_df)
        print(decon_df.mean(axis=0))

        # Make the weights sum up to 1.
        decon_df = self.sum_to_one(decon_df)
        print("Estimated proportions:")
        print(decon_df)
        print(decon_df.mean(axis=0))

        # Calculate the average residuals.
        print(residuals_df)
        print("Average residual: {:.2f}".format(residuals_df.mean()))

        return decon_df
Exemplo n.º 17
0
    def combine_files(self):
        # read the covariates file.
        print("Loading covariate matrix.")
        cov_df = load_dataframe(inpath=self.cov_file, header=0, index_col=0)
        tech_cov_df = cov_df[self.tech_covs].copy()
        cohorts_df = cov_df[self.cohorts].copy()
        del cov_df

        # validate the cohorts.
        print("Validating cohorts.")
        colsums = cohorts_df.sum(axis=1)
        cohorts_df[self.ref_cohort] = 0
        cohorts_df.loc[colsums == 0, self.ref_cohort] = 1
        if not cohorts_df.sum(axis=1).all():
            print("\tSome samples do not have a cohort.")
            exit()
        else:
            print("\tValid.")

        # read the phenotype file.
        print("Loading phenotype matrix.")
        pheno_df = load_dataframe(inpath=self.pheno_file,
                                  header=0,
                                  index_col=4,
                                  low_memory=False)

        # Combine the two gender columns, keep 'sex.by.expression' as main
        # gender ans use 'Gender' when no information is available.
        pheno_df = pheno_df.loc[:, ["Gender", "sex.by.expression"]]
        pheno_df.replace("no expression available", np.nan, inplace=True)
        pheno_df["SEX"] = pheno_df['sex.by.expression'].combine_first(
            pheno_df['Gender'])
        gender_df = pheno_df["SEX"].to_frame()
        del pheno_df
        gender_df = gender_df.replace({"SEX": self.sex_dict})

        # read the eigenvectors file.
        print("Loading eigenvectors matrix.")
        eigen_df = load_dataframe(self.eig_file, header=0, index_col=0)
        eigen_df = eigen_df.loc[:, [
            "Comp{}".format(x) for x in range(1, self.n_eigen + 1)
        ]]

        # read the eigenvectors before covariate correction file.
        print("Loading eigenvectors before cov. correction matrix.")
        cov_cor_df = load_dataframe(self.eig_bef_cov_corr_file,
                                    header=0,
                                    index_col=0)
        cov_cor_df.columns = [
            "PC1-before-cov-correction", "PC2-before-cov-correction"
        ]

        # read the marker genes expression file.
        print("Loading marker genes matrix.")
        marker_df = load_dataframe(self.marker_file, header=0, index_col=0)
        marker_df.sort_index(inplace=True)
        marker_df.drop_duplicates(inplace=True)
        marker_df = marker_df.T

        # merge.
        print("Merging matrices.")
        comb_cov = reduce(
            lambda left, right: pd.merge(
                left, right, left_index=True, right_index=True), [
                    tech_cov_df, cohorts_df, gender_df, eigen_df, cov_cor_df,
                    marker_df, self.celltype_pcs.T, self.celltype_cs.T,
                    self.deconvolution
                ])
        comb_cov = comb_cov.T
        comb_cov = comb_cov[self.sample_order]
        comb_cov.index.name = "-"
        print("\tShape: {}".format(comb_cov.shape))

        # Remove old dataframes.
        del tech_cov_df, cohorts_df, gender_df, eigen_df, cov_cor_df, marker_df

        return comb_cov
Exemplo n.º 18
0
    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting program.")
        print("\n### STEP1 ###\n")
        # Step 1. Combine GTE files.
        cgtef = CombineGTEFiles(
            settings=self.settings.get_setting('combine_gte_files'),
            force=self.force_dict['combine_gte_files'],
            outdir=self.outdir)
        cgtef.start()
        cgtef.clear_variables()

        # Step2. Combine eQTL probes files.
        print("\n### STEP2 ###\n")
        cepf = CombineEQTLProbes(
            settings=self.settings.get_setting('combine_eqtlprobes'),
            disease=self.disease,
            force=self.force_dict['combine_eqtlprobes'],
            outdir=self.outdir)
        cepf.start()
        cepf.clear_variables()

        # Step3. Create the ordered unmasked matrices.
        print("\n### STEP3 ###\n")
        cm = CreateMatrices(
            settings=self.settings.get_setting('create_matrices'),
            gte_df=cgtef.get_gte(),
            sample_dict=cgtef.get_sample_dict(),
            sample_order=cgtef.get_sample_order(),
            eqtl_df=cepf.get_eqtlprobes(),
            force=self.force_dict['create_matrices'],
            outdir=self.outdir)
        cm.start()
        cm.clear_variables()

        # Step4. Create the deconvolution matrices.
        print("\n### STEP4 ###\n")
        cdm = CreateDeconvolutionMatrices(
            settings=self.settings.get_setting('create_deconvolution_matrices'),
            expr_file=cm.get_expr_file(),
            expr_df=cm.get_complete_expr_matrix(),
            sample_dict=cgtef.get_sample_dict(),
            sample_order=cgtef.get_sample_order(),
            force=self.force_dict['create_deconvolution_matrices'],
            outdir=self.outdir)
        cdm.start()
        cdm.clear_variables()

        # Step5. Create the celltype PCA file.
        print("\n### STEP5 ###\n")
        pcf = PerformCelltypeFactorization(
            settings=self.settings.get_setting('perform_celltype_factorization'),
            profile_file=cdm.get_celltype_profile_file(),
            profile_df=cdm.get_celltype_profile(),
            ct_expr_file=cdm.get_ct_profile_expr_outpath(),
            force=self.force_dict['perform_celltype_factorization'],
            outdir=self.outdir)
        pcf.start()
        pcf.clear_variables()

        # Step6. Create the covariance matrix.
        print("\n### STEP6 ###\n")
        pd = PerformDeconvolution(
            settings=self.settings.get_setting('perform_deconvolution'),
            profile_file=cdm.get_celltype_profile_file(),
            profile_df=cdm.get_celltype_profile(),
            ct_expr_file=cdm.get_ct_profile_expr_outpath(),
            ct_expr_df=pcf.get_celltype_expression(),
            force=self.force_dict['perform_deconvolution'],
            outdir=self.outdir)
        pd.start()
        pd.clear_variables()

        # Step7. Create the covariance matrix.
        print("\n### STEP7 ###\n")
        ccm = CreateCovMatrix(
            settings=self.settings.get_setting('create_cov_matrix'),
            marker_file=cdm.get_markers_outpath(),
            celltype_pcs=pcf.get_celltype_pcs(),
            celltype_cs=pcf.get_celltype_cs(),
            deconvolution=pd.get_deconvolution(),
            sample_order=cgtef.get_sample_order(),
            force=self.force_dict['create_cov_matrix'],
            outdir=self.outdir)
        ccm.start()
        ccm.clear_variables()

        exit()

        # Load the complete dataframes.
        print("\n### LOADING SORTED DATAFRAMES ###\n")
        print("Extracting eQTL dataframe.")
        eqtl_df = cepf.get_eqtlprobes()

        print("Loading genotype dataframe.")
        geno_df = load_dataframe(cm.get_geno_outpath(),
                                 header=0,
                                 index_col=0)

        print("Loading alleles dataframe.")
        alleles_df = load_dataframe(cm.get_alleles_outpath(),
                                    header=0,
                                    index_col=0)

        print("Loading expression dataframe.")
        expr_df = load_dataframe(cm.get_expr_outpath(),
                                 header=0,
                                 index_col=0)

        print("Extracting covariates dataframe.")
        cov_df = ccm.get_covariates()

        # Validate the matrices.
        print("Validating matrices.")
        self.validate(eqtl_df.copy(), geno_df, alleles_df, expr_df, cov_df)

        # Step 8. Create the masked matrices.
        print("\n### STEP8 ###\n")
        cmm = MaskMatrices(
            settings=self.settings.get_setting('mask_matrices'),
            eqtl_df=eqtl_df.copy(),
            geno_df=geno_df.copy(),
            alleles_df=alleles_df.copy(),
            expr_df=expr_df.copy(),
            cov_df=cov_df.copy(),
            force=self.force_dict['mask_matrices'],
            outdir=self.outdir)
        cmm.start()
        del cmm

        # # Step 9. Create the group matrices.
        # print("\n### STEP9 ###\n")
        # cg = CreateGroups(
        #     settings=self.settings.get_setting('create_groups'),
        #     eqtl_df=eqtl_df.copy(),
        #     geno_df=geno_df.copy(),
        #     alleles_df=alleles_df.copy(),
        #     expr_df=expr_df.copy(),
        #     cov_df=cov_df.copy(),
        #     groups_file=cm.get_group_outpath(),
        #     force=self.force_dict['create_groups'],
        #     outdir=self.outdir)
        # cg.start()
        # del cg

        # Step 10. Create the regression matrices.
        print("\n### STEP10 ###\n")
        crm = CreateRegressionMatrix(
            settings=self.settings.get_setting('create_regression_matrix'),
            eqtl_df=eqtl_df.copy(),
            geno_df=geno_df.copy(),
            alleles_df=alleles_df.copy(),
            expr_df=expr_df.copy(),
            force=self.force_dict['create_regression_matrix'],
            outdir=self.outdir)
        crm.start()
        del crm
Exemplo n.º 19
0
    def start(self):
        print("Starting creating matrices.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.geno_outpath) and \
                check_file_exists(self.alleles_outpath) and \
                check_file_exists(self.expr_outpath) and \
                not self.force:
            print("Skipping step.")
            return

        # Remove the output files.
        for outfile in [
                self.geno_outpath, self.alleles_outpath, self.expr_outpath
        ]:
            if os.path.isfile(outfile):
                print("Removing file: {}.".format(outfile))
                os.remove(outfile)

        # Load the genotype matrix file.
        print("Loading genotype matrix.")
        geno_df = load_dataframe(self.geno_file, header=0, index_col=0)
        allele_df = geno_df.loc[:, ["Alleles", "MinorAllele"]].copy()
        geno_df = geno_df.rename(columns=self.sample_dict)
        geno_df = geno_df[self.sample_order]

        # Load the expression matrix file.
        print("Loading expression matrix.")
        expr_df = load_dataframe(self.expr_file, header=0, index_col=0)
        expr_df = expr_df.rename(columns=self.sample_dict)
        self.complete_expr_matrix = expr_df[self.sample_order]

        # Construct the genotype / expression matrices.
        print("Constructing matrices.")
        geno_str_buffer = ["-" + "\t" + "\t".join(self.sample_order) + "\n"]
        expr_str_buffer = ["-" + "\t" + "\t".join(self.sample_order) + "\n"]
        allele_str_buffer = [
            "-" + "\t" + "\t".join(list(allele_df.columns)) + "\n"
        ]

        # saved_profile_genes = []
        # groups = []
        # new_group_id = 0
        n_snps = self.eqtl_df.shape[0]
        for i, row in self.eqtl_df.iterrows():
            if (i % 250 == 0) or (i == (n_snps - 1)):
                print("\tProcessing {}/{} "
                      "[{:.2f}%]".format(i, (n_snps - 1),
                                         (100 / (n_snps - 1)) * i))

                # Write output files.
                self.write_buffer(self.geno_outpath, geno_str_buffer)
                geno_str_buffer = []

                self.write_buffer(self.expr_outpath, expr_str_buffer)
                expr_str_buffer = []

                self.write_buffer(self.alleles_outpath, allele_str_buffer)
                allele_str_buffer = []

            # Get the row info.
            snp_name = row["SNPName"]
            probe_name = row["ProbeName"]

            # Used for development.
            # snp_name = "10:100145864:rs4919426:T_C"
            # probe_name = "ENSG00000000003.15"
            # End used for development.

            # Get the genotype.
            genotype = geno_df.loc[[snp_name], :]
            if (len(genotype.index)) != 1:
                print("SNP: {} gives 0 or >1 " "genotypes.".format(snp_name))
                continue
            geno_str = snp_name + "\t" + "\t".join(
                genotype.iloc[0, :].astype(str).values) + "\n"
            geno_str_buffer.append(geno_str)

            # Get the alleles.
            alleles = allele_df.loc[[snp_name], :]
            if (len(alleles.index)) != 1:
                print("SNP: {} gives 0 or >1 " "alleles.".format(snp_name))
                continue
            allele_str = "{}\t{}\t{}\n".format(snp_name,
                                               alleles.iloc[0]["Alleles"],
                                               alleles.iloc[0]["MinorAllele"])
            allele_str_buffer.append(allele_str)

            # Get the expression.
            expression = self.complete_expr_matrix.loc[[probe_name], :]
            if (len(expression.index)) != 1:
                print("Probe: {} gives 0 or >1 expression "
                      "profiles.".format(probe_name))
                continue
            expr_str = probe_name + "\t" + "\t".join(
                expression.iloc[0, :].astype(str).values) + "\n"
            expr_str_buffer.append(expr_str)

            # # Create an eQTL object.
            # new_eqtl = Eqtl(snp_name, i, genotype, expression)
            #
            # # Get the samples indices of the eQTl.
            # samples = new_eqtl.get_samples()
            # samples_indices = new_eqtl.get_sample_indices()
            #
            # # Assign the group.
            # matches = False
            # if groups:
            #     # Check if there is a group with these samples.
            #     for group in groups:
            #         if group.matches(samples_indices):
            #             group.add_eqtl(new_eqtl)
            #             matches = True
            #             break
            #
            # # Add a new group.
            # if not matches:
            #     new_group = Group(new_group_id, samples)
            #     new_group.add_eqtl(new_eqtl)
            #     groups.append(new_group)
            #     new_group_id = new_group_id + 1

        # Write output files.
        if geno_str_buffer:
            self.write_buffer(self.geno_outpath, geno_str_buffer)

        if expr_str_buffer:
            self.write_buffer(self.expr_outpath, expr_str_buffer)

        if allele_str_buffer:
            self.write_buffer(self.alleles_outpath, allele_str_buffer)

        # # Pickle the groups.
        # print("Writing group pickle file.")
        # with open(self.group_outpath, "wb") as f:
        #     pickle.dump(groups, f)

        # Remove old dataframes.
        del geno_df, expr_df
    def start(self):
        print("Starting creating deconvolution matrices.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.markers_outpath) and \
                check_file_exists(self.ct_profile_expr_outpath) and \
                not self.force:
            print("Skipping step.")
            return

        # Check which expression file we will use.
        expr_file = self.expr_file
        expr_df = self.expr_df
        if self.decon_expr_file:
            print("Warning: using a different expression file for "
                  "deconvolution than for gene expression. This might take "
                  "longer to load.")
            expr_file = self.decon_expr_file
            expr_df = None

        # Load the complete expression file.
        if expr_df is None:
            # Load the expression matrix file.
            print("Loading expression matrix.")
            expr_df = load_dataframe(expr_file, header=0, index_col=0)
            expr_df = expr_df.rename(columns=self.sample_dict)
            expr_df = expr_df[self.sample_order]

        # Load the translate file.
        print("Loading translate matrix.")
        trans_df = load_dataframe(self.translate_file,
                                  header=0,
                                  index_col=None)
        trans_dict = dict(
            zip(trans_df.loc[:, "ArrayAddress"], trans_df.loc[:, "Symbol"]))

        # Translate the ENSEBL ID's to HGNC symbols.
        expr_df.index = expr_df.index.map(trans_dict)
        expr_df.index.name = "-"

        # Remove unneeded variables.
        del trans_df, trans_dict

        # Create the marker gene file.
        if not check_file_exists(self.markers_outpath) or self.force:
            if os.path.isfile(self.markers_outpath):
                print("Removing: {}".format(self.markers_outpath))
                os.remove(self.markers_outpath)

            print("Creating marker gene expression table.")
            marker_str_buffer = [
                "-" + "\t" + "\t".join(self.sample_order) + "\n"
            ]
            for celltype, marker_genes in self.marker_dict.items():
                for marker_gene in marker_genes:
                    if marker_gene in expr_df.index:
                        expression = expr_df.loc[[marker_gene], :]
                        if (len(expression.index)) != 1:
                            print("\tMarker gene: {} gives 0 or >1 expression "
                                  "profiles.".format(marker_gene))
                            continue

                        marker_str = self.marker_genes_suffix + "_" + \
                                     celltype + "_" + marker_gene + "\t" + \
                                     "\t".join(expression.iloc[0, :].astype(str).values) \
                                     + "\n"
                        marker_str_buffer.append(marker_str)
            self.write_buffer(self.markers_outpath, marker_str_buffer)

        # Create the marker gene file.
        if not check_file_exists(self.ct_profile_expr_outpath) or self.force:
            if os.path.isfile(self.ct_profile_expr_outpath):
                print("Removing: {}".format(self.ct_profile_expr_outpath))
                os.remove(self.ct_profile_expr_outpath)

            # Load the celltype profile file.
            print("Loading cell type profile matrix.")
            self.celltype_profile = load_dataframe(self.celltype_profile_file,
                                                   header=0,
                                                   index_col=0)

            # Create the celltype profile file.
            print("Creating cell type profile expression table.")
            profile_str_buffer = [
                "-" + "\t" + "\t".join(self.sample_order) + "\n"
            ]
            for marker_gene in self.celltype_profile.index:
                if marker_gene in expr_df.index:
                    expression = expr_df.loc[[marker_gene], :]
                    if (len(expression.index)) != 1:
                        print("\tMarker gene: {} gives 0 or >1 expression "
                              "profiles.".format(marker_gene))
                        continue

                    profile_str = marker_gene + "\t" + "\t".join(
                        expression.iloc[0, :].astype(str).values) + "\n"
                    profile_str_buffer.append(profile_str)
            self.write_buffer(self.ct_profile_expr_outpath, profile_str_buffer)
    def perform_matrix_factorization(self):
        # Load the expression data.
        print("Loading celltype expression data.")
        ct_expr_df = load_dataframe(inpath=self.ct_expr_file,
                                    header=0,
                                    index_col=0)

        if self.profile_df is None:
            # Load the celltype profile file.
            print("Loading cell type profile matrix.")
            self.profile_df = load_dataframe(self.profile_file,
                                             header=0,
                                             index_col=0)

        # Find the genes specific to each celltype.
        gene_celltypes = self.normalize(self.profile_df).idxmax(axis=1)

        # Construct a dataframe of the first component of each celltype
        # subset expression profile.
        pca_data = []
        print("Performing PCA")
        for celltype in self.profile_df.columns:
            print("\tWorking on: {}".format(celltype))
            ct_genes = gene_celltypes[gene_celltypes == celltype].index
            ct_expr = ct_expr_df.loc[ct_expr_df.index.isin(ct_genes), :]
            print("\t  N = {}".format(len(ct_expr.index)))

            # perform PCA over the expression of these genes.
            print("\t  PCA")
            pca_component = self.get_first_pca_component(ct_expr)
            pca_component_values = [x[0] for x in list(pca_component)]
            pca_data.append(pca_component_values)

        # Create the data frame.
        celltype_pcs = pd.DataFrame(pca_data,
                                    index=[
                                        "{}PCA_{}_PC1".format(*x.split("_"))
                                        for x in self.profile_df.columns
                                    ],
                                    columns=ct_expr_df.columns)

        # Shift the expression to be all positive.
        shifted_ct_expr = ct_expr_df.copy()
        if ct_expr_df.values.min() < 0:
            shifted_ct_expr = self.perform_shift(ct_expr_df)

        # Construct a dataframe of the first component of each celltype
        # subset expression profile.
        nmf_data = []
        print("Performing NMF")
        for celltype in self.profile_df.columns:
            print("\tWorking on: {}".format(celltype))
            ct_genes = gene_celltypes[gene_celltypes == celltype].index
            ct_expr = shifted_ct_expr.loc[
                shifted_ct_expr.index.isin(ct_genes), :]
            print("\t  N = {}".format(len(ct_expr.index)))

            # perform NMF over the expression of these genes.
            print("\t  NMF")
            nmf_component = self.get_first_nmf_component(ct_expr)
            nmf_component_values = [x[0] for x in list(nmf_component)]
            nmf_data.append(nmf_component_values)

        # Create the data frame.
        celltype_cs = pd.DataFrame(nmf_data,
                                   index=[
                                       "{}NMF_{}_C1".format(*x.split("_"))
                                       for x in self.profile_df.columns
                                   ],
                                   columns=shifted_ct_expr.columns)

        return ct_expr_df, celltype_pcs, celltype_cs
Exemplo n.º 22
0
    def work(self, permutation_orders):
        """
        Method that does the interaction analysis.

        :param storage: object, a storage object containing all results.
        """
        # Load the data
        print("Loading data", flush=True)
        cov_df = load_dataframe(self.cov_inpath, header=0, index_col=0)

        geno_df = load_dataframe(
            self.geno_inpath,
            header=0,
            index_col=0,
            skiprows=[i for i in range(1, self.skip_rows + 1)],
            nrows=self.n_eqtls)
        expr_df = load_dataframe(
            self.expr_inpath,
            header=0,
            index_col=0,
            skiprows=[i for i in range(1, self.skip_rows + 1)],
            nrows=self.n_eqtls)

        # Drop the covariates we don't want.
        if len(self.drop_covs) > 0:
            cov_df.drop(self.drop_covs, axis=0, inplace=True)

        # Split the covariate table into covariates of interest and technical
        # covariates.
        print("Extracting technical covariates data frame")
        tech_cov_df = cov_df.loc[self.tech_covs, :].copy()
        print("\tShape: {}".format(tech_cov_df.shape))

        # Replace -1 with NaN in the genotype dataframe. This way we can
        # drop missing values.
        geno_df.replace(-1, np.nan, inplace=True)

        # Initialize the storage object.
        print("Creating storage object")
        tech_cov_names = []
        cov_names = []
        for rowname in cov_df.index:
            if rowname in self.tech_covs:
                tech_cov_names.append(rowname)
            else:
                cov_names.append(rowname)
        storage = Storage(tech_covs=tech_cov_names, covs=cov_names)
        storage.print_info()

        # Start working.
        print("Starting interaction analyser", flush=True)
        for row_index, eqtl_index in enumerate(
            [i for i in range(self.skip_rows, self.skip_rows + self.n_eqtls)]):
            print("\tProcessing eQTL {}/{} "
                  "[{:.0f}%]".format(row_index + 1, self.n_eqtls,
                                     (100 / self.n_eqtls) * (row_index + 1)),
                  flush=True)

            # Get the complete genotype row for the permutation later.
            genotype_all = geno_df.iloc[row_index, :].copy()

            # Get the missing genotype indices.
            indices = np.arange(geno_df.shape[1])
            eqtl_indices = indices[~geno_df.iloc[row_index, :].isnull().values]

            # Subset the row and present samples for this eQTL.
            genotype = geno_df.iloc[row_index, eqtl_indices].copy()
            expression = expr_df.iloc[row_index, eqtl_indices].copy()
            technical_covs = tech_cov_df.iloc[:, eqtl_indices].copy()
            covariates = cov_df.iloc[:, eqtl_indices].copy()

            # Create the null model. Null model are all the technical
            # covariates multiplied with the genotype + the SNP.
            tech_inter_matrix = technical_covs.mul(genotype, axis=1)
            tech_inter_matrix.index = [
                "{}_X_SNP".format(x) for x in technical_covs.index
            ]
            intercept = pd.DataFrame(1,
                                     index=genotype.index,
                                     columns=["intercept"])
            base_matrix = reduce(
                lambda left, right: pd.
                merge(left, right, left_index=True, right_index=True), [
                    intercept,
                    genotype.to_frame(), technical_covs.T, tech_inter_matrix.T
                ])

            # Initialize variables.
            storage.add_row(eqtl_index, genotype.name)

            # Loop over the covariates.
            for cov_index in range(len(cov_df.index)):
                if storage.has_error():
                    break

                # Get the covariate we are processing.
                covariate = covariates.iloc[cov_index, :]
                cov_name = covariate.name

                if self.verbose:
                    print("\t\tWorking on '{}'".format(cov_name), flush=True)

                # Add the covariate to the null matrix if it isn't already.
                null_matrix = base_matrix.copy()
                if cov_name not in null_matrix.columns:
                    covariate_df = covariate.copy()
                    null_matrix = null_matrix.merge(covariate_df.to_frame(),
                                                    left_index=True,
                                                    right_index=True)

                # Create the null model.
                n_null = null_matrix.shape[0]
                df_null, rss_null, _ = self.create_model(
                    null_matrix, expression)

                # if self.verbose:
                #     print("\t\tn_null: {}\tdf_null: {}\trss_null: {}\t".format(n_null, df_null, rss_null))

                # Loop over each permutation sample order. The first order
                # is the normal order and the remainder are random shuffles.
                for order_id, sample_order in enumerate(permutation_orders):
                    if storage.has_error():
                        break

                    if self.verbose:
                        print("\t\t\tWorking on 'order_{}'".format(order_id),
                              flush=True)

                    # Reorder the covariate based on the sample order.
                    # Make sure the labels are in the same order, just
                    # shuffle the values.
                    covariate_all = cov_df.iloc[cov_index, :].copy()
                    covariate_all_index = covariate_all.index
                    covariate_all = covariate_all.reindex(
                        covariate_all.index[sample_order])
                    covariate_all.index = covariate_all_index

                    # Calculate the interaction effect of the covariate of
                    # interest. Then drop the NA's from the interaction
                    # term.
                    inter_of_interest = covariate_all * genotype_all
                    inter_name = "{}_X_SNP".format(cov_name)
                    if inter_name in null_matrix.columns:
                        inter_name = inter_name + "_2"
                    inter_of_interest.name = inter_name
                    inter_of_interest = inter_of_interest.iloc[eqtl_indices]

                    # Check if the drop is identical (see above).
                    if not inter_of_interest.index.equals(null_matrix.index):
                        print("\t\t\tError in permutation reordering "
                              "(ID: {})".format(order_id),
                              flush=True)
                        storage.set_error()
                        continue

                    # Create the alternative matrix and add the interaction
                    # term.
                    alt_matrix = null_matrix.copy()
                    alt_matrix = alt_matrix.merge(inter_of_interest.to_frame(),
                                                  left_index=True,
                                                  right_index=True)

                    # Create the alternative model.
                    n_alt = alt_matrix.shape[0]
                    df_alt, rss_alt, alt_tvalues = self.create_model(
                        alt_matrix,
                        expression,
                        tvalue_cols=[genotype.name, inter_name])

                    # if self.verbose:
                    #     print("\t\t\tn_alt: {}\tdf_alt: {}\trss_alt: {}\talt_tvalues: {}".format(n_alt, df_alt, rss_alt, alt_tvalues))

                    # Safe the t-values.
                    storage.add_value(cov_name, order_id, "snp_tvalue",
                                      alt_tvalues[genotype.name])
                    storage.add_value(cov_name, order_id, "inter_tvalue",
                                      alt_tvalues[inter_name])

                    # Make sure the n's are identical.
                    if n_null != n_alt:
                        print("\t\t\tError due to unequal n_null and n_alt",
                              flush=True)
                        storage.set_error()
                        continue

                    # Compare the null and alternative model.
                    fvalue = self.calc_f_value(rss_null, rss_alt, df_null,
                                               df_alt, n_null)
                    pvalue = self.get_p_value(fvalue, df_null, df_alt, n_null)

                    # if self.verbose:
                    #     print("\t\t\tfvalue: {}\tpvalue: {}".format(fvalue, pvalue))

                    # Safe the p-values.
                    storage.add_value(cov_name, order_id, "pvalue", pvalue)

                    # Check whether we are almost running out of time.
                    if time.time() > self.panic_time:
                        print("\tPanic!!!", flush=True)
                        return storage

            # Safe the results of the eQTL.
            storage.store_row()

        return storage