Python load_dataframe 예제들, utilities.load_dataframe Python 예제들

예제 #1

0

파일 보기

    def combine_files(self):
        combined = None
        for i, gte_inpath in enumerate(glob.glob(self.inpath)):
            gte_file = os.path.basename(gte_inpath).replace(".txt", "")
            if gte_file in self.exclude_files:
                continue
            df = load_dataframe(inpath=gte_inpath,
                                header=None,
                                index_col=None,
                                logger=self.log)
            df["dataset"] = gte_file
            if combined is None:
                combined = df
            else:
                combined = pd.concat([combined, df], axis=0, ignore_index=True)

        # Remove duplicate entries.
        combined.drop_duplicates(inplace=True)

        # Remove samples.
        if self.exclude_samples_path is not None:
            sample_exclude_df = load_dataframe(
                inpath=self.exclude_samples_path,
                header=None,
                index_col=None,
                logger=self.log)
            pre_shape = combined.shape[0]
            combined = combined.loc[
                ~combined.iloc[:,
                               1].isin(sample_exclude_df.iloc[:,
                                                              0].tolist()), :]
            self.log.warn("\tRemoving '{}' samples".format(pre_shape -
                                                           combined.shape[0]))

        return combined

예제 #2

0

파일 보기

    def start(self):
        self.log.info("Correcting expression data for dataset effects.")
        self.print_arguments()

        self.log.info("Correcting signature expression data.")
        if not check_file_exists(self.sign_expr_dc_outpath) or self.force:
            if self.dataset_df is None:
                self.dataset_df = load_dataframe(self.dataset_file,
                                                 header=0,
                                                 index_col=0,
                                                 logger=self.log)

            if self.sign_expr_df is None:
                self.sign_expr_df = load_dataframe(self.sign_expr_file,
                                                   header=0,
                                                   index_col=0,
                                                   logger=self.log)

            self.sign_expr_dc_df = self.dataset_correction(
                self.sign_expr_df, self.dataset_df)
            save_dataframe(df=self.sign_expr_dc_df,
                           outpath=self.sign_expr_dc_outpath,
                           index=True,
                           header=True,
                           logger=self.log)
        else:
            self.log.info("\tSkipping step.")

예제 #3

0

파일 보기

    def create_covs_file(self):
        # read the eigenvectors file.
        self.log.info("Loading eigenvectors matrix.")
        eigen_df = load_dataframe(self.eig_file,
                                  header=0,
                                  index_col=0,
                                  nrows=max(self.n_eigen),
                                  logger=self.log)
        if len(set(self.sample_order).intersection(set(
                eigen_df.columns))) == 0:
            eigen_df = eigen_df.T
        eigen_df.columns = [
            self.sample_dict[x] if x in self.sample_dict else x
            for x in eigen_df.columns
        ]
        eigen_df = eigen_df.loc[:, self.sample_order]

        for n_eigen in self.n_eigen:
            save_dataframe(df=eigen_df.iloc[:n_eigen, :],
                           outpath=os.path.join(
                               self.outdir,
                               "first{}PCComponents.txt.gz".format(n_eigen)),
                           index=True,
                           header=True,
                           logger=self.log)

        # loading deconvolution matrix.
        self.log.info("Loading deconvolution matrix.")
        if self.decon_df is None:
            self.decon_df = load_dataframe(self.decon_file,
                                           header=0,
                                           index_col=0,
                                           logger=self.log)

        # merge.
        self.log.info("Merging matrices.")
        covs_df = pd.merge(eigen_df.T,
                           self.decon_df,
                           left_index=True,
                           right_index=True)
        covs_df = covs_df.T
        covs_df.index.name = "-"

        # Validate sample order.
        if not covs_df.columns.equals(self.sample_order):
            covs_df = covs_df[self.sample_order]

        # Remove old dataframes.
        del eigen_df

        return covs_df

예제 #4

0

파일 보기

    def normal_transform(self):
        # loading deconvolution matrix.
        if self.df is None:
            self.log.info("Loading matrix.")
            self.df = load_dataframe(self.inpath,
                                     header=0,
                                     index_col=0,
                                     logger=self.log)

        new_data = []
        match = 0
        count = 0
        print("Processing data.")
        for i, (index, row) in enumerate(self.df.iterrows()):
            if (i == 0) or (i % self.print_interval == 0):
                print("\tprocessed {}/{} [{:.2f}%] lines".format(
                    i, self.df.shape[0], (100 / self.df.shape[0]) * i))
                print("\t\t{}/{} are identical".format(match, count))

            zscores1 = self.rank_method1(index, row)
            zscores2 = self.rank_method2(index, row)

            if np.array_equal(zscores1, zscores2):
                match += 1

            count += 1

        exit()
        return pd.DataFrame(new_data,
                            index=self.df.index,
                            columns=self.df.columns)

예제 #5

0

파일 보기

    def start(self):
        self.log.info("Starting creating cohort matrix.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            # Load the sample info.
            self.log.info("Loading sample information matrix.")
            self.sample_info_df = load_dataframe(inpath=self.inpath,
                                                 header=0,
                                                 index_col=None,
                                                 low_memory=False,
                                                 logger=self.log)

            # Construct sample-cohort dict.
            self.log.info("Creating sample to cohort dict.")
            sample_cohort_dict = construct_dict_from_df(
                self.sample_info_df, self.sample_id, self.cohort_id)

            # Create cohort dataframe.
            self.log.info("Constructing cohort matrix.")
            self.cohort_df = self.create_cohort_df(self.sample_dict,
                                                   self.sample_order,
                                                   sample_cohort_dict)
            self.save()
        else:
            self.log.info("Skipping step.")

예제 #6

0

파일 보기

    def start(self):
        self.log.info("Filtering technical covariates datafile.")
        self.print_arguments()

        # Check if output file exist.
        if not check_file_exists(self.outpath) or self.force:
            # Load the sample info.
            self.log.info("Loading covariates matrix.")
            cov_df = load_dataframe(inpath=self.cov_file,
                                    header=0,
                                    index_col=0,
                                    logger=self.log)

            # Filter on samples and technical covariates.
            self.log.info("Filtering on samples and technical covariates.")
            cov_df.index = [
                self.sample_dict[x] if x in self.sample_dict else x
                for x in cov_df.index
            ]
            tech_cov_df = cov_df.loc[self.sample_order, self.tech_covs].copy()
            del cov_df
            self.log.info("\tNew shape: {}".format(tech_cov_df.shape))

            # Remove technical covariates that are linearly dependent.
            self.log.info("Removing linearly dependent column(s).")
            self.tech_covs_df = self.filter_linear_dependent_covs(tech_cov_df)
            self.log.info("\tNew shape: {}".format(self.tech_covs_df.shape))

            self.save()
        else:
            self.log.info("Skipping step.")

예제 #7

0

파일 보기

파일: create_tech_cov_matrix.py 프로젝트: molgenis/metabrain

    def create_tech_covs_file(self):
        # Load the sample info.
        self.log.info("Loading covariates matrix.")
        cov_df = load_dataframe(inpath=self.cov_file,
                                header=0,
                                index_col=0,
                                logger=self.log)

        # Filter on samples and technical covariates.
        self.log.info("Filtering on samples and technical covariates.")
        cov_df.index = [self.sample_dict[x] if x in self.sample_dict else x for
                        x in cov_df.index]
        subset_tech_covs_df = cov_df.loc[self.sample_order, self.tech_covs].copy()
        del cov_df
        self.log.info("\tNew shape: {}".format(subset_tech_covs_df.shape))

        # Remove technical covariates that are linearly dependent.
        self.log.info("Removing linearly dependent column(s).")
        subset_tech_covs_df = self.filter_linear_dependent_covs(subset_tech_covs_df)
        self.log.info("\tNew shape: {}".format(subset_tech_covs_df.shape))

        # loading cohort matrix.
        self.log.info("Loading cohort matrix.")
        if self.cohort_df is None:
            self.cohort_df = load_dataframe(self.cohort_file,
                                            header=0,
                                            index_col=0,
                                            logger=self.log)

        # merge.
        self.log.info("Merging matrices.")
        tech_covs_df = pd.merge(subset_tech_covs_df, self.cohort_df.T,
                                left_index=True, right_index=True)
        tech_covs_df = tech_covs_df.T
        tech_covs_df.index.name = "-"

        # Validate sample order.
        if not tech_covs_df.columns.equals(self.sample_order):
            tech_covs_df = tech_covs_df[self.sample_order]

        return tech_covs_df

예제 #8

0

파일 보기

파일: create_covs_matrix.py 프로젝트: npklein/metabrain

    def create_covs_file(self):
        # read the eigenvectors file.
        self.log.info("Loading eigenvectors matrix.")
        eigen_df = load_dataframe(self.eig_file,
                                  header=0,
                                  index_col=0,
                                  nrows=self.n_eigen,
                                  logger=self.log)
        eigen_df.columns = [
            self.sample_dict[x] if x in self.sample_dict else x
            for x in eigen_df.columns
        ]
        eigen_df = eigen_df.loc[:, self.sample_order]

        # loading deconvolution matrix.
        self.log.info("Loading deconvolution matrix.")
        if self.decon_df is None:
            self.decon_df = load_dataframe(self.decon_file,
                                           header=0,
                                           index_col=0,
                                           logger=self.log)

        # merge.
        self.log.info("Merging matrices.")
        covs_df = pd.merge(eigen_df.T,
                           self.decon_df,
                           left_index=True,
                           right_index=True)
        covs_df = covs_df.T
        covs_df.index.name = "-"

        # Validate sample order.
        if not covs_df.columns.equals(self.sample_order):
            covs_df = covs_df[self.sample_order]

        # Remove old dataframes.
        del eigen_df

        return covs_df

예제 #9

0

파일 보기

    def load_data(self):
        print("Loading covariates.")
        covariates_df = load_dataframe(self.covariates_inpath,
                                       index_col=0,
                                       header=0)

        print("Loading matrix header.")
        matrix_df = load_dataframe(self.matrix_inpath,
                                   index_col=0,
                                   header=0,
                                   nrows=0)

        print("Loading sample_dict.")
        sample_dict = None
        if self.sample_dict_inpath is not None:
            sample_dict_df = load_dataframe(self.sample_dict_inpath,
                                            index_col=None,
                                            header=0)

            sample_dict = construct_dict_from_df(sample_dict_df,
                                                 sample_dict_df.columns[0],
                                                 sample_dict_df.columns[1])

        return covariates_df, matrix_df, sample_dict

예제 #10

0

파일 보기

    def combine_files(self):
        combined = None
        for i, infile in enumerate(glob.glob(self.inpath)):
            df = load_dataframe(inpath=infile,
                                header=None,
                                index_col=None,
                                logger=self.log)
            if combined is None:
                combined = df
            else:
                combined = pd.concat([combined, df], axis=0, ignore_index=True)

        # Remove duplicate entries.
        combined.drop_duplicates(inplace=True)

        return combined

예제 #11

0

파일 보기

    def combine_files(self):
        combined = None
        for i in range(1, self.n_iterations + 1):
            infile = os.path.join(self.indir, self.iter_dirname + str(i),
                                  self.in_filename)
            df = load_dataframe(inpath=infile,
                                header=0,
                                index_col=False,
                                logger=self.log)
            df["Iteration"] = i
            if combined is None:
                combined = df
            else:
                combined = pd.concat([combined, df], axis=0, ignore_index=True)

        # Remove duplicate entries.
        combined.drop_duplicates(inplace=True)

        return combined

예제 #12

0

파일 보기

    def start(self):
        self.log.info("Starting combining GTE files.")
        self.print_arguments()

        # Check if GTE output file exist.
        if check_file_exists(self.outpath) and not self.force:
            self.log.info("Skipping step, loading result.")
            self.gte_df = load_dataframe(inpath=self.outpath,
                                         header=None,
                                         index_col=None,
                                         logger=self.log)
        else:
            # Load each GTE file.
            self.log.info("Loading GTE files.")
            self.gte_df = self.combine_files()
            self.save()

        # Construct sample translate dict.
        self.sample_dict = self.create_sample_dict()
        self.sample_order = list(self.gte_df.iloc[:, 1])
        self.dataset_to_samples_dict = self.set_dataset_to_samples_dict()

예제 #13

0

파일 보기

    def prepare_matrix(self):
        self.log.info("\tLoading matrix.")
        df = load_dataframe(self.inpath,
                            header=0,
                            index_col=0,
                            logger=self.log)

        self.log.info("\tPreprocessing.")
        df.columns = [
            self.sample_dict[x] if x in self.sample_dict else x
            for x in df.columns
        ]

        self.log.info("\tChecking overlap.")
        overlap = [x for x in self.sample_order if x in df.columns]
        self.log.info("\t\t{}/{} [{:.2f}%] samples found".format(
            len(overlap), len(self.sample_order),
            (100 / len(self.sample_order)) * len(overlap)))

        self.log.info("\tSubsetting.")
        subset = df.loc[:, overlap]

        missing = set(self.sample_order) - set(subset.columns)
        if len(missing) > 0:
            self.log.info(
                "\tCompleting data frame, adding {} missing sample columns.".
                format(len(missing)))
            missing_df = pd.DataFrame(np.nan, index=df.index, columns=missing)
            subset = subset.merge(missing_df,
                                  left_index=True,
                                  right_index=True)

        if list(subset.columns.to_list()) != self.sample_order:
            self.log.info("\tReordering columns.")
            subset = subset.loc[:, self.sample_order]

        return subset

예제 #14

0

파일 보기

    def start(self):
        self.log.info("Starting creating matrices.")
        self.print_arguments()

        if self.eqtl_df is None:
            self.eqtl_df = load_dataframe(self.eqtl_file,
                                          header=0,
                                          index_col=None,
                                          logger=self.log)

        self.log.info("Parsing genotype input data.")
        if not check_file_exists(self.geno_outpath) or not check_file_exists(
                self.alleles_outpath) or self.force:
            alleles_df, geno_df = self.parse_genotype_file()

            self.log.info("Reorder, Filter, and save.")
            self.alleles_df = alleles_df.loc[self.eqtl_df.loc[:, "SNPName"], :]
            save_dataframe(df=self.alleles_df,
                           outpath=self.alleles_outpath,
                           index=True,
                           header=True,
                           logger=self.log)

            self.geno_df = geno_df.loc[self.eqtl_df.loc[:, "SNPName"],
                                       self.sample_order]
            save_dataframe(df=self.geno_df,
                           outpath=self.geno_outpath,
                           index=True,
                           header=True,
                           logger=self.log)
        else:
            self.log.info("\tSkipping step.")

        self.log.info("Parsing expression input data.")
        if not check_file_exists(self.expr_outpath) or not check_file_exists(
                self.sign_expr_outpath) or self.force:
            self.log.info("Loading signature matrix.")
            self.sign_df = load_dataframe(inpath=self.sign_file,
                                          header=0,
                                          index_col=0,
                                          logger=self.log)
            signature_genes = set(self.sign_df.index.to_list())

            self.log.info("Loading gene traslate dict.")
            self.gene_info_df = load_dataframe(inpath=self.gene_info_file,
                                               header=0,
                                               index_col=None,
                                               logger=self.log)
            gene_trans_dict = construct_dict_from_df(self.gene_info_df,
                                                     self.ensg_id,
                                                     self.hgnc_id)

            if not check_file_exists(self.expr_outpath) or self.force:
                self.log.info("Parsing expression data.")
                self.expr_df, self.sign_expr_df = self.parse_expression_file(
                    self.expr_file,
                    signature_genes,
                    gene_trans_dict,
                    include_decon=self.decon_expr_file is None)

            if (not check_file_exists(self.sign_expr_outpath) or
                    self.force) and (check_file_exists(self.decon_expr_file)):
                self.log.info("Parsing deconvolution expression data.")
                self.log.warning(
                    "Using different expresion file for deconvolution.")
                _, self.sign_expr_df = self.parse_expression_file(
                    self.decon_expr_file,
                    signature_genes,
                    gene_trans_dict,
                    include_expr=False,
                    remove_ens_version=True)

            self.log.info("Reorder, Filter, and save.")
            if self.expr_df is not None:
                self.expr_df = self.expr_df.loc[self.eqtl_df.loc[:,
                                                                 "ProbeName"],
                                                self.sample_order]
                save_dataframe(df=self.expr_df,
                               outpath=self.expr_outpath,
                               index=True,
                               header=True,
                               logger=self.log)
            if self.sign_expr_df is not None:
                self.sign_expr_df = self.sign_expr_df.loc[:, self.sample_order]
                save_dataframe(df=self.sign_expr_df,
                               outpath=self.sign_expr_outpath,
                               index=True,
                               header=True,
                               logger=self.log)
        else:
            self.log.info("\tSkipping step.")

예제 #15

0

파일 보기

    def work(self, permutation_orders):
        # Load the data
        print("Loading data", flush=True)
        tech_covs_df = load_dataframe(self.tech_covs_inpath,
                                      header=0,
                                      index_col=0)
        covs_df = load_dataframe(self.covs_inpath, header=0, index_col=0)

        geno_df = load_dataframe(
            self.geno_inpath,
            header=0,
            index_col=0,
            skiprows=[i for i in range(1, self.skip_rows + 1)],
            nrows=self.n_eqtls)
        expr_df = load_dataframe(
            self.expr_inpath,
            header=0,
            index_col=0,
            skiprows=[i for i in range(1, self.skip_rows + 1)],
            nrows=self.n_eqtls)

        # Validate the dataframes match up.
        dfs = [tech_covs_df, covs_df, geno_df, expr_df]
        for (a, b) in list(itertools.combinations(dfs, 2)):
            if a is not None and b is not None and \
                    not a.columns.identical(b.columns):
                print("Order of samples are not identical.")
                exit()

        # Replace -1 with NaN in the genotype dataframe. This way we can
        # drop missing values.
        geno_df.replace(-1, np.nan, inplace=True)

        # Initialize the storage object.
        print("Creating storage object")
        storage = StorageContainer(colnames=covs_df.index.to_list())

        # Start working.
        print("Starting interaction analyser", flush=True)
        for row_index, eqtl_index in enumerate(
            [i for i in range(self.skip_rows, self.skip_rows + self.n_eqtls)]):
            print("\tProcessing eQTL {}/{} "
                  "[{:.0f}%]".format(row_index + 1, self.n_eqtls,
                                     (100 / self.n_eqtls) * (row_index + 1)),
                  flush=True)
            start_time = time.time()

            # Get the complete genotype row for the permutation later.
            genotype_all = geno_df.iloc[row_index, :].copy()

            # Get the missing genotype indices.
            indices = np.arange(geno_df.shape[1])
            eqtl_indices = indices[~geno_df.iloc[row_index, :].isnull().values]

            # Subset the row and present samples for this eQTL.
            genotype = geno_df.iloc[row_index, eqtl_indices].copy()
            expression = expr_df.iloc[row_index, eqtl_indices].copy()
            technical_covs = tech_covs_df.iloc[:, eqtl_indices].copy()

            # Initialize variables.
            storage.add_row(eqtl_index, "{}_{}".format(genotype.name,
                                                       expression.name))

            # Create the base model. Null model are all the technical
            # covariates multiplied with the genotype + the SNP.
            intercept = pd.DataFrame(1,
                                     index=genotype.index,
                                     columns=["intercept"])
            base_matrix = reduce(
                lambda left, right: pd.merge(
                    left, right, left_index=True, right_index=True),
                [intercept, genotype.to_frame(), technical_covs.T])
            if self.correct_snp_tc_inter:
                tech_inter_matrix = technical_covs.mul(genotype, axis=1)
                tech_inter_matrix.index = [
                    "{}_X_SNP".format(x) for x in technical_covs.index
                ]
                base_matrix = base_matrix.merge(tech_inter_matrix.T,
                                                left_index=True,
                                                right_index=True)

            # Regress out the base model from the expression values.
            expression_hat = self.remove_covariates(expression, base_matrix)

            # Loop over the covariates.
            for cov_index in range(len(covs_df.index)):
                if storage.has_error():
                    break

                # Get the covariate we are processing.
                covariate = covs_df.iloc[cov_index, eqtl_indices].copy()
                cov_name = covariate.name

                if self.verbose:
                    print("\t\tWorking on '{}'".format(cov_name), flush=True)

                # Create the null model.
                null_matrix = covariate.to_frame()
                n_null = null_matrix.shape[0]
                df_null, rss_null, _, _ = self.create_model(
                    null_matrix, expression_hat)

                if self.verbose:
                    print("\t\tn_null: {}\tdf_null: {}\trss_null: {}\t".format(
                        n_null, df_null, rss_null))

                # Loop over each permutation sample order. The first order
                # is the normal order and the remainder are random shuffles.
                for order_id, sample_order in enumerate(permutation_orders):
                    if storage.has_error():
                        break

                    if self.verbose:
                        print("\t\t\tWorking on 'order_{}'".format(order_id),
                              flush=True)

                    # Reorder the covariate based on the sample order.
                    # Make sure the labels are in the same order, just
                    # shuffle the values.
                    covariate_all = covs_df.iloc[cov_index, :].copy()
                    if sample_order is not None:
                        covariate_all_index = covariate_all.index
                        covariate_all = covariate_all.reindex(
                            covariate_all.index[sample_order])
                        covariate_all.index = covariate_all_index

                    # Calculate the interaction effect of the covariate of
                    # interest. Then drop the NA's from the interaction
                    # term.
                    inter_of_interest = covariate_all * genotype_all
                    inter_name = "{}_X_SNP".format(cov_name)
                    inter_of_interest.name = inter_name
                    inter_of_interest = inter_of_interest.iloc[eqtl_indices]

                    del covariate_all

                    # Check if the drop is identical (see above).
                    if not inter_of_interest.index.equals(null_matrix.index):
                        print("\t\t\tError in permutation reordering "
                              "(ID: {})".format(order_id),
                              flush=True)
                        storage.set_error()
                        continue

                    # Create the alternative matrix and add the interaction
                    # term.
                    alt_matrix = null_matrix.copy()
                    alt_matrix = alt_matrix.merge(inter_of_interest.to_frame(),
                                                  left_index=True,
                                                  right_index=True)

                    del inter_of_interest

                    # Create the alternative model.
                    n_alt = alt_matrix.shape[0]
                    df_alt, rss_alt, coefficients_alt, std_errors_alt = self.create_model(
                        alt_matrix, expression_hat, cols=[inter_name])

                    del alt_matrix

                    if self.verbose:
                        print(
                            "\t\t\tn_alt: {}\tdf_alt: {}\trss_alt: {}\talt_coefficients: {}\talt_std_erros: {}"
                            .format(n_alt, df_alt, rss_alt, coefficients_alt,
                                    std_errors_alt))

                    # Make sure the n's are identical.
                    if n_null != n_alt:
                        print("\t\t\tError due to unequal n_null and n_alt",
                              flush=True)
                        storage.set_error()
                        continue

                    # Safe the coefficient and std error.
                    storage.add_coefficient(order_id,
                                            coefficients_alt[inter_name])
                    storage.add_std_error(order_id, std_errors_alt[inter_name])

                    # Compare the null and alternative model.
                    fvalue = self.calc_f_value(rss_null, rss_alt, df_null,
                                               df_alt, n_null)
                    pvalue = self.get_p_value(fvalue, df_null, df_alt, n_null)

                    if self.verbose:
                        print("\t\t\tfvalue: {}\tpvalue: {}".format(
                            fvalue, pvalue))

                    # Safe the p-values.
                    storage.add_pvalue(order_id, pvalue)

                    del fvalue, pvalue

                    # Check whether we are almost running out of time.
                    if time.time() > self.panic_time:
                        print("\tPanic!!!", flush=True)
                        return storage

            # Safe the results of the eQTL.
            storage.store_row()

            # Print the time.
            print("\t\tfinished in {:.4f} second(s).".format(time.time() -
                                                             start_time,
                                                             flush=True))

        return storage

예제 #16

0

파일 보기

    def perform_deconvolution(self):
        if self.sign_df is None:
            # Load the celltype profile file.
            self.log.info("Loading cell type profile matrix.")
            self.sign_df = load_dataframe(self.sign_file,
                                          header=0,
                                          index_col=0,
                                          logger=self.log)

        if self.sign_expr_df is None:
            # Load the celltype expression file.
            self.log.info("Loading cell type expression matrix.")
            self.sign_expr_df = load_dataframe(self.sign_expr_file,
                                               header=0,
                                               index_col=0,
                                               logger=self.log)

        # Filter uninformative genes from the signature matrix.
        sign_df = self.filter(self.sign_df, cutoff=self.min_expr_cutoff)

        # Subset and reorder.
        sign_df, expr_df = self.subset(sign_df, self.sign_expr_df)

        # Transform.
        sign_df = self.perform_log2_transform(sign_df)

        # Shift the data to be positive.
        self.log.info("Shifting data to be positive if required")
        if sign_df.values.min() < 0:
            self.log.warning("\tSignature matrix is shifted.")
            sign_df = self.perform_shift(sign_df)

        if expr_df.values.min() < 0:
            self.log.warning("\tExpression matrix is shifted.")
            expr_df = self.perform_shift(expr_df)

        self.log.info("Signature shape: {}".format(sign_df.shape))
        self.log.info("Expression shape: {}".format(expr_df.shape))

        # Perform deconvolution per sample.
        self.log.info("Performing partial deconvolution.")
        decon_data = []
        residuals_data = []
        recon_accuracy_data = []
        for _, sample in expr_df.T.iterrows():
            # Model.
            proportions, rnorm = self.nnls(sign_df, sample)

            # Calculate reconstruction accuracy.
            recon_accuracy = self.calc_reconstruction_accuracy(
                y=sample, X=sign_df, betas=proportions)

            # Save.
            decon_data.append(proportions)
            residuals_data.append(rnorm)
            recon_accuracy_data.append(recon_accuracy)

        decon_df = pd.DataFrame(decon_data,
                                index=expr_df.columns,
                                columns=sign_df.columns)
        residuals_df = pd.Series(residuals_data, index=expr_df.columns)
        recon_accuracy = pd.Series(recon_accuracy_data, index=expr_df.columns)

        self.log.info("Estimated weights:")
        self.log.info(decon_df.mean(axis=0))
        self.log.info(
            "Average reconstruction accuracy: {:.2f} [SD: {:.2f}]".format(
                recon_accuracy.mean(), recon_accuracy.std()))

        save_dataframe(df=decon_df,
                       outpath=os.path.join(self.outdir, "NNLS_betas.txt.gz"),
                       index=True,
                       header=True,
                       logger=self.log)

        # Make the weights sum up to 1.
        decon_df = self.sum_to_one(decon_df)
        self.log.info("Estimated proportions:")
        self.log.info(decon_df.mean(axis=0))

        # Calculate the average residuals.
        self.log.info("Average residual: {:.2f}".format(residuals_df.mean()))

        save_dataframe(df=decon_df,
                       outpath=os.path.join(
                           self.outdir, "deconvolution_table_complete.txt.gz"),
                       index=True,
                       header=True,
                       logger=self.log)

        if self.cell_type_groups is not None:
            self.log.info("Summing cell types.")
            cell_type_group = np.array([
                self.cell_type_groups[ct]
                if ct in self.cell_type_groups else ct
                for ct in decon_df.columns
            ],
                                       dtype=object)
            cell_types = list(set(cell_type_group))
            cell_types.sort()
            summed_decon_df = pd.DataFrame(np.nan,
                                           index=decon_df.index,
                                           columns=cell_types)
            for ct_group in cell_types:
                summed_decon_df.loc[:,
                                    ct_group] = decon_df.loc[:,
                                                             cell_type_group ==
                                                             ct_group].sum(
                                                                 axis=1)

            decon_df = summed_decon_df

        return decon_df

예제 #17

0

파일 보기

파일: perform_deconvolution.py 프로젝트: molgenis/metabrain

    def perform_deconvolution(self):
        if self.sign_df is None:
            # Load the celltype profile file.
            self.log.info("Loading cell type profile matrix.")
            self.sign_df = load_dataframe(self.sign_file,
                                          header=0,
                                          index_col=0,
                                          logger=self.log)

        if self.sign_expr_df is None:
            # Load the celltype expression file.
            self.log.info("Loading cell type expression matrix.")
            self.sign_expr_df = load_dataframe(self.sign_expr_file,
                                               header=0,
                                               index_col=0,
                                               logger=self.log)

        # Filter uninformative genes from the signature matrix.
        sign_df = self.filter(self.sign_df, cutoff=self.min_expr_cutoff)

        # Subset and reorder.
        sign_df, expr_df = self.subset(sign_df, self.sign_expr_df)

        # Transform.
        sign_df = self.perform_log2_transform(sign_df)

        # Shift the data to be positive.
        self.log.info("Shifting data to be positive")
        if sign_df.values.min() < 0:
            self.log.warning("\tSignature matrix is shifted.")
            sign_df = self.perform_shift(sign_df)

        if expr_df.values.min() < 0:
            self.log.warning("\tExpression matrix is shifted.")
            expr_df = self.perform_shift(expr_df)

        self.log.info("Signature shape: {}".format(sign_df.shape))
        self.log.info("Expression shape: {}".format(expr_df.shape))

        # Perform deconvolution per sample.
        self.log.info("Performing partial deconvolution.")
        decon_data = []
        residuals_data = []
        for _, sample in expr_df.T.iterrows():
            proportions, rnorm = self.nnls(sign_df, sample)
            decon_data.append(proportions)
            residuals_data.append(rnorm)

        decon_df = pd.DataFrame(decon_data,
                                index=expr_df.columns,
                                columns=[
                                    "{}NNLS_{}".format(*x.split("_"))
                                    for x in sign_df.columns
                                ])
        residuals_df = pd.Series(residuals_data, index=expr_df.columns)

        self.log.info("Estimated weights:")
        self.log.info(decon_df.mean(axis=0))

        # Make the weights sum up to 1.
        decon_df = self.sum_to_one(decon_df)
        self.log.info("Estimated proportions:")
        self.log.info(decon_df.mean(axis=0))

        # Calculate the average residuals.
        self.log.info("Average residual: {:.2f}".format(residuals_df.mean()))

        return decon_df

예제 #18

0

파일 보기

'''
	@author: XT
	fletching.py contains functions for calculating fletching profits
'''

import utilities

df = utilities.load_dataframe()


'''
Onyx bolts:
	- 1 Runite bolts
	- 1 Onyx bolt tips
'''
def OnyxBolts():

	#Buy
	rb = utilities.getPrice('Runite bolts', df)
	obt = utilities.getPrice('Onyx bolt tips', df)

	#Sell
	ob = utilities.getPrice('Onyx bolts', df)

	#Profit
	profit, BUY, SELL = utilities.calculateProfit([rb,obt],[ob])
	print("Profit: {}\n".format(profit))

def DiamondBolts():

	#Buy

예제 #19

0

파일 보기

    def start(self):
        print("Starting interaction analyser - combine and plot.")
        self.print_arguments()

        # Start the timer.
        start_time = time.time()

        print("")
        print("### Step 1 ###")
        print("Combine pickle files into dataframe.", flush=True)
        dataframes = {}
        for filename in [
                self.pvalues_filename, self.coef_filename,
                self.std_err_filename
        ]:
            outpath = os.path.join(self.work_dir,
                                   "{}_table.txt.gz".format(filename))
            if not check_file_exists(outpath) or self.force:
                print("Loading {} data.".format(filename), flush=True)
                columns, data = self.combine_pickles(self.work_dir,
                                                     filename,
                                                     columns=True)

                if len(data) == 0:
                    print("\tNo {} data found.".format(filename))
                    continue

                print("Creating {} dataframe.".format(filename), flush=True)
                df = self.create_df(data, columns)

                print("Saving {} dataframe.".format(filename), flush=True)
                save_dataframe(df=df, outpath=outpath, header=True, index=True)

                dataframes[filename] = df

                del columns, data, df
            else:
                print("Skipping step for {}".format(outpath))
                dataframes[filename] = load_dataframe(outpath,
                                                      header=0,
                                                      index_col=0)

        print("")
        print("### Step 2 ###")
        print("Calculate t-values", flush=True)
        outpath = os.path.join(self.work_dir,
                               "{}_table.txt.gz".format(self.tvalue_filename))
        if not check_file_exists(outpath) or self.force:
            if self.coef_filename in dataframes and self.std_err_filename in dataframes:
                # Calculate t-values
                coef_df = dataframes[self.coef_filename]
                std_err_df = dataframes[self.std_err_filename]

                if not coef_df.columns.identical(std_err_df.columns):
                    overlap = set(coef_df.columns).intersection(
                        set(std_err_df.columns))
                    if len(overlap) == 0:
                        print("No overlapping eQTLs between coef and std_err "
                              "data frame columns.")
                    else:
                        coef_df = coef_df.loc[:, overlap]
                        std_err_df = std_err_df.loc[:, overlap]
                if not coef_df.index.identical(std_err_df.index):
                    overlap = set(coef_df.index).intersection(
                        set(std_err_df.index))
                    if len(overlap) == 0:
                        print("No overlapping eQTLs between coef and std_err "
                              "data frames indices.")
                    else:
                        coef_df = coef_df.loc[overlap, :]
                        std_err_df = std_err_df.loc[overlap, :]

                if coef_df.columns.identical(
                        std_err_df.columns) and coef_df.index.identical(
                            std_err_df.index):
                    tvalue_df = coef_df / std_err_df

                    print("Saving {} dataframe.".format(self.tvalue_filename),
                          flush=True)
                    save_dataframe(df=tvalue_df,
                                   outpath=os.path.join(
                                       self.work_dir, "{}_table.txt.gz".format(
                                           self.tvalue_filename)),
                                   header=True,
                                   index=True)
            else:
                print("\tNo data found.")
        else:
            print("Skipping step.")

        print("")
        print("### Step 3 ###")
        print("Starting other calculations", flush=True)

        if self.pvalues_filename not in dataframes:
            print("\tNo pvalues data found.")
            return

        pvalue_df = dataframes[self.pvalues_filename]
        pvalue_df_columns = [
            "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.columns)
        ]
        pvalue_df.columns = pvalue_df_columns
        pvalue_df_indices = [
            "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.index)
        ]
        pvalue_df.index = pvalue_df_indices
        pvalue_df.reset_index(drop=False, inplace=True)

        print("Melting dataframe.", flush=True)
        dfm = pvalue_df.melt(id_vars=["index"])
        dfm.columns = ["covariate", "SNP", "pvalue"]
        dfm["rank"] = dfm.loc[:, "pvalue"].rank(ascending=True)
        n_signif = dfm[dfm["pvalue"] <= self.alpha].shape[0]
        n_total = dfm.shape[0]
        print("\t{}/{} [{:.2f}%] of pvalues < {}".format(
            n_signif, n_total, (100 / n_total) * n_signif, self.alpha),
              flush=True)

        print("Adding z-scores.", flush=True)
        dfm["zscore"] = stats.norm.isf(dfm["pvalue"])
        dfm.loc[dfm["pvalue"] > (1.0 - 1e-16), "zscore"] = -8.209536151601387
        dfm.loc[dfm["pvalue"] < 1e-323, "zscore"] = 38.44939448087599
        self.pivot_and_save(dfm, "zscore", pvalue_df_indices,
                            pvalue_df_columns)

        print("Adding BH-FDR.", flush=True)
        dfm["BH-FDR"] = dfm["pvalue"] * (n_total / (dfm["rank"] + 1))
        dfm.loc[dfm["BH-FDR"] > 1, "BH-FDR"] = 1
        prev_bh_fdr = -np.Inf
        for i in range(n_total):
            bh_fdr = dfm.loc[i, "BH-FDR"]
            if bh_fdr > prev_bh_fdr:
                prev_bh_fdr = bh_fdr
            else:
                dfm.loc[i, "BH-FDR"] = prev_bh_fdr
        n_signif = dfm[dfm["BH-FDR"] <= self.alpha].shape[0]
        print("\t{}/{} [{:.2f}%] of BH-FDR values < {}".format(
            n_signif, n_total, (100 / n_total) * n_signif, self.alpha),
              flush=True)
        self.pivot_and_save(dfm, "BH-FDR", pvalue_df_indices,
                            pvalue_df_columns)

        print("Adding permutation FDR.", flush=True)
        print("\tLoading permutation pvalue data.", flush=True)
        _, perm_pvalues = self.combine_pickles(self.work_dir,
                                               self.perm_pvalues_filename)
        # perm_pvalues = [random.random() for _ in range(n_total * 10)]
        print("Sorting p-values.", flush=True)
        perm_pvalues = sorted(perm_pvalues)

        if len(perm_pvalues) > 0:
            n_perm = len(perm_pvalues) / n_total
            if n_perm != self.n_perm:
                print("\tWARNING: not all permutation pvalus are present")
            perm_ranks = []
            for pvalue in dfm["pvalue"]:
                perm_ranks.append(bisect_left(perm_pvalues, pvalue))
            dfm["perm-rank"] = perm_ranks
            dfm["perm-FDR"] = (dfm["perm-rank"] / n_perm) / dfm["rank"]
            dfm.loc[(dfm.index == 0) | (dfm["perm-rank"] == 0), "perm-FDR"] = 0
            dfm.loc[dfm["perm-FDR"] > 1, "perm-FDR"] = 1

            self.pivot_and_save(dfm, "perm-FDR", pvalue_df_indices,
                                pvalue_df_columns)

        print("Saving full dataframe.", flush=True)
        save_dataframe(df=dfm,
                       outpath=os.path.join(self.work_dir,
                                            "molten_table.txt.gz"),
                       header=True,
                       index=True)
        print("")

        # Print the time.
        run_time_min, run_time_sec = divmod(time.time() - start_time, 60)
        run_time_hour, run_time_min = divmod(run_time_min, 60)
        print("finished in  {} hour(s), {} minute(s) and "
              "{} second(s).".format(int(run_time_hour), int(run_time_min),
                                     int(run_time_sec)),
              flush=True)

예제 #20

0

파일 보기

    def create_tech_covs_file(self):
        # Load the technical covariates.
        self.log.info("Loading technical covariates matrix.")
        tcov_df = load_dataframe(inpath=self.cov_file,
                                 header=0,
                                 index_col=0,
                                 logger=self.log)

        # Filter on samples and technical covariates.
        self.log.info("Filtering on samples and technical covariates.")
        tcov_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in tcov_df.index]
        tcov_df = tcov_df.loc[self.sample_order, :].copy()
        save_dataframe(df=tcov_df.T, outpath=os.path.join(self.outdir, "technical_covariates_table.txt.gz"),
                       index=True, header=True, logger=self.log)
        if self.technical_covariates:
            save_dataframe(df=tcov_df.loc[:, self.technical_covariates].T,
                           outpath=os.path.join(self.outdir, "technical_covariates_table_subset.txt.gz"),
                           index=True, header=True, logger=self.log)

        # Load the MDS components.
        self.log.info("Loading MDS matrix.")
        mds_df = load_dataframe(inpath=self.mds_file,
                                header=0,
                                index_col=0,
                                logger=self.log)

        # Filter on samples and technical covariates.
        self.log.info("Filtering on samples and technical covariates.")
        mds_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in mds_df.index]
        mds_df = mds_df.loc[self.sample_order, :].copy()

        save_dataframe(df=mds_df.T, outpath=os.path.join(self.outdir, "mds_covariates_table.txt.gz"),
                       index=True, header=True, logger=self.log)

        tmp_combined_df = tcov_df.merge(mds_df, left_index=True, right_index=True)
        save_dataframe(df=tmp_combined_df.T, outpath=os.path.join(self.outdir, "technical_and_mds_covariates_table.txt.gz"),
                       index=True, header=True, logger=self.log)

        # Loading cohort matrix.
        self.log.info("Loading dataset matrix.")
        if self.dataset_df is None:
            self.dataset_df = load_dataframe(self.dataset_file,
                                             header=0,
                                             index_col=0,
                                             logger=self.log)

        # merge.
        self.log.info("Merging matrices.")
        correction_df = reduce(lambda left, right: pd.merge(left,
                                                           right,
                                                           left_index=True,
                                                           right_index=True),
                             [tcov_df,
                              mds_df,
                              self.dataset_df])
        correction_df = correction_df.T
        correction_df.index.name = "-"
        self.log.info("\t Correction matrix shape: {}".format(correction_df.shape))

        # Validate sample order.
        if not correction_df.columns.equals(self.sample_order):
            correction_df = correction_df[self.sample_order]

        return correction_df