def start(self):
        print("Starting factorization of celltype profile expression.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.pca_outpath) and check_file_exists(
                self.nmf_outpath) and not self.force:
            print("Skipping step, loading result.")
            self.celltype_pcs = load_dataframe(inpath=self.pca_outpath,
                                               header=0,
                                               index_col=0)
            self.celltype_cs = load_dataframe(inpath=self.nmf_outpath,
                                              header=0,
                                              index_col=0)
        else:
            self.celltype_expression, self.celltype_pcs, self.celltype_cs = self.perform_matrix_factorization(
            )
            self.save()
示例#2
0
    def start(self):
        print("Starting deconvolution.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step, loading result.")
            self.deconvolution = load_dataframe(inpath=self.outpath,
                                                header=0,
                                                index_col=0)
        else:
            self.deconvolution = self.perform_deconvolution()
            self.save()
示例#3
0
    def start(self):
        print("Starting creating covariate file.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step, loading result.")
            self.covariates = load_dataframe(inpath=self.outpath,
                                             header=0,
                                             index_col=0)
        else:
            self.covariates = self.combine_files()
            self.save()
示例#4
0
    def start(self):
        print("Starting combining GTE files.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step, loading result.")
            self.gte = load_dataframe(inpath=self.outpath,
                                      header=None,
                                      index_col=None)
        else:
            # Load each GTE file.
            self.gte = self.combine_files()
            self.save()

        # Construct sample translate dict.
        self.sample_dict = self.create_sample_dict()
        self.sample_order = self.set_sample_order()
示例#5
0
    def start(self):
        print("Starting combining eQTL probe files.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step, loading result.")
            self.eqtl_probes = load_dataframe(inpath=self.outpath,
                                              header=0,
                                              index_col=False)
        else:
            # Load each GTE file.
            print("Loading eQTLprobes files.")
            combined_eqtl_probes = self.combine_files()
            if self.disease != "" and self.disease is not None:
                print("Filtering on trait: {}".format(self.disease))
                combined_eqtl_probes = self.filter_on_trait(
                    combined_eqtl_probes)
            self.eqtl_probes = combined_eqtl_probes
            self.save()
示例#6
0
文件: main.py 项目: npklein/metabrain
    def combine_groups(self, inter_outpath):
        print("Combining groups.")
        snp_mask = np.array([], dtype=np.int16)
        sample_mask = np.array([], dtype=np.int16)
        inter_df = None
        for i, group_id in enumerate(self.group_ids):
            print("  Working on: {:10s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.group_ids),
                                    (100 / len(self.group_ids)) * (i + 1)))

            # Define the directory names.
            data_indir = os.path.join(self.g_data_indir, group_id)
            inter_indir = os.path.join(self.g_inter_indir, group_id, 'output')

            # Load the group object.
            with open(os.path.join(data_indir, self.obj_filename), "rb") as f:
                group_object = pickle.load(f)

            # Safe the indices.
            snp_mask = np.append(snp_mask, group_object.get_snp_indices())
            sample_mask = np.append(sample_mask,
                                    group_object.get_sample_indices())

            if not check_file_exists(inter_outpath) or self.force:
                # Search for the interaction filename.
                inter_inpath = None
                for path in glob.glob(os.path.join(inter_indir, "*")):
                    if re.match(self.inter_regex, get_basename(path)):
                        inter_inpath = path
                        break
                if inter_inpath is None:
                    print("Interaction matrix not found.")
                    exit()

                # Load the interaction file.
                group_inter_df = load_dataframe(inpath=inter_inpath,
                                                header=0,
                                                index_col=0)

                # Merge them.
                if inter_df is None:
                    inter_df = group_inter_df
                else:
                    inter_df = inter_df.merge(group_inter_df,
                                              left_index=True,
                                              right_index=True)

        print("Preparing interaction matrix.")
        if not check_file_exists(inter_outpath) or self.force:
            # Sort the matrix according to the indices.
            inter_df = inter_df.T
            inter_df["index"] = snp_mask
            inter_df.sort_values(by=['index'], inplace=True)
            inter_df.drop(["index"], axis=1, inplace=True)
            inter_df = inter_df.T

            save_dataframe(df=inter_df,
                           outpath=inter_outpath,
                           index=True,
                           header=True)
        else:
            inter_df = load_dataframe(inpath=inter_outpath,
                                      header=0,
                                      index_col=0)

        # Prepare the masks.
        snp_mask = sorted(list(set(snp_mask)))
        sample_mask = sorted(list(set(sample_mask)))

        return snp_mask, sample_mask, inter_df
示例#7
0
文件: main.py 项目: npklein/metabrain
    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting combining groups.")
        self.print_arguments()

        # Combine the indices of each group and combine the interaction
        # matrix if need be.
        inter_outpath = os.path.join(self.outdir, self.inter_filename)
        snp_mask, sample_mask, inter_df = self.combine_groups(inter_outpath)

        print("\nSubsetting data with masks:")
        print("\tSNP mask:\tlength: {}\tlowest index: {}"
              "\thighest index: {}".format(len(snp_mask), min(snp_mask),
                                           max(snp_mask)))
        print("\tSample mask:\tlength: {}\tlowest index: {}"
              "\thighest index: {}".format(len(sample_mask), min(sample_mask),
                                           max(sample_mask)))
        print("")

        # Load the eQTL file if either the marker df or the eqtl df needs to be
        # created.
        markers_outpath = os.path.join(self.outdir, self.markers_filename)
        eqtl_outpath = os.path.join(self.outdir, self.eqtl_filename)
        if not check_file_exists(eqtl_outpath) or \
                not check_file_exists(markers_outpath) \
                or self.force:
            print("Loading eQTL file.")
            eqtl_df = load_dataframe(inpath=self.eqtl_inpath,
                                     header=0,
                                     index_col=None)
            eqtl_df = eqtl_df.iloc[snp_mask, :]

            print("Preparing marker matrix.")
            if not check_file_exists(markers_outpath) or self.force:
                self.create_marker_df(inter_df, eqtl_df, markers_outpath)
            else:
                print("\tSkipping step.")

            print("Preparing eQTL matrix.")
            if not check_file_exists(eqtl_outpath) or self.force:
                save_dataframe(outpath=eqtl_outpath,
                               df=eqtl_df,
                               index=False,
                               header=True)
            else:
                print("\tSkipping step.")
            del eqtl_df

        del inter_df

        print("\nPreparing genotype matrix.")
        geno_outpath = os.path.join(self.outdir, self.geno_filename)
        if not check_file_exists(geno_outpath) or self.force:
            geno_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.geno_filename),
                                     header=0,
                                     index_col=0)
            geno_df = geno_df.iloc[snp_mask, sample_mask]
            save_dataframe(outpath=geno_outpath,
                           df=geno_df,
                           index=True,
                           header=True)
            del geno_df
        else:
            print("\tSkipping step.")

        print("\nPreparing alleles matrix.")
        alleles_outpath = os.path.join(self.outdir, self.alleles_filename)
        if not check_file_exists(alleles_outpath) or self.force:
            alleles_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.alleles_filename),
                                        header=0,
                                        index_col=0)
            alleles_df = alleles_df.iloc[snp_mask, :]
            save_dataframe(outpath=alleles_outpath,
                           df=alleles_df,
                           index=True,
                           header=True)
            del alleles_df
        else:
            print("\tSkipping step.")

        print("\nPreparing expression matrix.")
        expr_outpath = os.path.join(self.outdir, self.expr_filename)
        if not check_file_exists(expr_outpath) or self.force:
            expr_df = load_dataframe(inpath=os.path.join(
                self.data_indir, self.expr_filename),
                                     header=0,
                                     index_col=0)
            expr_df = expr_df.iloc[snp_mask, sample_mask]
            save_dataframe(outpath=expr_outpath,
                           df=expr_df,
                           index=True,
                           header=True)
            del expr_df
        else:
            print("\tSkipping step.")

        print("\nPreparing covariate matrix.")
        cov_outpath = os.path.join(self.outdir, self.cov_filename)
        if not check_file_exists(cov_outpath) or self.force:
            cov_df = load_dataframe(inpath=self.cov_inpath,
                                    header=0,
                                    index_col=0)
            cov_df = cov_df.iloc[:, sample_mask].copy()
            save_dataframe(outpath=cov_outpath,
                           df=cov_df,
                           index=True,
                           header=True)
            del cov_df
        else:
            print("\tSkipping step.")
示例#8
0
    def start(self):
        print("Creating groups.")
        for i, (group_id, group_obj) in enumerate(self.groups.items()):
            print("  Working on: {:10s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.groups),
                                    (100 / len(self.groups)) * (i + 1)))

            # Create the group dir.
            group_dir = os.path.join(self.outdir, group_id)
            prepare_output_dir(group_dir)

            # Define the output names.
            group_object = os.path.join(group_dir,
                                        "group.pkl")
            eqtl_outpath = os.path.join(group_dir,
                                        "eqtl_table.txt.gz")
            geno_outpath = os.path.join(group_dir,
                                        "genotype_table.txt.gz")
            alleles_outpath = os.path.join(group_dir,
                                           "genotype_alleles.txt.gz")
            expr_outpath = os.path.join(group_dir,
                                        "expression_table.txt.gz")
            cov_outpath = os.path.join(group_dir,
                                       "covariates_table.txt.gz")

            # Check if output file exist, if not, create it.
            if not check_file_exists(group_object) or self.force:
                with open(group_object, "wb") as f:
                    pickle.dump(group_obj, f)
                print("\tSaved group object: "
                      "{}".format(get_basename(group_object)))

            # Get the group indices.
            snp_mask = group_obj.get_snp_indices()
            sample_mask = group_obj.get_sample_indices()

            # Check if output file exist, if not, create it.
            if not check_file_exists(eqtl_outpath) or self.force:
                group_eqtl = self.eqtl_df.iloc[snp_mask, :].copy()
                save_dataframe(outpath=eqtl_outpath, df=group_eqtl,
                               index=False, header=True)
                del group_eqtl

            if not check_file_exists(geno_outpath) or self.force:
                group_geno = self.geno_df.iloc[snp_mask, sample_mask].copy()
                save_dataframe(outpath=geno_outpath, df=group_geno,
                               index=True, header=True)
                del group_geno

            if not check_file_exists(alleles_outpath) or self.force:
                group_alleles = self.alleles_df.iloc[snp_mask, :].copy()
                save_dataframe(outpath=alleles_outpath, df=group_alleles,
                               index=True, header=True)
                del group_alleles

            if not check_file_exists(expr_outpath) or self.force:
                group_expr = self.expr_df.iloc[snp_mask, sample_mask].copy()
                save_dataframe(outpath=expr_outpath, df=group_expr,
                               index=True, header=True)
                del group_expr

            if not check_file_exists(cov_outpath) or self.force:
                group_cov = self.cov_df.iloc[:, sample_mask].copy()
                save_dataframe(outpath=cov_outpath, df=group_cov,
                               index=True, header=True)
                del group_cov
示例#9
0
    def start(self):
        print("Starting creating matrices.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.geno_outpath) and \
                check_file_exists(self.alleles_outpath) and \
                check_file_exists(self.expr_outpath) and \
                not self.force:
            print("Skipping step.")
            return

        # Remove the output files.
        for outfile in [
                self.geno_outpath, self.alleles_outpath, self.expr_outpath
        ]:
            if os.path.isfile(outfile):
                print("Removing file: {}.".format(outfile))
                os.remove(outfile)

        # Load the genotype matrix file.
        print("Loading genotype matrix.")
        geno_df = load_dataframe(self.geno_file, header=0, index_col=0)
        allele_df = geno_df.loc[:, ["Alleles", "MinorAllele"]].copy()
        geno_df = geno_df.rename(columns=self.sample_dict)
        geno_df = geno_df[self.sample_order]

        # Load the expression matrix file.
        print("Loading expression matrix.")
        expr_df = load_dataframe(self.expr_file, header=0, index_col=0)
        expr_df = expr_df.rename(columns=self.sample_dict)
        self.complete_expr_matrix = expr_df[self.sample_order]

        # Construct the genotype / expression matrices.
        print("Constructing matrices.")
        geno_str_buffer = ["-" + "\t" + "\t".join(self.sample_order) + "\n"]
        expr_str_buffer = ["-" + "\t" + "\t".join(self.sample_order) + "\n"]
        allele_str_buffer = [
            "-" + "\t" + "\t".join(list(allele_df.columns)) + "\n"
        ]

        # saved_profile_genes = []
        # groups = []
        # new_group_id = 0
        n_snps = self.eqtl_df.shape[0]
        for i, row in self.eqtl_df.iterrows():
            if (i % 250 == 0) or (i == (n_snps - 1)):
                print("\tProcessing {}/{} "
                      "[{:.2f}%]".format(i, (n_snps - 1),
                                         (100 / (n_snps - 1)) * i))

                # Write output files.
                self.write_buffer(self.geno_outpath, geno_str_buffer)
                geno_str_buffer = []

                self.write_buffer(self.expr_outpath, expr_str_buffer)
                expr_str_buffer = []

                self.write_buffer(self.alleles_outpath, allele_str_buffer)
                allele_str_buffer = []

            # Get the row info.
            snp_name = row["SNPName"]
            probe_name = row["ProbeName"]

            # Used for development.
            # snp_name = "10:100145864:rs4919426:T_C"
            # probe_name = "ENSG00000000003.15"
            # End used for development.

            # Get the genotype.
            genotype = geno_df.loc[[snp_name], :]
            if (len(genotype.index)) != 1:
                print("SNP: {} gives 0 or >1 " "genotypes.".format(snp_name))
                continue
            geno_str = snp_name + "\t" + "\t".join(
                genotype.iloc[0, :].astype(str).values) + "\n"
            geno_str_buffer.append(geno_str)

            # Get the alleles.
            alleles = allele_df.loc[[snp_name], :]
            if (len(alleles.index)) != 1:
                print("SNP: {} gives 0 or >1 " "alleles.".format(snp_name))
                continue
            allele_str = "{}\t{}\t{}\n".format(snp_name,
                                               alleles.iloc[0]["Alleles"],
                                               alleles.iloc[0]["MinorAllele"])
            allele_str_buffer.append(allele_str)

            # Get the expression.
            expression = self.complete_expr_matrix.loc[[probe_name], :]
            if (len(expression.index)) != 1:
                print("Probe: {} gives 0 or >1 expression "
                      "profiles.".format(probe_name))
                continue
            expr_str = probe_name + "\t" + "\t".join(
                expression.iloc[0, :].astype(str).values) + "\n"
            expr_str_buffer.append(expr_str)

            # # Create an eQTL object.
            # new_eqtl = Eqtl(snp_name, i, genotype, expression)
            #
            # # Get the samples indices of the eQTl.
            # samples = new_eqtl.get_samples()
            # samples_indices = new_eqtl.get_sample_indices()
            #
            # # Assign the group.
            # matches = False
            # if groups:
            #     # Check if there is a group with these samples.
            #     for group in groups:
            #         if group.matches(samples_indices):
            #             group.add_eqtl(new_eqtl)
            #             matches = True
            #             break
            #
            # # Add a new group.
            # if not matches:
            #     new_group = Group(new_group_id, samples)
            #     new_group.add_eqtl(new_eqtl)
            #     groups.append(new_group)
            #     new_group_id = new_group_id + 1

        # Write output files.
        if geno_str_buffer:
            self.write_buffer(self.geno_outpath, geno_str_buffer)

        if expr_str_buffer:
            self.write_buffer(self.expr_outpath, expr_str_buffer)

        if allele_str_buffer:
            self.write_buffer(self.alleles_outpath, allele_str_buffer)

        # # Pickle the groups.
        # print("Writing group pickle file.")
        # with open(self.group_outpath, "wb") as f:
        #     pickle.dump(groups, f)

        # Remove old dataframes.
        del geno_df, expr_df
    def start(self):
        print("Starting creating deconvolution matrices.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.markers_outpath) and \
                check_file_exists(self.ct_profile_expr_outpath) and \
                not self.force:
            print("Skipping step.")
            return

        # Check which expression file we will use.
        expr_file = self.expr_file
        expr_df = self.expr_df
        if self.decon_expr_file:
            print("Warning: using a different expression file for "
                  "deconvolution than for gene expression. This might take "
                  "longer to load.")
            expr_file = self.decon_expr_file
            expr_df = None

        # Load the complete expression file.
        if expr_df is None:
            # Load the expression matrix file.
            print("Loading expression matrix.")
            expr_df = load_dataframe(expr_file, header=0, index_col=0)
            expr_df = expr_df.rename(columns=self.sample_dict)
            expr_df = expr_df[self.sample_order]

        # Load the translate file.
        print("Loading translate matrix.")
        trans_df = load_dataframe(self.translate_file,
                                  header=0,
                                  index_col=None)
        trans_dict = dict(
            zip(trans_df.loc[:, "ArrayAddress"], trans_df.loc[:, "Symbol"]))

        # Translate the ENSEBL ID's to HGNC symbols.
        expr_df.index = expr_df.index.map(trans_dict)
        expr_df.index.name = "-"

        # Remove unneeded variables.
        del trans_df, trans_dict

        # Create the marker gene file.
        if not check_file_exists(self.markers_outpath) or self.force:
            if os.path.isfile(self.markers_outpath):
                print("Removing: {}".format(self.markers_outpath))
                os.remove(self.markers_outpath)

            print("Creating marker gene expression table.")
            marker_str_buffer = [
                "-" + "\t" + "\t".join(self.sample_order) + "\n"
            ]
            for celltype, marker_genes in self.marker_dict.items():
                for marker_gene in marker_genes:
                    if marker_gene in expr_df.index:
                        expression = expr_df.loc[[marker_gene], :]
                        if (len(expression.index)) != 1:
                            print("\tMarker gene: {} gives 0 or >1 expression "
                                  "profiles.".format(marker_gene))
                            continue

                        marker_str = self.marker_genes_suffix + "_" + \
                                     celltype + "_" + marker_gene + "\t" + \
                                     "\t".join(expression.iloc[0, :].astype(str).values) \
                                     + "\n"
                        marker_str_buffer.append(marker_str)
            self.write_buffer(self.markers_outpath, marker_str_buffer)

        # Create the marker gene file.
        if not check_file_exists(self.ct_profile_expr_outpath) or self.force:
            if os.path.isfile(self.ct_profile_expr_outpath):
                print("Removing: {}".format(self.ct_profile_expr_outpath))
                os.remove(self.ct_profile_expr_outpath)

            # Load the celltype profile file.
            print("Loading cell type profile matrix.")
            self.celltype_profile = load_dataframe(self.celltype_profile_file,
                                                   header=0,
                                                   index_col=0)

            # Create the celltype profile file.
            print("Creating cell type profile expression table.")
            profile_str_buffer = [
                "-" + "\t" + "\t".join(self.sample_order) + "\n"
            ]
            for marker_gene in self.celltype_profile.index:
                if marker_gene in expr_df.index:
                    expression = expr_df.loc[[marker_gene], :]
                    if (len(expression.index)) != 1:
                        print("\tMarker gene: {} gives 0 or >1 expression "
                              "profiles.".format(marker_gene))
                        continue

                    profile_str = marker_gene + "\t" + "\t".join(
                        expression.iloc[0, :].astype(str).values) + "\n"
                    profile_str_buffer.append(profile_str)
            self.write_buffer(self.ct_profile_expr_outpath, profile_str_buffer)
    def start(self):
        print("Starting creating regression file.")
        self.print_arguments()

        # Check if output file exist.
        if check_file_exists(self.outpath) and not self.force:
            print("Skipping step.")
            return

        # Remove the output files.
        if os.path.isfile(self.outpath):
            print("Removing file: {}.".format(self.outpath))
            os.remove(self.outpath)

        # Prepare string buffer.
        regr_str_buffer = [
            "snp\tprobe\talleles\tminor_allele\tallele_assessed\tflipped\tslope"
            "\tintercept\tcorr_coeff\tp_value\tstd_err\toveral_z_score"
            "\tz_score_estimate\n"
        ]

        # Correlating.
        print("Correlating:")
        nrows = self.eqtl_df.shape[0]
        for i, row in self.eqtl_df.iterrows():
            if (i % 250 == 0) or (i == (nrows - 1)):
                print("\t Processing {}/{} [{:.2f}%]".format(
                    i, nrows, (100 / nrows) * i))

                # Write output files.
                self.write_buffer(self.outpath, regr_str_buffer)
                regr_str_buffer = []

            # Extract the usefull information.
            snp_name = row["SNPName"]
            probe_name = row["ProbeName"]
            overal_z_score = row["OverallZScore"]
            allele_assessed = row["AlleleAssessed"]

            # Get the data.
            genotype = self.geno_df.iloc[i, :].T.to_frame()
            if snp_name != genotype.columns[0]:
                print("SNPName does not match in genotype subset.")
                break
            expression = self.expr_df.iloc[i, :].T.to_frame()
            if snp_name != expression.columns[0]:
                print("SNPName does not match in expression subset.")
                break
            data = genotype.merge(expression,
                                  left_index=True,
                                  right_index=True)
            data.columns = ["genotype", "expression"]

            # Remove missing values.
            data.replace(-1, np.nan, inplace=True)
            data.dropna(inplace=True)

            # Determine the alleles.
            (alleles, minor_allele) = self.alleles_df.iloc[i, :]

            # Determine whether to flip or not.
            flipped = False
            if allele_assessed != alleles.split("/")[1]:
                data['genotype'] = 2.0 - data['genotype']
                flipped = True

            # # Naive flip method.
            # if allele_assessed != minor_allele:
            #     data['genotype'] = 2.0 - data['genotype']

            # Calculate the correlation.
            slope, intercept, r_value, p_value, std_err = stats.linregress(
                data["genotype"], data["expression"])

            # Calculate the z-score estimate.
            z_score_estimate = slope / std_err

            # # Naive flip method 2.0.
            # if allele_assessed != alleles.split("/")[1]:
            #     z_score_estimate = z_score_estimate * -1

            # Add to the buffer.
            regr_str = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n" \
                .format(snp_name,
                        probe_name,
                        alleles,
                        minor_allele,
                        allele_assessed,
                        flipped,
                        slope,
                        intercept,
                        r_value,
                        p_value,
                        std_err,
                        overal_z_score,
                        z_score_estimate
                        )
            regr_str_buffer.append(regr_str)

        # Write output files.
        if regr_str_buffer:
            self.write_buffer(self.outpath, regr_str_buffer)
示例#12
0
    def start(self):
        """
        The method that serves as the pipeline of the whole program.
        """
        print("Starting interaction analyser.")
        self.print_arguments()

        # Loop over the groups.
        print("Performing interaction analyses.")
        for i, group_indir in enumerate(self.group_indirs):
            # Prepare the input and output directories.
            if self.groups is not None:
                group_id = get_leaf_dir(group_indir)
                group_outdir = os.path.join(self.outdir, group_id)
            else:
                group_id = ""
                group_outdir = self.outdir
            ia_indir = os.path.join(group_outdir, 'input')
            ia_outdir = os.path.join(group_outdir, 'output')
            for outdir in [group_outdir, ia_indir, ia_outdir]:
                prepare_output_dir(outdir)

            # Check if we can find an InteractionZSCoreMatrix
            has_inter_matrix = False
            if not self.force:
                for path in glob.glob(os.path.join(ia_outdir, "*")):
                    if re.match(self.inter_regex, get_basename(path)):
                        has_inter_matrix = True
                        break

            # Stop if we already have the interaction matrix.
            if has_inter_matrix and not self.force:
                continue

            print("\tWorking on: {:15s} [{}/{} "
                  "{:.2f}%]".format(group_id, i + 1, len(self.group_indirs),
                                    (100 / len(self.group_indirs)) * (i + 1)))

            # Prepare the EQTLInteractioAnalyser expected input.
            self.print_string("\n### STEP1 ###\n")
            expected_input = ["Genotypes", "Expression", "Covariates"]
            filenames = [
                self.geno_filename, self.expr_filename, self.cov_filename
            ]
            for exp_ia_infile, filename in zip(expected_input, filenames):
                # Check if the files alreadt exist.
                file1 = os.path.join(ia_indir, exp_ia_infile + ".binary.dat")
                file2 = os.path.join(ia_indir,
                                     exp_ia_infile + ".binary.rows.txt")
                file3 = os.path.join(ia_indir,
                                     exp_ia_infile + ".binary.columns.txt")

                if not check_file_exists(file1) or \
                        not check_file_exists(file2) or \
                        not check_file_exists(file3) or \
                        self.force:
                    self.print_string("\nPreparing {}.".format(filename))

                    # Define the filenames.
                    compr_file = os.path.join(self.indir, group_id,
                                              filename + '.txt.gz')
                    copy_file = os.path.join(ia_indir, filename + '.txt.gz')
                    uncompr_file = os.path.join(ia_indir, filename + '.txt')
                    bin_file = os.path.join(ia_indir,
                                            exp_ia_infile + ".binary")

                    # Copy and decompressed the file.
                    self.print_string("\nCopying the input files.")
                    self.copy_file(compr_file, copy_file)
                    self.print_string("\nDecompressing the input files.")
                    self.decompress(copy_file)

                    # Convert to binary.
                    self.print_string("\nConverting files to binary format.")
                    self.convert_to_binary(uncompr_file, bin_file)

                    # Remove the uncompressed file.
                    self.print_string("\nRemoving uncompressed files.")
                    if check_file_exists(uncompr_file):
                        self.print_string(
                            "\tos.remove({})".format(uncompr_file))
                        os.remove(uncompr_file)
                else:
                    self.print_string(
                        "Skipping {} preparation.".format(filename))

            # prepare the eQTL file.
            self.print_string("\n### STEP2 ###\n")
            eqtl_file = os.path.join(ia_indir, self.eqtl_filename + '.txt')
            if not check_file_exists(eqtl_file) or self.force:
                self.print_string("\nPreparing eQTL file.")
                # Define the filenames.
                compr_file = os.path.join(self.indir, group_id,
                                          self.eqtl_filename + '.txt.gz')
                copy_file = os.path.join(ia_indir,
                                         self.eqtl_filename + '.txt.gz')

                # Copy and decompressed the file.
                self.print_string("\nCopying the input files.")
                self.copy_file(compr_file, copy_file)
                self.print_string("\nDecompressing the input files.")
                self.decompress(copy_file)
            else:
                self.print_string("Skipping eqtl preparation.")

            # execute the program.
            self.print_string("\n### STEP3 ###\n")
            self.print_string("Executing the eQTLInteractionAnalyser.")
            self.execute(ia_indir, ia_outdir, eqtl_file)
示例#13
0
    def start(self):
        """
        Method to start the manager.
        """
        self.print_arguments()
        print("Starting Custom Interaction Analyser "
              "[{}]".format(datetime.now().strftime("%d-%m-%Y, %H:%M:%S")))

        # Start the timer.
        start_time = int(time.time())

        # Get the permutation orders.
        permutation_orders = None
        perm_orders_outfile = os.path.join(self.outdir,
                                           self.perm_orders_filename + ".pkl")
        if check_file_exists(perm_orders_outfile):
            print("Loading permutation order")
            permutation_orders = self.load_pickle(perm_orders_outfile)

            # Validate the permutation orders for the given input.
            if len(permutation_orders) != (self.n_permutations + 1):
                print("\tinvalid")
                permutation_orders = None

            if permutation_orders is not None:
                for order in permutation_orders:
                    if len(order) != self.n_samples:
                        print("\tinvalid")
                        permutation_orders = None
                        break

            print("\tvalid")

        if permutation_orders is None:
            print("Creating permutation order")
            permutation_orders = self.create_perm_orders()
            self.dump_pickle(permutation_orders, self.outdir,
                             self.perm_orders_filename)

        # Start the work.
        print("Start the analyses", flush=True)
        storage = self.work(permutation_orders)
        tc_container = storage.get_tech_cov_container()
        c_container = storage.get_cov_container()

        print("Saving output files", flush=True)
        filename_suffix = "{}_{}".format(self.skip_rows, self.n_eqtls)
        for container, outdir in zip([tc_container, c_container],
                                     [self.tech_cov_outdir, self.cov_outdir]):
            full_outdir = os.path.join(self.outdir, outdir)
            prepare_output_dir(full_outdir)

            self.dump_pickle(container.get_pvalues(),
                             full_outdir,
                             self.pvalues_filename,
                             filename_suffix=filename_suffix,
                             subdir=True,
                             unique=True)
            self.dump_pickle(container.get_snp_tvalues(),
                             full_outdir,
                             self.snp_tvalues_filename,
                             filename_suffix=filename_suffix,
                             subdir=True,
                             unique=True)
            self.dump_pickle(container.get_inter_tvalues(),
                             full_outdir,
                             self.inter_tvalues_filename,
                             filename_suffix=filename_suffix,
                             subdir=True,
                             unique=True)
            self.dump_pickle(container.get_perm_pvalues(),
                             full_outdir,
                             self.perm_pvalues_filename,
                             filename_suffix=filename_suffix,
                             subdir=True,
                             unique=True)

        # Print the process time.
        run_time = int(time.time()) - start_time
        run_time_min, run_time_sec = divmod(run_time, 60)
        run_time_hour, run_time_min = divmod(run_time_min, 60)
        print("Finished in  {} hour(s), {} minute(s) and "
              "{} second(s)".format(int(run_time_hour), int(run_time_min),
                                    int(run_time_sec)))
        print("Received {:.2f} analyses per minute".format(
            (self.n_eqtls * (self.n_permutations + 1)) / (run_time / 60)))

        # Shutdown the manager.
        print("Shutting down manager [{}]".format(
            datetime.now().strftime("%d-%m-%Y, %H:%M:%S")),
              flush=True)
示例#14
0
    def start(self):
        print("Starting creating masked files.")
        self.print_arguments()

        # Get the sizes.
        (n_eqtls, n_samples) = self.geno_df.shape
        n_covs = self.cov_df.shape[0]

        # Create masks.
        eqtl_mask = ["eqtl_" + str(x) for x in range(n_eqtls)]
        sample_mask = ["sample_" + str(x) for x in range(n_samples)]
        cov_mask = ["cov_" + str(x) for x in range(n_covs)]

        # Create translate dicts.
        print("Creating translation files.")
        eqtl_translate_outpath = os.path.join(self.outdir,
                                              "eqtl_translate_table.txt.gz")
        if not check_file_exists(eqtl_translate_outpath) or self.force:
            eqtl_translate = pd.DataFrame({'unmasked': list(self.geno_df.index),
                                           'masked': eqtl_mask})
            save_dataframe(outpath=eqtl_translate_outpath,
                           df=eqtl_translate,
                           index=False, header=True)
            del eqtl_translate
        else:
            print("\tSkipping eQTLs translate table.")

        sample_translate_outpath = os.path.join(self.outdir,
                                                "sample_translate_table.txt.gz")
        if not check_file_exists(sample_translate_outpath) or self.force:
            sample_translate = pd.DataFrame(
                {'unmasked': list(self.geno_df.columns),
                 'masked': sample_mask})
            save_dataframe(outpath=sample_translate_outpath,
                           df=sample_translate,
                           index=False, header=True)
            del sample_translate
        else:
            print("\tSkipping sample translate table.")

        cov_translate_outpath = os.path.join(self.outdir,
                                             "cov_translate_table.txt.gz")
        if not check_file_exists(cov_translate_outpath) or self.force:
            cov_translate = pd.DataFrame({'unmasked': list(self.cov_df.index),
                                          'masked': cov_mask})
            save_dataframe(outpath=cov_translate_outpath, df=cov_translate,
                           index=False, header=True)
            del cov_translate
        else:
            print("\tSkipping covariates translate table.")

        # Start masking the dataframes.
        print("Start masking files.")
        eqtl_outpath = os.path.join(self.outdir, "eqtl_table.txt.gz")
        if not check_file_exists(eqtl_outpath) or self.force:
            self.eqtl_df.index = eqtl_mask
            save_dataframe(outpath=eqtl_outpath, df=self.eqtl_df,
                           index=True, header=True)
        else:
            print("\tSkipping eQTL table.")

        geno_outpath = os.path.join(self.outdir, "genotype_table.txt.gz")
        if not check_file_exists(geno_outpath) or self.force:
            self.geno_df.index = eqtl_mask
            self.geno_df.columns = sample_mask
            save_dataframe(outpath=geno_outpath, df=self.geno_df,
                           index=True, header=True)
        else:
            print("\tSkipping genotype table.")

        alleles_outpath = os.path.join(self.outdir,
                                       "genotype_alleles.txt.gz")
        if not check_file_exists(alleles_outpath) or self.force:
            self.alleles_df.index = eqtl_mask
            save_dataframe(outpath=alleles_outpath, df=self.alleles_df,
                           index=True, header=True)
        else:
            print("\tSkipping genotype alleles tables.")

        expr_outpath = os.path.join(self.outdir, "expression_table.txt.gz")
        if not check_file_exists(expr_outpath) or self.force:
            self.expr_df.index = eqtl_mask
            self.expr_df.columns = sample_mask
            save_dataframe(outpath=expr_outpath, df=self.expr_df,
                           index=True, header=True)
        else:
            print("\tSkipping expression table.")

        cov_outpath = os.path.join(self.outdir, "covariates_table.txt.gz")
        if not check_file_exists(cov_outpath) or self.force:
            self.cov_df.index = cov_mask
            self.cov_df.columns = sample_mask
            save_dataframe(outpath=cov_outpath, df=self.cov_df,
                           index=True, header=True)
        else:
            print("\tSkipping covariates table.")