def start(self): print("Starting factorization of celltype profile expression.") self.print_arguments() # Check if output file exist. if check_file_exists(self.pca_outpath) and check_file_exists( self.nmf_outpath) and not self.force: print("Skipping step, loading result.") self.celltype_pcs = load_dataframe(inpath=self.pca_outpath, header=0, index_col=0) self.celltype_cs = load_dataframe(inpath=self.nmf_outpath, header=0, index_col=0) else: self.celltype_expression, self.celltype_pcs, self.celltype_cs = self.perform_matrix_factorization( ) self.save()
def start(self): print("Starting deconvolution.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step, loading result.") self.deconvolution = load_dataframe(inpath=self.outpath, header=0, index_col=0) else: self.deconvolution = self.perform_deconvolution() self.save()
def start(self): print("Starting creating covariate file.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step, loading result.") self.covariates = load_dataframe(inpath=self.outpath, header=0, index_col=0) else: self.covariates = self.combine_files() self.save()
def start(self): print("Starting combining GTE files.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step, loading result.") self.gte = load_dataframe(inpath=self.outpath, header=None, index_col=None) else: # Load each GTE file. self.gte = self.combine_files() self.save() # Construct sample translate dict. self.sample_dict = self.create_sample_dict() self.sample_order = self.set_sample_order()
def start(self): print("Starting combining eQTL probe files.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step, loading result.") self.eqtl_probes = load_dataframe(inpath=self.outpath, header=0, index_col=False) else: # Load each GTE file. print("Loading eQTLprobes files.") combined_eqtl_probes = self.combine_files() if self.disease != "" and self.disease is not None: print("Filtering on trait: {}".format(self.disease)) combined_eqtl_probes = self.filter_on_trait( combined_eqtl_probes) self.eqtl_probes = combined_eqtl_probes self.save()
def combine_groups(self, inter_outpath): print("Combining groups.") snp_mask = np.array([], dtype=np.int16) sample_mask = np.array([], dtype=np.int16) inter_df = None for i, group_id in enumerate(self.group_ids): print(" Working on: {:10s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.group_ids), (100 / len(self.group_ids)) * (i + 1))) # Define the directory names. data_indir = os.path.join(self.g_data_indir, group_id) inter_indir = os.path.join(self.g_inter_indir, group_id, 'output') # Load the group object. with open(os.path.join(data_indir, self.obj_filename), "rb") as f: group_object = pickle.load(f) # Safe the indices. snp_mask = np.append(snp_mask, group_object.get_snp_indices()) sample_mask = np.append(sample_mask, group_object.get_sample_indices()) if not check_file_exists(inter_outpath) or self.force: # Search for the interaction filename. inter_inpath = None for path in glob.glob(os.path.join(inter_indir, "*")): if re.match(self.inter_regex, get_basename(path)): inter_inpath = path break if inter_inpath is None: print("Interaction matrix not found.") exit() # Load the interaction file. group_inter_df = load_dataframe(inpath=inter_inpath, header=0, index_col=0) # Merge them. if inter_df is None: inter_df = group_inter_df else: inter_df = inter_df.merge(group_inter_df, left_index=True, right_index=True) print("Preparing interaction matrix.") if not check_file_exists(inter_outpath) or self.force: # Sort the matrix according to the indices. inter_df = inter_df.T inter_df["index"] = snp_mask inter_df.sort_values(by=['index'], inplace=True) inter_df.drop(["index"], axis=1, inplace=True) inter_df = inter_df.T save_dataframe(df=inter_df, outpath=inter_outpath, index=True, header=True) else: inter_df = load_dataframe(inpath=inter_outpath, header=0, index_col=0) # Prepare the masks. snp_mask = sorted(list(set(snp_mask))) sample_mask = sorted(list(set(sample_mask))) return snp_mask, sample_mask, inter_df
def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting combining groups.") self.print_arguments() # Combine the indices of each group and combine the interaction # matrix if need be. inter_outpath = os.path.join(self.outdir, self.inter_filename) snp_mask, sample_mask, inter_df = self.combine_groups(inter_outpath) print("\nSubsetting data with masks:") print("\tSNP mask:\tlength: {}\tlowest index: {}" "\thighest index: {}".format(len(snp_mask), min(snp_mask), max(snp_mask))) print("\tSample mask:\tlength: {}\tlowest index: {}" "\thighest index: {}".format(len(sample_mask), min(sample_mask), max(sample_mask))) print("") # Load the eQTL file if either the marker df or the eqtl df needs to be # created. markers_outpath = os.path.join(self.outdir, self.markers_filename) eqtl_outpath = os.path.join(self.outdir, self.eqtl_filename) if not check_file_exists(eqtl_outpath) or \ not check_file_exists(markers_outpath) \ or self.force: print("Loading eQTL file.") eqtl_df = load_dataframe(inpath=self.eqtl_inpath, header=0, index_col=None) eqtl_df = eqtl_df.iloc[snp_mask, :] print("Preparing marker matrix.") if not check_file_exists(markers_outpath) or self.force: self.create_marker_df(inter_df, eqtl_df, markers_outpath) else: print("\tSkipping step.") print("Preparing eQTL matrix.") if not check_file_exists(eqtl_outpath) or self.force: save_dataframe(outpath=eqtl_outpath, df=eqtl_df, index=False, header=True) else: print("\tSkipping step.") del eqtl_df del inter_df print("\nPreparing genotype matrix.") geno_outpath = os.path.join(self.outdir, self.geno_filename) if not check_file_exists(geno_outpath) or self.force: geno_df = load_dataframe(inpath=os.path.join( self.data_indir, self.geno_filename), header=0, index_col=0) geno_df = geno_df.iloc[snp_mask, sample_mask] save_dataframe(outpath=geno_outpath, df=geno_df, index=True, header=True) del geno_df else: print("\tSkipping step.") print("\nPreparing alleles matrix.") alleles_outpath = os.path.join(self.outdir, self.alleles_filename) if not check_file_exists(alleles_outpath) or self.force: alleles_df = load_dataframe(inpath=os.path.join( self.data_indir, self.alleles_filename), header=0, index_col=0) alleles_df = alleles_df.iloc[snp_mask, :] save_dataframe(outpath=alleles_outpath, df=alleles_df, index=True, header=True) del alleles_df else: print("\tSkipping step.") print("\nPreparing expression matrix.") expr_outpath = os.path.join(self.outdir, self.expr_filename) if not check_file_exists(expr_outpath) or self.force: expr_df = load_dataframe(inpath=os.path.join( self.data_indir, self.expr_filename), header=0, index_col=0) expr_df = expr_df.iloc[snp_mask, sample_mask] save_dataframe(outpath=expr_outpath, df=expr_df, index=True, header=True) del expr_df else: print("\tSkipping step.") print("\nPreparing covariate matrix.") cov_outpath = os.path.join(self.outdir, self.cov_filename) if not check_file_exists(cov_outpath) or self.force: cov_df = load_dataframe(inpath=self.cov_inpath, header=0, index_col=0) cov_df = cov_df.iloc[:, sample_mask].copy() save_dataframe(outpath=cov_outpath, df=cov_df, index=True, header=True) del cov_df else: print("\tSkipping step.")
def start(self): print("Creating groups.") for i, (group_id, group_obj) in enumerate(self.groups.items()): print(" Working on: {:10s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.groups), (100 / len(self.groups)) * (i + 1))) # Create the group dir. group_dir = os.path.join(self.outdir, group_id) prepare_output_dir(group_dir) # Define the output names. group_object = os.path.join(group_dir, "group.pkl") eqtl_outpath = os.path.join(group_dir, "eqtl_table.txt.gz") geno_outpath = os.path.join(group_dir, "genotype_table.txt.gz") alleles_outpath = os.path.join(group_dir, "genotype_alleles.txt.gz") expr_outpath = os.path.join(group_dir, "expression_table.txt.gz") cov_outpath = os.path.join(group_dir, "covariates_table.txt.gz") # Check if output file exist, if not, create it. if not check_file_exists(group_object) or self.force: with open(group_object, "wb") as f: pickle.dump(group_obj, f) print("\tSaved group object: " "{}".format(get_basename(group_object))) # Get the group indices. snp_mask = group_obj.get_snp_indices() sample_mask = group_obj.get_sample_indices() # Check if output file exist, if not, create it. if not check_file_exists(eqtl_outpath) or self.force: group_eqtl = self.eqtl_df.iloc[snp_mask, :].copy() save_dataframe(outpath=eqtl_outpath, df=group_eqtl, index=False, header=True) del group_eqtl if not check_file_exists(geno_outpath) or self.force: group_geno = self.geno_df.iloc[snp_mask, sample_mask].copy() save_dataframe(outpath=geno_outpath, df=group_geno, index=True, header=True) del group_geno if not check_file_exists(alleles_outpath) or self.force: group_alleles = self.alleles_df.iloc[snp_mask, :].copy() save_dataframe(outpath=alleles_outpath, df=group_alleles, index=True, header=True) del group_alleles if not check_file_exists(expr_outpath) or self.force: group_expr = self.expr_df.iloc[snp_mask, sample_mask].copy() save_dataframe(outpath=expr_outpath, df=group_expr, index=True, header=True) del group_expr if not check_file_exists(cov_outpath) or self.force: group_cov = self.cov_df.iloc[:, sample_mask].copy() save_dataframe(outpath=cov_outpath, df=group_cov, index=True, header=True) del group_cov
def start(self): print("Starting creating matrices.") self.print_arguments() # Check if output file exist. if check_file_exists(self.geno_outpath) and \ check_file_exists(self.alleles_outpath) and \ check_file_exists(self.expr_outpath) and \ not self.force: print("Skipping step.") return # Remove the output files. for outfile in [ self.geno_outpath, self.alleles_outpath, self.expr_outpath ]: if os.path.isfile(outfile): print("Removing file: {}.".format(outfile)) os.remove(outfile) # Load the genotype matrix file. print("Loading genotype matrix.") geno_df = load_dataframe(self.geno_file, header=0, index_col=0) allele_df = geno_df.loc[:, ["Alleles", "MinorAllele"]].copy() geno_df = geno_df.rename(columns=self.sample_dict) geno_df = geno_df[self.sample_order] # Load the expression matrix file. print("Loading expression matrix.") expr_df = load_dataframe(self.expr_file, header=0, index_col=0) expr_df = expr_df.rename(columns=self.sample_dict) self.complete_expr_matrix = expr_df[self.sample_order] # Construct the genotype / expression matrices. print("Constructing matrices.") geno_str_buffer = ["-" + "\t" + "\t".join(self.sample_order) + "\n"] expr_str_buffer = ["-" + "\t" + "\t".join(self.sample_order) + "\n"] allele_str_buffer = [ "-" + "\t" + "\t".join(list(allele_df.columns)) + "\n" ] # saved_profile_genes = [] # groups = [] # new_group_id = 0 n_snps = self.eqtl_df.shape[0] for i, row in self.eqtl_df.iterrows(): if (i % 250 == 0) or (i == (n_snps - 1)): print("\tProcessing {}/{} " "[{:.2f}%]".format(i, (n_snps - 1), (100 / (n_snps - 1)) * i)) # Write output files. self.write_buffer(self.geno_outpath, geno_str_buffer) geno_str_buffer = [] self.write_buffer(self.expr_outpath, expr_str_buffer) expr_str_buffer = [] self.write_buffer(self.alleles_outpath, allele_str_buffer) allele_str_buffer = [] # Get the row info. snp_name = row["SNPName"] probe_name = row["ProbeName"] # Used for development. # snp_name = "10:100145864:rs4919426:T_C" # probe_name = "ENSG00000000003.15" # End used for development. # Get the genotype. genotype = geno_df.loc[[snp_name], :] if (len(genotype.index)) != 1: print("SNP: {} gives 0 or >1 " "genotypes.".format(snp_name)) continue geno_str = snp_name + "\t" + "\t".join( genotype.iloc[0, :].astype(str).values) + "\n" geno_str_buffer.append(geno_str) # Get the alleles. alleles = allele_df.loc[[snp_name], :] if (len(alleles.index)) != 1: print("SNP: {} gives 0 or >1 " "alleles.".format(snp_name)) continue allele_str = "{}\t{}\t{}\n".format(snp_name, alleles.iloc[0]["Alleles"], alleles.iloc[0]["MinorAllele"]) allele_str_buffer.append(allele_str) # Get the expression. expression = self.complete_expr_matrix.loc[[probe_name], :] if (len(expression.index)) != 1: print("Probe: {} gives 0 or >1 expression " "profiles.".format(probe_name)) continue expr_str = probe_name + "\t" + "\t".join( expression.iloc[0, :].astype(str).values) + "\n" expr_str_buffer.append(expr_str) # # Create an eQTL object. # new_eqtl = Eqtl(snp_name, i, genotype, expression) # # # Get the samples indices of the eQTl. # samples = new_eqtl.get_samples() # samples_indices = new_eqtl.get_sample_indices() # # # Assign the group. # matches = False # if groups: # # Check if there is a group with these samples. # for group in groups: # if group.matches(samples_indices): # group.add_eqtl(new_eqtl) # matches = True # break # # # Add a new group. # if not matches: # new_group = Group(new_group_id, samples) # new_group.add_eqtl(new_eqtl) # groups.append(new_group) # new_group_id = new_group_id + 1 # Write output files. if geno_str_buffer: self.write_buffer(self.geno_outpath, geno_str_buffer) if expr_str_buffer: self.write_buffer(self.expr_outpath, expr_str_buffer) if allele_str_buffer: self.write_buffer(self.alleles_outpath, allele_str_buffer) # # Pickle the groups. # print("Writing group pickle file.") # with open(self.group_outpath, "wb") as f: # pickle.dump(groups, f) # Remove old dataframes. del geno_df, expr_df
def start(self): print("Starting creating deconvolution matrices.") self.print_arguments() # Check if output file exist. if check_file_exists(self.markers_outpath) and \ check_file_exists(self.ct_profile_expr_outpath) and \ not self.force: print("Skipping step.") return # Check which expression file we will use. expr_file = self.expr_file expr_df = self.expr_df if self.decon_expr_file: print("Warning: using a different expression file for " "deconvolution than for gene expression. This might take " "longer to load.") expr_file = self.decon_expr_file expr_df = None # Load the complete expression file. if expr_df is None: # Load the expression matrix file. print("Loading expression matrix.") expr_df = load_dataframe(expr_file, header=0, index_col=0) expr_df = expr_df.rename(columns=self.sample_dict) expr_df = expr_df[self.sample_order] # Load the translate file. print("Loading translate matrix.") trans_df = load_dataframe(self.translate_file, header=0, index_col=None) trans_dict = dict( zip(trans_df.loc[:, "ArrayAddress"], trans_df.loc[:, "Symbol"])) # Translate the ENSEBL ID's to HGNC symbols. expr_df.index = expr_df.index.map(trans_dict) expr_df.index.name = "-" # Remove unneeded variables. del trans_df, trans_dict # Create the marker gene file. if not check_file_exists(self.markers_outpath) or self.force: if os.path.isfile(self.markers_outpath): print("Removing: {}".format(self.markers_outpath)) os.remove(self.markers_outpath) print("Creating marker gene expression table.") marker_str_buffer = [ "-" + "\t" + "\t".join(self.sample_order) + "\n" ] for celltype, marker_genes in self.marker_dict.items(): for marker_gene in marker_genes: if marker_gene in expr_df.index: expression = expr_df.loc[[marker_gene], :] if (len(expression.index)) != 1: print("\tMarker gene: {} gives 0 or >1 expression " "profiles.".format(marker_gene)) continue marker_str = self.marker_genes_suffix + "_" + \ celltype + "_" + marker_gene + "\t" + \ "\t".join(expression.iloc[0, :].astype(str).values) \ + "\n" marker_str_buffer.append(marker_str) self.write_buffer(self.markers_outpath, marker_str_buffer) # Create the marker gene file. if not check_file_exists(self.ct_profile_expr_outpath) or self.force: if os.path.isfile(self.ct_profile_expr_outpath): print("Removing: {}".format(self.ct_profile_expr_outpath)) os.remove(self.ct_profile_expr_outpath) # Load the celltype profile file. print("Loading cell type profile matrix.") self.celltype_profile = load_dataframe(self.celltype_profile_file, header=0, index_col=0) # Create the celltype profile file. print("Creating cell type profile expression table.") profile_str_buffer = [ "-" + "\t" + "\t".join(self.sample_order) + "\n" ] for marker_gene in self.celltype_profile.index: if marker_gene in expr_df.index: expression = expr_df.loc[[marker_gene], :] if (len(expression.index)) != 1: print("\tMarker gene: {} gives 0 or >1 expression " "profiles.".format(marker_gene)) continue profile_str = marker_gene + "\t" + "\t".join( expression.iloc[0, :].astype(str).values) + "\n" profile_str_buffer.append(profile_str) self.write_buffer(self.ct_profile_expr_outpath, profile_str_buffer)
def start(self): print("Starting creating regression file.") self.print_arguments() # Check if output file exist. if check_file_exists(self.outpath) and not self.force: print("Skipping step.") return # Remove the output files. if os.path.isfile(self.outpath): print("Removing file: {}.".format(self.outpath)) os.remove(self.outpath) # Prepare string buffer. regr_str_buffer = [ "snp\tprobe\talleles\tminor_allele\tallele_assessed\tflipped\tslope" "\tintercept\tcorr_coeff\tp_value\tstd_err\toveral_z_score" "\tz_score_estimate\n" ] # Correlating. print("Correlating:") nrows = self.eqtl_df.shape[0] for i, row in self.eqtl_df.iterrows(): if (i % 250 == 0) or (i == (nrows - 1)): print("\t Processing {}/{} [{:.2f}%]".format( i, nrows, (100 / nrows) * i)) # Write output files. self.write_buffer(self.outpath, regr_str_buffer) regr_str_buffer = [] # Extract the usefull information. snp_name = row["SNPName"] probe_name = row["ProbeName"] overal_z_score = row["OverallZScore"] allele_assessed = row["AlleleAssessed"] # Get the data. genotype = self.geno_df.iloc[i, :].T.to_frame() if snp_name != genotype.columns[0]: print("SNPName does not match in genotype subset.") break expression = self.expr_df.iloc[i, :].T.to_frame() if snp_name != expression.columns[0]: print("SNPName does not match in expression subset.") break data = genotype.merge(expression, left_index=True, right_index=True) data.columns = ["genotype", "expression"] # Remove missing values. data.replace(-1, np.nan, inplace=True) data.dropna(inplace=True) # Determine the alleles. (alleles, minor_allele) = self.alleles_df.iloc[i, :] # Determine whether to flip or not. flipped = False if allele_assessed != alleles.split("/")[1]: data['genotype'] = 2.0 - data['genotype'] flipped = True # # Naive flip method. # if allele_assessed != minor_allele: # data['genotype'] = 2.0 - data['genotype'] # Calculate the correlation. slope, intercept, r_value, p_value, std_err = stats.linregress( data["genotype"], data["expression"]) # Calculate the z-score estimate. z_score_estimate = slope / std_err # # Naive flip method 2.0. # if allele_assessed != alleles.split("/")[1]: # z_score_estimate = z_score_estimate * -1 # Add to the buffer. regr_str = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n" \ .format(snp_name, probe_name, alleles, minor_allele, allele_assessed, flipped, slope, intercept, r_value, p_value, std_err, overal_z_score, z_score_estimate ) regr_str_buffer.append(regr_str) # Write output files. if regr_str_buffer: self.write_buffer(self.outpath, regr_str_buffer)
def start(self): """ The method that serves as the pipeline of the whole program. """ print("Starting interaction analyser.") self.print_arguments() # Loop over the groups. print("Performing interaction analyses.") for i, group_indir in enumerate(self.group_indirs): # Prepare the input and output directories. if self.groups is not None: group_id = get_leaf_dir(group_indir) group_outdir = os.path.join(self.outdir, group_id) else: group_id = "" group_outdir = self.outdir ia_indir = os.path.join(group_outdir, 'input') ia_outdir = os.path.join(group_outdir, 'output') for outdir in [group_outdir, ia_indir, ia_outdir]: prepare_output_dir(outdir) # Check if we can find an InteractionZSCoreMatrix has_inter_matrix = False if not self.force: for path in glob.glob(os.path.join(ia_outdir, "*")): if re.match(self.inter_regex, get_basename(path)): has_inter_matrix = True break # Stop if we already have the interaction matrix. if has_inter_matrix and not self.force: continue print("\tWorking on: {:15s} [{}/{} " "{:.2f}%]".format(group_id, i + 1, len(self.group_indirs), (100 / len(self.group_indirs)) * (i + 1))) # Prepare the EQTLInteractioAnalyser expected input. self.print_string("\n### STEP1 ###\n") expected_input = ["Genotypes", "Expression", "Covariates"] filenames = [ self.geno_filename, self.expr_filename, self.cov_filename ] for exp_ia_infile, filename in zip(expected_input, filenames): # Check if the files alreadt exist. file1 = os.path.join(ia_indir, exp_ia_infile + ".binary.dat") file2 = os.path.join(ia_indir, exp_ia_infile + ".binary.rows.txt") file3 = os.path.join(ia_indir, exp_ia_infile + ".binary.columns.txt") if not check_file_exists(file1) or \ not check_file_exists(file2) or \ not check_file_exists(file3) or \ self.force: self.print_string("\nPreparing {}.".format(filename)) # Define the filenames. compr_file = os.path.join(self.indir, group_id, filename + '.txt.gz') copy_file = os.path.join(ia_indir, filename + '.txt.gz') uncompr_file = os.path.join(ia_indir, filename + '.txt') bin_file = os.path.join(ia_indir, exp_ia_infile + ".binary") # Copy and decompressed the file. self.print_string("\nCopying the input files.") self.copy_file(compr_file, copy_file) self.print_string("\nDecompressing the input files.") self.decompress(copy_file) # Convert to binary. self.print_string("\nConverting files to binary format.") self.convert_to_binary(uncompr_file, bin_file) # Remove the uncompressed file. self.print_string("\nRemoving uncompressed files.") if check_file_exists(uncompr_file): self.print_string( "\tos.remove({})".format(uncompr_file)) os.remove(uncompr_file) else: self.print_string( "Skipping {} preparation.".format(filename)) # prepare the eQTL file. self.print_string("\n### STEP2 ###\n") eqtl_file = os.path.join(ia_indir, self.eqtl_filename + '.txt') if not check_file_exists(eqtl_file) or self.force: self.print_string("\nPreparing eQTL file.") # Define the filenames. compr_file = os.path.join(self.indir, group_id, self.eqtl_filename + '.txt.gz') copy_file = os.path.join(ia_indir, self.eqtl_filename + '.txt.gz') # Copy and decompressed the file. self.print_string("\nCopying the input files.") self.copy_file(compr_file, copy_file) self.print_string("\nDecompressing the input files.") self.decompress(copy_file) else: self.print_string("Skipping eqtl preparation.") # execute the program. self.print_string("\n### STEP3 ###\n") self.print_string("Executing the eQTLInteractionAnalyser.") self.execute(ia_indir, ia_outdir, eqtl_file)
def start(self): """ Method to start the manager. """ self.print_arguments() print("Starting Custom Interaction Analyser " "[{}]".format(datetime.now().strftime("%d-%m-%Y, %H:%M:%S"))) # Start the timer. start_time = int(time.time()) # Get the permutation orders. permutation_orders = None perm_orders_outfile = os.path.join(self.outdir, self.perm_orders_filename + ".pkl") if check_file_exists(perm_orders_outfile): print("Loading permutation order") permutation_orders = self.load_pickle(perm_orders_outfile) # Validate the permutation orders for the given input. if len(permutation_orders) != (self.n_permutations + 1): print("\tinvalid") permutation_orders = None if permutation_orders is not None: for order in permutation_orders: if len(order) != self.n_samples: print("\tinvalid") permutation_orders = None break print("\tvalid") if permutation_orders is None: print("Creating permutation order") permutation_orders = self.create_perm_orders() self.dump_pickle(permutation_orders, self.outdir, self.perm_orders_filename) # Start the work. print("Start the analyses", flush=True) storage = self.work(permutation_orders) tc_container = storage.get_tech_cov_container() c_container = storage.get_cov_container() print("Saving output files", flush=True) filename_suffix = "{}_{}".format(self.skip_rows, self.n_eqtls) for container, outdir in zip([tc_container, c_container], [self.tech_cov_outdir, self.cov_outdir]): full_outdir = os.path.join(self.outdir, outdir) prepare_output_dir(full_outdir) self.dump_pickle(container.get_pvalues(), full_outdir, self.pvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(container.get_snp_tvalues(), full_outdir, self.snp_tvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(container.get_inter_tvalues(), full_outdir, self.inter_tvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) self.dump_pickle(container.get_perm_pvalues(), full_outdir, self.perm_pvalues_filename, filename_suffix=filename_suffix, subdir=True, unique=True) # Print the process time. run_time = int(time.time()) - start_time run_time_min, run_time_sec = divmod(run_time, 60) run_time_hour, run_time_min = divmod(run_time_min, 60) print("Finished in {} hour(s), {} minute(s) and " "{} second(s)".format(int(run_time_hour), int(run_time_min), int(run_time_sec))) print("Received {:.2f} analyses per minute".format( (self.n_eqtls * (self.n_permutations + 1)) / (run_time / 60))) # Shutdown the manager. print("Shutting down manager [{}]".format( datetime.now().strftime("%d-%m-%Y, %H:%M:%S")), flush=True)
def start(self): print("Starting creating masked files.") self.print_arguments() # Get the sizes. (n_eqtls, n_samples) = self.geno_df.shape n_covs = self.cov_df.shape[0] # Create masks. eqtl_mask = ["eqtl_" + str(x) for x in range(n_eqtls)] sample_mask = ["sample_" + str(x) for x in range(n_samples)] cov_mask = ["cov_" + str(x) for x in range(n_covs)] # Create translate dicts. print("Creating translation files.") eqtl_translate_outpath = os.path.join(self.outdir, "eqtl_translate_table.txt.gz") if not check_file_exists(eqtl_translate_outpath) or self.force: eqtl_translate = pd.DataFrame({'unmasked': list(self.geno_df.index), 'masked': eqtl_mask}) save_dataframe(outpath=eqtl_translate_outpath, df=eqtl_translate, index=False, header=True) del eqtl_translate else: print("\tSkipping eQTLs translate table.") sample_translate_outpath = os.path.join(self.outdir, "sample_translate_table.txt.gz") if not check_file_exists(sample_translate_outpath) or self.force: sample_translate = pd.DataFrame( {'unmasked': list(self.geno_df.columns), 'masked': sample_mask}) save_dataframe(outpath=sample_translate_outpath, df=sample_translate, index=False, header=True) del sample_translate else: print("\tSkipping sample translate table.") cov_translate_outpath = os.path.join(self.outdir, "cov_translate_table.txt.gz") if not check_file_exists(cov_translate_outpath) or self.force: cov_translate = pd.DataFrame({'unmasked': list(self.cov_df.index), 'masked': cov_mask}) save_dataframe(outpath=cov_translate_outpath, df=cov_translate, index=False, header=True) del cov_translate else: print("\tSkipping covariates translate table.") # Start masking the dataframes. print("Start masking files.") eqtl_outpath = os.path.join(self.outdir, "eqtl_table.txt.gz") if not check_file_exists(eqtl_outpath) or self.force: self.eqtl_df.index = eqtl_mask save_dataframe(outpath=eqtl_outpath, df=self.eqtl_df, index=True, header=True) else: print("\tSkipping eQTL table.") geno_outpath = os.path.join(self.outdir, "genotype_table.txt.gz") if not check_file_exists(geno_outpath) or self.force: self.geno_df.index = eqtl_mask self.geno_df.columns = sample_mask save_dataframe(outpath=geno_outpath, df=self.geno_df, index=True, header=True) else: print("\tSkipping genotype table.") alleles_outpath = os.path.join(self.outdir, "genotype_alleles.txt.gz") if not check_file_exists(alleles_outpath) or self.force: self.alleles_df.index = eqtl_mask save_dataframe(outpath=alleles_outpath, df=self.alleles_df, index=True, header=True) else: print("\tSkipping genotype alleles tables.") expr_outpath = os.path.join(self.outdir, "expression_table.txt.gz") if not check_file_exists(expr_outpath) or self.force: self.expr_df.index = eqtl_mask self.expr_df.columns = sample_mask save_dataframe(outpath=expr_outpath, df=self.expr_df, index=True, header=True) else: print("\tSkipping expression table.") cov_outpath = os.path.join(self.outdir, "covariates_table.txt.gz") if not check_file_exists(cov_outpath) or self.force: self.cov_df.index = cov_mask self.cov_df.columns = sample_mask save_dataframe(outpath=cov_outpath, df=self.cov_df, index=True, header=True) else: print("\tSkipping covariates table.")