def combine_files(self): combined = None for i, gte_inpath in enumerate(glob.glob(self.inpath)): gte_file = os.path.basename(gte_inpath).replace(".txt", "") if gte_file in self.exclude_files: continue df = load_dataframe(inpath=gte_inpath, header=None, index_col=None, logger=self.log) df["dataset"] = gte_file if combined is None: combined = df else: combined = pd.concat([combined, df], axis=0, ignore_index=True) # Remove duplicate entries. combined.drop_duplicates(inplace=True) # Remove samples. if self.exclude_samples_path is not None: sample_exclude_df = load_dataframe( inpath=self.exclude_samples_path, header=None, index_col=None, logger=self.log) pre_shape = combined.shape[0] combined = combined.loc[ ~combined.iloc[:, 1].isin(sample_exclude_df.iloc[:, 0].tolist()), :] self.log.warn("\tRemoving '{}' samples".format(pre_shape - combined.shape[0])) return combined
def start(self): self.log.info("Correcting expression data for dataset effects.") self.print_arguments() self.log.info("Correcting signature expression data.") if not check_file_exists(self.sign_expr_dc_outpath) or self.force: if self.dataset_df is None: self.dataset_df = load_dataframe(self.dataset_file, header=0, index_col=0, logger=self.log) if self.sign_expr_df is None: self.sign_expr_df = load_dataframe(self.sign_expr_file, header=0, index_col=0, logger=self.log) self.sign_expr_dc_df = self.dataset_correction( self.sign_expr_df, self.dataset_df) save_dataframe(df=self.sign_expr_dc_df, outpath=self.sign_expr_dc_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.")
def create_covs_file(self): # read the eigenvectors file. self.log.info("Loading eigenvectors matrix.") eigen_df = load_dataframe(self.eig_file, header=0, index_col=0, nrows=max(self.n_eigen), logger=self.log) if len(set(self.sample_order).intersection(set( eigen_df.columns))) == 0: eigen_df = eigen_df.T eigen_df.columns = [ self.sample_dict[x] if x in self.sample_dict else x for x in eigen_df.columns ] eigen_df = eigen_df.loc[:, self.sample_order] for n_eigen in self.n_eigen: save_dataframe(df=eigen_df.iloc[:n_eigen, :], outpath=os.path.join( self.outdir, "first{}PCComponents.txt.gz".format(n_eigen)), index=True, header=True, logger=self.log) # loading deconvolution matrix. self.log.info("Loading deconvolution matrix.") if self.decon_df is None: self.decon_df = load_dataframe(self.decon_file, header=0, index_col=0, logger=self.log) # merge. self.log.info("Merging matrices.") covs_df = pd.merge(eigen_df.T, self.decon_df, left_index=True, right_index=True) covs_df = covs_df.T covs_df.index.name = "-" # Validate sample order. if not covs_df.columns.equals(self.sample_order): covs_df = covs_df[self.sample_order] # Remove old dataframes. del eigen_df return covs_df
def normal_transform(self): # loading deconvolution matrix. if self.df is None: self.log.info("Loading matrix.") self.df = load_dataframe(self.inpath, header=0, index_col=0, logger=self.log) new_data = [] match = 0 count = 0 print("Processing data.") for i, (index, row) in enumerate(self.df.iterrows()): if (i == 0) or (i % self.print_interval == 0): print("\tprocessed {}/{} [{:.2f}%] lines".format( i, self.df.shape[0], (100 / self.df.shape[0]) * i)) print("\t\t{}/{} are identical".format(match, count)) zscores1 = self.rank_method1(index, row) zscores2 = self.rank_method2(index, row) if np.array_equal(zscores1, zscores2): match += 1 count += 1 exit() return pd.DataFrame(new_data, index=self.df.index, columns=self.df.columns)
def start(self): self.log.info("Starting creating cohort matrix.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: # Load the sample info. self.log.info("Loading sample information matrix.") self.sample_info_df = load_dataframe(inpath=self.inpath, header=0, index_col=None, low_memory=False, logger=self.log) # Construct sample-cohort dict. self.log.info("Creating sample to cohort dict.") sample_cohort_dict = construct_dict_from_df( self.sample_info_df, self.sample_id, self.cohort_id) # Create cohort dataframe. self.log.info("Constructing cohort matrix.") self.cohort_df = self.create_cohort_df(self.sample_dict, self.sample_order, sample_cohort_dict) self.save() else: self.log.info("Skipping step.")
def start(self): self.log.info("Filtering technical covariates datafile.") self.print_arguments() # Check if output file exist. if not check_file_exists(self.outpath) or self.force: # Load the sample info. self.log.info("Loading covariates matrix.") cov_df = load_dataframe(inpath=self.cov_file, header=0, index_col=0, logger=self.log) # Filter on samples and technical covariates. self.log.info("Filtering on samples and technical covariates.") cov_df.index = [ self.sample_dict[x] if x in self.sample_dict else x for x in cov_df.index ] tech_cov_df = cov_df.loc[self.sample_order, self.tech_covs].copy() del cov_df self.log.info("\tNew shape: {}".format(tech_cov_df.shape)) # Remove technical covariates that are linearly dependent. self.log.info("Removing linearly dependent column(s).") self.tech_covs_df = self.filter_linear_dependent_covs(tech_cov_df) self.log.info("\tNew shape: {}".format(self.tech_covs_df.shape)) self.save() else: self.log.info("Skipping step.")
def create_tech_covs_file(self): # Load the sample info. self.log.info("Loading covariates matrix.") cov_df = load_dataframe(inpath=self.cov_file, header=0, index_col=0, logger=self.log) # Filter on samples and technical covariates. self.log.info("Filtering on samples and technical covariates.") cov_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in cov_df.index] subset_tech_covs_df = cov_df.loc[self.sample_order, self.tech_covs].copy() del cov_df self.log.info("\tNew shape: {}".format(subset_tech_covs_df.shape)) # Remove technical covariates that are linearly dependent. self.log.info("Removing linearly dependent column(s).") subset_tech_covs_df = self.filter_linear_dependent_covs(subset_tech_covs_df) self.log.info("\tNew shape: {}".format(subset_tech_covs_df.shape)) # loading cohort matrix. self.log.info("Loading cohort matrix.") if self.cohort_df is None: self.cohort_df = load_dataframe(self.cohort_file, header=0, index_col=0, logger=self.log) # merge. self.log.info("Merging matrices.") tech_covs_df = pd.merge(subset_tech_covs_df, self.cohort_df.T, left_index=True, right_index=True) tech_covs_df = tech_covs_df.T tech_covs_df.index.name = "-" # Validate sample order. if not tech_covs_df.columns.equals(self.sample_order): tech_covs_df = tech_covs_df[self.sample_order] return tech_covs_df
def create_covs_file(self): # read the eigenvectors file. self.log.info("Loading eigenvectors matrix.") eigen_df = load_dataframe(self.eig_file, header=0, index_col=0, nrows=self.n_eigen, logger=self.log) eigen_df.columns = [ self.sample_dict[x] if x in self.sample_dict else x for x in eigen_df.columns ] eigen_df = eigen_df.loc[:, self.sample_order] # loading deconvolution matrix. self.log.info("Loading deconvolution matrix.") if self.decon_df is None: self.decon_df = load_dataframe(self.decon_file, header=0, index_col=0, logger=self.log) # merge. self.log.info("Merging matrices.") covs_df = pd.merge(eigen_df.T, self.decon_df, left_index=True, right_index=True) covs_df = covs_df.T covs_df.index.name = "-" # Validate sample order. if not covs_df.columns.equals(self.sample_order): covs_df = covs_df[self.sample_order] # Remove old dataframes. del eigen_df return covs_df
def load_data(self): print("Loading covariates.") covariates_df = load_dataframe(self.covariates_inpath, index_col=0, header=0) print("Loading matrix header.") matrix_df = load_dataframe(self.matrix_inpath, index_col=0, header=0, nrows=0) print("Loading sample_dict.") sample_dict = None if self.sample_dict_inpath is not None: sample_dict_df = load_dataframe(self.sample_dict_inpath, index_col=None, header=0) sample_dict = construct_dict_from_df(sample_dict_df, sample_dict_df.columns[0], sample_dict_df.columns[1]) return covariates_df, matrix_df, sample_dict
def combine_files(self): combined = None for i, infile in enumerate(glob.glob(self.inpath)): df = load_dataframe(inpath=infile, header=None, index_col=None, logger=self.log) if combined is None: combined = df else: combined = pd.concat([combined, df], axis=0, ignore_index=True) # Remove duplicate entries. combined.drop_duplicates(inplace=True) return combined
def combine_files(self): combined = None for i in range(1, self.n_iterations + 1): infile = os.path.join(self.indir, self.iter_dirname + str(i), self.in_filename) df = load_dataframe(inpath=infile, header=0, index_col=False, logger=self.log) df["Iteration"] = i if combined is None: combined = df else: combined = pd.concat([combined, df], axis=0, ignore_index=True) # Remove duplicate entries. combined.drop_duplicates(inplace=True) return combined
def start(self): self.log.info("Starting combining GTE files.") self.print_arguments() # Check if GTE output file exist. if check_file_exists(self.outpath) and not self.force: self.log.info("Skipping step, loading result.") self.gte_df = load_dataframe(inpath=self.outpath, header=None, index_col=None, logger=self.log) else: # Load each GTE file. self.log.info("Loading GTE files.") self.gte_df = self.combine_files() self.save() # Construct sample translate dict. self.sample_dict = self.create_sample_dict() self.sample_order = list(self.gte_df.iloc[:, 1]) self.dataset_to_samples_dict = self.set_dataset_to_samples_dict()
def prepare_matrix(self): self.log.info("\tLoading matrix.") df = load_dataframe(self.inpath, header=0, index_col=0, logger=self.log) self.log.info("\tPreprocessing.") df.columns = [ self.sample_dict[x] if x in self.sample_dict else x for x in df.columns ] self.log.info("\tChecking overlap.") overlap = [x for x in self.sample_order if x in df.columns] self.log.info("\t\t{}/{} [{:.2f}%] samples found".format( len(overlap), len(self.sample_order), (100 / len(self.sample_order)) * len(overlap))) self.log.info("\tSubsetting.") subset = df.loc[:, overlap] missing = set(self.sample_order) - set(subset.columns) if len(missing) > 0: self.log.info( "\tCompleting data frame, adding {} missing sample columns.". format(len(missing))) missing_df = pd.DataFrame(np.nan, index=df.index, columns=missing) subset = subset.merge(missing_df, left_index=True, right_index=True) if list(subset.columns.to_list()) != self.sample_order: self.log.info("\tReordering columns.") subset = subset.loc[:, self.sample_order] return subset
def start(self): self.log.info("Starting creating matrices.") self.print_arguments() if self.eqtl_df is None: self.eqtl_df = load_dataframe(self.eqtl_file, header=0, index_col=None, logger=self.log) self.log.info("Parsing genotype input data.") if not check_file_exists(self.geno_outpath) or not check_file_exists( self.alleles_outpath) or self.force: alleles_df, geno_df = self.parse_genotype_file() self.log.info("Reorder, Filter, and save.") self.alleles_df = alleles_df.loc[self.eqtl_df.loc[:, "SNPName"], :] save_dataframe(df=self.alleles_df, outpath=self.alleles_outpath, index=True, header=True, logger=self.log) self.geno_df = geno_df.loc[self.eqtl_df.loc[:, "SNPName"], self.sample_order] save_dataframe(df=self.geno_df, outpath=self.geno_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.") self.log.info("Parsing expression input data.") if not check_file_exists(self.expr_outpath) or not check_file_exists( self.sign_expr_outpath) or self.force: self.log.info("Loading signature matrix.") self.sign_df = load_dataframe(inpath=self.sign_file, header=0, index_col=0, logger=self.log) signature_genes = set(self.sign_df.index.to_list()) self.log.info("Loading gene traslate dict.") self.gene_info_df = load_dataframe(inpath=self.gene_info_file, header=0, index_col=None, logger=self.log) gene_trans_dict = construct_dict_from_df(self.gene_info_df, self.ensg_id, self.hgnc_id) if not check_file_exists(self.expr_outpath) or self.force: self.log.info("Parsing expression data.") self.expr_df, self.sign_expr_df = self.parse_expression_file( self.expr_file, signature_genes, gene_trans_dict, include_decon=self.decon_expr_file is None) if (not check_file_exists(self.sign_expr_outpath) or self.force) and (check_file_exists(self.decon_expr_file)): self.log.info("Parsing deconvolution expression data.") self.log.warning( "Using different expresion file for deconvolution.") _, self.sign_expr_df = self.parse_expression_file( self.decon_expr_file, signature_genes, gene_trans_dict, include_expr=False, remove_ens_version=True) self.log.info("Reorder, Filter, and save.") if self.expr_df is not None: self.expr_df = self.expr_df.loc[self.eqtl_df.loc[:, "ProbeName"], self.sample_order] save_dataframe(df=self.expr_df, outpath=self.expr_outpath, index=True, header=True, logger=self.log) if self.sign_expr_df is not None: self.sign_expr_df = self.sign_expr_df.loc[:, self.sample_order] save_dataframe(df=self.sign_expr_df, outpath=self.sign_expr_outpath, index=True, header=True, logger=self.log) else: self.log.info("\tSkipping step.")
def work(self, permutation_orders): # Load the data print("Loading data", flush=True) tech_covs_df = load_dataframe(self.tech_covs_inpath, header=0, index_col=0) covs_df = load_dataframe(self.covs_inpath, header=0, index_col=0) geno_df = load_dataframe( self.geno_inpath, header=0, index_col=0, skiprows=[i for i in range(1, self.skip_rows + 1)], nrows=self.n_eqtls) expr_df = load_dataframe( self.expr_inpath, header=0, index_col=0, skiprows=[i for i in range(1, self.skip_rows + 1)], nrows=self.n_eqtls) # Validate the dataframes match up. dfs = [tech_covs_df, covs_df, geno_df, expr_df] for (a, b) in list(itertools.combinations(dfs, 2)): if a is not None and b is not None and \ not a.columns.identical(b.columns): print("Order of samples are not identical.") exit() # Replace -1 with NaN in the genotype dataframe. This way we can # drop missing values. geno_df.replace(-1, np.nan, inplace=True) # Initialize the storage object. print("Creating storage object") storage = StorageContainer(colnames=covs_df.index.to_list()) # Start working. print("Starting interaction analyser", flush=True) for row_index, eqtl_index in enumerate( [i for i in range(self.skip_rows, self.skip_rows + self.n_eqtls)]): print("\tProcessing eQTL {}/{} " "[{:.0f}%]".format(row_index + 1, self.n_eqtls, (100 / self.n_eqtls) * (row_index + 1)), flush=True) start_time = time.time() # Get the complete genotype row for the permutation later. genotype_all = geno_df.iloc[row_index, :].copy() # Get the missing genotype indices. indices = np.arange(geno_df.shape[1]) eqtl_indices = indices[~geno_df.iloc[row_index, :].isnull().values] # Subset the row and present samples for this eQTL. genotype = geno_df.iloc[row_index, eqtl_indices].copy() expression = expr_df.iloc[row_index, eqtl_indices].copy() technical_covs = tech_covs_df.iloc[:, eqtl_indices].copy() # Initialize variables. storage.add_row(eqtl_index, "{}_{}".format(genotype.name, expression.name)) # Create the base model. Null model are all the technical # covariates multiplied with the genotype + the SNP. intercept = pd.DataFrame(1, index=genotype.index, columns=["intercept"]) base_matrix = reduce( lambda left, right: pd.merge( left, right, left_index=True, right_index=True), [intercept, genotype.to_frame(), technical_covs.T]) if self.correct_snp_tc_inter: tech_inter_matrix = technical_covs.mul(genotype, axis=1) tech_inter_matrix.index = [ "{}_X_SNP".format(x) for x in technical_covs.index ] base_matrix = base_matrix.merge(tech_inter_matrix.T, left_index=True, right_index=True) # Regress out the base model from the expression values. expression_hat = self.remove_covariates(expression, base_matrix) # Loop over the covariates. for cov_index in range(len(covs_df.index)): if storage.has_error(): break # Get the covariate we are processing. covariate = covs_df.iloc[cov_index, eqtl_indices].copy() cov_name = covariate.name if self.verbose: print("\t\tWorking on '{}'".format(cov_name), flush=True) # Create the null model. null_matrix = covariate.to_frame() n_null = null_matrix.shape[0] df_null, rss_null, _, _ = self.create_model( null_matrix, expression_hat) if self.verbose: print("\t\tn_null: {}\tdf_null: {}\trss_null: {}\t".format( n_null, df_null, rss_null)) # Loop over each permutation sample order. The first order # is the normal order and the remainder are random shuffles. for order_id, sample_order in enumerate(permutation_orders): if storage.has_error(): break if self.verbose: print("\t\t\tWorking on 'order_{}'".format(order_id), flush=True) # Reorder the covariate based on the sample order. # Make sure the labels are in the same order, just # shuffle the values. covariate_all = covs_df.iloc[cov_index, :].copy() if sample_order is not None: covariate_all_index = covariate_all.index covariate_all = covariate_all.reindex( covariate_all.index[sample_order]) covariate_all.index = covariate_all_index # Calculate the interaction effect of the covariate of # interest. Then drop the NA's from the interaction # term. inter_of_interest = covariate_all * genotype_all inter_name = "{}_X_SNP".format(cov_name) inter_of_interest.name = inter_name inter_of_interest = inter_of_interest.iloc[eqtl_indices] del covariate_all # Check if the drop is identical (see above). if not inter_of_interest.index.equals(null_matrix.index): print("\t\t\tError in permutation reordering " "(ID: {})".format(order_id), flush=True) storage.set_error() continue # Create the alternative matrix and add the interaction # term. alt_matrix = null_matrix.copy() alt_matrix = alt_matrix.merge(inter_of_interest.to_frame(), left_index=True, right_index=True) del inter_of_interest # Create the alternative model. n_alt = alt_matrix.shape[0] df_alt, rss_alt, coefficients_alt, std_errors_alt = self.create_model( alt_matrix, expression_hat, cols=[inter_name]) del alt_matrix if self.verbose: print( "\t\t\tn_alt: {}\tdf_alt: {}\trss_alt: {}\talt_coefficients: {}\talt_std_erros: {}" .format(n_alt, df_alt, rss_alt, coefficients_alt, std_errors_alt)) # Make sure the n's are identical. if n_null != n_alt: print("\t\t\tError due to unequal n_null and n_alt", flush=True) storage.set_error() continue # Safe the coefficient and std error. storage.add_coefficient(order_id, coefficients_alt[inter_name]) storage.add_std_error(order_id, std_errors_alt[inter_name]) # Compare the null and alternative model. fvalue = self.calc_f_value(rss_null, rss_alt, df_null, df_alt, n_null) pvalue = self.get_p_value(fvalue, df_null, df_alt, n_null) if self.verbose: print("\t\t\tfvalue: {}\tpvalue: {}".format( fvalue, pvalue)) # Safe the p-values. storage.add_pvalue(order_id, pvalue) del fvalue, pvalue # Check whether we are almost running out of time. if time.time() > self.panic_time: print("\tPanic!!!", flush=True) return storage # Safe the results of the eQTL. storage.store_row() # Print the time. print("\t\tfinished in {:.4f} second(s).".format(time.time() - start_time, flush=True)) return storage
def perform_deconvolution(self): if self.sign_df is None: # Load the celltype profile file. self.log.info("Loading cell type profile matrix.") self.sign_df = load_dataframe(self.sign_file, header=0, index_col=0, logger=self.log) if self.sign_expr_df is None: # Load the celltype expression file. self.log.info("Loading cell type expression matrix.") self.sign_expr_df = load_dataframe(self.sign_expr_file, header=0, index_col=0, logger=self.log) # Filter uninformative genes from the signature matrix. sign_df = self.filter(self.sign_df, cutoff=self.min_expr_cutoff) # Subset and reorder. sign_df, expr_df = self.subset(sign_df, self.sign_expr_df) # Transform. sign_df = self.perform_log2_transform(sign_df) # Shift the data to be positive. self.log.info("Shifting data to be positive if required") if sign_df.values.min() < 0: self.log.warning("\tSignature matrix is shifted.") sign_df = self.perform_shift(sign_df) if expr_df.values.min() < 0: self.log.warning("\tExpression matrix is shifted.") expr_df = self.perform_shift(expr_df) self.log.info("Signature shape: {}".format(sign_df.shape)) self.log.info("Expression shape: {}".format(expr_df.shape)) # Perform deconvolution per sample. self.log.info("Performing partial deconvolution.") decon_data = [] residuals_data = [] recon_accuracy_data = [] for _, sample in expr_df.T.iterrows(): # Model. proportions, rnorm = self.nnls(sign_df, sample) # Calculate reconstruction accuracy. recon_accuracy = self.calc_reconstruction_accuracy( y=sample, X=sign_df, betas=proportions) # Save. decon_data.append(proportions) residuals_data.append(rnorm) recon_accuracy_data.append(recon_accuracy) decon_df = pd.DataFrame(decon_data, index=expr_df.columns, columns=sign_df.columns) residuals_df = pd.Series(residuals_data, index=expr_df.columns) recon_accuracy = pd.Series(recon_accuracy_data, index=expr_df.columns) self.log.info("Estimated weights:") self.log.info(decon_df.mean(axis=0)) self.log.info( "Average reconstruction accuracy: {:.2f} [SD: {:.2f}]".format( recon_accuracy.mean(), recon_accuracy.std())) save_dataframe(df=decon_df, outpath=os.path.join(self.outdir, "NNLS_betas.txt.gz"), index=True, header=True, logger=self.log) # Make the weights sum up to 1. decon_df = self.sum_to_one(decon_df) self.log.info("Estimated proportions:") self.log.info(decon_df.mean(axis=0)) # Calculate the average residuals. self.log.info("Average residual: {:.2f}".format(residuals_df.mean())) save_dataframe(df=decon_df, outpath=os.path.join( self.outdir, "deconvolution_table_complete.txt.gz"), index=True, header=True, logger=self.log) if self.cell_type_groups is not None: self.log.info("Summing cell types.") cell_type_group = np.array([ self.cell_type_groups[ct] if ct in self.cell_type_groups else ct for ct in decon_df.columns ], dtype=object) cell_types = list(set(cell_type_group)) cell_types.sort() summed_decon_df = pd.DataFrame(np.nan, index=decon_df.index, columns=cell_types) for ct_group in cell_types: summed_decon_df.loc[:, ct_group] = decon_df.loc[:, cell_type_group == ct_group].sum( axis=1) decon_df = summed_decon_df return decon_df
def perform_deconvolution(self): if self.sign_df is None: # Load the celltype profile file. self.log.info("Loading cell type profile matrix.") self.sign_df = load_dataframe(self.sign_file, header=0, index_col=0, logger=self.log) if self.sign_expr_df is None: # Load the celltype expression file. self.log.info("Loading cell type expression matrix.") self.sign_expr_df = load_dataframe(self.sign_expr_file, header=0, index_col=0, logger=self.log) # Filter uninformative genes from the signature matrix. sign_df = self.filter(self.sign_df, cutoff=self.min_expr_cutoff) # Subset and reorder. sign_df, expr_df = self.subset(sign_df, self.sign_expr_df) # Transform. sign_df = self.perform_log2_transform(sign_df) # Shift the data to be positive. self.log.info("Shifting data to be positive") if sign_df.values.min() < 0: self.log.warning("\tSignature matrix is shifted.") sign_df = self.perform_shift(sign_df) if expr_df.values.min() < 0: self.log.warning("\tExpression matrix is shifted.") expr_df = self.perform_shift(expr_df) self.log.info("Signature shape: {}".format(sign_df.shape)) self.log.info("Expression shape: {}".format(expr_df.shape)) # Perform deconvolution per sample. self.log.info("Performing partial deconvolution.") decon_data = [] residuals_data = [] for _, sample in expr_df.T.iterrows(): proportions, rnorm = self.nnls(sign_df, sample) decon_data.append(proportions) residuals_data.append(rnorm) decon_df = pd.DataFrame(decon_data, index=expr_df.columns, columns=[ "{}NNLS_{}".format(*x.split("_")) for x in sign_df.columns ]) residuals_df = pd.Series(residuals_data, index=expr_df.columns) self.log.info("Estimated weights:") self.log.info(decon_df.mean(axis=0)) # Make the weights sum up to 1. decon_df = self.sum_to_one(decon_df) self.log.info("Estimated proportions:") self.log.info(decon_df.mean(axis=0)) # Calculate the average residuals. self.log.info("Average residual: {:.2f}".format(residuals_df.mean())) return decon_df
''' @author: XT fletching.py contains functions for calculating fletching profits ''' import utilities df = utilities.load_dataframe() ''' Onyx bolts: - 1 Runite bolts - 1 Onyx bolt tips ''' def OnyxBolts(): #Buy rb = utilities.getPrice('Runite bolts', df) obt = utilities.getPrice('Onyx bolt tips', df) #Sell ob = utilities.getPrice('Onyx bolts', df) #Profit profit, BUY, SELL = utilities.calculateProfit([rb,obt],[ob]) print("Profit: {}\n".format(profit)) def DiamondBolts(): #Buy
def start(self): print("Starting interaction analyser - combine and plot.") self.print_arguments() # Start the timer. start_time = time.time() print("") print("### Step 1 ###") print("Combine pickle files into dataframe.", flush=True) dataframes = {} for filename in [ self.pvalues_filename, self.coef_filename, self.std_err_filename ]: outpath = os.path.join(self.work_dir, "{}_table.txt.gz".format(filename)) if not check_file_exists(outpath) or self.force: print("Loading {} data.".format(filename), flush=True) columns, data = self.combine_pickles(self.work_dir, filename, columns=True) if len(data) == 0: print("\tNo {} data found.".format(filename)) continue print("Creating {} dataframe.".format(filename), flush=True) df = self.create_df(data, columns) print("Saving {} dataframe.".format(filename), flush=True) save_dataframe(df=df, outpath=outpath, header=True, index=True) dataframes[filename] = df del columns, data, df else: print("Skipping step for {}".format(outpath)) dataframes[filename] = load_dataframe(outpath, header=0, index_col=0) print("") print("### Step 2 ###") print("Calculate t-values", flush=True) outpath = os.path.join(self.work_dir, "{}_table.txt.gz".format(self.tvalue_filename)) if not check_file_exists(outpath) or self.force: if self.coef_filename in dataframes and self.std_err_filename in dataframes: # Calculate t-values coef_df = dataframes[self.coef_filename] std_err_df = dataframes[self.std_err_filename] if not coef_df.columns.identical(std_err_df.columns): overlap = set(coef_df.columns).intersection( set(std_err_df.columns)) if len(overlap) == 0: print("No overlapping eQTLs between coef and std_err " "data frame columns.") else: coef_df = coef_df.loc[:, overlap] std_err_df = std_err_df.loc[:, overlap] if not coef_df.index.identical(std_err_df.index): overlap = set(coef_df.index).intersection( set(std_err_df.index)) if len(overlap) == 0: print("No overlapping eQTLs between coef and std_err " "data frames indices.") else: coef_df = coef_df.loc[overlap, :] std_err_df = std_err_df.loc[overlap, :] if coef_df.columns.identical( std_err_df.columns) and coef_df.index.identical( std_err_df.index): tvalue_df = coef_df / std_err_df print("Saving {} dataframe.".format(self.tvalue_filename), flush=True) save_dataframe(df=tvalue_df, outpath=os.path.join( self.work_dir, "{}_table.txt.gz".format( self.tvalue_filename)), header=True, index=True) else: print("\tNo data found.") else: print("Skipping step.") print("") print("### Step 3 ###") print("Starting other calculations", flush=True) if self.pvalues_filename not in dataframes: print("\tNo pvalues data found.") return pvalue_df = dataframes[self.pvalues_filename] pvalue_df_columns = [ "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.columns) ] pvalue_df.columns = pvalue_df_columns pvalue_df_indices = [ "{}_{}".format(x, i) for i, x in enumerate(pvalue_df.index) ] pvalue_df.index = pvalue_df_indices pvalue_df.reset_index(drop=False, inplace=True) print("Melting dataframe.", flush=True) dfm = pvalue_df.melt(id_vars=["index"]) dfm.columns = ["covariate", "SNP", "pvalue"] dfm["rank"] = dfm.loc[:, "pvalue"].rank(ascending=True) n_signif = dfm[dfm["pvalue"] <= self.alpha].shape[0] n_total = dfm.shape[0] print("\t{}/{} [{:.2f}%] of pvalues < {}".format( n_signif, n_total, (100 / n_total) * n_signif, self.alpha), flush=True) print("Adding z-scores.", flush=True) dfm["zscore"] = stats.norm.isf(dfm["pvalue"]) dfm.loc[dfm["pvalue"] > (1.0 - 1e-16), "zscore"] = -8.209536151601387 dfm.loc[dfm["pvalue"] < 1e-323, "zscore"] = 38.44939448087599 self.pivot_and_save(dfm, "zscore", pvalue_df_indices, pvalue_df_columns) print("Adding BH-FDR.", flush=True) dfm["BH-FDR"] = dfm["pvalue"] * (n_total / (dfm["rank"] + 1)) dfm.loc[dfm["BH-FDR"] > 1, "BH-FDR"] = 1 prev_bh_fdr = -np.Inf for i in range(n_total): bh_fdr = dfm.loc[i, "BH-FDR"] if bh_fdr > prev_bh_fdr: prev_bh_fdr = bh_fdr else: dfm.loc[i, "BH-FDR"] = prev_bh_fdr n_signif = dfm[dfm["BH-FDR"] <= self.alpha].shape[0] print("\t{}/{} [{:.2f}%] of BH-FDR values < {}".format( n_signif, n_total, (100 / n_total) * n_signif, self.alpha), flush=True) self.pivot_and_save(dfm, "BH-FDR", pvalue_df_indices, pvalue_df_columns) print("Adding permutation FDR.", flush=True) print("\tLoading permutation pvalue data.", flush=True) _, perm_pvalues = self.combine_pickles(self.work_dir, self.perm_pvalues_filename) # perm_pvalues = [random.random() for _ in range(n_total * 10)] print("Sorting p-values.", flush=True) perm_pvalues = sorted(perm_pvalues) if len(perm_pvalues) > 0: n_perm = len(perm_pvalues) / n_total if n_perm != self.n_perm: print("\tWARNING: not all permutation pvalus are present") perm_ranks = [] for pvalue in dfm["pvalue"]: perm_ranks.append(bisect_left(perm_pvalues, pvalue)) dfm["perm-rank"] = perm_ranks dfm["perm-FDR"] = (dfm["perm-rank"] / n_perm) / dfm["rank"] dfm.loc[(dfm.index == 0) | (dfm["perm-rank"] == 0), "perm-FDR"] = 0 dfm.loc[dfm["perm-FDR"] > 1, "perm-FDR"] = 1 self.pivot_and_save(dfm, "perm-FDR", pvalue_df_indices, pvalue_df_columns) print("Saving full dataframe.", flush=True) save_dataframe(df=dfm, outpath=os.path.join(self.work_dir, "molten_table.txt.gz"), header=True, index=True) print("") # Print the time. run_time_min, run_time_sec = divmod(time.time() - start_time, 60) run_time_hour, run_time_min = divmod(run_time_min, 60) print("finished in {} hour(s), {} minute(s) and " "{} second(s).".format(int(run_time_hour), int(run_time_min), int(run_time_sec)), flush=True)
def create_tech_covs_file(self): # Load the technical covariates. self.log.info("Loading technical covariates matrix.") tcov_df = load_dataframe(inpath=self.cov_file, header=0, index_col=0, logger=self.log) # Filter on samples and technical covariates. self.log.info("Filtering on samples and technical covariates.") tcov_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in tcov_df.index] tcov_df = tcov_df.loc[self.sample_order, :].copy() save_dataframe(df=tcov_df.T, outpath=os.path.join(self.outdir, "technical_covariates_table.txt.gz"), index=True, header=True, logger=self.log) if self.technical_covariates: save_dataframe(df=tcov_df.loc[:, self.technical_covariates].T, outpath=os.path.join(self.outdir, "technical_covariates_table_subset.txt.gz"), index=True, header=True, logger=self.log) # Load the MDS components. self.log.info("Loading MDS matrix.") mds_df = load_dataframe(inpath=self.mds_file, header=0, index_col=0, logger=self.log) # Filter on samples and technical covariates. self.log.info("Filtering on samples and technical covariates.") mds_df.index = [self.sample_dict[x] if x in self.sample_dict else x for x in mds_df.index] mds_df = mds_df.loc[self.sample_order, :].copy() save_dataframe(df=mds_df.T, outpath=os.path.join(self.outdir, "mds_covariates_table.txt.gz"), index=True, header=True, logger=self.log) tmp_combined_df = tcov_df.merge(mds_df, left_index=True, right_index=True) save_dataframe(df=tmp_combined_df.T, outpath=os.path.join(self.outdir, "technical_and_mds_covariates_table.txt.gz"), index=True, header=True, logger=self.log) # Loading cohort matrix. self.log.info("Loading dataset matrix.") if self.dataset_df is None: self.dataset_df = load_dataframe(self.dataset_file, header=0, index_col=0, logger=self.log) # merge. self.log.info("Merging matrices.") correction_df = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), [tcov_df, mds_df, self.dataset_df]) correction_df = correction_df.T correction_df.index.name = "-" self.log.info("\t Correction matrix shape: {}".format(correction_df.shape)) # Validate sample order. if not correction_df.columns.equals(self.sample_order): correction_df = correction_df[self.sample_order] return correction_df