def run(args): # load the data df = gtm.load_file_and_avg(args.data_file) genes = df['gene'].values found_genes, geneTS = gtm.get_gene_TS(df, genes) dfr = gtm.load_file_and_avg(args.rand_data_file) genesr = dfr['gene'].values found_genesr, geneTSr = gtm.get_gene_TS(dfr, genesr) n = geneTSr.shape[0] args_dict = ct.load_kwargs_file(argsfile=args.args_file) print args_dict if args.rowlist_file != None: with open(args.rowlist_file, 'rU') as f: rowlist = eval(f.readline()) else: rowlist = range(n) if args.test == "e": beta_tuple, all_res_df, use_df = ct.enet_granger_causality_row_cv(geneTS, geneTS, rowlist, **args_dict) with open(args.output_name, 'w') as outfile: pickle.dump(beta_tuple, outfile) all_res_df.to_csv(args.output_all_name, sep="\t", index=False) use_df.to_csv(args.output_use_name, sep="\t", index=False) param_df = use_df[["alpha", "lambda.min", "Row"]] rand_beta_tuple, rand_all_res_df, rand_use_df = ct.enet_granger_causality_row_load(geneTSr, geneTS, rowlist, param_df, **args_dict) with open(args.output_rand_name, 'w') as outfile: pickle.dump(rand_beta_tuple, outfile) rand_all_res_df.to_csv(args.output_rand_all_name, sep="\t", index=False) rand_use_df.to_csv(args.output_rand_use_name, sep="\t", index=False) print "HIIIIIII" print "Output written to ", args.output_name print "All results written to ", args.output_all_name print "Used params written to ", args.output_use_name print "Rand output written to ", args.output_rand_name print "All rand results written to ", args.output_rand_all_name print "Used rand params written to ", args.output_rand_use_name
def run(args): # load the data df = gtm.load_file_and_avg(args.data_file) genes = df['gene'].values found_genes, geneTS = gtm.get_gene_TS(df, genes) dfr = gtm.load_file_and_avg(args.rand_data_file) genesr = dfr['gene'].values found_genesr, geneTSr = gtm.get_gene_TS(dfr, genesr) n = geneTSr.shape[0] args_dict = ct.load_kwargs_file(argsfile=args.args_file) print args_dict if args.rowlist_file != None: with open(args.rowlist_file, 'rU') as f: rowlist = eval(f.readline()) else: rowlist = range(n) if args.test == "e": beta_tuple, all_res_df, use_df = ct.enet_granger_causality_row_cv( geneTS, geneTS, rowlist, **args_dict) with open(args.output_name, 'w') as outfile: pickle.dump(beta_tuple, outfile) all_res_df.to_csv(args.output_all_name, sep="\t", index=False) use_df.to_csv(args.output_use_name, sep="\t", index=False) param_df = use_df[["alpha", "lambda.min", "Row"]] rand_beta_tuple, rand_all_res_df, rand_use_df = ct.enet_granger_causality_row_load( geneTSr, geneTS, rowlist, param_df, **args_dict) with open(args.output_rand_name, 'w') as outfile: pickle.dump(rand_beta_tuple, outfile) rand_all_res_df.to_csv(args.output_rand_all_name, sep="\t", index=False) rand_use_df.to_csv(args.output_rand_use_name, sep="\t", index=False) print "HIIIIIII" print "Output written to ", args.output_name print "All results written to ", args.output_all_name print "Used params written to ", args.output_use_name print "Rand output written to ", args.output_rand_name print "All rand results written to ", args.output_rand_all_name print "Used rand params written to ", args.output_rand_use_name
def run(args): # load the data df = gtm.load_file_and_avg(args.data_file) genes = df['gene'].values found_genes, geneTS = gtm.get_gene_TS(df, genes) args_dict = load_kwargs_file(argsfile=args.args_file) if args.pairlist_file == None: pairlist = None else: pairlist = np.load(open(args.pairlist_file)) print args_dict if args.test == 'g': output = ct.pairwise_granger_causality_all(geneTS, pairlist, **args_dict) with open(args.output_name, 'w') as outfile: pickle.dump(output, outfile) print "HELLOOOOOOOO" print "Output written to ", args.output_name
def main(): tstart = time.time() input_file = args.input_file out_file_prefix = args.out_file_prefix start_index = args.start_index end_index = args.end_index df = gtm.load_file_and_avg(input_file) genes = df['gene'][start_index:end_index].values found_genes, geneTS = gtm.get_gene_TS(df, genes) cause_type = args.cause_type if cause_type == 'g': model_orders = range(args.model_order_min, args.model_order_max + 1) threshold = args.p_threshold p_matr_list = [] sig_matr_list = [] for model_order in model_orders: t_gc = time.time() p_matr = pairwise_granger_causality_all( geneTS, model_order=model_order, use_processes=args.use_processes, procnum=args.procnum) print "Time for granger causality", time.time() - t_gc sig_matr = p_matr < threshold p_matr_list.append(p_matr) sig_matr_list.append(sig_matr) all_sig_matr, all_sig_num, not_sig_num = gtm.compare_sig_matr( sig_matr_list=sig_matr_list) print "Total number of significant pairs ", all_sig_num + not_sig_num print "Pairs significant across all matrices ", all_sig_num, all_sig_num * 1.0 / ( all_sig_num + not_sig_num) out_file_name = out_file_prefix + "_GC.p" pickle.dump([ model_orders, p_matr_list, sig_matr_list, (all_sig_matr, all_sig_num, not_sig_num) ], open(out_file_name, "w")) print "Results written to", out_file_name # compare the significant matrices # save the output p matrices print "Total time used ", time.time() - tstart
def main(): tstart = time.time() input_file = args.input_file out_file_prefix = args.out_file_prefix start_index = args.start_index end_index = args.end_index df = gtm.load_file_and_avg(input_file) genes = df['gene'][start_index:end_index].values found_genes, geneTS = gtm.get_gene_TS(df, genes) cause_type = args.cause_type if cause_type == 'g': model_orders = range(args.model_order_min, args.model_order_max + 1) threshold = args.p_threshold p_matr_list = [] sig_matr_list = [] for model_order in model_orders: t_gc = time.time() p_matr = pairwise_granger_causality_all(geneTS, model_order=model_order, use_processes=args.use_processes, procnum=args.procnum) print "Time for granger causality", time.time() - t_gc sig_matr = p_matr < threshold p_matr_list.append(p_matr) sig_matr_list.append(sig_matr) all_sig_matr, all_sig_num, not_sig_num = gtm.compare_sig_matr(sig_matr_list=sig_matr_list) print "Total number of significant pairs ", all_sig_num + not_sig_num print "Pairs significant across all matrices ", all_sig_num, all_sig_num * 1.0 / (all_sig_num + not_sig_num) out_file_name = out_file_prefix + "_GC.p" pickle.dump([model_orders, p_matr_list, sig_matr_list, (all_sig_matr, all_sig_num, not_sig_num)], open(out_file_name, "w")) print "Results written to", out_file_name # compare the significant matrices # save the output p matrices print "Total time used ", time.time() - tstart
def run(args): data_file = args.data_file.split('/')[-1] rand_data_file = args.rand_data_file.split('/')[-1] df = gtm.load_file_and_avg(data_file) genes = df['gene'].values n = len(genes) script_filenames = [] output_filenames = [] output_rand_filenames = [] if args.test == "e": all_res_filenames = [] use_filenames = [] all_res_rand_filenames = [] use_rand_filenames = [] else: all_res_filenames = None use_filenames = None all_res_rand_filenames = None use_rand_filenames = None partition_rows = pj.partition_inputs(range(n), args.job_num) for partition_row, i in zip(partition_rows, range(len(partition_rows))): script_filename = args.output_name + "-script-" + str(i) + ".sh" script_filenames.append(script_filename) output_filename = args.output_name + "-" + str(i) + ".p" output_filenames.append(output_filename) output_rand_filename = args.output_name + "-randomized-" + str(i) + ".p" output_rand_filenames.append(output_rand_filename) # prepare the job associated with this row_filename = args.output_name + "-row-" + str(i) + ".txt" command_string = "python run_causal_rand_row.py -d " + data_file + " -rd " + rand_data_file + \ " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -rl " + \ str(row_filename) + " -o " + output_filename + " -or " + output_rand_filename if args.test == "e": all_res_filename = args.output_name + "-all-params-" + str(i) + ".txt" all_res_filenames.append(all_res_filename) use_filename = args.output_name + "-used-params-" + str(i) + ".txt" use_filenames.append(use_filename) all_res_rand_filename = args.output_name + "-all-params-randomized-" + str(i) + ".txt" all_res_rand_filenames.append(all_res_rand_filename) use_rand_filename = args.output_name + "-used-params-randomized-" + str(i) + ".txt" use_rand_filenames.append(use_rand_filename) command_string += " -oa " + all_res_filename + " -ou " + use_filename + " -ora " + all_res_rand_filename + " -oru " + use_rand_filename with open(row_filename, 'w') as rowfile: rowfile.write(str(partition_row) + "\n") print "Partition row written to ", row_filename with open(script_filename, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("module load python/2.7\n") outputfile.write("module load python/2.7/scipy-mkl\n") outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda\n") outputfile.write(command_string) outputfile.write("\n") os.chmod(script_filename, 0777) print "Script written to ", script_filename integrated_name_dict = {} integrated_name_dict["Output"] = args.output_name + ".p" integrated_name_dict["Rand-Output"] = args.output_name + "-randomized.p" integrated_name_dict["All-Params"] = args.output_name + "-all-params.txt" integrated_name_dict["Use-Params"] = args.output_name + "-use-params.txt" integrated_name_dict["All-Rand-Params"] = args.output_name + "-all-params-randomized.txt" integrated_name_dict["Use-Rand-Params"] = args.output_name + "-use-params-randomized.txt" with open("script_list.txt", 'w') as scriptfile: for script_filename in script_filenames: scriptfile.write(script_filename + "\n") print "Script list written to script_list.txt" # list of matrices to integrate output_matr_dict = {"Output": output_filenames, "Rand-Output": output_rand_filenames} output_matr_df = pd.DataFrame(output_matr_dict) output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False) print "Output matrices written to output_matr_list.txt" int_matr_dict = dict([(x, integrated_name_dict[x]) for x in ["Output", "Rand-Output"]]) int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False) print "integrated matrices written to int_matr_list.txt" if args.test == "e": # lists of dataframes (param files) to integrate # These will only be integrated if output_df_dict = {} output_df_lists = [all_res_filenames, use_filenames, all_res_rand_filenames, use_rand_filenames] output_df_names = ["All-Params", "Use-Params", "All-Rand-Params", "Use-Rand-Params"] for out_list, out_name in zip(output_df_lists, output_df_names): if out_list != None: output_df_dict[out_name] = out_list output_df_df = pd.DataFrame(output_df_dict) output_df_df.to_csv("output_df_list.txt", sep="\t", index=False) print "output dfs written to output_df_list.txt" int_df_dict = dict([(x, integrated_name_dict[x]) for x in set(output_df_names).intersection(output_df_dict.keys())]) int_df_df = pd.DataFrame(int_df_dict, index=[0]) int_df_df.to_csv("int_df_list.txt", sep="\t", index=False) print "Integrated dfs written to int_df_list.txt" with open("integrate_outputs.sh", 'w') as ifile: if args.test == "e": # here , "a" means the axis to integrate by ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1 && " + \ "python integrate_outputs_rand_row.py -i output_df_list.txt -t d -o int_df_list.txt\n") else: ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1\n") print "Integration script written to integrate_outputs.sh" os.chmod("integrate_outputs.sh", 0777) with open("fdr_control.sh", 'w') as ffile: fdr_string = "python fdr_control.py -m " + integrated_name_dict["Output"] + " -rm " + integrated_name_dict["Rand-Output"] + \ " -d " + data_file + " -rd " + rand_data_file + " -n " + args.output_name + " -f \"" + str(args.fdr) + "\" " + \ " -c " + str(args.coef_num) + " -mn " + str(1) + " -pp " + args.output_name + "-all-beta-histogram " ffile.write(fdr_string + " -sb e && " + fdr_string + " -sb n\n") print "FDR CONTROL script written to fdr_control.sh" os.chmod("fdr_control.sh", 0777) if args.parallel_num > 0: print "Parallel Number (# processes per job): " + str(args.parallel_num) script_groups = pj.partition_inputs(script_filenames, number=int(math.ceil(len(script_filenames) * 1.0/args.parallel_num))) print "Number of script groups ", len(script_groups) parallel_scripts = [] for i, script_group in zip(range(len(script_groups)), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) print "Parallel Script ", i, ":", parallel_script parallel_scripts.append(parallel_script) with open("parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print "Parallel script list written to parallel_script_list.txt"
def run(args): data_file = args.data_file.split('/')[-1] rand_data_file = args.rand_data_file.split('/')[-1] df = gtm.load_file_and_avg(data_file) genes = df['gene'].values n = len(genes) script_filenames = [] output_filenames = [] output_rand_filenames = [] if args.test == "e": all_res_filenames = [] use_filenames = [] all_res_rand_filenames = [] use_rand_filenames = [] else: all_res_filenames = None use_filenames = None all_res_rand_filenames = None use_rand_filenames = None partition_rows = pj.partition_inputs(range(n), args.job_num) for partition_row, i in zip(partition_rows, range(len(partition_rows))): script_filename = args.output_name + "-script-" + str(i) + ".sh" script_filenames.append(script_filename) output_filename = args.output_name + "-" + str(i) + ".p" output_filenames.append(output_filename) output_rand_filename = args.output_name + "-randomized-" + str( i) + ".p" output_rand_filenames.append(output_rand_filename) # prepare the job associated with this row_filename = args.output_name + "-row-" + str(i) + ".txt" command_string = "python run_causal_rand_row.py -d " + data_file + " -rd " + rand_data_file + \ " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -rl " + \ str(row_filename) + " -o " + output_filename + " -or " + output_rand_filename if args.test == "e": all_res_filename = args.output_name + "-all-params-" + str( i) + ".txt" all_res_filenames.append(all_res_filename) use_filename = args.output_name + "-used-params-" + str(i) + ".txt" use_filenames.append(use_filename) all_res_rand_filename = args.output_name + "-all-params-randomized-" + str( i) + ".txt" all_res_rand_filenames.append(all_res_rand_filename) use_rand_filename = args.output_name + "-used-params-randomized-" + str( i) + ".txt" use_rand_filenames.append(use_rand_filename) command_string += " -oa " + all_res_filename + " -ou " + use_filename + " -ora " + all_res_rand_filename + " -oru " + use_rand_filename with open(row_filename, 'w') as rowfile: rowfile.write(str(partition_row) + "\n") print "Partition row written to ", row_filename with open(script_filename, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("module load python/2.7\n") outputfile.write("module load python/2.7/scipy-mkl\n") outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda\n") outputfile.write(command_string) outputfile.write("\n") os.chmod(script_filename, 0777) print "Script written to ", script_filename integrated_name_dict = {} integrated_name_dict["Output"] = args.output_name + ".p" integrated_name_dict["Rand-Output"] = args.output_name + "-randomized.p" integrated_name_dict["All-Params"] = args.output_name + "-all-params.txt" integrated_name_dict["Use-Params"] = args.output_name + "-use-params.txt" integrated_name_dict[ "All-Rand-Params"] = args.output_name + "-all-params-randomized.txt" integrated_name_dict[ "Use-Rand-Params"] = args.output_name + "-use-params-randomized.txt" with open("script_list.txt", 'w') as scriptfile: for script_filename in script_filenames: scriptfile.write(script_filename + "\n") print "Script list written to script_list.txt" # list of matrices to integrate output_matr_dict = { "Output": output_filenames, "Rand-Output": output_rand_filenames } output_matr_df = pd.DataFrame(output_matr_dict) output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False) print "Output matrices written to output_matr_list.txt" int_matr_dict = dict([(x, integrated_name_dict[x]) for x in ["Output", "Rand-Output"]]) int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False) print "integrated matrices written to int_matr_list.txt" if args.test == "e": # lists of dataframes (param files) to integrate # These will only be integrated if output_df_dict = {} output_df_lists = [ all_res_filenames, use_filenames, all_res_rand_filenames, use_rand_filenames ] output_df_names = [ "All-Params", "Use-Params", "All-Rand-Params", "Use-Rand-Params" ] for out_list, out_name in zip(output_df_lists, output_df_names): if out_list != None: output_df_dict[out_name] = out_list output_df_df = pd.DataFrame(output_df_dict) output_df_df.to_csv("output_df_list.txt", sep="\t", index=False) print "output dfs written to output_df_list.txt" int_df_dict = dict([ (x, integrated_name_dict[x]) for x in set(output_df_names).intersection(output_df_dict.keys()) ]) int_df_df = pd.DataFrame(int_df_dict, index=[0]) int_df_df.to_csv("int_df_list.txt", sep="\t", index=False) print "Integrated dfs written to int_df_list.txt" with open("integrate_outputs.sh", 'w') as ifile: if args.test == "e": # here , "a" means the axis to integrate by ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1 && " + \ "python integrate_outputs_rand_row.py -i output_df_list.txt -t d -o int_df_list.txt\n") else: ifile.write( "python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1\n" ) print "Integration script written to integrate_outputs.sh" os.chmod("integrate_outputs.sh", 0777) with open("fdr_control.sh", 'w') as ffile: fdr_string = "python fdr_control.py -m " + integrated_name_dict["Output"] + " -rm " + integrated_name_dict["Rand-Output"] + \ " -d " + data_file + " -rd " + rand_data_file + " -n " + args.output_name + " -f \"" + str(args.fdr) + "\" " + \ " -c " + str(args.coef_num) + " -mn " + str(1) + " -pp " + args.output_name + "-all-beta-histogram " ffile.write(fdr_string + " -sb e && " + fdr_string + " -sb n\n") print "FDR CONTROL script written to fdr_control.sh" os.chmod("fdr_control.sh", 0777) if args.parallel_num > 0: print "Parallel Number (# processes per job): " + str( args.parallel_num) script_groups = pj.partition_inputs( script_filenames, number=int( math.ceil(len(script_filenames) * 1.0 / args.parallel_num))) print "Number of script groups ", len(script_groups) parallel_scripts = [] for i, script_group in zip(range(len(script_groups)), script_groups): appended_script_filenames = [ "./" + script_filename for script_filename in script_group ] parallel_script = " & ".join(appended_script_filenames) print "Parallel Script ", i, ":", parallel_script parallel_scripts.append(parallel_script) with open("parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print "Parallel script list written to parallel_script_list.txt"
def run(args): df = gtm.load_file_and_avg(args.data_file) genes = df['gene'].values n = len(genes) script_filenames = [] output_filenames = [] partition_pairs = lazy_partition_pairs(n, n, args.job_num) for partition_pair, i in zip(partition_pairs, range(len(partition_pairs))): script_filename = args.output_name + "-script-" + str(i) + ".sh" script_filenames.append(script_filename) output_filename = args.output_name + "-" + str(i) + ".p" output_filenames.append(output_filename) # prepare the job associated with this pair_filename = args.output_name + "-pair-" + str(i) + ".txt" command_string = "python run_causal.py -d " + args.data_file.split('/')[-1] + " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -pp " + \ str(pair_filename) + " -o " + output_filename if args.test == "gp": command_string += " -d2 " + args.data_file2.split('/')[-1] with open(pair_filename, 'w') as pairfile: pairfile.write(str(partition_pair) + "\n") print "Partition pair written to ", pair_filename with open(script_filename, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("module load python/2.7\n") outputfile.write("module load python/2.7/scipy-mkl\n") outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda\n") outputfile.write(command_string) outputfile.write("\n") os.chmod(script_filename, 0777) print "Script written to ", script_filename # submit the jobs soon with open("script_list.txt", 'w') as scriptfile: for script_filename in script_filenames: scriptfile.write(script_filename + "\n") print "Script list written to script_list.txt" with open("output_list.txt", 'w') as outputfile: for output_filename in output_filenames: outputfile.write(output_filename + "\n") print "Output list written to output_list.txt" with open("integrate_outputs.sh", 'w') as ifile: integrated_filename = args.output_name + ".p" ifile.write("python integrate_outputs.py -i output_list.txt -o " + integrated_filename + " -n " + str(n) + "\n") print "Integration script written to integrate_outputs.sh" os.chmod("integrate_outputs.sh", 0777) if args.parallel_num > 0: print "Parallel Number (# processes per job): " + str(args.parallel_num) script_groups = partition_inputs(script_filenames, number=len(script_filenames)/args.parallel_num) print "Number of script groups ", len(script_groups) parallel_scripts = [] for i, script_group in zip(range(len(script_groups)), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) print "Parallel Script ", i, ":", parallel_script parallel_scripts.append(parallel_script) with open("parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print "Parallel script list written to parallel_script_list.txt"
def run(args): data = gtm.load_file_and_avg(args.original_data) rand_data = gtm.load_file_and_avg(args.randomized_data) matr = pickle.load(open(args.original_matrix, 'rB'))[:, :, args.coef_num - 1] rand_matr = pickle.load(open(args.randomized_matrix, 'rB'))[:, :, args.coef_num - 1] if args.stratify_by not in {"e", "n"}: raise ValueError( "Stratify_by must be either 'e' for effect or 'n' for none") else: if args.stratify_by == "e": stratify_by = "effect" elif args.stratify_by == "n": stratify_by = "none" print print "Beginning FDR control, stratifying the matrix by ", stratify_by genes = data["gene"] rand_genes = rand_data["gene"] if (genes != rand_genes).any(): raise ValueError("Genes are not the same!") print "Original matrix for ", args.name, "saved to", args.name + "-unshuffled-matrix.txt" gtm.save_gene_matrix(matrix=matr, filename=args.name + "-unshuffled-matrix.txt", genes=genes) print "Randomized matrix for ", args.name, "saved to", args.name + "-shuffled-matrix.txt" gtm.save_gene_matrix(matrix=rand_matr, filename=args.name + "-shuffled-matrix.txt", genes=rand_genes) if args.plot_prefix != None: plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix) plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix + "_zoom-in-95", zoom_in_percentile=95) if args.cap_by != None: print "First capping original and randomized matrix" matr = cap_matr(matr, args.cap_by, name="Original") rand_matr = cap_matr(rand_matr, args.cap_by, name="Randomized") print "Using original" print "Trying to have an FDR of ", args.fdr print args.name functions = [get_abs_thresh, get_pos_neg_thresh] types = ["abs-thresh", "pos-neg-thresh"] # whether to take absolute value of given matrices absoluted = [True, True] for function, t, a in zip(functions, types, absoluted): print print "*******************" print t print "*******************" print "making matrix" out_prefix = args.name + "-unshuffled-" + t + "-FDR-" + str( args.fdr) + "-stratby-" + stratify_by thresh_matr, threshes = function(matr, rand_matr, args.fdr, stratify_by=stratify_by) matr_df = gtm.save_gene_matrix(out_prefix + "-matrix.txt", thresh_matr, genes) pickle.dump(threshes, open(out_prefix + "-threshes.p", 'w')) print "Matrix written to ", out_prefix + "-matrix.txt" print "Threshes written to ", out_prefix + "-threshes.p" #write_readme(thresh_matr, out_prefix, args.fdr, out_prefix + '-README.txt', out_prefix + "-matrix") if args.make_network: print "making network" net_df = nh.matr_to_net(matr_df, args.name + "-sb-" + args.stratify_by, make_pair=False) net_df.to_csv(out_prefix + "-network.txt", sep="\t", index=False) print "Network written to ", out_prefix + "-network.txt" if absoluted: print "Making absoluted matrix " abs_matr = np.absolute(thresh_matr) abs_prefix = args.name + "-unshuffled-" + t + "-absoluted-FDR-" + str( args.fdr) + "-stratby-" + stratify_by abs_df = gtm.save_gene_matrix(abs_prefix + "-matrix", abs_matr, genes) #write_readme(abs_matr, abs_prefix, args.fdr, abs_prefix + '-README.txt', abs_prefix + "-matrix") if args.make_network: print "Making absoluted network" abs_net_df = nh.matr_to_net(abs_df, args.name + "-sb-" + args.stratify_by, make_pair=False) abs_net_df.to_csv(abs_prefix + "-network.txt", sep="\t", index=False) print "Network written to ", abs_prefix + "-network.txt" print "FINISHED" print "#################################################" print
def run(args): data = gtm.load_file_and_avg(args.original_data) rand_data = gtm.load_file_and_avg(args.randomized_data) matr = pickle.load(open(args.original_matrix, 'rB'))[:, :, args.coef_num - 1] rand_matr = pickle.load(open(args.randomized_matrix, 'rB'))[:, :, args.coef_num - 1] if args.stratify_by not in {"e", "n"}: raise ValueError("Stratify_by must be either 'e' for effect or 'n' for none") else: if args.stratify_by == "e": stratify_by = "effect" elif args.stratify_by == "n": stratify_by = "none" genes = data["gene"] rand_genes = rand_data["gene"] if (genes != rand_genes).any(): raise ValueError("Genes are not the same!") print "Original matrix for ", args.name, "saved to", args.name + "-unshuffled-matrix.txt" gtm.save_gene_matrix(matrix=matr, filename=args.name + "-unshuffled-matrix.txt", genes=genes) print "Randomized matrix for ", args.name, "saved to", args.name + "-shuffled-matrix.txt" gtm.save_gene_matrix(matrix=rand_matr, filename=args.name + "-shuffled-matrix.txt", genes=rand_genes) if args.plot_prefix != None: plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix) plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix + "_zoom-in-95", zoom_in_percentile=95) print "Using original" print "Trying to have an FDR of ", args.fdr print args.name functions = [get_abs_thresh, get_pos_thresh, get_neg_thresh, get_pos_neg_thresh] types = ["abs-thresh", "pos-thresh", "neg-thresh", "pos-neg-thresh"] # whether to take absolute value of given matrices absoluted = [True, False, False, True] for function, t, a in zip(functions, types, absoluted): out_prefix = args.name + "-unshuffled-" + t + "-FDR-" + str(args.fdr) + "-stratby-" + stratify_by thresh_matr, threshes = function(matr, rand_matr, args.fdr, stratify_by = stratify_by) matr_df = gtm.save_gene_matrix(out_prefix + "-matrix.txt", thresh_matr, genes) pickle.dump(threshes, open(out_prefix + "-threshes.p", 'w')) print "Matrix written to ", out_prefix + "-matrix.txt" print "Threshes written to ", out_prefix + "-threshes.p" write_readme(thresh_matr, out_prefix, args.fdr, out_prefix + '-README.txt', out_prefix + "-matrix") if args.make_network: net_df = nh.matr_to_net(matr_df, args.name, make_pair=False) net_df.to_csv(out_prefix + "-network.txt", sep="\t", index=False) print "Network written to ", out_prefix + "-network.txt" if absoluted: abs_matr = np.absolute(thresh_matr) abs_prefix = args.name + "-unshuffled-" + t + "-absoluted-FDR-" + str(args.fdr) + "-stratby-" + stratify_by abs_df = gtm.save_gene_matrix(abs_prefix + "-matrix", abs_matr, genes) write_readme(abs_matr, abs_prefix, args.fdr, abs_prefix + '-README.txt', abs_prefix + "-matrix") if args.make_network: abs_net_df = nh.matr_to_net(abs_df, args.name, make_pair=False) abs_net_df.to_csv(abs_prefix + "-network.txt", sep="\t", index=False) print "Network written to ", abs_prefix + "-network.txt"
def run(args): df = gtm.load_file_and_avg(args.data_file) genes = df['gene'].values n = len(genes) script_filenames = [] output_filenames = [] partition_pairs = lazy_partition_pairs(n, n, args.job_num) for partition_pair, i in zip(partition_pairs, range(len(partition_pairs))): script_filename = args.output_name + "-script-" + str(i) + ".sh" script_filenames.append(script_filename) output_filename = args.output_name + "-" + str(i) + ".p" output_filenames.append(output_filename) # prepare the job associated with this pair_filename = args.output_name + "-pair-" + str(i) + ".txt" command_string = "python run_causal.py -d " + args.data_file.split('/')[-1] + " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -pp " + \ str(pair_filename) + " -o " + output_filename if args.test == "gp": command_string += " -d2 " + args.data_file2.split('/')[-1] with open(pair_filename, 'w') as pairfile: pairfile.write(str(partition_pair) + "\n") print "Partition pair written to ", pair_filename with open(script_filename, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("module load python/2.7\n") outputfile.write("module load python/2.7/scipy-mkl\n") outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda\n") outputfile.write(command_string) outputfile.write("\n") os.chmod(script_filename, 0777) print "Script written to ", script_filename # submit the jobs soon with open("script_list.txt", 'w') as scriptfile: for script_filename in script_filenames: scriptfile.write(script_filename + "\n") print "Script list written to script_list.txt" with open("output_list.txt", 'w') as outputfile: for output_filename in output_filenames: outputfile.write(output_filename + "\n") print "Output list written to output_list.txt" with open("integrate_outputs.sh", 'w') as ifile: integrated_filename = args.output_name + ".p" ifile.write("python integrate_outputs.py -i output_list.txt -o " + integrated_filename + " -n " + str(n) + "\n") print "Integration script written to integrate_outputs.sh" os.chmod("integrate_outputs.sh", 0777) if args.parallel_num > 0: print "Parallel Number (# processes per job): " + str( args.parallel_num) script_groups = partition_inputs(script_filenames, number=len(script_filenames) / args.parallel_num) print "Number of script groups ", len(script_groups) parallel_scripts = [] for i, script_group in zip(range(len(script_groups)), script_groups): appended_script_filenames = [ "./" + script_filename for script_filename in script_group ] parallel_script = " & ".join(appended_script_filenames) print "Parallel Script ", i, ":", parallel_script parallel_scripts.append(parallel_script) with open("parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print "Parallel script list written to parallel_script_list.txt"