示例#1
0
def run(args):

    # load the data

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)


    dfr = gtm.load_file_and_avg(args.rand_data_file)

    genesr = dfr['gene'].values

    found_genesr, geneTSr = gtm.get_gene_TS(dfr, genesr)

    n = geneTSr.shape[0]

    args_dict = ct.load_kwargs_file(argsfile=args.args_file)

    print args_dict




    if args.rowlist_file != None:
        with open(args.rowlist_file, 'rU') as f:
            rowlist = eval(f.readline())
    else:
        rowlist = range(n)


    if args.test == "e":
        beta_tuple, all_res_df, use_df = ct.enet_granger_causality_row_cv(geneTS, geneTS, rowlist, **args_dict)
        with open(args.output_name, 'w') as outfile:
            pickle.dump(beta_tuple, outfile)
        all_res_df.to_csv(args.output_all_name, sep="\t", index=False)
        use_df.to_csv(args.output_use_name, sep="\t", index=False)


        param_df = use_df[["alpha", "lambda.min", "Row"]]

        rand_beta_tuple, rand_all_res_df, rand_use_df = ct.enet_granger_causality_row_load(geneTSr, geneTS, rowlist, param_df, **args_dict)

        with open(args.output_rand_name, 'w') as outfile:
            pickle.dump(rand_beta_tuple, outfile)

        rand_all_res_df.to_csv(args.output_rand_all_name, sep="\t", index=False)
        rand_use_df.to_csv(args.output_rand_use_name, sep="\t", index=False)

        print "HIIIIIII"
        print "Output written to ", args.output_name
        print "All results written to ", args.output_all_name
        print "Used params written to ", args.output_use_name

        print "Rand output written to ", args.output_rand_name
        print "All rand results written to ", args.output_rand_all_name
        print "Used rand params written to ", args.output_rand_use_name
示例#2
0
def run(args):

    # load the data

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    dfr = gtm.load_file_and_avg(args.rand_data_file)

    genesr = dfr['gene'].values

    found_genesr, geneTSr = gtm.get_gene_TS(dfr, genesr)

    n = geneTSr.shape[0]

    args_dict = ct.load_kwargs_file(argsfile=args.args_file)

    print args_dict

    if args.rowlist_file != None:
        with open(args.rowlist_file, 'rU') as f:
            rowlist = eval(f.readline())
    else:
        rowlist = range(n)

    if args.test == "e":
        beta_tuple, all_res_df, use_df = ct.enet_granger_causality_row_cv(
            geneTS, geneTS, rowlist, **args_dict)
        with open(args.output_name, 'w') as outfile:
            pickle.dump(beta_tuple, outfile)
        all_res_df.to_csv(args.output_all_name, sep="\t", index=False)
        use_df.to_csv(args.output_use_name, sep="\t", index=False)

        param_df = use_df[["alpha", "lambda.min", "Row"]]

        rand_beta_tuple, rand_all_res_df, rand_use_df = ct.enet_granger_causality_row_load(
            geneTSr, geneTS, rowlist, param_df, **args_dict)

        with open(args.output_rand_name, 'w') as outfile:
            pickle.dump(rand_beta_tuple, outfile)

        rand_all_res_df.to_csv(args.output_rand_all_name,
                               sep="\t",
                               index=False)
        rand_use_df.to_csv(args.output_rand_use_name, sep="\t", index=False)

        print "HIIIIIII"
        print "Output written to ", args.output_name
        print "All results written to ", args.output_all_name
        print "Used params written to ", args.output_use_name

        print "Rand output written to ", args.output_rand_name
        print "All rand results written to ", args.output_rand_all_name
        print "Used rand params written to ", args.output_rand_use_name
示例#3
0
def run(args):

    # load the data

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    args_dict = load_kwargs_file(argsfile=args.args_file)

    if args.pairlist_file == None:
        pairlist = None
    else:
        pairlist = np.load(open(args.pairlist_file))

    print args_dict

    if args.test == 'g':
        output = ct.pairwise_granger_causality_all(geneTS, pairlist, **args_dict)
        with open(args.output_name, 'w') as outfile:
            pickle.dump(output, outfile)

    print "HELLOOOOOOOO"
    print "Output written to ", args.output_name
示例#4
0
def run(args):

    # load the data

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    args_dict = load_kwargs_file(argsfile=args.args_file)

    if args.pairlist_file == None:
        pairlist = None
    else:
        pairlist = np.load(open(args.pairlist_file))

    print args_dict

    if args.test == 'g':
        output = ct.pairwise_granger_causality_all(geneTS, pairlist,
                                                   **args_dict)
        with open(args.output_name, 'w') as outfile:
            pickle.dump(output, outfile)

    print "HELLOOOOOOOO"
    print "Output written to ", args.output_name
def main():
    tstart = time.time()

    input_file = args.input_file
    out_file_prefix = args.out_file_prefix

    start_index = args.start_index
    end_index = args.end_index

    df = gtm.load_file_and_avg(input_file)

    genes = df['gene'][start_index:end_index].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    cause_type = args.cause_type

    if cause_type == 'g':
        model_orders = range(args.model_order_min, args.model_order_max + 1)

        threshold = args.p_threshold

        p_matr_list = []
        sig_matr_list = []

        for model_order in model_orders:
            t_gc = time.time()
            p_matr = pairwise_granger_causality_all(
                geneTS,
                model_order=model_order,
                use_processes=args.use_processes,
                procnum=args.procnum)
            print "Time for granger causality", time.time() - t_gc

            sig_matr = p_matr < threshold

            p_matr_list.append(p_matr)
            sig_matr_list.append(sig_matr)

        all_sig_matr, all_sig_num, not_sig_num = gtm.compare_sig_matr(
            sig_matr_list=sig_matr_list)

        print "Total number of significant pairs ", all_sig_num + not_sig_num
        print "Pairs significant across all matrices ", all_sig_num, all_sig_num * 1.0 / (
            all_sig_num + not_sig_num)

        out_file_name = out_file_prefix + "_GC.p"
        pickle.dump([
            model_orders, p_matr_list, sig_matr_list,
            (all_sig_matr, all_sig_num, not_sig_num)
        ], open(out_file_name, "w"))

        print "Results written  to", out_file_name

    # compare the significant matrices

    # save the output p matrices

    print "Total time used ", time.time() - tstart
示例#6
0
def main():
    tstart = time.time()


    input_file = args.input_file
    out_file_prefix = args.out_file_prefix


    start_index = args.start_index
    end_index = args.end_index


    df = gtm.load_file_and_avg(input_file)

    genes = df['gene'][start_index:end_index].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    cause_type = args.cause_type

    if cause_type == 'g':
        model_orders = range(args.model_order_min, args.model_order_max + 1)

        threshold = args.p_threshold

        p_matr_list = []
        sig_matr_list = []

        for model_order in model_orders:
            t_gc = time.time()
            p_matr = pairwise_granger_causality_all(geneTS, model_order=model_order, use_processes=args.use_processes, procnum=args.procnum)
            print "Time for granger causality", time.time() - t_gc


            sig_matr = p_matr < threshold

            p_matr_list.append(p_matr)
            sig_matr_list.append(sig_matr)



        all_sig_matr, all_sig_num, not_sig_num = gtm.compare_sig_matr(sig_matr_list=sig_matr_list)

        print "Total number of significant pairs ", all_sig_num + not_sig_num
        print "Pairs significant across all matrices ", all_sig_num, all_sig_num * 1.0 / (all_sig_num + not_sig_num)


        out_file_name = out_file_prefix + "_GC.p"
        pickle.dump([model_orders, p_matr_list, sig_matr_list, (all_sig_matr, all_sig_num, not_sig_num)], open(out_file_name, "w"))

        print "Results written  to", out_file_name



    # compare the significant matrices

    # save the output p matrices

    print "Total time used ", time.time() - tstart
示例#7
0
def run(args):


    data_file = args.data_file.split('/')[-1]
    rand_data_file = args.rand_data_file.split('/')[-1]


    df = gtm.load_file_and_avg(data_file)

    genes = df['gene'].values

    n = len(genes)

    script_filenames = []
    output_filenames = []
    output_rand_filenames = []

    if args.test == "e":
        all_res_filenames = []
        use_filenames = []
        all_res_rand_filenames = []
        use_rand_filenames = []
    else:
        all_res_filenames = None
        use_filenames = None
        all_res_rand_filenames = None
        use_rand_filenames = None

    partition_rows = pj.partition_inputs(range(n), args.job_num)


    for partition_row, i in zip(partition_rows, range(len(partition_rows))):

        script_filename = args.output_name + "-script-" + str(i) + ".sh"
        script_filenames.append(script_filename)


        output_filename = args.output_name + "-" + str(i) + ".p"
        output_filenames.append(output_filename)

        output_rand_filename = args.output_name + "-randomized-" + str(i) + ".p"
        output_rand_filenames.append(output_rand_filename)

        # prepare the job associated with this

        row_filename = args.output_name + "-row-" + str(i) + ".txt"

        command_string = "python run_causal_rand_row.py -d " + data_file +  " -rd " + rand_data_file + \
                         " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -rl " + \
                         str(row_filename) + " -o " + output_filename + " -or " + output_rand_filename

        if args.test == "e":
            all_res_filename = args.output_name + "-all-params-" + str(i) + ".txt"
            all_res_filenames.append(all_res_filename)

            use_filename = args.output_name + "-used-params-" + str(i) + ".txt"
            use_filenames.append(use_filename)

            all_res_rand_filename = args.output_name + "-all-params-randomized-" + str(i) + ".txt"
            all_res_rand_filenames.append(all_res_rand_filename)

            use_rand_filename = args.output_name + "-used-params-randomized-" + str(i) + ".txt"
            use_rand_filenames.append(use_rand_filename)

            command_string += " -oa " + all_res_filename + " -ou " + use_filename + " -ora " + all_res_rand_filename + " -oru " + use_rand_filename


        with open(row_filename, 'w') as rowfile:
            rowfile.write(str(partition_row) + "\n")

        print "Partition row written to ", row_filename


        with open(script_filename, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("module load python/2.7\n")
            outputfile.write("module load python/2.7/scipy-mkl\n")
            outputfile.write("module load python/2.7/numpy-mkl\n")
            outputfile.write("module load anaconda\n")
            outputfile.write(command_string)
            outputfile.write("\n")
        os.chmod(script_filename, 0777)

        print "Script written to ", script_filename

    integrated_name_dict = {}
    integrated_name_dict["Output"] = args.output_name + ".p"
    integrated_name_dict["Rand-Output"] = args.output_name + "-randomized.p"
    integrated_name_dict["All-Params"] = args.output_name + "-all-params.txt"
    integrated_name_dict["Use-Params"] = args.output_name + "-use-params.txt"
    integrated_name_dict["All-Rand-Params"] = args.output_name + "-all-params-randomized.txt"
    integrated_name_dict["Use-Rand-Params"] = args.output_name + "-use-params-randomized.txt"


    with open("script_list.txt", 'w') as scriptfile:
        for script_filename in script_filenames:
            scriptfile.write(script_filename + "\n")
        print "Script list written to script_list.txt"


    # list of matrices to integrate
    output_matr_dict = {"Output": output_filenames, "Rand-Output": output_rand_filenames}
    output_matr_df = pd.DataFrame(output_matr_dict)
    output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False)
    print "Output matrices written to output_matr_list.txt"

    int_matr_dict = dict([(x, integrated_name_dict[x]) for x in ["Output", "Rand-Output"]])
    int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
    int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False)
    print "integrated matrices written to int_matr_list.txt"


    if args.test == "e":
        # lists of dataframes (param files) to integrate
        # These will only be integrated if
        output_df_dict = {}
        output_df_lists = [all_res_filenames, use_filenames, all_res_rand_filenames, use_rand_filenames]
        output_df_names = ["All-Params", "Use-Params", "All-Rand-Params", "Use-Rand-Params"]
        for out_list, out_name in zip(output_df_lists, output_df_names):
            if out_list != None:
                output_df_dict[out_name] = out_list

        output_df_df = pd.DataFrame(output_df_dict)
        output_df_df.to_csv("output_df_list.txt", sep="\t", index=False)
        print "output dfs written to output_df_list.txt"


        int_df_dict = dict([(x, integrated_name_dict[x]) for x in set(output_df_names).intersection(output_df_dict.keys())])
        int_df_df = pd.DataFrame(int_df_dict, index=[0])
        int_df_df.to_csv("int_df_list.txt", sep="\t", index=False)
        print "Integrated dfs written to int_df_list.txt"


    with open("integrate_outputs.sh", 'w') as ifile:

        if args.test == "e":
            # here , "a" means the axis to integrate by
            ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1 && " + \
                        "python integrate_outputs_rand_row.py -i output_df_list.txt -t d -o int_df_list.txt\n")

        else:
            ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1\n")

        print "Integration script written to integrate_outputs.sh"
        os.chmod("integrate_outputs.sh", 0777)

    with open("fdr_control.sh", 'w') as ffile:
        fdr_string = "python fdr_control.py -m " + integrated_name_dict["Output"] + " -rm " + integrated_name_dict["Rand-Output"] + \
                    " -d " + data_file + " -rd " + rand_data_file + " -n " + args.output_name + " -f \"" + str(args.fdr) + "\" " + \
                    " -c " + str(args.coef_num) + " -mn " + str(1) + " -pp " + args.output_name + "-all-beta-histogram "
        ffile.write(fdr_string + " -sb e && " + fdr_string + " -sb n\n")
        print "FDR CONTROL script written to fdr_control.sh"
        os.chmod("fdr_control.sh", 0777)


    if args.parallel_num > 0:
        print "Parallel Number (# processes per job): " + str(args.parallel_num)

        script_groups = pj.partition_inputs(script_filenames, number=int(math.ceil(len(script_filenames) * 1.0/args.parallel_num)))

        print "Number of script groups ", len(script_groups)


        parallel_scripts = []
        for i, script_group in zip(range(len(script_groups)), script_groups):
            appended_script_filenames = ["./" + script_filename for script_filename in script_group]
            parallel_script = " & ".join(appended_script_filenames)
            print "Parallel Script ", i, ":", parallel_script
            parallel_scripts.append(parallel_script)

        with open("parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print "Parallel script list written to parallel_script_list.txt"
示例#8
0
def run(args):

    data_file = args.data_file.split('/')[-1]
    rand_data_file = args.rand_data_file.split('/')[-1]

    df = gtm.load_file_and_avg(data_file)

    genes = df['gene'].values

    n = len(genes)

    script_filenames = []
    output_filenames = []
    output_rand_filenames = []

    if args.test == "e":
        all_res_filenames = []
        use_filenames = []
        all_res_rand_filenames = []
        use_rand_filenames = []
    else:
        all_res_filenames = None
        use_filenames = None
        all_res_rand_filenames = None
        use_rand_filenames = None

    partition_rows = pj.partition_inputs(range(n), args.job_num)

    for partition_row, i in zip(partition_rows, range(len(partition_rows))):

        script_filename = args.output_name + "-script-" + str(i) + ".sh"
        script_filenames.append(script_filename)

        output_filename = args.output_name + "-" + str(i) + ".p"
        output_filenames.append(output_filename)

        output_rand_filename = args.output_name + "-randomized-" + str(
            i) + ".p"
        output_rand_filenames.append(output_rand_filename)

        # prepare the job associated with this

        row_filename = args.output_name + "-row-" + str(i) + ".txt"

        command_string = "python run_causal_rand_row.py -d " + data_file +  " -rd " + rand_data_file + \
                         " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -rl " + \
                         str(row_filename) + " -o " + output_filename + " -or " + output_rand_filename

        if args.test == "e":
            all_res_filename = args.output_name + "-all-params-" + str(
                i) + ".txt"
            all_res_filenames.append(all_res_filename)

            use_filename = args.output_name + "-used-params-" + str(i) + ".txt"
            use_filenames.append(use_filename)

            all_res_rand_filename = args.output_name + "-all-params-randomized-" + str(
                i) + ".txt"
            all_res_rand_filenames.append(all_res_rand_filename)

            use_rand_filename = args.output_name + "-used-params-randomized-" + str(
                i) + ".txt"
            use_rand_filenames.append(use_rand_filename)

            command_string += " -oa " + all_res_filename + " -ou " + use_filename + " -ora " + all_res_rand_filename + " -oru " + use_rand_filename

        with open(row_filename, 'w') as rowfile:
            rowfile.write(str(partition_row) + "\n")

        print "Partition row written to ", row_filename

        with open(script_filename, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("module load python/2.7\n")
            outputfile.write("module load python/2.7/scipy-mkl\n")
            outputfile.write("module load python/2.7/numpy-mkl\n")
            outputfile.write("module load anaconda\n")
            outputfile.write(command_string)
            outputfile.write("\n")
        os.chmod(script_filename, 0777)

        print "Script written to ", script_filename

    integrated_name_dict = {}
    integrated_name_dict["Output"] = args.output_name + ".p"
    integrated_name_dict["Rand-Output"] = args.output_name + "-randomized.p"
    integrated_name_dict["All-Params"] = args.output_name + "-all-params.txt"
    integrated_name_dict["Use-Params"] = args.output_name + "-use-params.txt"
    integrated_name_dict[
        "All-Rand-Params"] = args.output_name + "-all-params-randomized.txt"
    integrated_name_dict[
        "Use-Rand-Params"] = args.output_name + "-use-params-randomized.txt"

    with open("script_list.txt", 'w') as scriptfile:
        for script_filename in script_filenames:
            scriptfile.write(script_filename + "\n")
        print "Script list written to script_list.txt"

    # list of matrices to integrate
    output_matr_dict = {
        "Output": output_filenames,
        "Rand-Output": output_rand_filenames
    }
    output_matr_df = pd.DataFrame(output_matr_dict)
    output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False)
    print "Output matrices written to output_matr_list.txt"

    int_matr_dict = dict([(x, integrated_name_dict[x])
                          for x in ["Output", "Rand-Output"]])
    int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
    int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False)
    print "integrated matrices written to int_matr_list.txt"

    if args.test == "e":
        # lists of dataframes (param files) to integrate
        # These will only be integrated if
        output_df_dict = {}
        output_df_lists = [
            all_res_filenames, use_filenames, all_res_rand_filenames,
            use_rand_filenames
        ]
        output_df_names = [
            "All-Params", "Use-Params", "All-Rand-Params", "Use-Rand-Params"
        ]
        for out_list, out_name in zip(output_df_lists, output_df_names):
            if out_list != None:
                output_df_dict[out_name] = out_list

        output_df_df = pd.DataFrame(output_df_dict)
        output_df_df.to_csv("output_df_list.txt", sep="\t", index=False)
        print "output dfs written to output_df_list.txt"

        int_df_dict = dict([
            (x, integrated_name_dict[x])
            for x in set(output_df_names).intersection(output_df_dict.keys())
        ])
        int_df_df = pd.DataFrame(int_df_dict, index=[0])
        int_df_df.to_csv("int_df_list.txt", sep="\t", index=False)
        print "Integrated dfs written to int_df_list.txt"

    with open("integrate_outputs.sh", 'w') as ifile:

        if args.test == "e":
            # here , "a" means the axis to integrate by
            ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1 && " + \
                        "python integrate_outputs_rand_row.py -i output_df_list.txt -t d -o int_df_list.txt\n")

        else:
            ifile.write(
                "python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1\n"
            )

        print "Integration script written to integrate_outputs.sh"
        os.chmod("integrate_outputs.sh", 0777)

    with open("fdr_control.sh", 'w') as ffile:
        fdr_string = "python fdr_control.py -m " + integrated_name_dict["Output"] + " -rm " + integrated_name_dict["Rand-Output"] + \
                    " -d " + data_file + " -rd " + rand_data_file + " -n " + args.output_name + " -f \"" + str(args.fdr) + "\" " + \
                    " -c " + str(args.coef_num) + " -mn " + str(1) + " -pp " + args.output_name + "-all-beta-histogram "
        ffile.write(fdr_string + " -sb e && " + fdr_string + " -sb n\n")
        print "FDR CONTROL script written to fdr_control.sh"
        os.chmod("fdr_control.sh", 0777)

    if args.parallel_num > 0:
        print "Parallel Number (# processes per job): " + str(
            args.parallel_num)

        script_groups = pj.partition_inputs(
            script_filenames,
            number=int(
                math.ceil(len(script_filenames) * 1.0 / args.parallel_num)))

        print "Number of script groups ", len(script_groups)

        parallel_scripts = []
        for i, script_group in zip(range(len(script_groups)), script_groups):
            appended_script_filenames = [
                "./" + script_filename for script_filename in script_group
            ]
            parallel_script = " & ".join(appended_script_filenames)
            print "Parallel Script ", i, ":", parallel_script
            parallel_scripts.append(parallel_script)

        with open("parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print "Parallel script list written to parallel_script_list.txt"
示例#9
0
def run(args):

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    n = len(genes)


    script_filenames = []
    output_filenames = []


    partition_pairs = lazy_partition_pairs(n, n, args.job_num)

    for partition_pair, i in zip(partition_pairs, range(len(partition_pairs))):

        script_filename = args.output_name + "-script-" + str(i) + ".sh"
        script_filenames.append(script_filename)


        output_filename = args.output_name + "-" + str(i) + ".p"
        output_filenames.append(output_filename)
        # prepare the job associated with this

        pair_filename = args.output_name + "-pair-" + str(i) + ".txt"

        command_string = "python run_causal.py -d " + args.data_file.split('/')[-1] + " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -pp " + \
                         str(pair_filename) + " -o " + output_filename


        if args.test == "gp":
            command_string += " -d2 " + args.data_file2.split('/')[-1]

        with open(pair_filename, 'w') as pairfile:
            pairfile.write(str(partition_pair) + "\n")

        print "Partition pair written to ", pair_filename


        with open(script_filename, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("module load python/2.7\n")
            outputfile.write("module load python/2.7/scipy-mkl\n")
            outputfile.write("module load python/2.7/numpy-mkl\n")
            outputfile.write("module load anaconda\n")
            outputfile.write(command_string)
            outputfile.write("\n")
        os.chmod(script_filename, 0777)

        print "Script written to ", script_filename

    # submit the jobs soon


    with open("script_list.txt", 'w') as scriptfile:
        for script_filename in script_filenames:
            scriptfile.write(script_filename + "\n")
        print "Script list written to script_list.txt"

    with open("output_list.txt", 'w') as outputfile:
        for output_filename in output_filenames:
            outputfile.write(output_filename + "\n")
        print "Output list written to output_list.txt"

    with open("integrate_outputs.sh", 'w') as ifile:
        integrated_filename = args.output_name + ".p"
        ifile.write("python integrate_outputs.py -i output_list.txt -o " + integrated_filename + " -n " + str(n) + "\n")
        print "Integration script written to integrate_outputs.sh"
        os.chmod("integrate_outputs.sh", 0777)


    if args.parallel_num > 0:
        print "Parallel Number (# processes per job): " + str(args.parallel_num)

        script_groups = partition_inputs(script_filenames, number=len(script_filenames)/args.parallel_num)

        print "Number of script groups ", len(script_groups)


        parallel_scripts = []
        for i, script_group in zip(range(len(script_groups)), script_groups):
            appended_script_filenames = ["./" + script_filename for script_filename in script_group]
            parallel_script = " & ".join(appended_script_filenames)
            print "Parallel Script ", i, ":", parallel_script
            parallel_scripts.append(parallel_script)

        with open("parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print "Parallel script list written to parallel_script_list.txt"
示例#10
0
def run(args):

    data = gtm.load_file_and_avg(args.original_data)
    rand_data = gtm.load_file_and_avg(args.randomized_data)

    matr = pickle.load(open(args.original_matrix, 'rB'))[:, :,
                                                         args.coef_num - 1]
    rand_matr = pickle.load(open(args.randomized_matrix,
                                 'rB'))[:, :, args.coef_num - 1]

    if args.stratify_by not in {"e", "n"}:
        raise ValueError(
            "Stratify_by must be either 'e' for effect or 'n' for none")
    else:
        if args.stratify_by == "e":
            stratify_by = "effect"
        elif args.stratify_by == "n":
            stratify_by = "none"

    print
    print "Beginning FDR control, stratifying the matrix by ", stratify_by

    genes = data["gene"]
    rand_genes = rand_data["gene"]

    if (genes != rand_genes).any():
        raise ValueError("Genes are not the same!")

    print "Original matrix for ", args.name, "saved to", args.name + "-unshuffled-matrix.txt"
    gtm.save_gene_matrix(matrix=matr,
                         filename=args.name + "-unshuffled-matrix.txt",
                         genes=genes)

    print "Randomized matrix for ", args.name, "saved to", args.name + "-shuffled-matrix.txt"
    gtm.save_gene_matrix(matrix=rand_matr,
                         filename=args.name + "-shuffled-matrix.txt",
                         genes=rand_genes)

    if args.plot_prefix != None:
        plot_betas(matr.flatten(),
                   rand_matr.flatten(),
                   filename=args.plot_prefix)
        plot_betas(matr.flatten(),
                   rand_matr.flatten(),
                   filename=args.plot_prefix + "_zoom-in-95",
                   zoom_in_percentile=95)

    if args.cap_by != None:
        print "First capping original and randomized matrix"
        matr = cap_matr(matr, args.cap_by, name="Original")
        rand_matr = cap_matr(rand_matr, args.cap_by, name="Randomized")

    print "Using original"
    print "Trying to have an FDR of ", args.fdr
    print args.name

    functions = [get_abs_thresh, get_pos_neg_thresh]
    types = ["abs-thresh", "pos-neg-thresh"]
    # whether to take absolute value of given matrices
    absoluted = [True, True]

    for function, t, a in zip(functions, types, absoluted):

        print
        print "*******************"
        print t
        print "*******************"

        print "making matrix"

        out_prefix = args.name + "-unshuffled-" + t + "-FDR-" + str(
            args.fdr) + "-stratby-" + stratify_by

        thresh_matr, threshes = function(matr,
                                         rand_matr,
                                         args.fdr,
                                         stratify_by=stratify_by)

        matr_df = gtm.save_gene_matrix(out_prefix + "-matrix.txt", thresh_matr,
                                       genes)
        pickle.dump(threshes, open(out_prefix + "-threshes.p", 'w'))

        print "Matrix written to ", out_prefix + "-matrix.txt"
        print "Threshes written to ", out_prefix + "-threshes.p"

        #write_readme(thresh_matr, out_prefix, args.fdr, out_prefix + '-README.txt', out_prefix + "-matrix")

        if args.make_network:
            print "making network"
            net_df = nh.matr_to_net(matr_df,
                                    args.name + "-sb-" + args.stratify_by,
                                    make_pair=False)

            net_df.to_csv(out_prefix + "-network.txt", sep="\t", index=False)

            print "Network written to ", out_prefix + "-network.txt"

        if absoluted:
            print "Making absoluted matrix "
            abs_matr = np.absolute(thresh_matr)

            abs_prefix = args.name + "-unshuffled-" + t + "-absoluted-FDR-" + str(
                args.fdr) + "-stratby-" + stratify_by

            abs_df = gtm.save_gene_matrix(abs_prefix + "-matrix", abs_matr,
                                          genes)

            #write_readme(abs_matr, abs_prefix, args.fdr, abs_prefix + '-README.txt', abs_prefix + "-matrix")

            if args.make_network:
                print "Making absoluted network"
                abs_net_df = nh.matr_to_net(abs_df,
                                            args.name + "-sb-" +
                                            args.stratify_by,
                                            make_pair=False)

                abs_net_df.to_csv(abs_prefix + "-network.txt",
                                  sep="\t",
                                  index=False)

                print "Network written to ", abs_prefix + "-network.txt"

    print "FINISHED"
    print "#################################################"
    print
示例#11
0
def run(args):
    data = gtm.load_file_and_avg(args.original_data)
    rand_data = gtm.load_file_and_avg(args.randomized_data)

    matr = pickle.load(open(args.original_matrix, 'rB'))[:, :, args.coef_num - 1]
    rand_matr = pickle.load(open(args.randomized_matrix, 'rB'))[:, :, args.coef_num - 1]

    if args.stratify_by not in {"e", "n"}:
        raise ValueError("Stratify_by must be either 'e' for effect or 'n' for none")
    else:
        if args.stratify_by == "e":
            stratify_by = "effect"
        elif args.stratify_by == "n":
            stratify_by = "none"

    genes = data["gene"]
    rand_genes = rand_data["gene"]

    if (genes != rand_genes).any():
        raise ValueError("Genes are not the same!")


    print "Original matrix for ", args.name, "saved to", args.name + "-unshuffled-matrix.txt"
    gtm.save_gene_matrix(matrix=matr, filename=args.name + "-unshuffled-matrix.txt", genes=genes)

    print "Randomized matrix for ", args.name, "saved to", args.name + "-shuffled-matrix.txt"
    gtm.save_gene_matrix(matrix=rand_matr, filename=args.name + "-shuffled-matrix.txt", genes=rand_genes)


    if args.plot_prefix != None:
        plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix)
        plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix + "_zoom-in-95", zoom_in_percentile=95)



    print "Using original"
    print "Trying to have an FDR of ", args.fdr
    print args.name


    functions = [get_abs_thresh, get_pos_thresh, get_neg_thresh, get_pos_neg_thresh]
    types = ["abs-thresh", "pos-thresh", "neg-thresh", "pos-neg-thresh"]
    # whether to take absolute value of given matrices
    absoluted = [True, False, False, True]

    for function, t, a in zip(functions, types, absoluted):
        out_prefix = args.name + "-unshuffled-" + t + "-FDR-" + str(args.fdr) + "-stratby-" + stratify_by


        thresh_matr, threshes = function(matr, rand_matr, args.fdr, stratify_by = stratify_by)


        matr_df = gtm.save_gene_matrix(out_prefix + "-matrix.txt", thresh_matr, genes)
        pickle.dump(threshes, open(out_prefix + "-threshes.p", 'w'))

        print "Matrix written to ", out_prefix + "-matrix.txt"
        print "Threshes written to ", out_prefix + "-threshes.p"

        write_readme(thresh_matr, out_prefix, args.fdr, out_prefix + '-README.txt', out_prefix + "-matrix")

        if args.make_network:
            net_df = nh.matr_to_net(matr_df, args.name, make_pair=False)

            net_df.to_csv(out_prefix + "-network.txt", sep="\t", index=False)

            print "Network written to ", out_prefix + "-network.txt"

        if absoluted:
            abs_matr = np.absolute(thresh_matr)

            abs_prefix = args.name + "-unshuffled-" + t + "-absoluted-FDR-" + str(args.fdr) + "-stratby-" + stratify_by

            abs_df = gtm.save_gene_matrix(abs_prefix + "-matrix", abs_matr, genes)

            write_readme(abs_matr, abs_prefix, args.fdr, abs_prefix + '-README.txt', abs_prefix + "-matrix")

            if args.make_network:
                abs_net_df = nh.matr_to_net(abs_df, args.name, make_pair=False)

                abs_net_df.to_csv(abs_prefix + "-network.txt", sep="\t", index=False)

                print "Network written to ", abs_prefix + "-network.txt"
示例#12
0
def run(args):

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    n = len(genes)

    script_filenames = []
    output_filenames = []

    partition_pairs = lazy_partition_pairs(n, n, args.job_num)

    for partition_pair, i in zip(partition_pairs, range(len(partition_pairs))):

        script_filename = args.output_name + "-script-" + str(i) + ".sh"
        script_filenames.append(script_filename)

        output_filename = args.output_name + "-" + str(i) + ".p"
        output_filenames.append(output_filename)
        # prepare the job associated with this

        pair_filename = args.output_name + "-pair-" + str(i) + ".txt"

        command_string = "python run_causal.py -d " + args.data_file.split('/')[-1] + " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -pp " + \
                         str(pair_filename) + " -o " + output_filename

        if args.test == "gp":
            command_string += " -d2 " + args.data_file2.split('/')[-1]

        with open(pair_filename, 'w') as pairfile:
            pairfile.write(str(partition_pair) + "\n")

        print "Partition pair written to ", pair_filename

        with open(script_filename, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("module load python/2.7\n")
            outputfile.write("module load python/2.7/scipy-mkl\n")
            outputfile.write("module load python/2.7/numpy-mkl\n")
            outputfile.write("module load anaconda\n")
            outputfile.write(command_string)
            outputfile.write("\n")
        os.chmod(script_filename, 0777)

        print "Script written to ", script_filename

    # submit the jobs soon

    with open("script_list.txt", 'w') as scriptfile:
        for script_filename in script_filenames:
            scriptfile.write(script_filename + "\n")
        print "Script list written to script_list.txt"

    with open("output_list.txt", 'w') as outputfile:
        for output_filename in output_filenames:
            outputfile.write(output_filename + "\n")
        print "Output list written to output_list.txt"

    with open("integrate_outputs.sh", 'w') as ifile:
        integrated_filename = args.output_name + ".p"
        ifile.write("python integrate_outputs.py -i output_list.txt -o " +
                    integrated_filename + " -n " + str(n) + "\n")
        print "Integration script written to integrate_outputs.sh"
        os.chmod("integrate_outputs.sh", 0777)

    if args.parallel_num > 0:
        print "Parallel Number (# processes per job): " + str(
            args.parallel_num)

        script_groups = partition_inputs(script_filenames,
                                         number=len(script_filenames) /
                                         args.parallel_num)

        print "Number of script groups ", len(script_groups)

        parallel_scripts = []
        for i, script_group in zip(range(len(script_groups)), script_groups):
            appended_script_filenames = [
                "./" + script_filename for script_filename in script_group
            ]
            parallel_script = " & ".join(appended_script_filenames)
            print "Parallel Script ", i, ":", parallel_script
            parallel_scripts.append(parallel_script)

        with open("parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print "Parallel script list written to parallel_script_list.txt"