示例#1
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--results-directory", dest="res_dir",
                      type="string", help="directory to write results"
                      "tables to")

    parser.add_option("--alpha", dest="alpha", type="string",
                      help="statistical significance p-value threshold")

    parser.add_option("--method", dest="method", type="string",
                      help="analysis design. "
                      "either timepoint or condition")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    try:
        infile = argv[-1]
        open(infile, "r")
        # check for compression state
        if infile.split(".")[-1] == "gz":
            comp = "gzip"
        else:
            comp = None

    except IOError:
        infile = options.stdin
        # check for compression state
        if infile.name.split(".")[-1] == "gz":
            comp = "gzip"
        else:
            comp = None

    alpha = float(options.alpha)
    res_dir = options.res_dir

    count_table = pd.read_table(infile,
                                sep="\t",
                                index_col=0,
                                header=0,
                                compression=comp)
    columns = count_table.columns
    conditions = set([x.split(".")[0] for x in columns])
    times = set([x.split(".")[1] for x in columns])

    data_dict = {}
    cond_times = [x for x in itertools.product(conditions, times)]
    base_col = {}
    time_dict = {}

    if options.method == "timepoint":

        # assumes all column names are in the form
        # `condition`:`time`:`replicate`
        # use `condition`.`time` as dictionary keys

        for x in cond_times:
            c_t = "%s.%s" % (x[0], x[1])
            cols = [k for k in count_table.columns if re.search(c_t, k)]
            if x[1] == '000':
                base_col[c_t] = count_table[cols]
            else:
                time_dict[c_t] = count_table[cols]

        for bt in itertools.product(base_col.keys(),
                                    time_dict.keys()):
            df = pd.merge(left=base_col[bt[0]],
                          right=time_dict[bt[1]],
                          how='outer',
                          left_index=True,
                          right_index=True)
            time = int(bt[1].split(".")[1])
            data_dict["%s_0_%i" % (bt[0].split(".")[0],
                                   time)] = df

        for each in data_dict.keys():
            df_ = data_dict[each]
            outfile = "%s/%s-time.tsv" % (res_dir,
                                          each)
            res_frame = TS.timepointDESeq2(df_,
                                           each,
                                           alpha,
                                           res_dir)
            res_frame.to_csv(outfile,
                             sep="\t",
                             index_label="gene_id")

    elif options.method == "condition":

        # assumes all column names are in the form
        # `condition`:`time`:`replicate`
        # use `condition`.`time` as dictionary keys

        for x in cond_times:
            c_t = "%s.%s" % (x[0], x[1])
            cols = [k for k in count_table.columns if re.search(c_t, k)]
            if int(x[1]) == 0:
                base_col[c_t] = count_table[cols]
            else:
                time_dict[c_t] = count_table[cols]

        # make a dataframe for each 0:time point combination
        # for all conditions, index on `condition:0_time`

        base_keys = base_col.keys()
        time_keys = time_dict.keys()
        for k in conditions:
            for x in itertools.product(base_keys, time_keys):
                if re.search(k, x[0]) and re.search(k, x[1]):
                    df = pd.merge(left=base_col[x[0]],
                                  right=time_dict[x[1]],
                                  how='outer',
                                  left_index=True,
                                  right_index=True)
                    time = int(x[1].split(".")[1])
                    data_dict["%s.0_%i" % (x[0].split(".")[0],
                                           time)] = df
                else:
                    pass

        time_span = set([x.split(".")[1] for x in data_dict.keys()])

        all_dict = {}
        for cond in itertools.combinations(conditions, 2):
            c1 = cond[0]
            c2 = cond[1]
            for x in time_span:
                key1 = "%s.%s" % (c1, x)
                key2 = "%s.%s" % (c2, x)
                df = pd.merge(left=data_dict[key1],
                              right=data_dict[key2],
                              how='outer',
                              left_index=True,
                              right_index=True)
                all_dict["%s_%s.%s-diff" % (c1, c2, x)] = df

        for each in all_dict.keys():

            df = all_dict[each]
            outfile = "%s/%s-cond.tsv" % (res_dir,
                                          each)
            res_frame = TS.conditionDESeq2(df,
                                           each,
                                           alpha,
                                           res_dir)
            res_frame.to_csv(outfile, sep="\t", index_label="gene_id")

    # write footer and output benchmark information.
    E.Stop()
示例#2
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--results-directory",
                      dest="res_dir",
                      type="string",
                      help="directory to write results"
                      "tables to")

    parser.add_option("--alpha",
                      dest="alpha",
                      type="string",
                      help="statistical significance p-value threshold")

    parser.add_option("--method",
                      dest="method",
                      type="string",
                      help="analysis design. "
                      "either timepoint or condition")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    try:
        infile = argv[-1]
        open(infile, "r")
        # check for compression state
        if infile.split(".")[-1] == "gz":
            comp = "gzip"
        else:
            comp = None

    except IOError:
        infile = options.stdin
        # check for compression state
        if infile.name.split(".")[-1] == "gz":
            comp = "gzip"
        else:
            comp = None

    alpha = float(options.alpha)
    res_dir = options.res_dir

    count_table = pd.read_table(infile,
                                sep="\t",
                                index_col=0,
                                header=0,
                                compression=comp)
    columns = count_table.columns
    conditions = set([x.split(".")[0] for x in columns])
    times = set([x.split(".")[1] for x in columns])

    data_dict = {}
    cond_times = [x for x in itertools.product(conditions, times)]
    base_col = {}
    time_dict = {}

    if options.method == "timepoint":

        # assumes all column names are in the form
        # `condition`:`time`:`replicate`
        # use `condition`.`time` as dictionary keys

        for x in cond_times:
            c_t = "%s.%s" % (x[0], x[1])
            cols = [k for k in count_table.columns if re.search(c_t, k)]
            if x[1] == '000':
                base_col[c_t] = count_table[cols]
            else:
                time_dict[c_t] = count_table[cols]

        for bt in itertools.product(base_col.keys(), time_dict.keys()):
            df = pd.merge(left=base_col[bt[0]],
                          right=time_dict[bt[1]],
                          how='outer',
                          left_index=True,
                          right_index=True)
            time = int(bt[1].split(".")[1])
            data_dict["%s_0_%i" % (bt[0].split(".")[0], time)] = df

        for each in data_dict.keys():
            df_ = data_dict[each]
            outfile = "%s/%s-time.tsv" % (res_dir, each)
            res_frame = TS.timepointDESeq2(df_, each, alpha, res_dir)
            res_frame.to_csv(outfile, sep="\t", index_label="gene_id")

    elif options.method == "condition":

        # assumes all column names are in the form
        # `condition`:`time`:`replicate`
        # use `condition`.`time` as dictionary keys

        for x in cond_times:
            c_t = "%s.%s" % (x[0], x[1])
            cols = [k for k in count_table.columns if re.search(c_t, k)]
            if int(x[1]) == 0:
                base_col[c_t] = count_table[cols]
            else:
                time_dict[c_t] = count_table[cols]

        # make a dataframe for each 0:time point combination
        # for all conditions, index on `condition:0_time`

        base_keys = base_col.keys()
        time_keys = time_dict.keys()
        for k in conditions:
            for x in itertools.product(base_keys, time_keys):
                if re.search(k, x[0]) and re.search(k, x[1]):
                    df = pd.merge(left=base_col[x[0]],
                                  right=time_dict[x[1]],
                                  how='outer',
                                  left_index=True,
                                  right_index=True)
                    time = int(x[1].split(".")[1])
                    data_dict["%s.0_%i" % (x[0].split(".")[0], time)] = df
                else:
                    pass

        time_span = set([x.split(".")[1] for x in data_dict.keys()])

        all_dict = {}
        for cond in itertools.combinations(conditions, 2):
            c1 = cond[0]
            c2 = cond[1]
            for x in time_span:
                key1 = "%s.%s" % (c1, x)
                key2 = "%s.%s" % (c2, x)
                df = pd.merge(left=data_dict[key1],
                              right=data_dict[key2],
                              how='outer',
                              left_index=True,
                              right_index=True)
                all_dict["%s_%s.%s-diff" % (c1, c2, x)] = df

        for each in all_dict.keys():

            df = all_dict[each]
            outfile = "%s/%s-cond.tsv" % (res_dir, each)
            res_frame = TS.conditionDESeq2(df, each, alpha, res_dir)
            res_frame.to_csv(outfile, sep="\t", index_label="gene_id")

    # write footer and output benchmark information.
    E.Stop()