Python Timeseries示例，CGAT.Timeseries Python示例

示例#1

0

显示文件

文件： pipeline_timeseries.py 项目： hainm/CGATPipelines

        def splitFiles(infile, outfile):
            """
            Arbitrarily split files into chunks for parallelisation
            """

            Timeseries.splitFiles(infile=infile, nchunks=PARAMS["resampling_chunks"], out_dir="parallel_files.dir")
            P.touch(outfile)

示例#2

0

显示文件

        def splitFiles(infile, outfile):
            '''
            Arbitrarily split files into chunks for parallelisation
            '''

            Timeseries.splitFiles(infile=infile,
                                  nchunks=PARAMS['resampling_chunks'],
                                  out_dir="parallel_files.dir")
            P.touch(outfile)

示例#3

0

显示文件

文件： pipeline_timeseries.py 项目： hainm/CGATPipelines

    def genReplicateData(infile, outfile):
        """
        Split each replicate into a separate file for clustering
        within each replicate.  Relies on each replicate being the
        same across the whole time series.
        """

        outdir = outfile.split("/")[0]
        Timeseries.splitReplicates(infile=infile, axis="column", group_var="replicates", outdir=outdir)

        P.touch(outfile)

示例#4

0

显示文件

    def genReplicateData(infile, outfile):
        '''
        Split each replicate into a separate file for clustering
        within each replicate.  Relies on each replicate being the
        same across the whole time series.
        '''

        outdir = outfile.split("/")[0]
        Timeseries.splitReplicates(infile=infile,
                                   axis="column",
                                   group_var="replicates",
                                   outdir=outdir)

        P.touch(outfile)

示例#5

0

显示文件

def randIndexes(clustering_results):
    '''
    Calculate Rand index and adjusted Rand index over pairwise
    clustering comparisons.
    Use cythonised function to calculate indices
    '''

    # reassign module and gene labels with integer ids, integer comparison is
    # much faster than string comparison
    cluster_labels = clustering_results.values
    map_dict = get_label_map(cluster_labels)

    gene_map = {}
    for r, gene in enumerate(clustering_results.index):
        gene_map[gene] = r
    E.info("mapping gene ids")

    integer_matrix = make_mapped_matrix(map_dict, clustering_results)
    # take a small slice of the matrix for testing 5 genes, 3 clusterings

    E.info("counting clustering consensus")
    # use cythonized function to return rand index matrix
    cy_rand = Timeseries.consensus_metrics(integer_matrix)
    E.info("Rand Index calculated for all clusterings")

    return cy_rand

示例#6

0

显示文件

文件： diffgene2venn.py 项目： wangdi2014/cgat

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--alpha",
                      dest="alpha",
                      type="string",
                      help="false positive rate for differentially"
                      " expressed genes")

    parser.add_option("--file-list",
                      dest="infiles",
                      type="string",
                      help="comma separated list of input files")

    parser.add_option("--output-directory",
                      dest="out_dir",
                      type="string",
                      help="output directory for png images")

    # add common options (-h/--help, ...) and parse command line

    (options, args) = E.Start(parser, argv=argv)

    infiles = options.infiles.split(",")
    TS.genSigGenes(file_list=infiles,
                   alpha=float(options.alpha),
                   out_dir=options.out_dir)

    # Write footer and output benchmark information.
    E.Stop()

示例#7

0

显示文件

文件： distance2merge.py 项目： gsc0107/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--outfile",
                      dest="outfile",
                      type="string",
                      help="output filename")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infiles = argv[-1]

    files_list = infiles.split(",")

    if not options.outfile:
        outfile = options.stdout
    else:
        outfile = options.outfile

    TS.mergeFiles(file_list=files_list, outfile=outfile)

    # write footer and output benchmark information
    E.Stop()

示例#8

0

显示文件

文件： diffgene2venn.py 项目： Q-KIM/cgat

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--alpha", dest="alpha", type="string",
                      help="false positive rate for differentially"
                      " expressed genes")

    parser.add_option("--file-list", dest="infiles", type="string",
                      help="comma separated list of input files")

    parser.add_option("--output-directory", dest="out_dir", type="string",
                      help="output directory for png images")

    # add common options (-h/--help, ...) and parse command line

    (options, args) = E.Start(parser, argv=argv)

    infiles = options.infiles.split(",")
    TS.genSigGenes(file_list=infiles,
                   alpha=float(options.alpha),
                   out_dir=options.out_dir)

    # Write footer and output benchmark information.
    E.Stop()

示例#9

0

显示文件

文件： distance2merge.py 项目： Q-KIM/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--outfile", dest="outfile", type="string",
                      help="output filename")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infiles = argv[-1]

    files_list = infiles.split(",")

    if not options.outfile:
        outfile = options.stdout
    else:
        outfile = options.outfile

    TS.mergeFiles(file_list=files_list,
                  outfile=outfile)

    # write footer and output benchmark information
    E.Stop()

示例#10

0

显示文件

文件： expression2expression.py 项目： mmaarriiee/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--time", dest="timepoints", type="string",
                      help="a comma-separated list of time points measured")

    parser.add_option("--replicates", dest="reps", type="string",
                      help="a comma-separated list of replicate IDs")

    parser.add_option("--conditions", dest="conditions", type="string",
                      help="a comma-separated list of experimental conditions")

    parser.add_option("--orders", dest="orders", type="int",
                      help="order of polynomial terms to include in"
                      "maSigPro linear model")

    parser.add_option("--fdr", dest="fdr", type="string",
                      help="FDR for calling DEGs")

    parser.add_option("--padjust", dest="padjust", type="string",
                      help="multiple testing correction to apply to"
                      "control FDR")

    parser.add_option("--stepwise", dest="stepwise", type="string",
                      help="stepwise regression to use")

    parser.add_option("--pinclude", dest="pinclude", type="string",
                      help="p-value for inclusion in stepwise regression")

    parser.add_option("--rsquared", dest="rsquared", type="string",
                      help="rsquared cut-off for DEG reporting")

    parser.add_option("--var-group", dest="vargroup", type="string",
                      help="variable group reporting. each, all or"
                      "group")

    parser.add_option("--task", dest="task", type="string",
                      help="analysis task to be executed")

    parser.add_option("--infile", dest="infile", type="string",
                      help="input file path")

    parser.add_option("--quantile", dest="quantile", type="int",
                      help="see pipeline.ini for explanation")

# add common options (-h/--help, ...) and parse command line

    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]

    parser.set_defaults(cutHeight=0,
                        conditions=None,
                        split=False,
                        cluster_size=30)

    if options.task == "deseq":
        timepoints = [int(x) for x in options.timepoints.split(",")]
        timepoints.sort()
        reps = [x for x in options.reps.split(",")]
        if not options.conditions:
            conditions = None
        else:
            conditions = [x for x in options.conditions.split(",")]

        data_frame = TS.deseqNormalize(infile=infile,
                                       time_points=timepoints,
                                       reps=reps,
                                       conditions=conditions)

    elif options.task == "masigpro":
        data_frame = TS.maSigPro(infile=infile,
                                 order_terms=int(options.orders),
                                 fdr=float(options.fdr),
                                 adjust=options.padjust,
                                 stepwise=options.stepwise,
                                 include_p=float(options.pinclude),
                                 rsq=float(options.rsquared),
                                 var_group=options.vargroup)

    elif options.task == "sumcovar":
        timepoints = [int(x) for x in options.timepoints.split(",")]
        reps = [x for x in options.reps.split(",")]
        data_frame = TS.covarFilter(infile=infile,
                                    time_points=timepoints,
                                    replicates=reps,
                                    quantile=int(options.quantile))

    elif options.task == "average_expression":
        data_frame = TS.avTimeExpression(infile)

    else:
        pass

    data_frame.to_csv(options.stdout,
                      sep="\t",
                      header=True,
                      index_label="gene_id")

    # Write footer and output benchmark information.
    E.Stop()

示例#11

0

显示文件

文件： timeseries2diffgenes.py 项目： Q-KIM/cgat

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--results-directory", dest="res_dir",
                      type="string", help="directory to write results"
                      "tables to")

    parser.add_option("--alpha", dest="alpha", type="string",
                      help="statistical significance p-value threshold")

    parser.add_option("--method", dest="method", type="string",
                      help="analysis design. "
                      "either timepoint or condition")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    try:
        infile = argv[-1]
        open(infile, "r")
        # check for compression state
        if infile.split(".")[-1] == "gz":
            comp = "gzip"
        else:
            comp = None

    except IOError:
        infile = options.stdin
        # check for compression state
        if infile.name.split(".")[-1] == "gz":
            comp = "gzip"
        else:
            comp = None

    alpha = float(options.alpha)
    res_dir = options.res_dir

    count_table = pd.read_table(infile,
                                sep="\t",
                                index_col=0,
                                header=0,
                                compression=comp)
    columns = count_table.columns
    conditions = set([x.split(".")[0] for x in columns])
    times = set([x.split(".")[1] for x in columns])

    data_dict = {}
    cond_times = [x for x in itertools.product(conditions, times)]
    base_col = {}
    time_dict = {}

    if options.method == "timepoint":

        # assumes all column names are in the form
        # `condition`:`time`:`replicate`
        # use `condition`.`time` as dictionary keys

        for x in cond_times:
            c_t = "%s.%s" % (x[0], x[1])
            cols = [k for k in count_table.columns if re.search(c_t, k)]
            if x[1] == '000':
                base_col[c_t] = count_table[cols]
            else:
                time_dict[c_t] = count_table[cols]

        for bt in itertools.product(base_col.keys(),
                                    time_dict.keys()):
            df = pd.merge(left=base_col[bt[0]],
                          right=time_dict[bt[1]],
                          how='outer',
                          left_index=True,
                          right_index=True)
            time = int(bt[1].split(".")[1])
            data_dict["%s_0_%i" % (bt[0].split(".")[0],
                                   time)] = df

        for each in data_dict.keys():
            df_ = data_dict[each]
            outfile = "%s/%s-time.tsv" % (res_dir,
                                          each)
            res_frame = TS.timepointDESeq2(df_,
                                           each,
                                           alpha,
                                           res_dir)
            res_frame.to_csv(outfile,
                             sep="\t",
                             index_label="gene_id")

    elif options.method == "condition":

        # assumes all column names are in the form
        # `condition`:`time`:`replicate`
        # use `condition`.`time` as dictionary keys

        for x in cond_times:
            c_t = "%s.%s" % (x[0], x[1])
            cols = [k for k in count_table.columns if re.search(c_t, k)]
            if int(x[1]) == 0:
                base_col[c_t] = count_table[cols]
            else:
                time_dict[c_t] = count_table[cols]

        # make a dataframe for each 0:time point combination
        # for all conditions, index on `condition:0_time`

        base_keys = base_col.keys()
        time_keys = time_dict.keys()
        for k in conditions:
            for x in itertools.product(base_keys, time_keys):
                if re.search(k, x[0]) and re.search(k, x[1]):
                    df = pd.merge(left=base_col[x[0]],
                                  right=time_dict[x[1]],
                                  how='outer',
                                  left_index=True,
                                  right_index=True)
                    time = int(x[1].split(".")[1])
                    data_dict["%s.0_%i" % (x[0].split(".")[0],
                                           time)] = df
                else:
                    pass

        time_span = set([x.split(".")[1] for x in data_dict.keys()])

        all_dict = {}
        for cond in itertools.combinations(conditions, 2):
            c1 = cond[0]
            c2 = cond[1]
            for x in time_span:
                key1 = "%s.%s" % (c1, x)
                key2 = "%s.%s" % (c2, x)
                df = pd.merge(left=data_dict[key1],
                              right=data_dict[key2],
                              how='outer',
                              left_index=True,
                              right_index=True)
                all_dict["%s_%s.%s-diff" % (c1, c2, x)] = df

        for each in all_dict.keys():

            df = all_dict[each]
            outfile = "%s/%s-cond.tsv" % (res_dir,
                                          each)
            res_frame = TS.conditionDESeq2(df,
                                           each,
                                           alpha,
                                           res_dir)
            res_frame.to_csv(outfile, sep="\t", index_label="gene_id")

    # write footer and output benchmark information.
    E.Stop()

示例#12

0

显示文件

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--results-directory",
                      dest="res_dir",
                      type="string",
                      help="directory to write results"
                      "tables to")

    parser.add_option("--alpha",
                      dest="alpha",
                      type="string",
                      help="statistical significance p-value threshold")

    parser.add_option("--method",
                      dest="method",
                      type="string",
                      help="analysis design. "
                      "either timepoint or condition")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    try:
        infile = argv[-1]
        open(infile, "r")
        # check for compression state
        if infile.split(".")[-1] == "gz":
            comp = "gzip"
        else:
            comp = None

    except IOError:
        infile = options.stdin
        # check for compression state
        if infile.name.split(".")[-1] == "gz":
            comp = "gzip"
        else:
            comp = None

    alpha = float(options.alpha)
    res_dir = options.res_dir

    count_table = pd.read_table(infile,
                                sep="\t",
                                index_col=0,
                                header=0,
                                compression=comp)
    columns = count_table.columns
    conditions = set([x.split(".")[0] for x in columns])
    times = set([x.split(".")[1] for x in columns])

    data_dict = {}
    cond_times = [x for x in itertools.product(conditions, times)]
    base_col = {}
    time_dict = {}

    if options.method == "timepoint":

        # assumes all column names are in the form
        # `condition`:`time`:`replicate`
        # use `condition`.`time` as dictionary keys

        for x in cond_times:
            c_t = "%s.%s" % (x[0], x[1])
            cols = [k for k in count_table.columns if re.search(c_t, k)]
            if x[1] == '000':
                base_col[c_t] = count_table[cols]
            else:
                time_dict[c_t] = count_table[cols]

        for bt in itertools.product(base_col.keys(), time_dict.keys()):
            df = pd.merge(left=base_col[bt[0]],
                          right=time_dict[bt[1]],
                          how='outer',
                          left_index=True,
                          right_index=True)
            time = int(bt[1].split(".")[1])
            data_dict["%s_0_%i" % (bt[0].split(".")[0], time)] = df

        for each in data_dict.keys():
            df_ = data_dict[each]
            outfile = "%s/%s-time.tsv" % (res_dir, each)
            res_frame = TS.timepointDESeq2(df_, each, alpha, res_dir)
            res_frame.to_csv(outfile, sep="\t", index_label="gene_id")

    elif options.method == "condition":

        # assumes all column names are in the form
        # `condition`:`time`:`replicate`
        # use `condition`.`time` as dictionary keys

        for x in cond_times:
            c_t = "%s.%s" % (x[0], x[1])
            cols = [k for k in count_table.columns if re.search(c_t, k)]
            if int(x[1]) == 0:
                base_col[c_t] = count_table[cols]
            else:
                time_dict[c_t] = count_table[cols]

        # make a dataframe for each 0:time point combination
        # for all conditions, index on `condition:0_time`

        base_keys = base_col.keys()
        time_keys = time_dict.keys()
        for k in conditions:
            for x in itertools.product(base_keys, time_keys):
                if re.search(k, x[0]) and re.search(k, x[1]):
                    df = pd.merge(left=base_col[x[0]],
                                  right=time_dict[x[1]],
                                  how='outer',
                                  left_index=True,
                                  right_index=True)
                    time = int(x[1].split(".")[1])
                    data_dict["%s.0_%i" % (x[0].split(".")[0], time)] = df
                else:
                    pass

        time_span = set([x.split(".")[1] for x in data_dict.keys()])

        all_dict = {}
        for cond in itertools.combinations(conditions, 2):
            c1 = cond[0]
            c2 = cond[1]
            for x in time_span:
                key1 = "%s.%s" % (c1, x)
                key2 = "%s.%s" % (c2, x)
                df = pd.merge(left=data_dict[key1],
                              right=data_dict[key2],
                              how='outer',
                              left_index=True,
                              right_index=True)
                all_dict["%s_%s.%s-diff" % (c1, c2, x)] = df

        for each in all_dict.keys():

            df = all_dict[each]
            outfile = "%s/%s-cond.tsv" % (res_dir, each)
            res_frame = TS.conditionDESeq2(df, each, alpha, res_dir)
            res_frame.to_csv(outfile, sep="\t", index_label="gene_id")

    # write footer and output benchmark information.
    E.Stop()

示例#13

0

显示文件

文件： distance2clusters.py 项目： wangdi2014/cgat

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--task",
                      dest="task",
                      type="string",
                      help="analysis task to be executed")

    parser.add_option("--infile",
                      dest="infile",
                      type="string",
                      help="input file path")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("replicate", "resample"),
                      help="whether to use replicate or resample "
                      "for consensus clustering.")

    parser.add_option("--cluster-algorithm",
                      dest="cluster",
                      type="string",
                      help="hierarchical clustering algorithm")

    parser.add_option("--expression-file",
                      dest="express",
                      type="string",
                      help="matching expression data from input"
                      " distance matrix")

    parser.add_option("--cluster-file",
                      dest="clustfile",
                      type="string",
                      help="file to output cluster labels to")

    parser.add_option("--output-file",
                      dest="outfile",
                      type="string",
                      help="output file to write to")

    parser.add_option("--cut-height",
                      dest="cutHeight",
                      type="string",
                      help="threshold at which to define consensus clusters"
                      "as valid")

    parser.add_option("--split-clusters",
                      dest="split",
                      action="store_true",
                      help="switch for using deepSplit in tree cutting")

    parser.add_option("--cluster-size",
                      dest="cluster_size",
                      type="int",
                      help="minimum cluster size for tree cutting. Clusters "
                      "with fewer than this many objects will be merged with "
                      "nearest cluster. Default=30")

    parser.add_option("--image-dir",
                      dest="images_dir",
                      type="string",
                      help="directory to write plots/figures to")

    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]

    parser.set_defaults(cutHeight=0,
                        conditions=None,
                        split=False,
                        cluster_size=30)

    if options.task == "cluster":

        data_frame = TS.treeCutting(infile=infile,
                                    expression_file=options.express,
                                    cluster_file=options.clustfile,
                                    cluster_algorithm=options.cluster,
                                    deepsplit=options.split)

    elif options.task == "clustagree":
        if options.method == "resample":
            data_frame = TS.clusterAgreement(infile)
        elif options.method == "replicate":
            file_list = infile.split(",")
            data_frame = TS.clusterAverage(file_list)

    elif options.task == "consensus-cluster":
        min_size = int(options.cluster_size)
        data_frame = TS.consensusClustering(infile=infile,
                                            cutHeight=float(options.cutHeight),
                                            cluster_algorithm=options.cluster,
                                            min_size=min_size,
                                            deepsplit=options.split)

    elif options.task == "pca":
        files = infile.split(",")
        infile = files[1]
        cluster_file = files[0]
        data_frame = TS.clusterPCA(infile=infile,
                                   cluster_file=cluster_file,
                                   image_dir=options.images_dir)

    else:
        pass

    data_frame.to_csv(options.stdout,
                      sep="\t",
                      header=True,
                      index_label="gene_id")

    # Write footer and output benchmark information.
    E.Stop()

示例#14

0

显示文件

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--time",
                      dest="timepoints",
                      type="string",
                      help="a comma-separated list of time points measured")

    parser.add_option("--replicates",
                      dest="reps",
                      type="string",
                      help="a comma-separated list of replicate IDs")

    parser.add_option("--condition",
                      dest="condition",
                      type="string",
                      help="experimental condition")

    parser.add_option("--resamples",
                      dest="resamples",
                      type="string",
                      help="number of times to resample replicates to"
                      " generate pseudo datasets")

    parser.add_option("--input-gtf",
                      dest="gtf_file",
                      type="string",
                      help="reference gtf file")

    parser.add_option("--output-file-directory",
                      dest="output_dir",
                      type="string",
                      help="directory to output"
                      " resampled files to")

    # add common options (-h/--help, ...) and parse command line

    (options, args) = E.start(parser, argv=argv)

    try:
        infile = IOTools.open_file(argv[-1], "r")
    except IOError:
        infile = options.stdin

    data_frame = pd.read_table(infile, sep="\t", index_col=0, header=0)
    time_str = options.timepoints.split(",")
    time_points = [int(x) for x in time_str]
    replicates = options.reps.split(",")
    reps = int(options.resamples)

    its = [time_str, replicates]
    midx = pd.MultiIndex.from_product(its, names=['times', 'replicates'])

    TS.genResampleData(data_frame=data_frame,
                       multiple_index=midx,
                       replicates=reps,
                       sample_reps=replicates,
                       times=time_points,
                       condition=options.condition,
                       ref_gtf=options.gtf_file,
                       out_dir=options.output_dir,
                       seed=int(options.random_seed))

    # Write footer and output benchmark information.
    E.stop()

示例#15

0

显示文件

文件： expression2expression.py 项目： wangdi2014/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--time",
                      dest="timepoints",
                      type="string",
                      help="a comma-separated list of time points measured")

    parser.add_option("--replicates",
                      dest="reps",
                      type="string",
                      help="a comma-separated list of replicate IDs")

    parser.add_option("--conditions",
                      dest="conditions",
                      type="string",
                      help="a comma-separated list of experimental conditions")

    parser.add_option("--orders",
                      dest="orders",
                      type="int",
                      help="order of polynomial terms to include in"
                      "maSigPro linear model")

    parser.add_option("--fdr",
                      dest="fdr",
                      type="string",
                      help="FDR for calling DEGs")

    parser.add_option("--padjust",
                      dest="padjust",
                      type="string",
                      help="multiple testing correction to apply to"
                      "control FDR")

    parser.add_option("--stepwise",
                      dest="stepwise",
                      type="string",
                      help="stepwise regression to use")

    parser.add_option("--pinclude",
                      dest="pinclude",
                      type="string",
                      help="p-value for inclusion in stepwise regression")

    parser.add_option("--rsquared",
                      dest="rsquared",
                      type="string",
                      help="rsquared cut-off for DEG reporting")

    parser.add_option("--var-group",
                      dest="vargroup",
                      type="string",
                      help="variable group reporting. each, all or"
                      "group")

    parser.add_option("--task",
                      dest="task",
                      type="string",
                      help="analysis task to be executed")

    parser.add_option("--infile",
                      dest="infile",
                      type="string",
                      help="input file path")

    parser.add_option("--quantile",
                      dest="quantile",
                      type="int",
                      help="see pipeline.ini for explanation")

    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]

    parser.set_defaults(cutHeight=0,
                        conditions=None,
                        split=False,
                        cluster_size=30)

    if options.task == "deseq":
        timepoints = [int(x) for x in options.timepoints.split(",")]
        timepoints.sort()
        reps = [x for x in options.reps.split(",")]
        if not options.conditions:
            conditions = None
        else:
            conditions = [x for x in options.conditions.split(",")]

        data_frame = TS.deseqNormalize(infile=infile,
                                       time_points=timepoints,
                                       reps=reps,
                                       conditions=conditions)

    elif options.task == "masigpro":
        data_frame = TS.maSigPro(infile=infile,
                                 order_terms=int(options.orders),
                                 fdr=float(options.fdr),
                                 adjust=options.padjust,
                                 stepwise=options.stepwise,
                                 include_p=float(options.pinclude),
                                 rsq=float(options.rsquared),
                                 var_group=options.vargroup)

    elif options.task == "sumcovar":
        timepoints = [int(x) for x in options.timepoints.split(",")]
        reps = [x for x in options.reps.split(",")]
        data_frame = TS.covarFilter(infile=infile,
                                    time_points=timepoints,
                                    replicates=reps,
                                    quantile=int(options.quantile))

    elif options.task == "average_expression":
        data_frame = TS.avTimeExpression(infile)

    else:
        pass

    data_frame.to_csv(options.stdout,
                      sep="\t",
                      header=True,
                      index_label="gene_id")

    # Write footer and output benchmark information.
    E.Stop()

示例#16

0

显示文件

文件： clusters2metrics.py 项目： gsc0107/cgat

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("metrics", "summary", "module_summary"),
                      help="method to summarise clustering")

    parser.add_option("--ref-gtf-files",
                      dest="ref_gtf",
                      type="string",
                      help="comma separated list of reference gtf files")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.method == "metrics":
        infile = argv[-1]
        E.info("loading input file: %s" % infile)
        assert infile

        df = pd.read_table(infile, sep="\t", header=None, index_col=0)

        df = df.ix[:, :50]
        cluster_combs = (x for x in itertools.combinations(df.columns, 2))
        genes = df.index
        results_dict = {}
        all_clusts = {}

        E.info("setting up cluster containers")
        for i in df.columns:
            clusters = set(df[i].values.tolist())
            cluster_dict = {}
            for clust in clusters:
                cluster_dict[clust] = []
            for gene in genes:
                cluster_dict[df[i][gene]].append(gene)

            for col in clusters:
                col_set = set()
                clust_col = cluster_dict[col]
                gene_members = itertools.combinations(clust_col, 2)
                col_set.update(gene_members)
                cluster_dict[col] = col_set
                all_clusts[i] = cluster_dict
        E.info("generating all pair-wise cluster comparisons")
        E.info("calculating adjusted mutual information")
        for k in cluster_combs:
            clusters1 = all_clusts[k[0]]
            clusters2 = all_clusts[k[1]]
            metric_dict = {}
            metric_dict['AMI'] = TS.adjustedMutualInformation(
                clusters1, clusters2)
            results_dict[k] = metric_dict

        res_frame = pd.DataFrame(results_dict).T
        res_frame = res_frame.reset_index()
        res_frame.drop(['level_0'], inplace=True, axis=1)
        res_frame.drop(['level_1'], inplace=True, axis=1)

        # flatten rand indices and add to output dataframe
        rand_arrays = TS.randIndexes(df)
        flat_adj_rand = TS.unravel_arrays(rand_arrays[0])
        flat_rand = TS.unravel_arrays(rand_arrays[1])
        res_frame['Rand_Index'] = flat_rand
        res_frame['Adjusted_Rand_Index'] = flat_adj_rand
        E.info("aggregating results")

        res_frame.to_csv(options.stdout, sep="\t", index_label='idx')

    elif options.method == "summary":
        infiles = argv[-1]
        list_of_files = infiles.split(",")

        file_dict = {}
        for fle in list_of_files:
            fname = fle.split("/")[-1]
            condition = fname.split("-")[0]
            ref = fname.split("-")[1]
            df_ = pd.read_table(fle, sep="\t", header=0, index_col=0)
            df_.columns = ['gene_id', 'cluster']
            clust_dict = {}
            for idx in df_.index:
                cluster = df_.loc[idx]['cluster']
                gene = df_.loc[idx]['gene_id']
                try:
                    clust_dict[cluster] += 1
                except KeyError:
                    clust_dict[cluster] = 1
            med_size = np.median(list(clust_dict.values()))
            file_dict[fname] = {
                'condition': condition,
                'reference': ref,
                'median_cluster_size': med_size
            }

        outframe = pd.DataFrame(file_dict).T
        outframe.to_csv(options.stdout, sep="\t", index_label='idx')

    elif options.method == "module_summary":
        # get lncRNA/gene lengths from reference gtfs
        ref_gtfs = options.ref_gtf.split(",")
        length_dict = {}
        for ref in ref_gtfs:
            oref = IOTools.openFile(ref, "rb")
            git = GTF.transcript_iterator(GTF.iterator(oref))
            for gene in git:
                for trans in gene:
                    length = trans.end - trans.start
                    try:
                        length_dict[trans.gene_id] += length
                    except KeyError:
                        length_dict[trans.gene_id] = length
            oref.close()

        infiles = argv[-1]
        list_of_files = infiles.split(",")

        fdfs = []
        for fle in list_of_files:
            cond = fle.split("/")[-1].split("-")[0]
            refer = fle.split("/")[-1].split("-")[1]
            _df = pd.read_table(fle, sep="\t", header=0, index_col=0)
            _df.columns = ['gene_id', 'cluster']
            clusters = set(_df['cluster'])
            c_dict = {}
            # summarize over each cluster
            for clust in clusters:
                lengths = []
                c_df = _df[_df['cluster'] == clust]
                for lid in c_df['gene_id']:
                    lengths.append(length_dict[lid])
                    c_dict[clust] = {
                        'cluster_size': len(c_df['gene_id']),
                        'mean_length': np.mean(lengths),
                        'index': (cond, refer),
                        'module': clust
                    }
            cdf = pd.DataFrame(c_dict).T
            # use a multindex for hierarchical indexing
            midx = pd.MultiIndex.from_tuples(cdf['index'])
            cdf.index = midx
            cdf.drop(['index'], inplace=True, axis=1)
            fdfs.append(cdf)

        # generate a single output df
        s_df = fdfs[0]
        fdfs.pop(0)
        for df in fdfs:
            s_df = s_df.append(df)

        s_df.to_csv(options.stdout,
                    index_label=("condition", "reference"),
                    sep="\t")

    # write footer and output benchmark information.
    E.Stop()

示例#17

0

显示文件

文件： data2resamples.py 项目： CGATOxford/cgat

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--time", dest="timepoints", type="string",
                      help="a comma-separated list of time points measured")

    parser.add_option("--replicates", dest="reps", type="string",
                      help="a comma-separated list of replicate IDs")

    parser.add_option("--condition", dest="condition", type="string",
                      help="experimental condition")

    parser.add_option("--resamples", dest="resamples", type="string",
                      help="number of times to resample replicates to"
                      " generate pseudo datasets")

    parser.add_option("--input-gtf", dest="gtf_file", type="string",
                      help="reference gtf file")

    parser.add_option("--output-file-directory", dest="output_dir",
                      type="string", help="directory to output"
                      " resampled files to")

    # add common options (-h/--help, ...) and parse command line

    (options, args) = E.Start(parser, argv=argv)

    try:
        infile = IOTools.openFile(argv[-1], "r")
    except IOError:
        infile = options.stdin

    data_frame = pd.read_table(infile,
                               sep="\t",
                               index_col=0,
                               header=0)
    time_str = options.timepoints.split(",")
    time_points = [int(x) for x in time_str]
    replicates = options.reps.split(",")
    reps = int(options.resamples)

    its = [time_str, replicates]
    midx = pd.MultiIndex.from_product(its,
                                      names=['times', 'replicates'])

    TS.genResampleData(data_frame=data_frame,
                       multiple_index=midx,
                       replicates=reps,
                       sample_reps=replicates,
                       times=time_points,
                       condition=options.condition,
                       ref_gtf=options.gtf_file,
                       out_dir=options.output_dir,
                       seed=int(options.random_seed))

    # Write footer and output benchmark information.
    E.Stop()

示例#18

0

显示文件

文件： distance2clusters.py 项目： SCV/cgat

def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--task", dest="task", type="string",
                      help="analysis task to be executed")

    parser.add_option("--infile", dest="infile", type="string",
                      help="input file path")

    parser.add_option("--method", dest="method", type="choice",
                      choices=("replicate", "resample"),
                      help="whether to use replicate or resample "
                      "for consensus clustering.")

    parser.add_option("--cluster-algorithm", dest="cluster", type="string",
                      help="hierarchical clustering algorithm")

    parser.add_option("--expression-file", dest="express", type="string",
                      help="matching expression data from input"
                      " distance matrix")

    parser.add_option("--cluster-file", dest="clustfile", type="string",
                      help="file to output cluster labels to")

    parser.add_option("--output-file", dest="outfile", type="string",
                      help="output file to write to")

    parser.add_option("--cut-height", dest="cutHeight", type="string",
                      help="threshold at which to define consensus clusters"
                      "as valid")

    parser.add_option("--split-clusters", dest="split", action="store_true",
                      help="switch for using deepSplit in tree cutting")

    parser.add_option("--cluster-size", dest="cluster_size", type="int",
                      help="minimum cluster size for tree cutting. Clusters "
                      "with fewer than this many objects will be merged with "
                      "nearest cluster. Default=30")

    parser.add_option("--image-dir", dest="images_dir", type="string",
                      help="directory to write plots/figures to")

# add common options (-h/--help, ...) and parse command line

    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]

    parser.set_defaults(cutHeight=0,
                        conditions=None,
                        split=False,
                        cluster_size=30)

    if options.task == "cluster":

        data_frame = TS.treeCutting(infile=infile,
                                    expression_file=options.express,
                                    cluster_file=options.clustfile,
                                    cluster_algorithm=options.cluster,
                                    deepsplit=options.split)

    elif options.task == "clustagree":
        if options.method == "resample":
            data_frame = TS.clusterAgreement(infile)
        elif options.method == "replicate":
            file_list = infile.split(",")
            data_frame = TS.clusterAverage(file_list)

    elif options.task == "consensus-cluster":
        min_size = int(options.cluster_size)
        data_frame = TS.consensusClustering(infile=infile,
                                            cutHeight=float(options.cutHeight),
                                            cluster_algorithm=options.cluster,
                                            min_size=min_size,
                                            deepsplit=options.split)

    elif options.task == "pca":
        files = infile.split(",")
        infile = files[1]
        cluster_file = files[0]
        data_frame = TS.clusterPCA(infile=infile,
                                   cluster_file=cluster_file,
                                   image_dir=options.images_dir)

    else:
        pass

    data_frame.to_csv(options.stdout,
                      sep="\t",
                      header=True,
                      index_label="gene_id")

    # Write footer and output benchmark information.
    E.Stop()

示例#19

0

显示文件

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--k",
                      dest="k",
                      type="int",
                      default=0,
                      help="value of k to adjust adaptive tuning function")

    parser.add_option("--out",
                      dest="outfile",
                      type="string",
                      help="output file name")

    parser.add_option("--expression-file",
                      dest="expr",
                      type="string",
                      help="file containing expression data")

    parser.add_option("--parallel",
                      dest="parallel",
                      action="store_true",
                      default=False,
                      help="switches on parallel, will"
                      " split distance matrix into relevant number of"
                      " slices. Start-end positions are defined by"
                      " the file name.")

    parser.add_option("--distance-metric",
                      dest="dist_metric",
                      type="string",
                      help="distance metric to use for dissimilarity of time "
                      "series objects.  Choices: dtw, cross-correlate, "
                      "temporal-correlate. Default=dtw")

    parser.add_option("--lag",
                      dest="lag",
                      type="string",
                      help="cross correlation lag to report")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    infile = args[-1]

    parser.set_defaults(lag=0, k=0)
    if options.parallel:
        datfile = options.expr
    else:
        datfile = infile

    data = pd.read_table(datfile, sep="\t", index_col=0, header=0)

    # data should already be sorted in time-series order
    # the time and replicate columns needs to be dropped to ensure only the
    # gene data is passed into the DTW function

    # drop header line(s) and non-numerical rows
    try:
        data.drop(['times'], inplace=True, axis=0)
        data.drop(['replicates'], inplace=True, axis=0)
    except ValueError:
        pass
    genes = data.index
    data = data.convert_objects(convert_numeric=True)

    # iterate over the genes list in nested loops to get
    # all pair-wise combinations.

    if options.dist_metric == "dtw":
        if options.parallel:
            start_idx = int(infile.split("-")[3].split("_")[0])
            end_idx = int(infile.split("-")[3].split("_")[1])
            slice_idx = genes[start_idx:end_idx]

            df_ = TS.dtwWrapper(data=data,
                                rows=genes,
                                columns=slice_idx,
                                k=options.k)
        else:
            df_ = TS.dtwWrapper(data=data,
                                rows=genes,
                                columns=genes,
                                k=options.k)

    elif options.dist_metric == "cross-correlate":

        if options.lag is None:
            options.lag = 0
        else:
            pass

        if options.parallel:
            start_idx = int(infile.split("/")[-1].split("-")[3].split("_")[0])
            end_idx = int(infile.split("/")[-1].split("-")[3].split("_")[1])
            slice_idx = genes[start_idx:end_idx]

            df_ = TS.correlateDistanceMetric(data=data,
                                             rows=genes,
                                             columns=slice_idx,
                                             method=options.dist_metric,
                                             lag=int(options.lag))
        else:
            df_ = TS.correlateDistanceMetric(data=data,
                                             rows=genes,
                                             columns=genes,
                                             method=options.dist_metric,
                                             lag=int(options.lag))

    elif options.dist_metric == "temporal-correlate":
        if options.parallel:
            start_idx = int(infile.split("/")[-1].split("-")[3].split("_")[0])
            end_idx = int(infile.split("/")[-1].split("-")[3].split("_")[1])
            slice_idx = genes[start_idx:end_idx]

            df_ = TS.correlateDistanceMetric(data=data,
                                             rows=genes,
                                             columns=slice_idx,
                                             method=options.dist_metric)
        else:
            df_ = TS.correlateDistanceMetric(data=data,
                                             rows=genes,
                                             columns=genes,
                                             method=options.dist_metric)

    if not options.outfile:
        df_.to_csv(options.stdout, sep="\t")
    else:
        df_.to_csv(options.outfile, sep="\t")

    # write footer and output benchmark information
    E.stop()

示例#20

0

显示文件

文件： clusters2metrics.py 项目： Q-KIM/cgat

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--method", dest="method", type="choice",
                      choices=("metrics", "summary", "module_summary"),
                      help="method to summarise clustering")

    parser.add_option("--ref-gtf-files", dest="ref_gtf", type="string",
                      help="comma separated list of reference gtf files")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.method == "metrics":
        infile = argv[-1]
        E.info("loading input file: %s" % infile)
        assert infile

        df = pd.read_table(infile,
                           sep="\t",
                           header=None,
                           index_col=0)

        df = df.ix[:, :50]
        cluster_combs = (x for x in itertools.combinations(df.columns,
                                                           2))
        genes = df.index
        results_dict = {}
        all_clusts = {}

        E.info("setting up cluster containers")
        for i in df.columns:
            clusters = set(df[i].values.tolist())
            cluster_dict = {}
            for clust in clusters:
                cluster_dict[clust] = []
            for gene in genes:
                cluster_dict[df[i][gene]].append(gene)

            for col in clusters:
                col_set = set()
                clust_col = cluster_dict[col]
                gene_members = itertools.combinations(clust_col,
                                                      2)
                col_set.update(gene_members)
                cluster_dict[col] = col_set
                all_clusts[i] = cluster_dict
        E.info("generating all pair-wise cluster comparisons")
        E.info("calculating adjusted mutual information")
        for k in cluster_combs:
            clusters1 = all_clusts[k[0]]
            clusters2 = all_clusts[k[1]]
            metric_dict = {}
            metric_dict['AMI'] = TS.adjustedMutualInformation(clusters1,
                                                              clusters2)
            results_dict[k] = metric_dict

        res_frame = pd.DataFrame(results_dict).T
        res_frame = res_frame.reset_index()
        res_frame.drop(['level_0'], inplace=True, axis=1)
        res_frame.drop(['level_1'], inplace=True, axis=1)

        # flatten rand indices and add to output dataframe
        rand_arrays = TS.randIndexes(df)
        flat_adj_rand = TS.unravel_arrays(rand_arrays[0])
        flat_rand = TS.unravel_arrays(rand_arrays[1])
        res_frame['Rand_Index'] = flat_rand
        res_frame['Adjusted_Rand_Index'] = flat_adj_rand
        E.info("aggregating results")

        res_frame.to_csv(options.stdout,
                         sep="\t",
                         index_label='idx')

    elif options.method == "summary":
        infiles = argv[-1]
        list_of_files = infiles.split(",")

        file_dict = {}
        for fle in list_of_files:
            fname = fle.split("/")[-1]
            condition = fname.split("-")[0]
            ref = fname.split("-")[1]
            df_ = pd.read_table(fle,
                                sep="\t",
                                header=0,
                                index_col=0)
            df_.columns = ['gene_id', 'cluster']
            clust_dict = {}
            for idx in df_.index:
                cluster = df_.loc[idx]['cluster']
                gene = df_.loc[idx]['gene_id']
                try:
                    clust_dict[cluster] += 1
                except KeyError:
                    clust_dict[cluster] = 1
            med_size = np.median(clust_dict.values())
            file_dict[fname] = {'condition': condition,
                                'reference': ref,
                                'median_cluster_size': med_size}

        outframe = pd.DataFrame(file_dict).T
        outframe.to_csv(options.stdout,
                        sep="\t",
                        index_label='idx')

    elif options.method == "module_summary":
        # get lncRNA/gene lengths from reference gtfs
        ref_gtfs = options.ref_gtf.split(",")
        length_dict = {}
        for ref in ref_gtfs:
            oref = IOTools.openFile(ref, "rb")
            git = GTF.transcript_iterator(GTF.iterator(oref))
            for gene in git:
                for trans in gene:
                    length = trans.end - trans.start
                    try:
                        length_dict[trans.gene_id] += length
                    except KeyError:
                        length_dict[trans.gene_id] = length
            oref.close()

        infiles = argv[-1]
        list_of_files = infiles.split(",")

        fdfs = []
        for fle in list_of_files:
            cond = fle.split("/")[-1].split("-")[0]
            refer = fle.split("/")[-1].split("-")[1]
            _df = pd.read_table(fle, sep="\t",
                                header=0, index_col=0)
            _df.columns = ['gene_id', 'cluster']
            clusters = set(_df['cluster'])
            c_dict = {}
            # summarize over each cluster
            for clust in clusters:
                lengths = []
                c_df = _df[_df['cluster'] == clust]
                for lid in c_df['gene_id']:
                    lengths.append(length_dict[lid])
                    c_dict[clust] = {'cluster_size': len(c_df['gene_id']),
                                     'mean_length': np.mean(lengths),
                                     'index': (cond, refer),
                                     'module': clust}
            cdf = pd.DataFrame(c_dict).T
            # use a multindex for hierarchical indexing
            midx = pd.MultiIndex.from_tuples(cdf['index'])
            cdf.index = midx
            cdf.drop(['index'], inplace=True, axis=1)
            fdfs.append(cdf)

        # generate a single output df
        s_df = fdfs[0]
        fdfs.pop(0)
        for df in fdfs:
            s_df = s_df.append(df)

        s_df.to_csv(options.stdout,
                    index_label=("condition", "reference"),
                    sep="\t")

    # write footer and output benchmark information.
    E.Stop()

示例#21

0

显示文件

文件： expression2distance.py 项目： SCV/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--k", dest="k", type="int", default=0,
                      help="value of k to adjust adaptive tuning function")

    parser.add_option("--out", dest="outfile", type="string",
                      help="output file name")

    parser.add_option("--expression-file", dest="expr", type="string",
                      help="file containing expression data")

    parser.add_option("--parallel", dest="parallel", action="store_true",
                      default=False, help="switches on parallel, will"
                      " split distance matrix into relevant number of"
                      " slices. Start-end positions are defined by"
                      " the file name.")

    parser.add_option("--distance-metric", dest="dist_metric", type="string",
                      help="distance metric to use for dissimilarity of time "
                      "series objects.  Choices: dtw, cross-correlate, "
                      "temporal-correlate. Default=dtw")

    parser.add_option("--lag", dest="lag", type="string",
                      help="cross correlation lag to report")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infile = args[-1]

    parser.set_defaults(lag=0,
                        k=0)
    if options.parallel:
        datfile = options.expr
    else:
        datfile = infile

    data = pd.read_table(datfile,
                         sep="\t",
                         index_col=0,
                         header=0)

    # data should already be sorted in time-series order
    # the time and replicate columns needs to be dropped to ensure only the
    # gene data is passed into the DTW function

    # drop header line(s) and non-numerical rows
    try:
        data.drop(['times'], inplace=True, axis=0)
        data.drop(['replicates'], inplace=True, axis=0)
    except ValueError:
        pass
    genes = data.index
    data = data.convert_objects(convert_numeric=True)

    # iterate over the genes list in nested loops to get
    # all pair-wise combinations.

    if options.dist_metric == "dtw":
        if options.parallel:
            start_idx = int(infile.split("-")[3].split("_")[0])
            end_idx = int(infile.split("-")[3].split("_")[1])
            slice_idx = genes[start_idx:end_idx]

            df_ = TS.dtwWrapper(data=data,
                                rows=genes,
                                columns=slice_idx,
                                k=options.k)
        else:
            df_ = TS.dtwWrapper(data=data,
                                rows=genes,
                                columns=genes,
                                k=options.k)

    elif options.dist_metric == "cross-correlate":

        if options.lag is None:
            options.lag = 0
        else:
            pass

        if options.parallel:
            start_idx = int(infile.split("/")[-1].split("-")[3].split("_")[0])
            end_idx = int(infile.split("/")[-1].split("-")[3].split("_")[1])
            slice_idx = genes[start_idx:end_idx]

            df_ = TS.correlateDistanceMetric(data=data,
                                             rows=genes,
                                             columns=slice_idx,
                                             method=options.dist_metric,
                                             lag=int(options.lag))
        else:
            df_ = TS.correlateDistanceMetric(data=data,
                                             rows=genes,
                                             columns=genes,
                                             method=options.dist_metric,
                                             lag=int(options.lag))

    elif options.dist_metric == "temporal-correlate":
        if options.parallel:
            start_idx = int(infile.split("/")[-1].split("-")[3].split("_")[0])
            end_idx = int(infile.split("/")[-1].split("-")[3].split("_")[1])
            slice_idx = genes[start_idx:end_idx]

            df_ = TS.correlateDistanceMetric(data=data,
                                             rows=genes,
                                             columns=slice_idx,
                                             method=options.dist_metric)
        else:
            df_ = TS.correlateDistanceMetric(data=data,
                                             rows=genes,
                                             columns=genes,
                                             method=options.dist_metric)

    if not options.outfile:
        df_.to_csv(options.stdout, sep="\t")
    else:
        df_.to_csv(options.outfile, sep="\t")

    # write footer and output benchmark information
    E.Stop()