def splitFiles(infile, outfile): """ Arbitrarily split files into chunks for parallelisation """ Timeseries.splitFiles(infile=infile, nchunks=PARAMS["resampling_chunks"], out_dir="parallel_files.dir") P.touch(outfile)
def splitFiles(infile, outfile): ''' Arbitrarily split files into chunks for parallelisation ''' Timeseries.splitFiles(infile=infile, nchunks=PARAMS['resampling_chunks'], out_dir="parallel_files.dir") P.touch(outfile)
def genReplicateData(infile, outfile): """ Split each replicate into a separate file for clustering within each replicate. Relies on each replicate being the same across the whole time series. """ outdir = outfile.split("/")[0] Timeseries.splitReplicates(infile=infile, axis="column", group_var="replicates", outdir=outdir) P.touch(outfile)
def genReplicateData(infile, outfile): ''' Split each replicate into a separate file for clustering within each replicate. Relies on each replicate being the same across the whole time series. ''' outdir = outfile.split("/")[0] Timeseries.splitReplicates(infile=infile, axis="column", group_var="replicates", outdir=outdir) P.touch(outfile)
def randIndexes(clustering_results): ''' Calculate Rand index and adjusted Rand index over pairwise clustering comparisons. Use cythonised function to calculate indices ''' # reassign module and gene labels with integer ids, integer comparison is # much faster than string comparison cluster_labels = clustering_results.values map_dict = get_label_map(cluster_labels) gene_map = {} for r, gene in enumerate(clustering_results.index): gene_map[gene] = r E.info("mapping gene ids") integer_matrix = make_mapped_matrix(map_dict, clustering_results) # take a small slice of the matrix for testing 5 genes, 3 clusterings E.info("counting clustering consensus") # use cythonized function to return rand index matrix cy_rand = Timeseries.consensus_metrics(integer_matrix) E.info("Rand Index calculated for all clusterings") return cy_rand
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--alpha", dest="alpha", type="string", help="false positive rate for differentially" " expressed genes") parser.add_option("--file-list", dest="infiles", type="string", help="comma separated list of input files") parser.add_option("--output-directory", dest="out_dir", type="string", help="output directory for png images") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) infiles = options.infiles.split(",") TS.genSigGenes(file_list=infiles, alpha=float(options.alpha), out_dir=options.out_dir) # Write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--outfile", dest="outfile", type="string", help="output filename") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) infiles = argv[-1] files_list = infiles.split(",") if not options.outfile: outfile = options.stdout else: outfile = options.outfile TS.mergeFiles(file_list=files_list, outfile=outfile) # write footer and output benchmark information E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--time", dest="timepoints", type="string", help="a comma-separated list of time points measured") parser.add_option("--replicates", dest="reps", type="string", help="a comma-separated list of replicate IDs") parser.add_option("--conditions", dest="conditions", type="string", help="a comma-separated list of experimental conditions") parser.add_option("--orders", dest="orders", type="int", help="order of polynomial terms to include in" "maSigPro linear model") parser.add_option("--fdr", dest="fdr", type="string", help="FDR for calling DEGs") parser.add_option("--padjust", dest="padjust", type="string", help="multiple testing correction to apply to" "control FDR") parser.add_option("--stepwise", dest="stepwise", type="string", help="stepwise regression to use") parser.add_option("--pinclude", dest="pinclude", type="string", help="p-value for inclusion in stepwise regression") parser.add_option("--rsquared", dest="rsquared", type="string", help="rsquared cut-off for DEG reporting") parser.add_option("--var-group", dest="vargroup", type="string", help="variable group reporting. each, all or" "group") parser.add_option("--task", dest="task", type="string", help="analysis task to be executed") parser.add_option("--infile", dest="infile", type="string", help="input file path") parser.add_option("--quantile", dest="quantile", type="int", help="see pipeline.ini for explanation") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) infile = argv[-1] parser.set_defaults(cutHeight=0, conditions=None, split=False, cluster_size=30) if options.task == "deseq": timepoints = [int(x) for x in options.timepoints.split(",")] timepoints.sort() reps = [x for x in options.reps.split(",")] if not options.conditions: conditions = None else: conditions = [x for x in options.conditions.split(",")] data_frame = TS.deseqNormalize(infile=infile, time_points=timepoints, reps=reps, conditions=conditions) elif options.task == "masigpro": data_frame = TS.maSigPro(infile=infile, order_terms=int(options.orders), fdr=float(options.fdr), adjust=options.padjust, stepwise=options.stepwise, include_p=float(options.pinclude), rsq=float(options.rsquared), var_group=options.vargroup) elif options.task == "sumcovar": timepoints = [int(x) for x in options.timepoints.split(",")] reps = [x for x in options.reps.split(",")] data_frame = TS.covarFilter(infile=infile, time_points=timepoints, replicates=reps, quantile=int(options.quantile)) elif options.task == "average_expression": data_frame = TS.avTimeExpression(infile) else: pass data_frame.to_csv(options.stdout, sep="\t", header=True, index_label="gene_id") # Write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--results-directory", dest="res_dir", type="string", help="directory to write results" "tables to") parser.add_option("--alpha", dest="alpha", type="string", help="statistical significance p-value threshold") parser.add_option("--method", dest="method", type="string", help="analysis design. " "either timepoint or condition") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) try: infile = argv[-1] open(infile, "r") # check for compression state if infile.split(".")[-1] == "gz": comp = "gzip" else: comp = None except IOError: infile = options.stdin # check for compression state if infile.name.split(".")[-1] == "gz": comp = "gzip" else: comp = None alpha = float(options.alpha) res_dir = options.res_dir count_table = pd.read_table(infile, sep="\t", index_col=0, header=0, compression=comp) columns = count_table.columns conditions = set([x.split(".")[0] for x in columns]) times = set([x.split(".")[1] for x in columns]) data_dict = {} cond_times = [x for x in itertools.product(conditions, times)] base_col = {} time_dict = {} if options.method == "timepoint": # assumes all column names are in the form # `condition`:`time`:`replicate` # use `condition`.`time` as dictionary keys for x in cond_times: c_t = "%s.%s" % (x[0], x[1]) cols = [k for k in count_table.columns if re.search(c_t, k)] if x[1] == '000': base_col[c_t] = count_table[cols] else: time_dict[c_t] = count_table[cols] for bt in itertools.product(base_col.keys(), time_dict.keys()): df = pd.merge(left=base_col[bt[0]], right=time_dict[bt[1]], how='outer', left_index=True, right_index=True) time = int(bt[1].split(".")[1]) data_dict["%s_0_%i" % (bt[0].split(".")[0], time)] = df for each in data_dict.keys(): df_ = data_dict[each] outfile = "%s/%s-time.tsv" % (res_dir, each) res_frame = TS.timepointDESeq2(df_, each, alpha, res_dir) res_frame.to_csv(outfile, sep="\t", index_label="gene_id") elif options.method == "condition": # assumes all column names are in the form # `condition`:`time`:`replicate` # use `condition`.`time` as dictionary keys for x in cond_times: c_t = "%s.%s" % (x[0], x[1]) cols = [k for k in count_table.columns if re.search(c_t, k)] if int(x[1]) == 0: base_col[c_t] = count_table[cols] else: time_dict[c_t] = count_table[cols] # make a dataframe for each 0:time point combination # for all conditions, index on `condition:0_time` base_keys = base_col.keys() time_keys = time_dict.keys() for k in conditions: for x in itertools.product(base_keys, time_keys): if re.search(k, x[0]) and re.search(k, x[1]): df = pd.merge(left=base_col[x[0]], right=time_dict[x[1]], how='outer', left_index=True, right_index=True) time = int(x[1].split(".")[1]) data_dict["%s.0_%i" % (x[0].split(".")[0], time)] = df else: pass time_span = set([x.split(".")[1] for x in data_dict.keys()]) all_dict = {} for cond in itertools.combinations(conditions, 2): c1 = cond[0] c2 = cond[1] for x in time_span: key1 = "%s.%s" % (c1, x) key2 = "%s.%s" % (c2, x) df = pd.merge(left=data_dict[key1], right=data_dict[key2], how='outer', left_index=True, right_index=True) all_dict["%s_%s.%s-diff" % (c1, c2, x)] = df for each in all_dict.keys(): df = all_dict[each] outfile = "%s/%s-cond.tsv" % (res_dir, each) res_frame = TS.conditionDESeq2(df, each, alpha, res_dir) res_frame.to_csv(outfile, sep="\t", index_label="gene_id") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--task", dest="task", type="string", help="analysis task to be executed") parser.add_option("--infile", dest="infile", type="string", help="input file path") parser.add_option("--method", dest="method", type="choice", choices=("replicate", "resample"), help="whether to use replicate or resample " "for consensus clustering.") parser.add_option("--cluster-algorithm", dest="cluster", type="string", help="hierarchical clustering algorithm") parser.add_option("--expression-file", dest="express", type="string", help="matching expression data from input" " distance matrix") parser.add_option("--cluster-file", dest="clustfile", type="string", help="file to output cluster labels to") parser.add_option("--output-file", dest="outfile", type="string", help="output file to write to") parser.add_option("--cut-height", dest="cutHeight", type="string", help="threshold at which to define consensus clusters" "as valid") parser.add_option("--split-clusters", dest="split", action="store_true", help="switch for using deepSplit in tree cutting") parser.add_option("--cluster-size", dest="cluster_size", type="int", help="minimum cluster size for tree cutting. Clusters " "with fewer than this many objects will be merged with " "nearest cluster. Default=30") parser.add_option("--image-dir", dest="images_dir", type="string", help="directory to write plots/figures to") (options, args) = E.Start(parser, argv=argv) infile = argv[-1] parser.set_defaults(cutHeight=0, conditions=None, split=False, cluster_size=30) if options.task == "cluster": data_frame = TS.treeCutting(infile=infile, expression_file=options.express, cluster_file=options.clustfile, cluster_algorithm=options.cluster, deepsplit=options.split) elif options.task == "clustagree": if options.method == "resample": data_frame = TS.clusterAgreement(infile) elif options.method == "replicate": file_list = infile.split(",") data_frame = TS.clusterAverage(file_list) elif options.task == "consensus-cluster": min_size = int(options.cluster_size) data_frame = TS.consensusClustering(infile=infile, cutHeight=float(options.cutHeight), cluster_algorithm=options.cluster, min_size=min_size, deepsplit=options.split) elif options.task == "pca": files = infile.split(",") infile = files[1] cluster_file = files[0] data_frame = TS.clusterPCA(infile=infile, cluster_file=cluster_file, image_dir=options.images_dir) else: pass data_frame.to_csv(options.stdout, sep="\t", header=True, index_label="gene_id") # Write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--time", dest="timepoints", type="string", help="a comma-separated list of time points measured") parser.add_option("--replicates", dest="reps", type="string", help="a comma-separated list of replicate IDs") parser.add_option("--condition", dest="condition", type="string", help="experimental condition") parser.add_option("--resamples", dest="resamples", type="string", help="number of times to resample replicates to" " generate pseudo datasets") parser.add_option("--input-gtf", dest="gtf_file", type="string", help="reference gtf file") parser.add_option("--output-file-directory", dest="output_dir", type="string", help="directory to output" " resampled files to") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) try: infile = IOTools.open_file(argv[-1], "r") except IOError: infile = options.stdin data_frame = pd.read_table(infile, sep="\t", index_col=0, header=0) time_str = options.timepoints.split(",") time_points = [int(x) for x in time_str] replicates = options.reps.split(",") reps = int(options.resamples) its = [time_str, replicates] midx = pd.MultiIndex.from_product(its, names=['times', 'replicates']) TS.genResampleData(data_frame=data_frame, multiple_index=midx, replicates=reps, sample_reps=replicates, times=time_points, condition=options.condition, ref_gtf=options.gtf_file, out_dir=options.output_dir, seed=int(options.random_seed)) # Write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--time", dest="timepoints", type="string", help="a comma-separated list of time points measured") parser.add_option("--replicates", dest="reps", type="string", help="a comma-separated list of replicate IDs") parser.add_option("--conditions", dest="conditions", type="string", help="a comma-separated list of experimental conditions") parser.add_option("--orders", dest="orders", type="int", help="order of polynomial terms to include in" "maSigPro linear model") parser.add_option("--fdr", dest="fdr", type="string", help="FDR for calling DEGs") parser.add_option("--padjust", dest="padjust", type="string", help="multiple testing correction to apply to" "control FDR") parser.add_option("--stepwise", dest="stepwise", type="string", help="stepwise regression to use") parser.add_option("--pinclude", dest="pinclude", type="string", help="p-value for inclusion in stepwise regression") parser.add_option("--rsquared", dest="rsquared", type="string", help="rsquared cut-off for DEG reporting") parser.add_option("--var-group", dest="vargroup", type="string", help="variable group reporting. each, all or" "group") parser.add_option("--task", dest="task", type="string", help="analysis task to be executed") parser.add_option("--infile", dest="infile", type="string", help="input file path") parser.add_option("--quantile", dest="quantile", type="int", help="see pipeline.ini for explanation") (options, args) = E.Start(parser, argv=argv) infile = argv[-1] parser.set_defaults(cutHeight=0, conditions=None, split=False, cluster_size=30) if options.task == "deseq": timepoints = [int(x) for x in options.timepoints.split(",")] timepoints.sort() reps = [x for x in options.reps.split(",")] if not options.conditions: conditions = None else: conditions = [x for x in options.conditions.split(",")] data_frame = TS.deseqNormalize(infile=infile, time_points=timepoints, reps=reps, conditions=conditions) elif options.task == "masigpro": data_frame = TS.maSigPro(infile=infile, order_terms=int(options.orders), fdr=float(options.fdr), adjust=options.padjust, stepwise=options.stepwise, include_p=float(options.pinclude), rsq=float(options.rsquared), var_group=options.vargroup) elif options.task == "sumcovar": timepoints = [int(x) for x in options.timepoints.split(",")] reps = [x for x in options.reps.split(",")] data_frame = TS.covarFilter(infile=infile, time_points=timepoints, replicates=reps, quantile=int(options.quantile)) elif options.task == "average_expression": data_frame = TS.avTimeExpression(infile) else: pass data_frame.to_csv(options.stdout, sep="\t", header=True, index_label="gene_id") # Write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--method", dest="method", type="choice", choices=("metrics", "summary", "module_summary"), help="method to summarise clustering") parser.add_option("--ref-gtf-files", dest="ref_gtf", type="string", help="comma separated list of reference gtf files") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.method == "metrics": infile = argv[-1] E.info("loading input file: %s" % infile) assert infile df = pd.read_table(infile, sep="\t", header=None, index_col=0) df = df.ix[:, :50] cluster_combs = (x for x in itertools.combinations(df.columns, 2)) genes = df.index results_dict = {} all_clusts = {} E.info("setting up cluster containers") for i in df.columns: clusters = set(df[i].values.tolist()) cluster_dict = {} for clust in clusters: cluster_dict[clust] = [] for gene in genes: cluster_dict[df[i][gene]].append(gene) for col in clusters: col_set = set() clust_col = cluster_dict[col] gene_members = itertools.combinations(clust_col, 2) col_set.update(gene_members) cluster_dict[col] = col_set all_clusts[i] = cluster_dict E.info("generating all pair-wise cluster comparisons") E.info("calculating adjusted mutual information") for k in cluster_combs: clusters1 = all_clusts[k[0]] clusters2 = all_clusts[k[1]] metric_dict = {} metric_dict['AMI'] = TS.adjustedMutualInformation( clusters1, clusters2) results_dict[k] = metric_dict res_frame = pd.DataFrame(results_dict).T res_frame = res_frame.reset_index() res_frame.drop(['level_0'], inplace=True, axis=1) res_frame.drop(['level_1'], inplace=True, axis=1) # flatten rand indices and add to output dataframe rand_arrays = TS.randIndexes(df) flat_adj_rand = TS.unravel_arrays(rand_arrays[0]) flat_rand = TS.unravel_arrays(rand_arrays[1]) res_frame['Rand_Index'] = flat_rand res_frame['Adjusted_Rand_Index'] = flat_adj_rand E.info("aggregating results") res_frame.to_csv(options.stdout, sep="\t", index_label='idx') elif options.method == "summary": infiles = argv[-1] list_of_files = infiles.split(",") file_dict = {} for fle in list_of_files: fname = fle.split("/")[-1] condition = fname.split("-")[0] ref = fname.split("-")[1] df_ = pd.read_table(fle, sep="\t", header=0, index_col=0) df_.columns = ['gene_id', 'cluster'] clust_dict = {} for idx in df_.index: cluster = df_.loc[idx]['cluster'] gene = df_.loc[idx]['gene_id'] try: clust_dict[cluster] += 1 except KeyError: clust_dict[cluster] = 1 med_size = np.median(list(clust_dict.values())) file_dict[fname] = { 'condition': condition, 'reference': ref, 'median_cluster_size': med_size } outframe = pd.DataFrame(file_dict).T outframe.to_csv(options.stdout, sep="\t", index_label='idx') elif options.method == "module_summary": # get lncRNA/gene lengths from reference gtfs ref_gtfs = options.ref_gtf.split(",") length_dict = {} for ref in ref_gtfs: oref = IOTools.openFile(ref, "rb") git = GTF.transcript_iterator(GTF.iterator(oref)) for gene in git: for trans in gene: length = trans.end - trans.start try: length_dict[trans.gene_id] += length except KeyError: length_dict[trans.gene_id] = length oref.close() infiles = argv[-1] list_of_files = infiles.split(",") fdfs = [] for fle in list_of_files: cond = fle.split("/")[-1].split("-")[0] refer = fle.split("/")[-1].split("-")[1] _df = pd.read_table(fle, sep="\t", header=0, index_col=0) _df.columns = ['gene_id', 'cluster'] clusters = set(_df['cluster']) c_dict = {} # summarize over each cluster for clust in clusters: lengths = [] c_df = _df[_df['cluster'] == clust] for lid in c_df['gene_id']: lengths.append(length_dict[lid]) c_dict[clust] = { 'cluster_size': len(c_df['gene_id']), 'mean_length': np.mean(lengths), 'index': (cond, refer), 'module': clust } cdf = pd.DataFrame(c_dict).T # use a multindex for hierarchical indexing midx = pd.MultiIndex.from_tuples(cdf['index']) cdf.index = midx cdf.drop(['index'], inplace=True, axis=1) fdfs.append(cdf) # generate a single output df s_df = fdfs[0] fdfs.pop(0) for df in fdfs: s_df = s_df.append(df) s_df.to_csv(options.stdout, index_label=("condition", "reference"), sep="\t") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--time", dest="timepoints", type="string", help="a comma-separated list of time points measured") parser.add_option("--replicates", dest="reps", type="string", help="a comma-separated list of replicate IDs") parser.add_option("--condition", dest="condition", type="string", help="experimental condition") parser.add_option("--resamples", dest="resamples", type="string", help="number of times to resample replicates to" " generate pseudo datasets") parser.add_option("--input-gtf", dest="gtf_file", type="string", help="reference gtf file") parser.add_option("--output-file-directory", dest="output_dir", type="string", help="directory to output" " resampled files to") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) try: infile = IOTools.openFile(argv[-1], "r") except IOError: infile = options.stdin data_frame = pd.read_table(infile, sep="\t", index_col=0, header=0) time_str = options.timepoints.split(",") time_points = [int(x) for x in time_str] replicates = options.reps.split(",") reps = int(options.resamples) its = [time_str, replicates] midx = pd.MultiIndex.from_product(its, names=['times', 'replicates']) TS.genResampleData(data_frame=data_frame, multiple_index=midx, replicates=reps, sample_reps=replicates, times=time_points, condition=options.condition, ref_gtf=options.gtf_file, out_dir=options.output_dir, seed=int(options.random_seed)) # Write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--task", dest="task", type="string", help="analysis task to be executed") parser.add_option("--infile", dest="infile", type="string", help="input file path") parser.add_option("--method", dest="method", type="choice", choices=("replicate", "resample"), help="whether to use replicate or resample " "for consensus clustering.") parser.add_option("--cluster-algorithm", dest="cluster", type="string", help="hierarchical clustering algorithm") parser.add_option("--expression-file", dest="express", type="string", help="matching expression data from input" " distance matrix") parser.add_option("--cluster-file", dest="clustfile", type="string", help="file to output cluster labels to") parser.add_option("--output-file", dest="outfile", type="string", help="output file to write to") parser.add_option("--cut-height", dest="cutHeight", type="string", help="threshold at which to define consensus clusters" "as valid") parser.add_option("--split-clusters", dest="split", action="store_true", help="switch for using deepSplit in tree cutting") parser.add_option("--cluster-size", dest="cluster_size", type="int", help="minimum cluster size for tree cutting. Clusters " "with fewer than this many objects will be merged with " "nearest cluster. Default=30") parser.add_option("--image-dir", dest="images_dir", type="string", help="directory to write plots/figures to") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) infile = argv[-1] parser.set_defaults(cutHeight=0, conditions=None, split=False, cluster_size=30) if options.task == "cluster": data_frame = TS.treeCutting(infile=infile, expression_file=options.express, cluster_file=options.clustfile, cluster_algorithm=options.cluster, deepsplit=options.split) elif options.task == "clustagree": if options.method == "resample": data_frame = TS.clusterAgreement(infile) elif options.method == "replicate": file_list = infile.split(",") data_frame = TS.clusterAverage(file_list) elif options.task == "consensus-cluster": min_size = int(options.cluster_size) data_frame = TS.consensusClustering(infile=infile, cutHeight=float(options.cutHeight), cluster_algorithm=options.cluster, min_size=min_size, deepsplit=options.split) elif options.task == "pca": files = infile.split(",") infile = files[1] cluster_file = files[0] data_frame = TS.clusterPCA(infile=infile, cluster_file=cluster_file, image_dir=options.images_dir) else: pass data_frame.to_csv(options.stdout, sep="\t", header=True, index_label="gene_id") # Write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--k", dest="k", type="int", default=0, help="value of k to adjust adaptive tuning function") parser.add_option("--out", dest="outfile", type="string", help="output file name") parser.add_option("--expression-file", dest="expr", type="string", help="file containing expression data") parser.add_option("--parallel", dest="parallel", action="store_true", default=False, help="switches on parallel, will" " split distance matrix into relevant number of" " slices. Start-end positions are defined by" " the file name.") parser.add_option("--distance-metric", dest="dist_metric", type="string", help="distance metric to use for dissimilarity of time " "series objects. Choices: dtw, cross-correlate, " "temporal-correlate. Default=dtw") parser.add_option("--lag", dest="lag", type="string", help="cross correlation lag to report") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) infile = args[-1] parser.set_defaults(lag=0, k=0) if options.parallel: datfile = options.expr else: datfile = infile data = pd.read_table(datfile, sep="\t", index_col=0, header=0) # data should already be sorted in time-series order # the time and replicate columns needs to be dropped to ensure only the # gene data is passed into the DTW function # drop header line(s) and non-numerical rows try: data.drop(['times'], inplace=True, axis=0) data.drop(['replicates'], inplace=True, axis=0) except ValueError: pass genes = data.index data = data.convert_objects(convert_numeric=True) # iterate over the genes list in nested loops to get # all pair-wise combinations. if options.dist_metric == "dtw": if options.parallel: start_idx = int(infile.split("-")[3].split("_")[0]) end_idx = int(infile.split("-")[3].split("_")[1]) slice_idx = genes[start_idx:end_idx] df_ = TS.dtwWrapper(data=data, rows=genes, columns=slice_idx, k=options.k) else: df_ = TS.dtwWrapper(data=data, rows=genes, columns=genes, k=options.k) elif options.dist_metric == "cross-correlate": if options.lag is None: options.lag = 0 else: pass if options.parallel: start_idx = int(infile.split("/")[-1].split("-")[3].split("_")[0]) end_idx = int(infile.split("/")[-1].split("-")[3].split("_")[1]) slice_idx = genes[start_idx:end_idx] df_ = TS.correlateDistanceMetric(data=data, rows=genes, columns=slice_idx, method=options.dist_metric, lag=int(options.lag)) else: df_ = TS.correlateDistanceMetric(data=data, rows=genes, columns=genes, method=options.dist_metric, lag=int(options.lag)) elif options.dist_metric == "temporal-correlate": if options.parallel: start_idx = int(infile.split("/")[-1].split("-")[3].split("_")[0]) end_idx = int(infile.split("/")[-1].split("-")[3].split("_")[1]) slice_idx = genes[start_idx:end_idx] df_ = TS.correlateDistanceMetric(data=data, rows=genes, columns=slice_idx, method=options.dist_metric) else: df_ = TS.correlateDistanceMetric(data=data, rows=genes, columns=genes, method=options.dist_metric) if not options.outfile: df_.to_csv(options.stdout, sep="\t") else: df_.to_csv(options.outfile, sep="\t") # write footer and output benchmark information E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--method", dest="method", type="choice", choices=("metrics", "summary", "module_summary"), help="method to summarise clustering") parser.add_option("--ref-gtf-files", dest="ref_gtf", type="string", help="comma separated list of reference gtf files") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.method == "metrics": infile = argv[-1] E.info("loading input file: %s" % infile) assert infile df = pd.read_table(infile, sep="\t", header=None, index_col=0) df = df.ix[:, :50] cluster_combs = (x for x in itertools.combinations(df.columns, 2)) genes = df.index results_dict = {} all_clusts = {} E.info("setting up cluster containers") for i in df.columns: clusters = set(df[i].values.tolist()) cluster_dict = {} for clust in clusters: cluster_dict[clust] = [] for gene in genes: cluster_dict[df[i][gene]].append(gene) for col in clusters: col_set = set() clust_col = cluster_dict[col] gene_members = itertools.combinations(clust_col, 2) col_set.update(gene_members) cluster_dict[col] = col_set all_clusts[i] = cluster_dict E.info("generating all pair-wise cluster comparisons") E.info("calculating adjusted mutual information") for k in cluster_combs: clusters1 = all_clusts[k[0]] clusters2 = all_clusts[k[1]] metric_dict = {} metric_dict['AMI'] = TS.adjustedMutualInformation(clusters1, clusters2) results_dict[k] = metric_dict res_frame = pd.DataFrame(results_dict).T res_frame = res_frame.reset_index() res_frame.drop(['level_0'], inplace=True, axis=1) res_frame.drop(['level_1'], inplace=True, axis=1) # flatten rand indices and add to output dataframe rand_arrays = TS.randIndexes(df) flat_adj_rand = TS.unravel_arrays(rand_arrays[0]) flat_rand = TS.unravel_arrays(rand_arrays[1]) res_frame['Rand_Index'] = flat_rand res_frame['Adjusted_Rand_Index'] = flat_adj_rand E.info("aggregating results") res_frame.to_csv(options.stdout, sep="\t", index_label='idx') elif options.method == "summary": infiles = argv[-1] list_of_files = infiles.split(",") file_dict = {} for fle in list_of_files: fname = fle.split("/")[-1] condition = fname.split("-")[0] ref = fname.split("-")[1] df_ = pd.read_table(fle, sep="\t", header=0, index_col=0) df_.columns = ['gene_id', 'cluster'] clust_dict = {} for idx in df_.index: cluster = df_.loc[idx]['cluster'] gene = df_.loc[idx]['gene_id'] try: clust_dict[cluster] += 1 except KeyError: clust_dict[cluster] = 1 med_size = np.median(clust_dict.values()) file_dict[fname] = {'condition': condition, 'reference': ref, 'median_cluster_size': med_size} outframe = pd.DataFrame(file_dict).T outframe.to_csv(options.stdout, sep="\t", index_label='idx') elif options.method == "module_summary": # get lncRNA/gene lengths from reference gtfs ref_gtfs = options.ref_gtf.split(",") length_dict = {} for ref in ref_gtfs: oref = IOTools.openFile(ref, "rb") git = GTF.transcript_iterator(GTF.iterator(oref)) for gene in git: for trans in gene: length = trans.end - trans.start try: length_dict[trans.gene_id] += length except KeyError: length_dict[trans.gene_id] = length oref.close() infiles = argv[-1] list_of_files = infiles.split(",") fdfs = [] for fle in list_of_files: cond = fle.split("/")[-1].split("-")[0] refer = fle.split("/")[-1].split("-")[1] _df = pd.read_table(fle, sep="\t", header=0, index_col=0) _df.columns = ['gene_id', 'cluster'] clusters = set(_df['cluster']) c_dict = {} # summarize over each cluster for clust in clusters: lengths = [] c_df = _df[_df['cluster'] == clust] for lid in c_df['gene_id']: lengths.append(length_dict[lid]) c_dict[clust] = {'cluster_size': len(c_df['gene_id']), 'mean_length': np.mean(lengths), 'index': (cond, refer), 'module': clust} cdf = pd.DataFrame(c_dict).T # use a multindex for hierarchical indexing midx = pd.MultiIndex.from_tuples(cdf['index']) cdf.index = midx cdf.drop(['index'], inplace=True, axis=1) fdfs.append(cdf) # generate a single output df s_df = fdfs[0] fdfs.pop(0) for df in fdfs: s_df = s_df.append(df) s_df.to_csv(options.stdout, index_label=("condition", "reference"), sep="\t") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--k", dest="k", type="int", default=0, help="value of k to adjust adaptive tuning function") parser.add_option("--out", dest="outfile", type="string", help="output file name") parser.add_option("--expression-file", dest="expr", type="string", help="file containing expression data") parser.add_option("--parallel", dest="parallel", action="store_true", default=False, help="switches on parallel, will" " split distance matrix into relevant number of" " slices. Start-end positions are defined by" " the file name.") parser.add_option("--distance-metric", dest="dist_metric", type="string", help="distance metric to use for dissimilarity of time " "series objects. Choices: dtw, cross-correlate, " "temporal-correlate. Default=dtw") parser.add_option("--lag", dest="lag", type="string", help="cross correlation lag to report") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) infile = args[-1] parser.set_defaults(lag=0, k=0) if options.parallel: datfile = options.expr else: datfile = infile data = pd.read_table(datfile, sep="\t", index_col=0, header=0) # data should already be sorted in time-series order # the time and replicate columns needs to be dropped to ensure only the # gene data is passed into the DTW function # drop header line(s) and non-numerical rows try: data.drop(['times'], inplace=True, axis=0) data.drop(['replicates'], inplace=True, axis=0) except ValueError: pass genes = data.index data = data.convert_objects(convert_numeric=True) # iterate over the genes list in nested loops to get # all pair-wise combinations. if options.dist_metric == "dtw": if options.parallel: start_idx = int(infile.split("-")[3].split("_")[0]) end_idx = int(infile.split("-")[3].split("_")[1]) slice_idx = genes[start_idx:end_idx] df_ = TS.dtwWrapper(data=data, rows=genes, columns=slice_idx, k=options.k) else: df_ = TS.dtwWrapper(data=data, rows=genes, columns=genes, k=options.k) elif options.dist_metric == "cross-correlate": if options.lag is None: options.lag = 0 else: pass if options.parallel: start_idx = int(infile.split("/")[-1].split("-")[3].split("_")[0]) end_idx = int(infile.split("/")[-1].split("-")[3].split("_")[1]) slice_idx = genes[start_idx:end_idx] df_ = TS.correlateDistanceMetric(data=data, rows=genes, columns=slice_idx, method=options.dist_metric, lag=int(options.lag)) else: df_ = TS.correlateDistanceMetric(data=data, rows=genes, columns=genes, method=options.dist_metric, lag=int(options.lag)) elif options.dist_metric == "temporal-correlate": if options.parallel: start_idx = int(infile.split("/")[-1].split("-")[3].split("_")[0]) end_idx = int(infile.split("/")[-1].split("-")[3].split("_")[1]) slice_idx = genes[start_idx:end_idx] df_ = TS.correlateDistanceMetric(data=data, rows=genes, columns=slice_idx, method=options.dist_metric) else: df_ = TS.correlateDistanceMetric(data=data, rows=genes, columns=genes, method=options.dist_metric) if not options.outfile: df_.to_csv(options.stdout, sep="\t") else: df_.to_csv(options.outfile, sep="\t") # write footer and output benchmark information E.Stop()