def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--method", dest="method", type="choice", choices=("metrics", "summary", "module_summary"), help="method to summarise clustering") parser.add_option("--ref-gtf-files", dest="ref_gtf", type="string", help="comma separated list of reference gtf files") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.method == "metrics": infile = argv[-1] E.info("loading input file: %s" % infile) assert infile df = pd.read_table(infile, sep="\t", header=None, index_col=0) df = df.ix[:, :50] cluster_combs = (x for x in itertools.combinations(df.columns, 2)) genes = df.index results_dict = {} all_clusts = {} E.info("setting up cluster containers") for i in df.columns: clusters = set(df[i].values.tolist()) cluster_dict = {} for clust in clusters: cluster_dict[clust] = [] for gene in genes: cluster_dict[df[i][gene]].append(gene) for col in clusters: col_set = set() clust_col = cluster_dict[col] gene_members = itertools.combinations(clust_col, 2) col_set.update(gene_members) cluster_dict[col] = col_set all_clusts[i] = cluster_dict E.info("generating all pair-wise cluster comparisons") E.info("calculating adjusted mutual information") for k in cluster_combs: clusters1 = all_clusts[k[0]] clusters2 = all_clusts[k[1]] metric_dict = {} metric_dict['AMI'] = TS.adjustedMutualInformation( clusters1, clusters2) results_dict[k] = metric_dict res_frame = pd.DataFrame(results_dict).T res_frame = res_frame.reset_index() res_frame.drop(['level_0'], inplace=True, axis=1) res_frame.drop(['level_1'], inplace=True, axis=1) # flatten rand indices and add to output dataframe rand_arrays = TS.randIndexes(df) flat_adj_rand = TS.unravel_arrays(rand_arrays[0]) flat_rand = TS.unravel_arrays(rand_arrays[1]) res_frame['Rand_Index'] = flat_rand res_frame['Adjusted_Rand_Index'] = flat_adj_rand E.info("aggregating results") res_frame.to_csv(options.stdout, sep="\t", index_label='idx') elif options.method == "summary": infiles = argv[-1] list_of_files = infiles.split(",") file_dict = {} for fle in list_of_files: fname = fle.split("/")[-1] condition = fname.split("-")[0] ref = fname.split("-")[1] df_ = pd.read_table(fle, sep="\t", header=0, index_col=0) df_.columns = ['gene_id', 'cluster'] clust_dict = {} for idx in df_.index: cluster = df_.loc[idx]['cluster'] gene = df_.loc[idx]['gene_id'] try: clust_dict[cluster] += 1 except KeyError: clust_dict[cluster] = 1 med_size = np.median(list(clust_dict.values())) file_dict[fname] = { 'condition': condition, 'reference': ref, 'median_cluster_size': med_size } outframe = pd.DataFrame(file_dict).T outframe.to_csv(options.stdout, sep="\t", index_label='idx') elif options.method == "module_summary": # get lncRNA/gene lengths from reference gtfs ref_gtfs = options.ref_gtf.split(",") length_dict = {} for ref in ref_gtfs: oref = IOTools.openFile(ref, "rb") git = GTF.transcript_iterator(GTF.iterator(oref)) for gene in git: for trans in gene: length = trans.end - trans.start try: length_dict[trans.gene_id] += length except KeyError: length_dict[trans.gene_id] = length oref.close() infiles = argv[-1] list_of_files = infiles.split(",") fdfs = [] for fle in list_of_files: cond = fle.split("/")[-1].split("-")[0] refer = fle.split("/")[-1].split("-")[1] _df = pd.read_table(fle, sep="\t", header=0, index_col=0) _df.columns = ['gene_id', 'cluster'] clusters = set(_df['cluster']) c_dict = {} # summarize over each cluster for clust in clusters: lengths = [] c_df = _df[_df['cluster'] == clust] for lid in c_df['gene_id']: lengths.append(length_dict[lid]) c_dict[clust] = { 'cluster_size': len(c_df['gene_id']), 'mean_length': np.mean(lengths), 'index': (cond, refer), 'module': clust } cdf = pd.DataFrame(c_dict).T # use a multindex for hierarchical indexing midx = pd.MultiIndex.from_tuples(cdf['index']) cdf.index = midx cdf.drop(['index'], inplace=True, axis=1) fdfs.append(cdf) # generate a single output df s_df = fdfs[0] fdfs.pop(0) for df in fdfs: s_df = s_df.append(df) s_df.to_csv(options.stdout, index_label=("condition", "reference"), sep="\t") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--method", dest="method", type="choice", choices=("metrics", "summary", "module_summary"), help="method to summarise clustering") parser.add_option("--ref-gtf-files", dest="ref_gtf", type="string", help="comma separated list of reference gtf files") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.method == "metrics": infile = argv[-1] E.info("loading input file: %s" % infile) assert infile df = pd.read_table(infile, sep="\t", header=None, index_col=0) df = df.ix[:, :50] cluster_combs = (x for x in itertools.combinations(df.columns, 2)) genes = df.index results_dict = {} all_clusts = {} E.info("setting up cluster containers") for i in df.columns: clusters = set(df[i].values.tolist()) cluster_dict = {} for clust in clusters: cluster_dict[clust] = [] for gene in genes: cluster_dict[df[i][gene]].append(gene) for col in clusters: col_set = set() clust_col = cluster_dict[col] gene_members = itertools.combinations(clust_col, 2) col_set.update(gene_members) cluster_dict[col] = col_set all_clusts[i] = cluster_dict E.info("generating all pair-wise cluster comparisons") E.info("calculating adjusted mutual information") for k in cluster_combs: clusters1 = all_clusts[k[0]] clusters2 = all_clusts[k[1]] metric_dict = {} metric_dict['AMI'] = TS.adjustedMutualInformation(clusters1, clusters2) results_dict[k] = metric_dict res_frame = pd.DataFrame(results_dict).T res_frame = res_frame.reset_index() res_frame.drop(['level_0'], inplace=True, axis=1) res_frame.drop(['level_1'], inplace=True, axis=1) # flatten rand indices and add to output dataframe rand_arrays = TS.randIndexes(df) flat_adj_rand = TS.unravel_arrays(rand_arrays[0]) flat_rand = TS.unravel_arrays(rand_arrays[1]) res_frame['Rand_Index'] = flat_rand res_frame['Adjusted_Rand_Index'] = flat_adj_rand E.info("aggregating results") res_frame.to_csv(options.stdout, sep="\t", index_label='idx') elif options.method == "summary": infiles = argv[-1] list_of_files = infiles.split(",") file_dict = {} for fle in list_of_files: fname = fle.split("/")[-1] condition = fname.split("-")[0] ref = fname.split("-")[1] df_ = pd.read_table(fle, sep="\t", header=0, index_col=0) df_.columns = ['gene_id', 'cluster'] clust_dict = {} for idx in df_.index: cluster = df_.loc[idx]['cluster'] gene = df_.loc[idx]['gene_id'] try: clust_dict[cluster] += 1 except KeyError: clust_dict[cluster] = 1 med_size = np.median(clust_dict.values()) file_dict[fname] = {'condition': condition, 'reference': ref, 'median_cluster_size': med_size} outframe = pd.DataFrame(file_dict).T outframe.to_csv(options.stdout, sep="\t", index_label='idx') elif options.method == "module_summary": # get lncRNA/gene lengths from reference gtfs ref_gtfs = options.ref_gtf.split(",") length_dict = {} for ref in ref_gtfs: oref = IOTools.openFile(ref, "rb") git = GTF.transcript_iterator(GTF.iterator(oref)) for gene in git: for trans in gene: length = trans.end - trans.start try: length_dict[trans.gene_id] += length except KeyError: length_dict[trans.gene_id] = length oref.close() infiles = argv[-1] list_of_files = infiles.split(",") fdfs = [] for fle in list_of_files: cond = fle.split("/")[-1].split("-")[0] refer = fle.split("/")[-1].split("-")[1] _df = pd.read_table(fle, sep="\t", header=0, index_col=0) _df.columns = ['gene_id', 'cluster'] clusters = set(_df['cluster']) c_dict = {} # summarize over each cluster for clust in clusters: lengths = [] c_df = _df[_df['cluster'] == clust] for lid in c_df['gene_id']: lengths.append(length_dict[lid]) c_dict[clust] = {'cluster_size': len(c_df['gene_id']), 'mean_length': np.mean(lengths), 'index': (cond, refer), 'module': clust} cdf = pd.DataFrame(c_dict).T # use a multindex for hierarchical indexing midx = pd.MultiIndex.from_tuples(cdf['index']) cdf.index = midx cdf.drop(['index'], inplace=True, axis=1) fdfs.append(cdf) # generate a single output df s_df = fdfs[0] fdfs.pop(0) for df in fdfs: s_df = s_df.append(df) s_df.to_csv(options.stdout, index_label=("condition", "reference"), sep="\t") # write footer and output benchmark information. E.Stop()