def gen_clusters_from_data(fmeth, min_clust_size=3, max_clust_size=200, n=20, w=0.5): header = nopen(fmeth).readline().rstrip().split("\t")[1:] clust_gen = (c for c in aclust(feature.feature_gen(fmeth), max_dist=500, max_skip=1, linkage=0.5) if len(c) >= min_clust_size and len(c) <= max_clust_size) for cluster in generate(clust_gen, header, n=n, w=w): yield cluster
def gen_clusters_from_data(fmeth, min_clust_size=3, max_clust_size=200, n=20, w=0.5): header = nopen(fmeth).readline().rstrip().split("\t")[1:] clust_gen = (c for c in aclust( feature.feature_gen(fmeth), max_dist=500, max_skip=1, linkage=0.5) if len(c) >= min_clust_size and len(c) <= max_clust_size) for cluster in generate(clust_gen, header, n=n, w=w): yield cluster
def main(argv=sys.argv[1:]): import argparse ap = argparse.ArgumentParser() ap.add_argument("--n-samples", type=int, default=20, help="number of samples from each group to simulate") ap.add_argument("-w", type=float, default=0, help="weight parameter 0 is random, 1 is strong separation" " between simulated groups") ap.add_argument("methylation", help="input methylation data from which to simulate data") ap.add_argument("prefix", help="output prefix. prefix.meth.txt, prefix.covs.txt " "and prefix.clusters.txt" " be created") cp = ap.add_argument_group('clustering parameters') cp.add_argument('--rho-min', type=float, default=0.3, help="minimum correlation to merge 2 probes") cp.add_argument('--min-cluster-size', type=int, default=2, help="minimum cluster size on which to run model: " "must be at least 2") cp.add_argument('--linkage', choices=['single', 'complete'], default='complete', help="linkage method") cp.add_argument('--max-dist', default=500, type=int, help="maximum distance beyond which a probe can not be" " added to a cluster") args = ap.parse_args(argv) samples = np.array(nopen(args.methylation).readline().rstrip().split("\t")[1:]) cluster_gen = (c for c in aclust(feature.feature_gen(args.methylation, rho_min=args.rho_min), max_dist=args.max_dist, max_skip=1, linkage=args.linkage)) def check_cluster(clust): return len(clust) >= args.min_cluster_size n, w = args.n_samples, args.w fh_meth = nopen("%s.meth.txt" % args.prefix, "w") fh_clst = nopen("%s.clusters.txt" % args.prefix, "w") with nopen("%s.covs.txt" % args.prefix, "w") as fh_covs: print >>fh_covs, "id\tcase" print >>fh_covs, "\n".join( ["case_%i\t1" % i for i in range(n)] + ["ctrl_%i\t0" % i for i in range(n)]) print >>fh_meth, "probe\t" + "\t".join( ["case_%i" % i for i in range(n)] + ["ctrl_%i" % i for i in range(n)]) print >>fh_clst, "chrom\tstart\tend\tw\tprobes" for i, ocluster in enumerate(cluster_gen): is_cluster = check_cluster(ocluster) cluster = simulate_cluster(ocluster, samples, n, w if is_cluster else 0) cluster.to_csv(fh_meth, sep="\t", index=True, header=False, float_format="%.3f") chrom, start, end = ocluster[0].group, ocluster[0].start, ocluster[-1].end probes = ",".join("%s:%i" % (o.group, o.end) for o in ocluster) if is_cluster: print >>fh_clst, "{chrom}\t{start}\t{end}\t{w}\t{probes}".format(**locals()) print >>sys.stderr, "wrote:", fh_meth.name
def main(argv=sys.argv[1:]): import argparse ap = argparse.ArgumentParser() ap.add_argument("--n-samples", type=int, default=20, help="number of samples from each group to simulate") ap.add_argument("-w", type=float, default=0, help="weight parameter 0 is random, 1 is strong separation" " between simulated groups") ap.add_argument("methylation", help="input methylation data from which to simulate data") ap.add_argument("prefix", help="output prefix. prefix.meth.txt, prefix.covs.txt " "and prefix.clusters.txt" " be created") cp = ap.add_argument_group('clustering parameters') cp.add_argument('--rho-min', type=float, default=0.3, help="minimum correlation to merge 2 probes") cp.add_argument('--min-cluster-size', type=int, default=2, help="minimum cluster size on which to run model: " "must be at least 2") cp.add_argument('--linkage', choices=['single', 'complete'], default='complete', help="linkage method") cp.add_argument('--max-dist', default=500, type=int, help="maximum distance beyond which a probe can not be" " added to a cluster") args = ap.parse_args(argv) samples = np.array( nopen(args.methylation).readline().rstrip().split("\t")[1:]) cluster_gen = (c for c in aclust(feature.feature_gen(args.methylation, rho_min=args.rho_min), max_dist=args.max_dist, max_skip=1, linkage=args.linkage)) def check_cluster(clust): return len(clust) >= args.min_cluster_size n, w = args.n_samples, args.w fh_meth = nopen("%s.meth.txt" % args.prefix, "w") fh_clst = nopen("%s.clusters.txt" % args.prefix, "w") with nopen("%s.covs.txt" % args.prefix, "w") as fh_covs: print >> fh_covs, "id\tcase" print >> fh_covs, "\n".join(["case_%i\t1" % i for i in range(n)] + ["ctrl_%i\t0" % i for i in range(n)]) print >> fh_meth, "probe\t" + "\t".join(["case_%i" % i for i in range(n)] + ["ctrl_%i" % i for i in range(n)]) print >> fh_clst, "chrom\tstart\tend\tw\tprobes" for i, ocluster in enumerate(cluster_gen): is_cluster = check_cluster(ocluster) cluster = simulate_cluster(ocluster, samples, n, w if is_cluster else 0) cluster.to_csv(fh_meth, sep="\t", index=True, header=False, float_format="%.3f") chrom, start, end = ocluster[0].group, ocluster[0].start, ocluster[ -1].end probes = ",".join("%s:%i" % (o.group, o.end) for o in ocluster) if is_cluster: print >> fh_clst, "{chrom}\t{start}\t{end}\t{w}\t{probes}".format( **locals()) print >> sys.stderr, "wrote:", fh_meth.name