示例#1
0
def gen_clusters_from_data(fmeth, min_clust_size=3, max_clust_size=200, n=20, w=0.5):
    header = nopen(fmeth).readline().rstrip().split("\t")[1:]
    clust_gen = (c for c in aclust(feature.feature_gen(fmeth),
                                   max_dist=500,
                                   max_skip=1,
                                   linkage=0.5)
                              if len(c) >= min_clust_size
                             and len(c) <= max_clust_size)
    for cluster in generate(clust_gen, header, n=n, w=w):
        yield cluster
示例#2
0
def gen_clusters_from_data(fmeth,
                           min_clust_size=3,
                           max_clust_size=200,
                           n=20,
                           w=0.5):
    header = nopen(fmeth).readline().rstrip().split("\t")[1:]
    clust_gen = (c for c in aclust(
        feature.feature_gen(fmeth), max_dist=500, max_skip=1, linkage=0.5)
                 if len(c) >= min_clust_size and len(c) <= max_clust_size)
    for cluster in generate(clust_gen, header, n=n, w=w):
        yield cluster
示例#3
0
def main(argv=sys.argv[1:]):
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--n-samples", type=int, default=20,
            help="number of samples from each group to simulate")
    ap.add_argument("-w", type=float, default=0,
            help="weight parameter 0 is random, 1 is strong separation"
            " between simulated groups")
    ap.add_argument("methylation",
            help="input methylation data from which to simulate data")
    ap.add_argument("prefix",
            help="output prefix. prefix.meth.txt, prefix.covs.txt "
            "and prefix.clusters.txt" " be created")

    cp = ap.add_argument_group('clustering parameters')
    cp.add_argument('--rho-min', type=float, default=0.3,
                   help="minimum correlation to merge 2 probes")
    cp.add_argument('--min-cluster-size', type=int, default=2,
                    help="minimum cluster size on which to run model: "
                   "must be at least 2")
    cp.add_argument('--linkage', choices=['single', 'complete'],
                    default='complete', help="linkage method")

    cp.add_argument('--max-dist', default=500, type=int,
                    help="maximum distance beyond which a probe can not be"
                    " added to a cluster")

    args = ap.parse_args(argv)

    samples = np.array(nopen(args.methylation).readline().rstrip().split("\t")[1:])
    cluster_gen = (c for c in aclust(feature.feature_gen(args.methylation,
                                                         rho_min=args.rho_min),
                                     max_dist=args.max_dist,
                                     max_skip=1,
                                     linkage=args.linkage))
    def check_cluster(clust):
        return len(clust) >= args.min_cluster_size

    n, w = args.n_samples, args.w
    fh_meth = nopen("%s.meth.txt" % args.prefix, "w")
    fh_clst = nopen("%s.clusters.txt" % args.prefix, "w")
    with nopen("%s.covs.txt" % args.prefix, "w") as fh_covs:
        print >>fh_covs, "id\tcase"
        print >>fh_covs, "\n".join(
                         ["case_%i\t1" % i for i in range(n)] +
                         ["ctrl_%i\t0" % i for i in range(n)])
    print >>fh_meth, "probe\t" + "\t".join(
            ["case_%i" % i for i in range(n)] +
            ["ctrl_%i" % i for i in range(n)])

    print >>fh_clst, "chrom\tstart\tend\tw\tprobes"

    for i, ocluster in enumerate(cluster_gen):
        is_cluster = check_cluster(ocluster)
        cluster = simulate_cluster(ocluster, samples, n, w if is_cluster else 0)
        cluster.to_csv(fh_meth, sep="\t", index=True, header=False,
                float_format="%.3f")
        chrom, start, end = ocluster[0].group, ocluster[0].start, ocluster[-1].end
        probes = ",".join("%s:%i" % (o.group, o.end) for o in ocluster)
        if is_cluster:
            print >>fh_clst, "{chrom}\t{start}\t{end}\t{w}\t{probes}".format(**locals())

    print >>sys.stderr, "wrote:", fh_meth.name
示例#4
0
def main(argv=sys.argv[1:]):
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--n-samples",
                    type=int,
                    default=20,
                    help="number of samples from each group to simulate")
    ap.add_argument("-w",
                    type=float,
                    default=0,
                    help="weight parameter 0 is random, 1 is strong separation"
                    " between simulated groups")
    ap.add_argument("methylation",
                    help="input methylation data from which to simulate data")
    ap.add_argument("prefix",
                    help="output prefix. prefix.meth.txt, prefix.covs.txt "
                    "and prefix.clusters.txt"
                    " be created")

    cp = ap.add_argument_group('clustering parameters')
    cp.add_argument('--rho-min',
                    type=float,
                    default=0.3,
                    help="minimum correlation to merge 2 probes")
    cp.add_argument('--min-cluster-size',
                    type=int,
                    default=2,
                    help="minimum cluster size on which to run model: "
                    "must be at least 2")
    cp.add_argument('--linkage',
                    choices=['single', 'complete'],
                    default='complete',
                    help="linkage method")

    cp.add_argument('--max-dist',
                    default=500,
                    type=int,
                    help="maximum distance beyond which a probe can not be"
                    " added to a cluster")

    args = ap.parse_args(argv)

    samples = np.array(
        nopen(args.methylation).readline().rstrip().split("\t")[1:])
    cluster_gen = (c for c in aclust(feature.feature_gen(args.methylation,
                                                         rho_min=args.rho_min),
                                     max_dist=args.max_dist,
                                     max_skip=1,
                                     linkage=args.linkage))

    def check_cluster(clust):
        return len(clust) >= args.min_cluster_size

    n, w = args.n_samples, args.w
    fh_meth = nopen("%s.meth.txt" % args.prefix, "w")
    fh_clst = nopen("%s.clusters.txt" % args.prefix, "w")
    with nopen("%s.covs.txt" % args.prefix, "w") as fh_covs:
        print >> fh_covs, "id\tcase"
        print >> fh_covs, "\n".join(["case_%i\t1" % i for i in range(n)] +
                                    ["ctrl_%i\t0" % i for i in range(n)])
    print >> fh_meth, "probe\t" + "\t".join(["case_%i" % i for i in range(n)] +
                                            ["ctrl_%i" % i for i in range(n)])

    print >> fh_clst, "chrom\tstart\tend\tw\tprobes"

    for i, ocluster in enumerate(cluster_gen):
        is_cluster = check_cluster(ocluster)
        cluster = simulate_cluster(ocluster, samples, n,
                                   w if is_cluster else 0)
        cluster.to_csv(fh_meth,
                       sep="\t",
                       index=True,
                       header=False,
                       float_format="%.3f")
        chrom, start, end = ocluster[0].group, ocluster[0].start, ocluster[
            -1].end
        probes = ",".join("%s:%i" % (o.group, o.end) for o in ocluster)
        if is_cluster:
            print >> fh_clst, "{chrom}\t{start}\t{end}\t{w}\t{probes}".format(
                **locals())

    print >> sys.stderr, "wrote:", fh_meth.name