Пример #1
0
 def __init__(self, workdir, betadir="beta",
              mu=.003, sigma=10, step=.1, threshold=.2):
     self.model = self.initialize(mu=mu, sigma=sigma, step=step)
     self.workdir = workdir
     self.betadir = betadir
     if not op.exists(betadir):
         sync_from_s3("s3://hli-mv-data-science/htang/ccn/beta",
                      target_dir=betadir)
     self.mu = mu
     self.sigma = sigma
     self.step = step
     self.threshold = threshold
Пример #2
0
 def __init__(self, workdir, betadir="beta",
              mu=.003, sigma=10, step=.1, threshold=.2):
     self.model = self.initialize(mu=mu, sigma=sigma, step=step)
     self.workdir = workdir
     self.betadir = betadir
     if not op.exists(betadir):
         sync_from_s3("s3://hli-mv-data-science/htang/ccn/beta",
                      target_dir=betadir)
     self.mu = mu
     self.sigma = sigma
     self.step = step
     self.threshold = threshold
Пример #3
0
def cn(args):
    """
    %prog cn workdir 102340_NA12878 \
        s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/

    Download CCN output folder and convert cib to copy number per 1Kb.
    """
    p = OptionParser(cn.__doc__)
    p.add_option("--binsize",
                 default=1000,
                 type="int",
                 help="Window size along chromosome")
    p.add_option(
        "--cleanup",
        default=False,
        action="store_true",
        help="Clean up downloaded s3 folder",
    )
    p.add_option(
        "--hmm",
        default=False,
        action="store_true",
        help="Run HMM caller after computing CN",
    )
    p.add_option(
        "--upload",
        default="s3://hli-mv-data-science/htang/ccn",
        help="Upload cn and seg results to s3",
    )
    p.add_option("--rebuildgc",
                 help="Rebuild GC directory rather than pulling from S3")
    opts, args = p.parse_args(args)

    if len(args) == 2:
        workdir, sample_key = args
        s3dir = None
    elif len(args) == 3:
        workdir, sample_key, s3dir = args
    else:
        sys.exit(not p.print_help())

    n = opts.binsize
    rebuildgc = opts.rebuildgc
    mkdir(workdir)
    sampledir = op.join(workdir, sample_key)
    if s3dir:
        sync_from_s3(s3dir, target_dir=sampledir)

    assert op.exists(sampledir), "Directory {} doesn't exist!".format(
        sampledir)

    cndir = op.join(workdir, sample_key + "-cn")
    if op.exists(cndir):
        logging.debug("Directory {} exists. Skipped.".format(cndir))
        return

    gcdir = "gc"
    if rebuildgc:
        build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir)
    if not op.exists(gcdir):
        sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc", target_dir=gcdir)

    # Build GC correction table
    gc_bin = defaultdict(list)
    gc_med = {}
    coverage = []

    for seqid in allsomes:
        gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n))
        if not op.exists(gcfile):
            logging.error("File {} not found. Continue anyway.".format(gcfile))
            continue
        gc = np.fromfile(gcfile, dtype=np.uint8)
        cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid))
        cib = load_cib(cibfile)
        print(seqid, gc.shape[0], cib.shape[0], file=sys.stderr)
        if seqid in autosomes:
            for gci, k in zip(gc, cib):
                gc_bin[gci].append(k)
        coverage.append((seqid, gc, cib))

    for gci, k in gc_bin.items():
        nonzero_k = [x for x in k if x]
        gc_med[gci] = med = np.median(nonzero_k) / 2
        print(gci, len(nonzero_k), med, file=sys.stderr)

    mkdir(cndir)
    apply_fun = np.vectorize(gc_med.get)
    # Apply the GC correction over coverage
    for seqid, gc, cib in coverage:
        nitems = cib.shape[0]
        beta = apply_fun(gc[:nitems])
        beta_cn = cib / beta
        cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid))
        beta_cn.tofile(cnfile)

    # Run HMM caller if asked
    segfile = hmm([workdir, sample_key]) if opts.hmm else None

    upload = opts.upload
    if upload:
        push_to_s3(upload, cndir)
        if segfile:
            push_to_s3(upload, segfile)

    if opts.cleanup:
        import shutil

        shutil.rmtree(sampledir)
        shutil.rmtree(cndir)
Пример #4
0
def cn(args):
    """
    %prog cn workdir 102340_NA12878 \
        s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/

    Download CCN output folder and convert cib to copy number per 1Kb.
    """
    p = OptionParser(cn.__doc__)
    p.add_option("--binsize", default=1000, type="int",
                 help="Window size along chromosome")
    p.add_option("--cleanup", default=False, action="store_true",
                 help="Clean up downloaded s3 folder")
    p.add_option("--hmm", default=False, action="store_true",
                 help="Run HMM caller after computing CN")
    p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn",
                 help="Upload cn and seg results to s3")
    p.add_option("--rebuildgc",
                 help="Rebuild GC directory rather than pulling from S3")
    opts, args = p.parse_args(args)

    if len(args) == 2:
        workdir, sample_key = args
        s3dir = None
    elif len(args) == 3:
        workdir, sample_key, s3dir = args
    else:
        sys.exit(not p.print_help())

    n = opts.binsize
    rebuildgc = opts.rebuildgc
    mkdir(workdir)
    sampledir = op.join(workdir, sample_key)
    if s3dir:
        sync_from_s3(s3dir, target_dir=sampledir)

    assert op.exists(sampledir), \
        "Directory {} doesn't exist!".format(sampledir)

    cndir = op.join(workdir, sample_key + "-cn")
    if op.exists(cndir):
        logging.debug("Directory {} exists. Skipped.".format(cndir))
        return

    gcdir = "gc"
    if rebuildgc:
        build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir)
    if not op.exists(gcdir):
        sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc",
                     target_dir=gcdir)

    # Build GC correction table
    gc_bin = defaultdict(list)
    gc_med = {}
    coverage = []

    for seqid in allsomes:
        gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n))
        if not op.exists(gcfile):
            logging.error("File {} not found. Continue anyway.".format(gcfile))
            continue
        gc = np.fromfile(gcfile, dtype=np.uint8)
        cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid))
        cib = load_cib(cibfile)
        print >> sys.stderr, seqid, gc.shape[0], cib.shape[0]
        if seqid in autosomes:
            for gci, k in zip(gc, cib):
                gc_bin[gci].append(k)
        coverage.append((seqid, gc, cib))

    for gci, k in gc_bin.items():
        nonzero_k = [x for x in k if x]
        gc_med[gci] = med = np.median(nonzero_k) / 2
        print >> sys.stderr, gci, len(nonzero_k), med

    mkdir(cndir)
    apply_fun = np.vectorize(gc_med.get)
    # Apply the GC correction over coverage
    for seqid, gc, cib in coverage:
        nitems = cib.shape[0]
        beta = apply_fun(gc[:nitems])
        beta_cn = cib / beta
        cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid))
        beta_cn.tofile(cnfile)

    # Run HMM caller if asked
    segfile = hmm([workdir, sample_key]) if opts.hmm else None

    upload = opts.upload
    if upload:
        push_to_s3(upload, cndir)
        if segfile:
            push_to_s3(upload, segfile)

    if opts.cleanup:
        import shutil
        shutil.rmtree(sampledir)
        shutil.rmtree(cndir)