def __init__(self, workdir, betadir="beta", mu=.003, sigma=10, step=.1, threshold=.2): self.model = self.initialize(mu=mu, sigma=sigma, step=step) self.workdir = workdir self.betadir = betadir if not op.exists(betadir): sync_from_s3("s3://hli-mv-data-science/htang/ccn/beta", target_dir=betadir) self.mu = mu self.sigma = sigma self.step = step self.threshold = threshold
def cn(args): """ %prog cn workdir 102340_NA12878 \ s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/ Download CCN output folder and convert cib to copy number per 1Kb. """ p = OptionParser(cn.__doc__) p.add_option("--binsize", default=1000, type="int", help="Window size along chromosome") p.add_option( "--cleanup", default=False, action="store_true", help="Clean up downloaded s3 folder", ) p.add_option( "--hmm", default=False, action="store_true", help="Run HMM caller after computing CN", ) p.add_option( "--upload", default="s3://hli-mv-data-science/htang/ccn", help="Upload cn and seg results to s3", ) p.add_option("--rebuildgc", help="Rebuild GC directory rather than pulling from S3") opts, args = p.parse_args(args) if len(args) == 2: workdir, sample_key = args s3dir = None elif len(args) == 3: workdir, sample_key, s3dir = args else: sys.exit(not p.print_help()) n = opts.binsize rebuildgc = opts.rebuildgc mkdir(workdir) sampledir = op.join(workdir, sample_key) if s3dir: sync_from_s3(s3dir, target_dir=sampledir) assert op.exists(sampledir), "Directory {} doesn't exist!".format( sampledir) cndir = op.join(workdir, sample_key + "-cn") if op.exists(cndir): logging.debug("Directory {} exists. Skipped.".format(cndir)) return gcdir = "gc" if rebuildgc: build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir) if not op.exists(gcdir): sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc", target_dir=gcdir) # Build GC correction table gc_bin = defaultdict(list) gc_med = {} coverage = [] for seqid in allsomes: gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n)) if not op.exists(gcfile): logging.error("File {} not found. Continue anyway.".format(gcfile)) continue gc = np.fromfile(gcfile, dtype=np.uint8) cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid)) cib = load_cib(cibfile) print(seqid, gc.shape[0], cib.shape[0], file=sys.stderr) if seqid in autosomes: for gci, k in zip(gc, cib): gc_bin[gci].append(k) coverage.append((seqid, gc, cib)) for gci, k in gc_bin.items(): nonzero_k = [x for x in k if x] gc_med[gci] = med = np.median(nonzero_k) / 2 print(gci, len(nonzero_k), med, file=sys.stderr) mkdir(cndir) apply_fun = np.vectorize(gc_med.get) # Apply the GC correction over coverage for seqid, gc, cib in coverage: nitems = cib.shape[0] beta = apply_fun(gc[:nitems]) beta_cn = cib / beta cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid)) beta_cn.tofile(cnfile) # Run HMM caller if asked segfile = hmm([workdir, sample_key]) if opts.hmm else None upload = opts.upload if upload: push_to_s3(upload, cndir) if segfile: push_to_s3(upload, segfile) if opts.cleanup: import shutil shutil.rmtree(sampledir) shutil.rmtree(cndir)
def cn(args): """ %prog cn workdir 102340_NA12878 \ s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/ Download CCN output folder and convert cib to copy number per 1Kb. """ p = OptionParser(cn.__doc__) p.add_option("--binsize", default=1000, type="int", help="Window size along chromosome") p.add_option("--cleanup", default=False, action="store_true", help="Clean up downloaded s3 folder") p.add_option("--hmm", default=False, action="store_true", help="Run HMM caller after computing CN") p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn", help="Upload cn and seg results to s3") p.add_option("--rebuildgc", help="Rebuild GC directory rather than pulling from S3") opts, args = p.parse_args(args) if len(args) == 2: workdir, sample_key = args s3dir = None elif len(args) == 3: workdir, sample_key, s3dir = args else: sys.exit(not p.print_help()) n = opts.binsize rebuildgc = opts.rebuildgc mkdir(workdir) sampledir = op.join(workdir, sample_key) if s3dir: sync_from_s3(s3dir, target_dir=sampledir) assert op.exists(sampledir), \ "Directory {} doesn't exist!".format(sampledir) cndir = op.join(workdir, sample_key + "-cn") if op.exists(cndir): logging.debug("Directory {} exists. Skipped.".format(cndir)) return gcdir = "gc" if rebuildgc: build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir) if not op.exists(gcdir): sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc", target_dir=gcdir) # Build GC correction table gc_bin = defaultdict(list) gc_med = {} coverage = [] for seqid in allsomes: gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n)) if not op.exists(gcfile): logging.error("File {} not found. Continue anyway.".format(gcfile)) continue gc = np.fromfile(gcfile, dtype=np.uint8) cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid)) cib = load_cib(cibfile) print >> sys.stderr, seqid, gc.shape[0], cib.shape[0] if seqid in autosomes: for gci, k in zip(gc, cib): gc_bin[gci].append(k) coverage.append((seqid, gc, cib)) for gci, k in gc_bin.items(): nonzero_k = [x for x in k if x] gc_med[gci] = med = np.median(nonzero_k) / 2 print >> sys.stderr, gci, len(nonzero_k), med mkdir(cndir) apply_fun = np.vectorize(gc_med.get) # Apply the GC correction over coverage for seqid, gc, cib in coverage: nitems = cib.shape[0] beta = apply_fun(gc[:nitems]) beta_cn = cib / beta cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid)) beta_cn.tofile(cnfile) # Run HMM caller if asked segfile = hmm([workdir, sample_key]) if opts.hmm else None upload = opts.upload if upload: push_to_s3(upload, cndir) if segfile: push_to_s3(upload, segfile) if opts.cleanup: import shutil shutil.rmtree(sampledir) shutil.rmtree(cndir)