def __init__(self, **kwargs): self.LowDepCut = float(kwargs["low_dep_cut"]) self.CorrectWinLen = int(kwargs["correct_win_len"]) self.CorrectShiftLen = int(kwargs["correct_shift_len"]) chroms = SaveLoad(os.path.abspath(kwargs["chromstat"])) s_chrom = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None self.chrom_stat = chroms.load() self.samples = sorted(self.chrom_stat.keys()) self.contigs = sorted(self.chrom_stat[self.samples[0]].keys(), key=lambda x: _chrom_valued(x)) if s_chrom is not None and len(s_chrom): self.contigs = filter(lambda x: x in s_chrom, self.contigs) self.indir = os.path.abspath(kwargs["indir"]) self.bed = dict() regions = defaultdict(list) bed = open(os.path.abspath(kwargs["region"]), 'r') for line in bed: if line.startswith("#"): continue rows = line.strip().split("\t") if len(rows) < 3: continue chrom = str(rows[0]) if chrom not in self.contigs: continue start = int(rows[1]) stop = int(rows[2]) regions[chrom].extend(range(start, stop + 1)) for chrom in self.contigs: self.bed[chrom] = sorted(regions[chrom])
def __init__(self, **kwargs): self.outdir = os.path.abspath(kwargs["indir"]) self.win_len = int(kwargs["correct_win_len"]) or 30 self.shift_len = int(kwargs["correct_shift_len"]) or 25 self.contral_wins = int(100.0 / self.shift_len + 0.5) + 1 chroms = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None samples = str(kwargs["sample"]).split(",") if kwargs["sample"] else None all_samples = set() contigs = list() self.cnvdata = defaultdict(dict) self.sample_win_data = defaultdict(dict) for cnv_data in glob(os.path.join(self.outdir, "chr*.cnv.args")): chrom = ".".join(os.path.basename(cnv_data).split(".")[0:-2]) if chroms is not None and chrom not in chroms: continue cnvdata = SaveLoad(cnv_data) cnvdata = cnvdata.load() contigs.append(chrom) for sample in cnvdata.keys(): dep_f = os.path.join(self.outdir, sample, "%s.W%iS%i.fixdep.gz" % (chrom, self.win_len, self.shift_len)) if os.path.isfile(dep_f) and os.path.isfile(dep_f + '.tbi'): self.sample_win_data[chrom][sample] = dep_f if samples is not None and sample not in samples: continue all_samples.add(sample) self.cnvdata[sample][chrom] = cnvdata[sample] self.samples = sorted(all_samples) self.contigs = sorted(contigs, key=lambda x: _chrom_valued(x)) databases = os.path.abspath(kwargs["dbdir"]) t_db = os.path.abspath(kwargs["transdb"]) if "transdb" in kwargs else os.path.join(databases, "transdb", "ncbi_anno_rel104.dbref.db") for db in glob(os.path.join(databases, "*", "*.cnvdb.config")): db = os.path.abspath(db) dbname = os.path.basename(os.path.dirname(db)) _AnnotationDB[dbname].add(db) self.reference = os.path.abspath(kwargs["reference"]) if kwargs["reference"] else \ os.path.join(databases, 'aln_db/hg19/hg19_chM_male_mask.fa') self.DBAnno = CNVAnnotation(self.reference, _AnnotationDB) self.HGVS = HGVS(t_db)
def gc_correct(**kwargs): depthf = os.path.abspath(kwargs["input"]) if not os.path.isfile(depthf) or not os.path.isfile(depthf + '.tbi'): return sample = str(kwargs['sample']) if kwargs['sample'] else os.path.basename(depthf).split(".")[0] outdir = os.path.join(os.path.abspath(kwargs['outdir']), sample) if not os.path.exists(outdir): os.makedirs(outdir) out = os.path.join(outdir, "%s.Fixdep.tsv" % sample) wins = SaveLoad(os.path.abspath(kwargs["wingc"])) wingc = wins.load() poss = SaveLoad(os.path.abspath(kwargs["posgc"])) posgc = poss.load() chroms = SaveLoad(os.path.abspath(kwargs["chromstat"])) chrom_stat = chroms.load() f_out = smart_open(out, 'w') f_out.writelines("#Chrom\tPos\tFixDepth\n") gc_depth = list() dep_f = pysam.TabixFile(depthf) for rows, gc_content in sorted(wingc.iteritems(), key=lambda x: (_chrom_valued(x[0][0]), x[0][1])): chrom = rows[0] start = rows[1] stop = rows[2] - 1 if chrom not in chrom_stat[sample] or chrom_stat[sample][chrom] < 1: continue try: depths = [int(line.strip().split("\t")[-1]) for line in dep_f.fetch(chrom, start, stop)] win_mean_dep = min(sum(depths) / float(len(depths)), 6.0 * chrom_stat[sample][chrom].average) win_mean_dep *= 2.0 / chrom_stat[sample][chrom].ploid except Exception: win_mean_dep = 0.0 gc_depth.append([gc_content, win_mean_dep]) gc_depth = DescribeArray(gc_depth, col=1) gcdep = gc_depth.array[gc_depth.array[:, 1] > 0.05 * gc_depth.median] prd = unique_rows(lowess(gcdep[:, 1], gcdep[:, 0], frac=0.25)) mdp = np.median(prd[:, 1]) if mdp <= 0.0: raise ValueError("Sample %s depth file Error !" % depthf) lgc = gcl = max(10000, int(prd[:, 0].max() * 10000)) loe = [-0.0001, ] * gcl gcj = 0 for gc, dp in prd: gcj = int(round(gc, 4) * 10000) if gcj < gcl: gcl = gcj loe[gcj] = mdp / float(dp) if dp > 0 else 1.0 for gc in xrange(gcl): loe[gc] = min(loe[gcl], 10.0) for i in xrange(gcl + 1, gcj): if loe[i] < 0: ls = i - 1 lv = loe[i - 1] rs = i + 1 while loe[rs] < 0 and rs < len(loe): rs += 1 rv = loe[rs] loe[i] = min((lv + (rs - float(ls)) * rv) / (rs - float(ls) + 1.0), 10.0) for i in xrange(gcj + 1, lgc): loe[i] = min(loe[gcj], 10.0) for line in dep_f.fetch(): rows = line.strip().split("\t") chrom = str(rows[0]) pos = int(rows[1]) deps = int(rows[-1]) try: fixdeps = int(deps * loe[int(round(posgc[chrom][pos], 4) * 10000)]) except KeyError: continue f_out.writelines("\t".join(map(str, [chrom, pos, fixdeps])) + '\n') f_out.close() dep_f.close() _ = pysam.tabix_index(out, seq_col=0, start_col=1, end_col=1, force=True)
def gc_correct(**kwargs): depthf = os.path.abspath(kwargs["input"]) if not os.path.isfile(depthf) or not os.path.isfile(depthf + '.tbi'): return sample = str(kwargs['sample']) if kwargs['sample'] else os.path.basename( depthf).split(".")[0] outdir = os.path.join(os.path.abspath(kwargs['outdir']), sample) if not os.path.exists(outdir): os.makedirs(outdir) out = os.path.join(outdir, "%s.Fixdep.tsv" % sample) wins = SaveLoad(os.path.abspath(kwargs["wingc"])) wingc = wins.load() poss = SaveLoad(os.path.abspath(kwargs["posgc"])) posgc = poss.load() chroms = SaveLoad(os.path.abspath(kwargs["chromstat"])) chrom_stat = chroms.load() f_out = smart_open(out, 'w') f_out.writelines("#Chrom\tPos\tFixDepth\n") gc_depth = list() dep_f = pysam.TabixFile(depthf) for rows, gc_content in sorted(wingc.iteritems(), key=lambda x: (_chrom_valued(x[0][0]), x[0][1])): chrom = rows[0] start = rows[1] stop = rows[2] - 1 if chrom not in chrom_stat[sample] or chrom_stat[sample][chrom] < 1: continue try: depths = [ int(line.strip().split("\t")[-1]) for line in dep_f.fetch(chrom, start, stop) ] win_mean_dep = min( sum(depths) / float(len(depths)), 6.0 * chrom_stat[sample][chrom].average) win_mean_dep *= 2.0 / chrom_stat[sample][chrom].ploid except Exception: win_mean_dep = 0.0 gc_depth.append([gc_content, win_mean_dep]) gc_depth = DescribeArray(gc_depth, col=1) gcdep = gc_depth.array[gc_depth.array[:, 1] > 0.05 * gc_depth.median] prd = unique_rows(lowess(gcdep[:, 1], gcdep[:, 0], frac=0.25)) mdp = np.median(prd[:, 1]) if mdp <= 0.0: raise ValueError("Sample %s depth file Error !" % depthf) lgc = gcl = max(10000, int(prd[:, 0].max() * 10000)) loe = [ -0.0001, ] * gcl gcj = 0 for gc, dp in prd: gcj = int(round(gc, 4) * 10000) if gcj < gcl: gcl = gcj loe[gcj] = mdp / float(dp) if dp > 0 else 1.0 for gc in xrange(gcl): loe[gc] = min(loe[gcl], 10.0) for i in xrange(gcl + 1, gcj): if loe[i] < 0: ls = i - 1 lv = loe[i - 1] rs = i + 1 while loe[rs] < 0 and rs < len(loe): rs += 1 rv = loe[rs] loe[i] = min((lv + (rs - float(ls)) * rv) / (rs - float(ls) + 1.0), 10.0) for i in xrange(gcj + 1, lgc): loe[i] = min(loe[gcj], 10.0) for line in dep_f.fetch(): rows = line.strip().split("\t") chrom = str(rows[0]) pos = int(rows[1]) deps = int(rows[-1]) try: fixdeps = int(deps * loe[int(round(posgc[chrom][pos], 4) * 10000)]) except KeyError: continue f_out.writelines("\t".join(map(str, [chrom, pos, fixdeps])) + '\n') f_out.close() dep_f.close() _ = pysam.tabix_index(out, seq_col=0, start_col=1, end_col=1, force=True)