Пример #1
0
 def __init__(self, **kwargs):
     self.LowDepCut = float(kwargs["low_dep_cut"])
     self.CorrectWinLen = int(kwargs["correct_win_len"])
     self.CorrectShiftLen = int(kwargs["correct_shift_len"])
     chroms = SaveLoad(os.path.abspath(kwargs["chromstat"]))
     s_chrom = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None
     self.chrom_stat = chroms.load()
     self.samples = sorted(self.chrom_stat.keys())
     self.contigs = sorted(self.chrom_stat[self.samples[0]].keys(),
                           key=lambda x: _chrom_valued(x))
     if s_chrom is not None and len(s_chrom):
         self.contigs = filter(lambda x: x in s_chrom, self.contigs)
     self.indir = os.path.abspath(kwargs["indir"])
     self.bed = dict()
     regions = defaultdict(list)
     bed = open(os.path.abspath(kwargs["region"]), 'r')
     for line in bed:
         if line.startswith("#"):
             continue
         rows = line.strip().split("\t")
         if len(rows) < 3:
             continue
         chrom = str(rows[0])
         if chrom not in self.contigs:
             continue
         start = int(rows[1])
         stop = int(rows[2])
         regions[chrom].extend(range(start, stop + 1))
     for chrom in self.contigs:
         self.bed[chrom] = sorted(regions[chrom])
Пример #2
0
	def __init__(self, **kwargs):
		self.LowDepCut = float(kwargs["low_dep_cut"])
		self.CorrectWinLen = int(kwargs["correct_win_len"])
		self.CorrectShiftLen = int(kwargs["correct_shift_len"])
		chroms = SaveLoad(os.path.abspath(kwargs["chromstat"]))
		s_chrom = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None
		self.chrom_stat = chroms.load()
		self.samples = sorted(self.chrom_stat.keys())
		self.contigs = sorted(self.chrom_stat[self.samples[0]].keys(), key=lambda x: _chrom_valued(x))
		if s_chrom is not None and len(s_chrom):
			self.contigs = filter(lambda x: x in s_chrom, self.contigs)
		self.indir = os.path.abspath(kwargs["indir"])
		self.bed = dict()
		regions = defaultdict(list)
		bed = open(os.path.abspath(kwargs["region"]), 'r')
		for line in bed:
			if line.startswith("#"):
				continue
			rows = line.strip().split("\t")
			if len(rows) < 3:
				continue
			chrom = str(rows[0])
			if chrom not in self.contigs:
				continue
			start = int(rows[1])
			stop = int(rows[2])
			regions[chrom].extend(range(start, stop + 1))
		for chrom in self.contigs:
			self.bed[chrom] = sorted(regions[chrom])
Пример #3
0
	def __init__(self, **kwargs):
		self.outdir = os.path.abspath(kwargs["indir"])
		self.win_len = int(kwargs["correct_win_len"]) or 30
		self.shift_len = int(kwargs["correct_shift_len"]) or 25
		self.contral_wins = int(100.0 / self.shift_len + 0.5) + 1
		chroms = str(kwargs["chrom"]).split(",") if kwargs["chrom"] else None
		samples = str(kwargs["sample"]).split(",") if kwargs["sample"] else None
		all_samples = set()
		contigs = list()
		self.cnvdata = defaultdict(dict)
		self.sample_win_data = defaultdict(dict)
		for cnv_data in glob(os.path.join(self.outdir, "chr*.cnv.args")):
			chrom = ".".join(os.path.basename(cnv_data).split(".")[0:-2])
			if chroms is not None and chrom not in chroms:
				continue
			cnvdata = SaveLoad(cnv_data)
			cnvdata = cnvdata.load()
			contigs.append(chrom)
			for sample in cnvdata.keys():
				dep_f = os.path.join(self.outdir, sample, "%s.W%iS%i.fixdep.gz" % (chrom, self.win_len, self.shift_len))
				if os.path.isfile(dep_f) and os.path.isfile(dep_f + '.tbi'):
					self.sample_win_data[chrom][sample] = dep_f
				if samples is not None and sample not in samples:
					continue
				all_samples.add(sample)
				self.cnvdata[sample][chrom] = cnvdata[sample]
		self.samples = sorted(all_samples)
		self.contigs = sorted(contigs, key=lambda x: _chrom_valued(x))
		databases = os.path.abspath(kwargs["dbdir"])
		t_db = os.path.abspath(kwargs["transdb"]) if "transdb" in kwargs else os.path.join(databases, "transdb",
		                                                                                   "ncbi_anno_rel104.dbref.db")
		for db in glob(os.path.join(databases, "*", "*.cnvdb.config")):
			db = os.path.abspath(db)
			dbname = os.path.basename(os.path.dirname(db))
			_AnnotationDB[dbname].add(db)
		self.reference = os.path.abspath(kwargs["reference"]) if kwargs["reference"] else \
			os.path.join(databases, 'aln_db/hg19/hg19_chM_male_mask.fa')
		self.DBAnno = CNVAnnotation(self.reference, _AnnotationDB)
		self.HGVS = HGVS(t_db)
Пример #4
0
def gc_correct(**kwargs):
	depthf = os.path.abspath(kwargs["input"])
	if not os.path.isfile(depthf) or not os.path.isfile(depthf + '.tbi'):
		return
	sample = str(kwargs['sample']) if kwargs['sample'] else os.path.basename(depthf).split(".")[0]
	outdir = os.path.join(os.path.abspath(kwargs['outdir']), sample)
	if not os.path.exists(outdir):
		os.makedirs(outdir)
	out = os.path.join(outdir, "%s.Fixdep.tsv" % sample)
	wins = SaveLoad(os.path.abspath(kwargs["wingc"]))
	wingc = wins.load()
	poss = SaveLoad(os.path.abspath(kwargs["posgc"]))
	posgc = poss.load()
	chroms = SaveLoad(os.path.abspath(kwargs["chromstat"]))
	chrom_stat = chroms.load()
	f_out = smart_open(out, 'w')
	f_out.writelines("#Chrom\tPos\tFixDepth\n")
	gc_depth = list()
	dep_f = pysam.TabixFile(depthf)
	for rows, gc_content in sorted(wingc.iteritems(), key=lambda x: (_chrom_valued(x[0][0]), x[0][1])):
		chrom = rows[0]
		start = rows[1]
		stop = rows[2] - 1
		if chrom not in chrom_stat[sample] or chrom_stat[sample][chrom] < 1:
			continue
		try:
			depths = [int(line.strip().split("\t")[-1]) for line in dep_f.fetch(chrom, start, stop)]
			win_mean_dep = min(sum(depths) / float(len(depths)), 6.0 * chrom_stat[sample][chrom].average)
			win_mean_dep *= 2.0 / chrom_stat[sample][chrom].ploid
		except Exception:
			win_mean_dep = 0.0
		gc_depth.append([gc_content, win_mean_dep])
	gc_depth = DescribeArray(gc_depth, col=1)
	gcdep = gc_depth.array[gc_depth.array[:, 1] > 0.05 * gc_depth.median]
	prd = unique_rows(lowess(gcdep[:, 1], gcdep[:, 0], frac=0.25))
	mdp = np.median(prd[:, 1])
	if mdp <= 0.0:
		raise ValueError("Sample %s depth file Error !" % depthf)
	lgc = gcl = max(10000, int(prd[:, 0].max() * 10000))
	loe = [-0.0001, ] * gcl
	gcj = 0
	for gc, dp in prd:
		gcj = int(round(gc, 4) * 10000)
		if gcj < gcl:
			gcl = gcj
		loe[gcj] = mdp / float(dp) if dp > 0 else 1.0
	for gc in xrange(gcl):
		loe[gc] = min(loe[gcl], 10.0)
	for i in xrange(gcl + 1, gcj):
		if loe[i] < 0:
			ls = i - 1
			lv = loe[i - 1]
			rs = i + 1
			while loe[rs] < 0 and rs < len(loe):
				rs += 1
			rv = loe[rs]
			loe[i] = min((lv + (rs - float(ls)) * rv) / (rs - float(ls) + 1.0), 10.0)
	for i in xrange(gcj + 1, lgc):
		loe[i] = min(loe[gcj], 10.0)
	for line in dep_f.fetch():
		rows = line.strip().split("\t")
		chrom = str(rows[0])
		pos = int(rows[1])
		deps = int(rows[-1])
		try:
			fixdeps = int(deps * loe[int(round(posgc[chrom][pos], 4) * 10000)])
		except KeyError:
			continue
		f_out.writelines("\t".join(map(str, [chrom, pos, fixdeps])) + '\n')
	f_out.close()
	dep_f.close()
	_ = pysam.tabix_index(out, seq_col=0, start_col=1, end_col=1, force=True)
Пример #5
0
def gc_correct(**kwargs):
    depthf = os.path.abspath(kwargs["input"])
    if not os.path.isfile(depthf) or not os.path.isfile(depthf + '.tbi'):
        return
    sample = str(kwargs['sample']) if kwargs['sample'] else os.path.basename(
        depthf).split(".")[0]
    outdir = os.path.join(os.path.abspath(kwargs['outdir']), sample)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    out = os.path.join(outdir, "%s.Fixdep.tsv" % sample)
    wins = SaveLoad(os.path.abspath(kwargs["wingc"]))
    wingc = wins.load()
    poss = SaveLoad(os.path.abspath(kwargs["posgc"]))
    posgc = poss.load()
    chroms = SaveLoad(os.path.abspath(kwargs["chromstat"]))
    chrom_stat = chroms.load()
    f_out = smart_open(out, 'w')
    f_out.writelines("#Chrom\tPos\tFixDepth\n")
    gc_depth = list()
    dep_f = pysam.TabixFile(depthf)
    for rows, gc_content in sorted(wingc.iteritems(),
                                   key=lambda x:
                                   (_chrom_valued(x[0][0]), x[0][1])):
        chrom = rows[0]
        start = rows[1]
        stop = rows[2] - 1
        if chrom not in chrom_stat[sample] or chrom_stat[sample][chrom] < 1:
            continue
        try:
            depths = [
                int(line.strip().split("\t")[-1])
                for line in dep_f.fetch(chrom, start, stop)
            ]
            win_mean_dep = min(
                sum(depths) / float(len(depths)),
                6.0 * chrom_stat[sample][chrom].average)
            win_mean_dep *= 2.0 / chrom_stat[sample][chrom].ploid
        except Exception:
            win_mean_dep = 0.0
        gc_depth.append([gc_content, win_mean_dep])
    gc_depth = DescribeArray(gc_depth, col=1)
    gcdep = gc_depth.array[gc_depth.array[:, 1] > 0.05 * gc_depth.median]
    prd = unique_rows(lowess(gcdep[:, 1], gcdep[:, 0], frac=0.25))
    mdp = np.median(prd[:, 1])
    if mdp <= 0.0:
        raise ValueError("Sample %s depth file Error !" % depthf)
    lgc = gcl = max(10000, int(prd[:, 0].max() * 10000))
    loe = [
        -0.0001,
    ] * gcl
    gcj = 0
    for gc, dp in prd:
        gcj = int(round(gc, 4) * 10000)
        if gcj < gcl:
            gcl = gcj
        loe[gcj] = mdp / float(dp) if dp > 0 else 1.0
    for gc in xrange(gcl):
        loe[gc] = min(loe[gcl], 10.0)
    for i in xrange(gcl + 1, gcj):
        if loe[i] < 0:
            ls = i - 1
            lv = loe[i - 1]
            rs = i + 1
            while loe[rs] < 0 and rs < len(loe):
                rs += 1
            rv = loe[rs]
            loe[i] = min((lv + (rs - float(ls)) * rv) / (rs - float(ls) + 1.0),
                         10.0)
    for i in xrange(gcj + 1, lgc):
        loe[i] = min(loe[gcj], 10.0)
    for line in dep_f.fetch():
        rows = line.strip().split("\t")
        chrom = str(rows[0])
        pos = int(rows[1])
        deps = int(rows[-1])
        try:
            fixdeps = int(deps * loe[int(round(posgc[chrom][pos], 4) * 10000)])
        except KeyError:
            continue
        f_out.writelines("\t".join(map(str, [chrom, pos, fixdeps])) + '\n')
    f_out.close()
    dep_f.close()
    _ = pysam.tabix_index(out, seq_col=0, start_col=1, end_col=1, force=True)