Exemplo n.º 1
0
def hmm_cnv(dep_data, regions, best_probability, trials, ploid=2.0, output=None, contral_wins=5):
	output = os.path.abspath(output) if output is not None else sys.stdout
	final_cnv = smart_open(output, 'w')
	cne = ViterbiTraining(dep_data, ploid, best_probability, trials)
	copy_est = cne.train()
	if np.count_nonzero(copy_est != ploid) / float(len(copy_est)) < 0.1 and len(copy_est) > 800:
		iterations = 0
		lastDif = differences = len(copy_est)
		n_copy_est = np.copy(copy_est)
		tmp_arg = [[0, 0, 0], [0, 0, 0]]
		while differences > 0 and iterations < 100:
			ndep = [int(ploid * dep_data[i] / float(copy_est[i]) + 0.5) for i in range(len(copy_est)) if copy_est[i]]
			nbarg = NegativeBinomial(ndep)
			nbarg.nbinom_fit()
			tmp_b = float(nbarg.best_probability)
			tmp_t = int(nbarg.trials)
			devi = float(nbarg.mindevi)
			cne = ViterbiTraining(dep_data, ploid, tmp_b, tmp_t)
			copy_est = cne.train(copy_est)
			iterations += 1
			differences = np.count_nonzero(np.array(n_copy_est) != copy_est)
			n_copy_est = np.copy(copy_est)
			if differences == lastDif:
				if (tmp_arg[0][2] == devi) and (tmp_arg[0][1] == tmp_t) and (tmp_arg[0][0] == tmp_b):
					break
			lastDif = differences
			tmp_arg = [[tmp_arg[1][0], tmp_arg[1][1], tmp_arg[1][2]], [tmp_b, tmp_t, devi]]
	trans_p = cne.mlEstimate(copy_est)
	cnvs = cne.posterior_decoding(trans_p, ploid)
	chrom = str(regions[0][0])
	for k, g in groupby(zip(regions, cnvs), lambda x: x[1][1]):
		if k == ploid:
			continue
		elems = list(g)
		for start, stop in join_ranges([d[0][1:] for d in elems]):
			p = [float(d[1][0]) for d in elems if d[0][1] >= start and d[0][2] <= stop]
			mpp = round(sum(p) / len(p), 3)
			if mpp < 0.95 or len(p) < contral_wins:
				continue
			bp_len = stop - start + 1
			mut_type = "gain" if k > ploid else "loss"
			final_cnv.write("\t".join(map(str, [chrom, ploid, start, stop, bp_len, k, mut_type, mpp])) + '\n')
	final_cnv.close()
Exemplo n.º 2
0
 def win_correct(self, chrom):
     pos_filter = defaultdict(int)
     filterSampleInChrom = set()
     bed_chrom = self.bed[chrom]
     depth_dict = dict()
     regions = list()
     sample_dep = defaultdict(list)
     win_sift_dep = defaultdict(list)
     wsdep = defaultdict(list)
     samples_filter = filter(lambda i: self.chrom_stat[i][chrom].ploid > 0,
                             self.samples)
     if len(samples_filter) < 2:
         return
     nbinom_data = list()
     nbinom_out = open(os.path.join(self.indir, "%s.nbinom.arg" % chrom),
                       'w')
     chr_cor = open(
         os.path.join(
             self.indir, "%s_W%dS%d.cor" %
             (chrom, self.CorrectWinLen, self.CorrectShiftLen)), 'w')
     ws_deps = {
         sample: os.path.join(
             self.indir, sample, "%s.W%dS%d.fixdep" %
             (chrom, self.CorrectWinLen, self.CorrectShiftLen))
         for sample in samples_filter
     }
     ws_dep = {
         sample: open(ws_deps[sample], 'w')
         for sample in samples_filter
     }
     for sample in samples_filter:
         chrom_d = list()
         fixdeps = pysam.TabixFile(
             os.path.join(self.indir,
                          "{0}/{0}.Fixdep.tsv.gz".format(sample)))
         depth_dict[sample] = fixdeps
         for line in fixdeps.fetch(chrom):
             rows = line.strip().split("\t")
             pos = int(rows[1])
             c_d = int(rows[-1])
             chrom_d.append([pos, c_d])
         depths = DescribeArray(chrom_d, col=1)
         for (pos, dep) in chrom_d:
             if dep < 0.6 * self.LowDepCut * depths.average:
                 pos_filter[pos] += 2
             elif dep < self.LowDepCut * depths.average:
                 pos_filter[pos] += 1
     for pos, number in pos_filter.iteritems():
         if number >= len(samples_filter):
             try:
                 bed_chrom.remove(pos)
             except ValueError:
                 continue
     bed_chrom = list(
         join_ranges(join_numbers(bed_chrom), offset=self.CorrectWinLen))
     for s, e in bed_chrom:
         if e - s < self.CorrectShiftLen:
             continue
         for win in xrange(s, e, self.CorrectShiftLen):
             end_p = min(win + self.CorrectWinLen - 1, e)
             regions.append((chrom, win, end_p))
             for sample in samples_filter:
                 depth = depth_dict[sample]
                 lines = list(depth.fetch(chrom, win,
                                          end_p)) or ["-1\t-1\t0\n"]
                 mdep = sum(
                     [int(line.strip().split("\t")[-1])
                      for line in lines]) / float(len(lines))
                 mdep = round(mdep, 2)
                 sample_dep[sample].append(mdep)
                 win_sift_dep[(chrom, win, end_p)].append(mdep)
     mdep_chr = {
         i: sum(j) / len(j)
         for i, j in sample_dep.iteritems() if len(j)
     }
     cormtx = np.corrcoef([sample_dep[i] for i in samples_filter])
     chr_cor.write(chrom + "\t" + "\t".join(samples_filter) + '\n')
     for sn in range(len(samples_filter)):
         chr_cor.write("\t".join([samples_filter[sn]] +
                                 map(str, cormtx[sn])))
         mcor = (cormtx[sn].sum() - 1.0) / (len(samples_filter) - 1.0)
         if mcor < 0.6:
             filterSampleInChrom.add(samples_filter[sn])
             chr_cor.write("\tLow correlation\n")
         else:
             chr_cor.write("\n")
     chr_cor.write("\n")
     if len(filterSampleInChrom) / float(len(samples_filter)) > 0.6:
         return
     for r in regions:
         chrom, start, stop = r
         d = win_sift_dep[r]
         dr = list()
         ndr = list()
         for sn in range(len(samples_filter)):
             sample = samples_filter[sn]
             if sample in filterSampleInChrom:
                 continue
             contorl = self.LowDepCut * mdep_chr[sample]
             dr.append(d[sn] /
                       mdep_chr[sample]) if mdep_chr[sample] > 0 else 0
             if d[sn] > contorl:
                 ndr.append(d[sn] / mdep_chr[sample])
         mdr = np.median(np.array(ndr)) if len(ndr) > 3 else 1.0
         if mdr < self.LowDepCut:
             mdr = 1.0
         for sn in range(len(samples_filter)):
             d[sn] /= mdr
             wsdep[samples_filter[sn]].append(d[sn])
     for s, d in wsdep.iteritems():
         if s in filterSampleInChrom:
             continue
         tmp_des = DescribeArray(d)
         d = [min(5.0 * tmp_des.average, i) for i in d]
         tmp_des = DescribeArray(d)
         d = [int(i * 60.0 / tmp_des.average + 0.5) for i in d]
         if len(regions) == len(d):
             for i in range(len(regions)):
                 reg = "\t".join(map(str, regions[i]))
                 ws_dep[s].write("%s\t%i\n" % (reg, d[i]))
                 nbinom_data.append(d[i])
     chr_cor.close()
     for s, d in ws_deps.iteritems():
         ws_dep[s].close()
         depth_dict[s].close()
         if os.path.isfile(d):
             _ = pysam.tabix_index(d,
                                   seq_col=0,
                                   start_col=1,
                                   end_col=2,
                                   force=True)
     nbarg = NegativeBinomial(nbinom_data)
     nbarg.nbinom_fit()
     trials = nbarg.trials
     best_probability = nbarg.best_probability
     min_devi = nbarg.mindevi
     nbinom_out.write(
         "\t".join(map(str, [trials, best_probability, min_devi])) + '\n')
     nbinom_out.close()