def depths(self, bed, prefix=None, read_filter='all', qual_threshold=15, count_threshold=None, uncover_threshold=5): threshold_type = type(count_threshold) if threshold_type is list or threshold_type is tuple: count_threshold = set(count_threshold) elif count_threshold is set: pass else: count_threshold = {1, 4, 10, 20, 30, 100} uncover_threshold = max(int(uncover_threshold), 1) count_threshold.add(uncover_threshold) count_threshold = sorted(count_threshold, key=int) outdir, name = os.path.split(os.path.abspath(prefix)) if prefix else (os.getcwd(), 'Rhea_Chip') depth = os.path.join(outdir, '%s.depth.tsv' % name) bedstat = os.path.join(outdir, '%s.bed.stat' % name) stats = os.path.join(outdir, '%s.stat' % name) chromstat = os.path.join(outdir, '%s.chrom.stat' % name) uncover = os.path.join(outdir, '%s.uncover.bed' % name) _depth = smart_open(depth, 'w') _bedstat = smart_open(bedstat, 'w') _stats = smart_open(stats, 'w') _chromstat = smart_open(chromstat, 'w') # _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\tWinGC_%s\n" % str(gcwin)) _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\n") _bedstat.write("#Chr\tStart\tStop\tAverage\tMedian\tMax\tMin\n") _stats.write("##A Simple introduction about %s\n" % self.name) _chromstat.write("#Chr\tAverage\tMedian\tMax\tMin\t") _chromstat.write("\t".join(["Coverage (>=%sX)" % str(key) for key in count_threshold]) + '\n') chroms = defaultdict(dict) rangedict = defaultdict(int) regiondict = defaultdict(int) total_base = 0 region_num = 0 with smart_open(bed) as regions: for region in regions: rows = region.strip().split() if len(rows) < 3: continue try: chrom = rows[0] start = max(int(rows[1]) - 1, 0) stop = min(int(rows[2]) + 1, self.reference.get_reference_length(chrom)) except Exception: continue cov_a, cov_c, cov_g, cov_t = self._reader.count_coverage(chrom, start, stop, read_callback=read_filter, quality_threshold=int(qual_threshold)) bases = self.reference.fetch(chrom, start, stop).upper() reg = list() chrom = "chr" + re.sub("^chr", "", chrom) if chrom.startswith("chrM"): chrom = 'chrM_NC_012920.1' for n in xrange(start, stop): offset = n - start dep = [bases[offset], cov_a[offset], cov_c[offset], cov_g[offset], cov_t[offset]] base_depth = sum(dep[1:]) dep.append(base_depth) # gc_radio = round(self.count_gc(bases[offset:offset + 201]), 4) # dep.append(gc_radio) chroms[chrom][n] = dep for num in count_threshold: num = int(num) if base_depth >= num: rangedict[num] += 1 total_base += 1 reg.append(base_depth) region_num += 1 array = DescribeArray(reg) averages = str(round(array.average, 2)) mediandepth = str(round(array.median, 2)) maxdepth = str(round(array.max, 2)) mindepth = str(round(array.min, 2)) _bedstat.write("\t".join([chrom, rows[1], rows[2], averages, mediandepth, maxdepth, mindepth]) + '\n') for num in count_threshold: num = int(num) if array.average >= num: regiondict[num] += 1 uncover_range = list() n_array = list() for chrom in sorted(chroms.keys(), key=lambda x: _chrom_valued(x)): dep = sorted(chroms[chrom].iteritems(), key=lambda x: int(x[0])) array = list() for p, d in dep: _depth.write("\t".join([chrom, str(p), '\t'.join([str(i) for i in d])]) + '\n') array.append(d) n_array.append(d[5]) array = DescribeArray(array, col=5) averages = str(round(array.average, 2)) mediandepth = str(round(array.median, 2)) maxdepth = str(round(array.max, 2)) mindepth = str(round(array.min, 2)) chromcover = [array.get_frequece(thre, col=5) for thre in count_threshold] _chromstat.write("\t".join([chrom, averages, mediandepth, maxdepth, mindepth] + chromcover) + '\n') if read_filter == 'all': uncover_bases = [int(p) for p, d in dep if d[5] < uncover_threshold] uncover_range.extend(formact_number_list_to_range(uncover_bases, tag=chrom)) if read_filter == 'all': uncoverout = smart_open(uncover, 'w') uncoverout.write("#Chr\tStart\tStop\n") uncoverout.writelines(uncover_range) uncoverout.close() array = DescribeArray(n_array) _stats.write("Average depth : %.2f\n" % array.average) _stats.write("Median depth : %.2f\n" % array.median) _stats.write("Max depth : %.2f\n" % array.max) _stats.write("Min depth : %.2f\n" % array.min) for number in count_threshold: number = int(number) _stats.write("Coverage (>={0:d}X) : {1:.2f}%\n".format( number, (float(rangedict[number]) / total_base) * 100, 2)) for number in count_threshold: number = int(number) _stats.write("Region Coverage (>={0:d}X) : {1:.2f}%\n".format( number, (float(regiondict[number]) / region_num) * 100, 2)) _depth.close() _bedstat.close() _stats.close() _chromstat.close() dep_f = CreatIndex(depth) _ = dep_f.check_index(seq_col=0, start_col=1, end_col=1)
def parse(self, outdir=os.getcwd(), kickout_function=None, follow_function=None): logger.info("General VCF Phrasing begin !!!") self.vcf.parse_metainfo() titles = ["Chrom", "Start", "Stop", "Refer", "Call", "Zygosity", "VarType", "Filter", "ADepth", "ARatio", "PL", "NeighborGID", "PhasedGID", "MutationName", "GeneSym", "EntrezGeneID", "Transcript", "TransBioType", "cHGVS", "Protein", "pHGVS", "Strand", "PrimaryTag", "FunctionName", "Impact", "ExInID", "cDNAPos", "AAPos", "CDSPos", "AnnoTag", "StandardMutation"] annotation = namedtuple("Annotation", titles) dbtitle = self.DBAnno.dbtitle.split("\t") titles.extend(dbtitle) kickouts = kickout_function.split(",") if kickout_function else list() follows = follow_function.split(",") if follow_function else None for name, samples in self.vcf.readlines.iteritems(): logger.info("Start to annotate sample %s" % name) message_num = 0 result_out = list() for messages in samples: chrom = messages.Chrom pos = int(messages.Pos) gt = re.compile("(\d)[|/](\d)").match(messages.GT) if not gt: continue message_num += 1 if message_num % 250 == 0: logger.info("Complete " + str(message_num) + " articles") l1, l2 = int(gt.group(1)), int(gt.group(2)) refer = messages.Ref alter_1 = messages.Alter[l1 - 1] if l1 > 0 else refer alter_2 = messages.Alter[l2 - 1] if l2 > 0 else refer alter_out_set = set() eff_info_dict = self.get_snpeff_info(messages.Info['ANN']) try: nb_id = messages.NB except AttributeError: nb_id = "." try: pb_id = messages.PB except AttributeError: pb_id = "." filter_tag = messages.Filter if l1 == l2: zygosity = "hom-ref" if l1 == 0 else "hom-alt" alter_out_set.add(alter_1) else: zygosity = "het-alt" if l1 != 0: alter_out_set.add(alter_1) if l2 != 0: alter_out_set.add(alter_2) for alters in alter_out_set: offset_s, offset_e, nor_refer, nor_alter, vartype, close_anno = \ self.closet_anno(chrom, pos, refer, alters) start = pos + offset_s stop = pos + + offset_e for info_rows in eff_info_dict[alters]: if not len(info_rows): info_rows = (".",) * 16 genesym, gene_id, trans, trans_bio, chgvs, protein_id, phgvs, strand, primary, \ functions, impact, exon_num, cdna_pos, aa_pos, cds_pos, anno_tag = info_rows functions = re.sub("_variant", "", functions) if (functions in kickouts) or (follows and functions not in follow_function): continue if chgvs != ".": mutation_name = "{0}({1}): {2}".format(trans, genesym, chgvs) mutation_name += " (%s)" % phgvs if phgvs != "." else "" else: mutation_name = "." try: alle_depth = int(messages.AD.split(",")[l2]) try: base_depth = int(messages.DP) alle_radio = round(float(alle_depth) / base_depth, 2) except Exception: alle_radio = "." except Exception: alle_depth = "." alle_radio = "." try: phred = ",".join(messages.PL.split(",")[3 * (l2 - 1): 3 * l2]) except Exception: phred = "." varinfo = [chrom, start, stop, nor_refer, nor_alter, zygosity, vartype, filter_tag, alle_depth, alle_radio, phred, nb_id, pb_id, mutation_name, genesym, gene_id, trans, trans_bio, chgvs, protein_id, phgvs, strand, primary, functions, impact, exon_num, cdna_pos, aa_pos, cds_pos, anno_tag, close_anno] variation = annotation._make(varinfo) dbinfo = self.DBAnno.dbanno(variation) for i in dbtitle: if i in dbinfo: varinfo.append("|".join(dbinfo[i])) else: varinfo.append(".") result_out.append(varinfo) sample_out = os.path.join(outdir, "%s.anno.tsv" % name) f_out = open(sample_out, "w") f_out.write("#" + "\t".join(titles) + '\n') for anno_message in sorted(result_out, key=lambda x: (_chrom_valued(x[0]), int(x[1]), int(x[2]))): f_out.write("\t".join(map(str, anno_message)) + '\n') f_out.close() fileout = pysam.tabix_index(sample_out, seq_col=0, start_col=1, end_col=2, force=True) logger.info( "Sample {0} [{1}]: Annotation completed, total {2} articles !".format(name, fileout, message_num))
def depths(self, bed, prefix=None, read_filter='all', qual_threshold=15, count_threshold=None, uncover_threshold=5): threshold_type = type(count_threshold) if threshold_type is list or threshold_type is tuple: count_threshold = set(count_threshold) elif count_threshold is set: pass else: count_threshold = {1, 4, 10, 20, 30, 100} uncover_threshold = max(int(uncover_threshold), 1) count_threshold.add(uncover_threshold) count_threshold = sorted(count_threshold, key=int) outdir, name = os.path.split( os.path.abspath(prefix)) if prefix else (os.getcwd(), 'Rhea_Chip') depth = os.path.join(outdir, '%s.depth.tsv' % name) bedstat = os.path.join(outdir, '%s.bed.stat' % name) stats = os.path.join(outdir, '%s.stat' % name) chromstat = os.path.join(outdir, '%s.chrom.stat' % name) uncover = os.path.join(outdir, '%s.uncover.bed' % name) _depth = smart_open(depth, 'w') _bedstat = smart_open(bedstat, 'w') _stats = smart_open(stats, 'w') _chromstat = smart_open(chromstat, 'w') # _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\tWinGC_%s\n" % str(gcwin)) _depth.write("#Chrom\tPos\tRef\tCov_A\tCov_C\tCov_G\tCov_T\tDepth\n") _bedstat.write("#Chr\tStart\tStop\tAverage\tMedian\tMax\tMin\n") _stats.write("##A Simple introduction about %s\n" % self.name) _chromstat.write("#Chr\tAverage\tMedian\tMax\tMin\t") _chromstat.write("\t".join( ["Coverage (>=%sX)" % str(key) for key in count_threshold]) + '\n') chroms = defaultdict(dict) rangedict = defaultdict(int) regiondict = defaultdict(int) total_base = 0 region_num = 0 with smart_open(bed) as regions: for region in regions: rows = region.strip().split() if len(rows) < 3: continue try: chrom = rows[0] start = max(int(rows[1]) - 1, 0) stop = min( int(rows[2]) + 1, self.reference.get_reference_length(chrom)) except Exception: continue cov_a, cov_c, cov_g, cov_t = self._reader.count_coverage( chrom, start, stop, read_callback=read_filter, quality_threshold=int(qual_threshold)) bases = self.reference.fetch(chrom, start, stop).upper() reg = list() chrom = "chr" + re.sub("^chr", "", chrom) if chrom.startswith("chrM"): chrom = 'chrM_NC_012920.1' for n in xrange(start, stop): offset = n - start dep = [ bases[offset], cov_a[offset], cov_c[offset], cov_g[offset], cov_t[offset] ] base_depth = sum(dep[1:]) dep.append(base_depth) # gc_radio = round(self.count_gc(bases[offset:offset + 201]), 4) # dep.append(gc_radio) chroms[chrom][n] = dep for num in count_threshold: num = int(num) if base_depth >= num: rangedict[num] += 1 total_base += 1 reg.append(base_depth) region_num += 1 array = DescribeArray(reg) averages = str(round(array.average, 2)) mediandepth = str(round(array.median, 2)) maxdepth = str(round(array.max, 2)) mindepth = str(round(array.min, 2)) _bedstat.write("\t".join([ chrom, rows[1], rows[2], averages, mediandepth, maxdepth, mindepth ]) + '\n') for num in count_threshold: num = int(num) if array.average >= num: regiondict[num] += 1 uncover_range = list() n_array = list() for chrom in sorted(chroms.keys(), key=lambda x: _chrom_valued(x)): dep = sorted(chroms[chrom].iteritems(), key=lambda x: int(x[0])) array = list() for p, d in dep: _depth.write("\t".join( [chrom, str(p), '\t'.join([str(i) for i in d])]) + '\n') array.append(d) n_array.append(d[5]) array = DescribeArray(array, col=5) averages = str(round(array.average, 2)) mediandepth = str(round(array.median, 2)) maxdepth = str(round(array.max, 2)) mindepth = str(round(array.min, 2)) chromcover = [ array.get_frequece(thre, col=5) for thre in count_threshold ] _chromstat.write( "\t".join([chrom, averages, mediandepth, maxdepth, mindepth] + chromcover) + '\n') if read_filter == 'all': uncover_bases = [ int(p) for p, d in dep if d[5] < uncover_threshold ] uncover_range.extend( formact_number_list_to_range(uncover_bases, tag=chrom)) if read_filter == 'all': uncoverout = smart_open(uncover, 'w') uncoverout.write("#Chr\tStart\tStop\n") uncoverout.writelines(uncover_range) uncoverout.close() array = DescribeArray(n_array) _stats.write("Average depth : %.2f\n" % array.average) _stats.write("Median depth : %.2f\n" % array.median) _stats.write("Max depth : %.2f\n" % array.max) _stats.write("Min depth : %.2f\n" % array.min) for number in count_threshold: number = int(number) _stats.write("Coverage (>={0:d}X) : {1:.2f}%\n".format( number, (float(rangedict[number]) / total_base) * 100, 2)) for number in count_threshold: number = int(number) _stats.write("Region Coverage (>={0:d}X) : {1:.2f}%\n".format( number, (float(regiondict[number]) / region_num) * 100, 2)) _depth.close() _bedstat.close() _stats.close() _chromstat.close() dep_f = CreatIndex(depth) _ = dep_f.check_index(seq_col=0, start_col=1, end_col=1)