def filter_by_allele_freq(input_file, output_file, tumor_bam, matched_control_bam, tumor_AF_thres, control_AF_thres, max_fisher_pvalue): hout = open(output_file, 'w') print >> hout, '\t'.join([ "Chr", "Pos", "Dir", "Junc_Seq", "Num_Tumor_Total_Read", "Num_Tumor_Var_Read", "Num_Control_Total_Read", "Num_Control_Var_Read", "Minus_Log_Fisher_P_value" ]) with open(input_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') tumor_num = int(F[4]) control_num = int(F[5]) region = F[0] + ':' + F[1] + '-' + F[1] depth_tumor_info = pysam.depth(tumor_bam, "-r", region) depth_tumor = int(depth_tumor_info[0].rstrip('\n').split('\t')[2]) # depth_tumor = int(depth_tumor_info.split('\t')[2]) # depth_tumor = int(depth_tumor_info[0].split('\t')[2]) AF_tumor = float(tumor_num) / depth_tumor if AF_tumor < tumor_AF_thres: continue # print '\t'.join(F) if matched_control_bam != "": depth_control_info = pysam.depth(matched_control_bam, "-r", region) depth_control = int(depth_control_info[0].rstrip('\n').split( '\t')[2]) if len(depth_control_info) != 0 else 0 # depth_control = int(depth_control_info.split('\t')[2]) if len(depth_control_info) != 0 else 0 # depth_control = int(depth_control_info[0].split('\t')[2]) if len(depth_control_info) != 0 else 0 control_AF = float( control_num) / depth_control if depth_control > 0 else 1.0 else: depth_control = "---" control_AF = "---" if control_AF != "---" and control_AF > control_AF_thres: continue lpvalue = "---" if control_AF != "": oddsratio, pvalue = stats.fisher_exact( [[depth_tumor - tumor_num, tumor_num], [depth_control - control_num, control_num]], 'less') if pvalue < 1e-100: pvalue = 1e-100 lpvalue = (-math.log(pvalue, 10) if pvalue < 1 else 0) lpvalue = round(lpvalue, 4) if 10**(-lpvalue) > float(max_fisher_pvalue): continue print >> hout, '\t'.join(F[:4]) + '\t' + str(depth_tumor) + '\t' + str(tumor_num) + '\t' + \ str(depth_control) + '\t' + str(control_num) + '\t' + str(lpvalue) hout.close()
def calculate_coverage_score(self, bam_file, threshold): """Calculate coverage score using pysam from a .bam file. The coverage score is calculated by defining a minimum **threshold** that will make a residue (single nucleotide read) valid. Anything greater than this value will be valid and will be calculated as a percentage of the total number of residues. :param bam_file: Path to a .bam file :type bam_file: str :param threshold: Minimum number of reads per residue to be considered valid :type threshold: int :return: Percentage of residues that are greater the given threshold :return type: float """ raw_output = pysam.depth(bam_file) # NOTE: Probably a better algorithm / implementation here. # A grep / awk solution would be better. But perhaps a strictly # pythonic algorithm with faster results. covered, length = 0, 0 # The '[:-1]` is to strip trailing newline for read in raw_output.split('\n')[:-1]: length += 1 if int(read.split('\t')[2]) >= threshold: covered += 1 # NOTE: # This is an expression to get the last residue number # raw_output.split( '\n')[-2].split('\t')[2] return (covered / length * 100)
def unicov(bed_file, bam_file: List[str], bam_dir: Optional[str] = None, bam_list: Optional[str] = None, **kwargs): input_files = kwargs["input_files"] names = [sm_tag(x) for x in input_files] s = pysam.depth("-a", "-b", bed_file, *input_files) f = StringIO(s) df = pd.read_table(f) df.columns = ["chr", "pos"] + names size = df.shape[0] dat = {"coverage": COVERAGES} for name in names: percs = [] for coverage in COVERAGES: count = sum(df[name] >= coverage) perc = count / size * 100 percs.append(perc) dat[name] = percs df = pd.DataFrame(dat) return df.to_string() + "\n"
def samdepth(samfile, sitesfile): depth = pysam.depth(samfile, '-aa', '-b', sitesfile) depth = pd.read_csv(StringIO(depth), sep='\t', header=None, usecols=[2], squeeze=True) return depth.values
def get_one_coverage(bam_fn, region, region_pad=500): chrom, start, end = region start = max(start - region_pad, 0) end += region_pad # no problem if we run off the end # get the coverage over this region region_depth = pysam.depth(bam_fn, "-a", "-r", "%s:%d-%d" % (chrom, start, end)) return region_depth
def coverage_over_region(input_bam, region, reference, min_mapq=40, min_baseq=15, min_anchor=11): """Compute coverage over 1-indexed, closed region""" if reference: depth_result = pysam.depth( # pylint: disable=no-member "-Q", str(min_mapq), "-q", str(min_baseq), "-l", str(min_anchor), "-r", region, "--reference", reference, input_bam, ) else: depth_result = pysam.depth( # pylint: disable=no-member "-Q", str(min_mapq), "-q", str(min_baseq), "-l", str(min_anchor), "-r", region, input_bam, ) # start, end are 0-indexed half-open coordinates _, start, end = pysam.libcutils.parse_region(region=region) region_length = end - start if len(depth_result) > 0 and region_length > 0: depths = np.loadtxt(io.StringIO(depth_result), dtype=int, usecols=2) total_coverage = np.sum(depths) return (total_coverage / region_length, total_coverage, region_length) else: return (0., 0., region_length)
def getBedCovFile(this, bamFile, bedFile, outFile, bp="20", mpQ="20"): try: with open(outFile, 'w') as fileHandle: print(pysam.depth("-aa", "-q", bp, "-Q", mpQ, "-b", bedFile, bamFile), end="", file=fileHandle) except Exception as e: print("error: %s %s" % (bamFile, e)) return False return True
def getCoverage(row, bam): chromosome = str(row.loc['chr']) start = str(row.loc['start']) end = str(row.loc['end']) region = chromosome + ':' + start + '-' + end output = pysam.depth('-aa', '-d', '0', '-r', region, bam) coverage = dict() for line in output.split('\n')[:-1]: line = line.split('\t') coverage[int(line[1])] = int(line[2]) return (coverage)
def stats(out,loop,core,taxonomyDict,refDict,delSam): bamCmd="samtools view -bS -@ "+core+" "+out+str(loop)+".sam > "+str(loop)+"tmp.bam" sortCmd="samtools sort -@ "+core+" "+str(loop)+"tmp.bam > "+str(loop)+"sorted.bam" indexCmd="samtools index -@ "+core+" "+str(loop)+"sorted.bam" bam=subprocess.Popen([bamCmd],shell=True,stdout=subprocess.PIPE) stdout,error=bam.communicate() sort=subprocess.Popen([sortCmd],shell=True,stdout=subprocess.PIPE) stdout,error=sort.communicate() index=subprocess.Popen([indexCmd],shell=True,stdout=subprocess.PIPE) stdout,error=index.communicate() statOut=open(out+str(loop)+"_stats","w") statOut.write("Species_ID\tSpecies\tPhylum\tSubphylum\tDNA.read_count\tgenome_length\tmean_coverage_per_base\tmedian_coverage_per_base\tcoverage.percent\n") depth=pysam.depth("-a",str(loop)+"sorted.bam").split("\n") depth.pop(-1) zeroCounter=0 #count positions with zero reads ID="null" seq=0 cov=[] for line in depth: if line.split("\t")[0] != ID: if ID != "null": if refDict[ID][1].split(",")[0] in taxonomyDict: tax=taxonomyDict[refDict[ID][1].split(",")[0]] else: tax=["NA","NA"] statOut.write(ID+"\t"+reference[ID][1].split(",")[0]+"\t"+tax[0]+"\t"+tax[1]+"\t"+pysam.view("-c",str(loop)+"sorted.bam",ID).strip("\n")+"\t"+str(seq)+"\t"+str(numpy.mean(cov))+"\t"+str(numpy.median(cov))+"\t"+str((float(seq)-float(zeroCounter))/float(seq)*float(100))+"\n") ID=line.split("\t")[0] seq=0 zeroCounter=0 elif line.split("\t")[2] == "0": zeroCounter+=1 cov.append(int(line.split("\t")[2])) seq+=1 statOut.write(ID+"\t"+reference[ID][1].split(",")[0]+"\t"+tax[0]+"\t"+tax[1]+"\t"+pysam.view("-c",str(loop)+"sorted.bam",ID).strip("\n")+"\t"+str(seq)+"\t"+str(numpy.mean(cov))+"\t"+str(numpy.median(cov))+"\t"+str((float(seq)-float(zeroCounter))/float(seq)*float(100))+"\n") statOut.close() if delSam == "y": os.remove(str(loop)+"sorted.bam") os.remove(str(loop)+"tmp.bam") os.remove(str(loop)+"sorted.bam.bai") os.remove(out+str(loop)+".sam")
def get_depth(ref_name, ref_pos, bam_file): pos_str = ref_name + ':' + str(int(ref_pos) - 1) + '-' + str(ref_pos) res = pysam.depth("-r", pos_str, bam_file) if res == '': return 0 start = 0 end = len(res) - 1 for i in range(len(res) - 1, -1, -1): if res[i] == '\t': start = i + 1 break return int(res[start:end])
def get_coverage_df(bam_file): # Create a coverage dictionary. coverage_dict = {} coverage_list = pysam.depth(bam_file, split_lines=True) for line in coverage_list: chrom, position, depth = line.split('\t') coverage_dict["%s-%s" % (chrom, position)] = depth # Convert it to a data frame. coverage_df = pandas.DataFrame.from_dict(coverage_dict, orient='index', columns=["depth"]) return coverage_df
def getBedCoverage(this, bamFile, bedFile, bp="20", mpQ="20"): allSiteCover = createDict() try: result = pysam.depth("-aa", "-q", bp, "-Q", mpQ, "-b", bedFile, bamFile) for line in result.split("\n"): if re.search(r"^\s*$", line): continue cols = line.split("\t") allSiteCover[cols[0]][int(cols[1])] = int(cols[2]) except Exception as e: print("error: %s" % e) return False return allSiteCover
def coverage(self,sv,length=0): coverage={} for i in sv._svdict: cov=[] for target in sv._svdict[i]: deps=pysam.depth("-a", "-Q" "10", "-r", '{}:{}-{}'.format(self._chr_prefix+target[0],target[1],target[2]), "-l", str(length), self._fn).strip().split('\n') for dep in deps: dep=dep.strip().split('\t') if len(dep)<3: continue cov.append(int(dep[2])) coverage[i]=median(cov,1) return coverage
def calc_depth_coeff(args, params): if not args.b is None: stdout=pysam.depth(args.b, '-a') else: stdout=pysam.depth(args.c, '-a', '--reference', args.fa) stdout=stdout.strip().split('\n') global coeff coeff={} d={} for line in stdout: ls=line.split() if ls[0] == args.refseq_id: d[int(ls[1]) - 1]=int(ls[2]) # 1-based to 0-based tmp=[] for i in range(args.refseq_start, args.refseq_end): tmp.append(d[i]) mean_depth=np.mean(tmp) for i in range(args.refseq_start, args.refseq_end - params.k + 1): if i % params.slide_bin == 0: tmp=[] for apos in range(params.k): tmp.append(d[i + apos]) tmp_depth=np.mean(tmp) coeff[i]= mean_depth / tmp_depth
def getSam(): import subprocess import numpy import pysam import subprocess bamCmd = "samtools view -bS -@ 20 normalRun.sam > normalRun.bam" sortCmd = "samtools sort -@ 20 normalRun.bam > normalRunsorted.bam" indexCmd = "samtools index -@ 20 normalRunsorted.bam" bam = subprocess.Popen([bamCmd], shell=True, stdout=subprocess.PIPE) stdout, error = bam.communicate() sort = subprocess.Popen([sortCmd], shell=True, stdout=subprocess.PIPE) stdout, error = sort.communicate() index = subprocess.Popen([indexCmd], shell=True, stdout=subprocess.PIPE) stdout, error = index.communicate() depth = pysam.depth("-a", "normalRunsorted.bam").split("\n") depth.pop() zeroCounter = 0 #count positions with zero reads ID = "null" seq = 0 out = open("normalRun", "w") out.write( "reference\tDNA.read_count\tsequence_length\tmeanCov\tmedianCov\tcovPer\n" ) cov = [] for line in depth: if line.split("\t")[0] != ID: if ID != "null": out.write( ID + "\t" + pysam.view("-c", "normalRunsorted.bam", ID).strip("\n") + "\t" + str(seq) + "\t" + str(numpy.mean(cov)) + "\t" + str(numpy.median(cov)) + "\t" + str((float(seq) - float(zeroCounter)) / float(seq) * float(100)) + "\n") ID = line.split("\t")[0] seq = 0 zeroCounter = 0 elif line.split("\t")[2] == "0": zeroCounter += 1 cov.append(int(line.split("\t")[2])) seq += 1 out.close()
def bamdepthtobed(bamfile, outbed='mindepth.bed', mindepth=1, minlength=1): baminfor = Baminfo.Baminfo(bamfile) outio = open(outbed, 'w') for chrom in baminfor.getchrlen(): print(chrom) depthstr = pysam.depth('-r', chrom, bamfile) depthchr = depthstr.split('\n') del depthstr points = list() for reg in depthchr: try: (chrom, site, depthnow) = reg.split('\t') site = int(site) depthnow = int(depthnow) if depthnow >= mindepth: points.append(site) # print(chrom, site, depthnow) except Exception as e: print("warnning:", reg) continue_region = continueregion(points, minlength) for nowregion in continue_region: print(chrom, nowregion['start_site'], nowregion['end_site'], sep='\t', file=outio) outio.close() # if __name__ == '__main__': # # bamdepthtobed(bamfile='/mnt/e/Data/Solanum/Solanum_etuberosum/Solanum_etuberosum_map_to_DM.bam', # outbed='/mnt/e/Data/Solanum/Solanum_etuberosum/Solanum_etuberosum_map_to_DM_min_depth.bed', # mindepth=3, # minlength=150)
def realdepth(bamfh, region, cut): depth = [] #depth_result = pysam.depth("-a","-r",region,bamfh) depth_result = pysam.depth("-a", "-Q" "40", "-r", region, bamfh) str_flag = 0 if isinstance(depth_result, str): depth_result = depth_result.split('\n') str_flag = 1 for x in depth_result: r = x.rstrip('\n').split('\t') if str_flag == 1: if len(r) != 3: continue depth.append(float(r[2])) else: depth.append(float(r[2])) return depth
def create_depth_file(self, bamfile_name, core_num): """ bamfile_name: name of the bamfile to create depth file with core_num: core_num is only used in sorting and is optional. Default is 2 Description: This function calls samtools. You can access the newly created bamfiles through self.bam_root The output from pysam.depth is returned as a string, no longer saved as a file """ if not core_num: core_num = 2 else: core_num = int(core_num) sorted_bamfile = bamfile_name.split('.bam')[0] + '.sortedByCoord.bam' pysam.sort("--threads", str(core_num), "-m", "2G", "-o", sorted_bamfile, bamfile_name) # The depth file is returned as one single string return pysam.depth(sorted_bamfile)
def getDepths(in_aln, region, min_base_qual=13, excluded_flags="UNMAP,SECONDARY,QCFAIL,DUP"): """ Return list of depths on region. :param in_aln: Path to the alignments file. :type in_aln: str :param region: The evaluated region. :type region: anacore.region.Region :param min_base_qual: Minimum base quality to count this read base on depth. :type min_base_qual: int :param excluded_flags: Discard any read that has any of the flags specified in the comma-separated list. :type excluded_flags: str :return: Depths on region. :rtype: list """ out_str = pysam.depth( "-a", "-G", excluded_flags, "-J", "-q", str(min_base_qual), "-r", "{}:{}-{}".format(region.reference.name, region.start, region.end), in_aln) return [int(elt.split()[2]) for elt in out_str.strip().split("\n")]
def coverage(self, region): print( "-->Determining read coverage and depth for \033[1;35m{0}\033[m.". format(region[0])) # data_file = self.data_file pysam_depth = pysam.depth("-r{0}:1-{1}".format(region[0], region[1][0]), self.data_file, split_lines=True) depth_list = [] depth_counts = 0 breadth_counts = 0 for line in pysam_depth: depth_list.append(line.split("\t")[2]) depth_counts += int(line.split("\t")[2]) breadth_counts += 1 print( " -->Read coverage and depth analysis complete for \033[1;35m{0}\033[m." .format(region[0])) return depth_counts / int(region[1][0]), breadth_counts / int( region[1][0])
def calculate_read_depth(target_gene, control_gene, bam_path, output_file, genome_build="hg19"): """Create a GDF (GATK DepthOfCoverage Format) file for Stargazer from BAM files by computing read depth. Parameters ---------- target_gene : str Name of the target gene. Choices: {'abcb1', 'cacna1s', 'cftr', 'cyp1a1', 'cyp1a2', 'cyp1b1', 'cyp2a6', 'cyp2a13', 'cyp2b6', 'cyp2c8', 'cyp2c9', 'cyp2c19', 'cyp2d6', 'cyp2e1', 'cyp2f1', 'cyp2j2', 'cyp2r1', 'cyp2s1', 'cyp2w1', 'cyp3a4', 'cyp3a5', 'cyp3a7', 'cyp3a43', 'cyp4a11', 'cyp4a22', 'cyp4b1', 'cyp4f2', 'cyp17a1', 'cyp19a1', 'cyp26a1', 'dpyd', 'g6pd', 'gstm1', 'gstp1', 'gstt1', 'ifnl3', 'nat1', 'nat2', 'nudt15', 'por', 'ptgis', 'ryr1', 'slc15a2', 'slc22a2', 'slco1b1', 'slco1b3', 'slco2b1', 'sult1a1', 'tbxas1', 'tpmt', 'ugt1a1', 'ugt1a4', 'ugt2b7', 'ugt2b15', 'ugt2b17', 'vkorc1', 'xpc'}. control_gene : str Name of a preselected control gene. Used for intrasample normalization during copy number analysis by Stargazer. Choices: {'egfr', 'ryr1', 'vdr'}. Alternatively, you can provide a custom genomic region with the 'chr:start-end' format (e.g. chr12:48232319-48301814). bam_path : str Read BAM files from ``bam_path``, one file path per line. Also accepts single BAM file. output_file : str Path to the output file. genome_build : str, default: 'hg19' Build of the reference genome assembly. Choices: {'hg19', 'hg38'}. """ bam_files = [] if bam_path.endswith('.bam'): bam_files.append(bam_path) else: with open(bam_path) as f: for line in f: bam_files.append(line.strip()) sn_tags = [] sm_tags = [] for bam_file in bam_files: sn_tags += get_sn_tags(bam_file) _sm_tags = get_sm_tags(bam_file) if not _sm_tags: raise ValueError(f"SM tags not found: {bam_file}") elif len(_sm_tags) > 1: raise ValueError(f"Multiple SM tags ({_sm_tags}) " f"found: {bam_file}") else: sm_tags.append(list(_sm_tags)[0]) if any(["chr" in x for x in list(set(sn_tags))]): chr = "chr" else: chr = "" loci = [ Locus.from_input(target_gene, genome_build), Locus.from_input(control_gene, genome_build) ] depth_data = "" for locus in sorted(loci, key=lambda x: (int(x.chrom), x.region_start)): depth_data += pysam.depth("-a", "-Q", '1', "-r", f"{chr}{locus.region}", *bam_files) df = pd.read_csv(StringIO(depth_data), sep="\t", header=None) df.columns = ["chrom", "pos"] + ["Depth_for_" + x for x in sm_tags] df.insert(0, "Locus", df["chrom"].astype(str) + ":" + df["pos"].astype(str)) df.drop(columns=["chrom", "pos"], inplace=True) df.insert(1, "Total_Depth", df.iloc[:, 1:].sum(axis=1)) df.insert(2, "Average_Depth_sample", df.iloc[:, 2:].mean(axis=1)) df.to_csv(output_file, sep='\t', index=False)
def get_coverage_and_snp_count(task_queue, reference, output_metrics, output_vcf, timeout): while True: try: tup = task_queue.get(block=True, timeout=timeout) except queue.Empty: break bam_file, vcf_file = tup # Create a coverage dictionary. coverage_dict = {} coverage_list = pysam.depth(bam_file, split_lines=True) for line in coverage_list: chrom, position, depth = line.split('\t') coverage_dict["%s-%s" % (chrom, position)] = depth # Convert it to a data frame. coverage_df = pandas.DataFrame.from_dict(coverage_dict, orient='index', columns=["depth"]) # Create a zero coverage dictionary. zero_dict = {} for record in SeqIO.parse(reference, "fasta"): chrom = record.id total_len = len(record.seq) for pos in list(range(1, total_len + 1)): zero_dict["%s-%s" % (str(chrom), str(pos))] = 0 # Convert it to a data frame with depth_x # and depth_y columns - index is NaN. zero_df = pandas.DataFrame.from_dict(zero_dict, orient='index', columns=["depth"]) coverage_df = zero_df.merge(coverage_df, left_index=True, right_index=True, how='outer') # depth_x "0" column no longer needed. coverage_df = coverage_df.drop(columns=['depth_x']) coverage_df = coverage_df.rename(columns={'depth_y': 'depth'}) # Covert the NaN to 0 coverage and get some metrics. coverage_df = coverage_df.fillna(0) coverage_df['depth'] = coverage_df['depth'].apply(int) total_length = len(coverage_df) average_coverage = coverage_df['depth'].mean() zero_df = coverage_df[coverage_df['depth'] == 0] total_zero_coverage = len(zero_df) total_coverage = total_length - total_zero_coverage genome_coverage = "{:.2%}".format(total_coverage / total_length) # Process the associated VCF input. column_names = [ "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample" ] vcf_df = pandas.read_csv(vcf_file, sep='\t', header=None, names=column_names, comment='#') good_snp_count = len( vcf_df[(vcf_df['ALT'].str.len() == 1) & (vcf_df['REF'].str.len() == 1) & (vcf_df['QUAL'] > 150)]) base_file_name = get_base_file_name(vcf_file) if total_zero_coverage > 0: header_file = "%s_header.csv" % base_file_name with open(header_file, 'w') as outfile: with open(vcf_file) as infile: for line in infile: if re.search('^#', line): outfile.write("%s" % line) vcf_df_snp = vcf_df[vcf_df['REF'].str.len() == 1] vcf_df_snp = vcf_df_snp[vcf_df_snp['ALT'].str.len() == 1] vcf_df_snp['ABS_VALUE'] = vcf_df_snp['CHROM'].map( str) + "-" + vcf_df_snp['POS'].map(str) vcf_df_snp = vcf_df_snp.set_index('ABS_VALUE') cat_df = pandas.concat([vcf_df_snp, zero_df], axis=1, sort=False) cat_df = cat_df.drop(columns=['CHROM', 'POS', 'depth']) cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']] = cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']].fillna('.') cat_df['REF'] = cat_df['REF'].fillna('N') cat_df['FORMAT'] = cat_df['FORMAT'].fillna('GT') cat_df['Sample'] = cat_df['Sample'].fillna('./.') cat_df['temp'] = cat_df.index.str.rsplit('-', n=1) cat_df[['CHROM', 'POS']] = pandas.DataFrame(cat_df.temp.values.tolist(), index=cat_df.index) cat_df = cat_df[[ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Sample' ]] cat_df['POS'] = cat_df['POS'].astype(int) cat_df = cat_df.sort_values(['CHROM', 'POS']) body_file = "%s_body.csv" % base_file_name cat_df.to_csv(body_file, sep='\t', header=False, index=False) if output_vcf is None: output_vcf_file = os.path.join(OUTPUT_VCF_DIR, "%s.vcf" % base_file_name) else: output_vcf_file = output_vcf with open(output_vcf_file, "w") as outfile: for cf in [header_file, body_file]: with open(cf, "r") as infile: for line in infile: outfile.write("%s" % line) else: if output_vcf is None: output_vcf_file = os.path.join(OUTPUT_VCF_DIR, "%s.vcf" % base_file_name) else: output_vcf_file = output_vcf shutil.copyfile(vcf_file, output_vcf_file) bam_metrics = [ base_file_name, "", "%4f" % average_coverage, genome_coverage ] vcf_metrics = [base_file_name, str(good_snp_count), "", ""] if output_metrics is None: output_metrics_file = os.path.join(OUTPUT_METRICS_DIR, "%s.tabular" % base_file_name) else: output_metrics_file = output_metrics metrics_columns = [ "File", "Number of Good SNPs", "Average Coverage", "Genome Coverage" ] with open(output_metrics_file, "w") as fh: fh.write("# %s\n" % "\t".join(metrics_columns)) fh.write("%s\n" % "\t".join(bam_metrics)) fh.write("%s\n" % "\t".join(vcf_metrics)) task_queue.task_done()
def bam2sdf(genome_build: str, target_gene: str, control_gene: str, bam_file: List[str], **kwargs) -> str: """ Create SDF file from BAM file(s). Returns: str: SDF file. Args: genome_build (str): Genome build (hg19, hg38). target_gene (str): Target gene. control_gene (str): Control gene or region. bam_file (list[str]): BAM file(s). """ gene_table = get_gene_table() targets = [k for k, v in gene_table.items() if v["type"] == "target"] if target_gene not in targets: raise ValueError( f"'{target_gene}' is not among target genes: {targets}") tr = gene_table[target_gene][f"{genome_build}_region"].replace("chr", "") if "chr" in control_gene or ":" in control_gene: cr = control_gene.replace("chr", "") else: controls = [k for k, v in gene_table.items() if v["control"] == "yes"] if control_gene not in controls: raise ValueError( f"'{control_gene}' is not among control genes: {controls}") cr = gene_table[control_gene][f"{genome_build}_region"].replace( "chr", "") regions = sort_regions([tr, cr]) # Get sample and sequence names from BAM headers. sm = [] sn = [] for x in bam_file: sm.append(sm_tag(x)) result = pysam.view("-H", x).strip().split("\n") for line in result: fields = line.split("\t") if "@SQ" == fields[0]: for field in fields: if "SN:" in field: y = field.replace("SN:", "") if y not in sn: sn.append(y) logger.info(f"Sample IDs: {sm}") logger.info(f"Contigs: {sn}") # Determine whether the "chr" string should be used. if any(["chr" in x for x in sn]): chr_str = "chr" else: chr_str = "" result = "" for region in regions: temp = pysam.depth("-a", "-Q", "1", "-r", f"{chr_str}{region}", *bam_file) result += temp return result
def add_zero_coverage(self, sample_name, sample_reference, nodupbam, hapall, zero_coverage_vcf): coverage_dict = {} coverage_list = pysam.depth(nodupbam, split_lines=True) for line in coverage_list: chrom, position, depth = line.split('\t') coverage_dict[chrom + "-" + position] = depth coverage_df = pd.DataFrame.from_dict(coverage_dict, orient='index', columns=["depth"]) zero_dict = {} for record in SeqIO.parse(sample_reference, "fasta"): chrom = record.id total_len = len(record.seq) for pos in list(range(1, total_len + 1)): zero_dict[str(chrom) + "-" + str(pos)] = 0 zero_df = pd.DataFrame.from_dict(zero_dict, orient='index', columns=["depth"]) #df with depth_x and depth_y columns, depth_y index is NaN coverage_df = zero_df.merge(coverage_df, left_index=True, right_index=True, how='outer') #depth_x "0" column no longer needed coverage_df = coverage_df.drop(columns=['depth_x']) coverage_df = coverage_df.rename(columns={'depth_y': 'depth'}) #covert the NaN to 0 coverage coverage_df = coverage_df.fillna(0) coverage_df['depth'] = coverage_df['depth'].apply(int) total_length = len(coverage_df) ave_coverage = coverage_df['depth'].mean() zero_df = coverage_df[coverage_df['depth'] == 0] total_zero_coverage = len(zero_df) print("\tPositions with no coverage: {:,}".format(total_zero_coverage)) total_coverage = total_length - total_zero_coverage genome_coverage = "{:.2%}".format(total_coverage / total_length) vcf_df = pd.read_csv(hapall, sep='\t', header=None, names=[ "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample" ], comment='#') good_snp_count = len( vcf_df[(vcf_df['ALT'].str.len() == 1) & (vcf_df['REF'].str.len() == 1) & (vcf_df['QUAL'] > 150)]) if total_zero_coverage > 0: header_out = open('v_header.csv', 'w+') with open(hapall) as fff: for line in fff: if re.search('^#', line): print(line.strip(), file=header_out) header_out.close() vcf_df_snp = vcf_df[vcf_df['REF'].str.len() == 1] vcf_df_snp = vcf_df_snp[vcf_df_snp['ALT'].str.len() == 1] vcf_df_snp['ABS_VALUE'] = vcf_df_snp['CHROM'].map( str) + '-' + vcf_df_snp['POS'].map(str) vcf_df_snp = vcf_df_snp.set_index('ABS_VALUE') cat_df = pd.concat([vcf_df_snp, zero_df], axis=1, sort=False) cat_df = cat_df.drop(columns=['CHROM', 'POS', 'depth']) cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']] = cat_df[['ID', 'ALT', 'QUAL', 'FILTER', 'INFO']].fillna('.') cat_df['REF'] = cat_df['REF'].fillna('N') cat_df['FORMAT'] = cat_df['FORMAT'].fillna('GT') cat_df['Sample'] = cat_df['Sample'].fillna('./.') cat_df['temp'] = cat_df.index.str.rsplit('-', n=1) cat_df[['CHROM', 'POS']] = pd.DataFrame(cat_df.temp.values.tolist(), index=cat_df.index) cat_df = cat_df[[ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'Sample' ]] cat_df['POS'] = cat_df['POS'].astype(int) cat_df = cat_df.sort_values(['CHROM', 'POS']) cat_df.to_csv('v_annotated_body.csv', sep='\t', header=False, index=False) cat_files = ['v_header.csv', 'v_annotated_body.csv'] with open(zero_coverage_vcf, "wb") as outfile: for cf in cat_files: with open(cf, "rb") as infile: outfile.write(infile.read()) os.remove('v_header.csv') os.remove('v_annotated_body.csv') else: shutil.copyfile(hapall, zero_coverage_vcf) return (zero_coverage_vcf, good_snp_count, ave_coverage, genome_coverage)
parser.add_argument('-g', '--gff', required=True, help='gene annotation in gff format') parser.add_argument('-mf', '--min_frac', required=False, type=float, default=0.3, help='minimum fraction of gene covered by <min_cov> reads (0.3)') parser.add_argument('-mc', '--min_cov', required=True, type=int, default=2, help='minimum coverage per base to be considered covered (2)') parser.add_argument('-o', '--out', required=False, default='genes_pav.tsv', help='output table') args=parser.parse_args() gff = GffIO.GffIO(args.gff) samfile = pysam.AlignmentFile(args.inbam, "rb") genes = {} for gene in gff.nextGene(): ID = gene.ID coords = gene.extractCoords("exon") contig = gene.seqid genes[ID] = {"tot":0, "pass":0} for i in range(len(coords["starts"])): region=contig+":"+coords["starts"][i]+"-"+coords["ends"][i] for pos in pysam.depth('-aa', '-r', region, args.inbam, split_lines=True): genes[ID]["tot"] += 1 depth = pos.split("\t")[2] if int(depth) >= args.min_cov: genes[ID]["pass"] += 1 # for column in samfile.pileup(contig, int(coords["starts"][i]), int(coords["ends"][i]), truncate=True): # genes[ID]["tot"] += 1 # if column.nsegments >= args.min_cov: # genes[ID]["pass"] += 1 log="Analyzed "+str(len(genes))+" genes in file "+args.inbam+"\n" print(log) samfile.close() outfile = open(args.out, "w") header = "Gene\tPresence\tPassed\tTotal\n" outfile.write(header) for gene in genes.keys():
def from_bam(cls, bams, regions=None, zero=False, map_qual=None, names=None): """ Construct CovFrame from BAM files. Under the hood, the method computes read depth using the :command:`samtools depth` command. Parameters ---------- bams : str or list One or more input BAM files. Alternatively, you can provide a text file (.txt, .tsv, .csv, or .list) containing one BAM file per line. regions : str, list, or pybed.BedFrame, optional By default (``regions=None``), the method counts all reads in BAM files, which can be excruciatingly slow for large files (e.g. whole genome sequencing). Therefore, use this argument to only output positions in given regions. Each region must have the format chrom:start-end and be a half-open interval with (start, end]. This means, for example, chr1:100-103 will extract positions 101, 102, and 103. Alternatively, you can provide a BED file (compressed or uncompressed) or a :class:`pybed.BedFrame` object to specify regions. Note that the 'chr' prefix in contig names (e.g. 'chr1' vs. '1') will be automatically added or removed as necessary to match the input BAM's contig names. zero : bool, default: False If True, output all positions including those with zero depth. map_qual: int, optional Only count reads with mapping quality greater than or equal to this number. names : list, optional By default (``names=None``), sample name is extracted using SM tag in BAM files. If the tag is missing, the method will set the filename as sample name. Use this argument to manually provide sample names. Returns ------- CovFrame CovFrame object. See Also -------- CovFrame CovFrame object creation using constructor. CovFrame.from_dict Construct CovFrame from dict of array-like or dicts. CovFrame.from_file Construct CovFrame from a text file containing read depth data. Examples -------- >>> from fuc import pycov >>> cf = pycov.CovFrame.from_bam(bam) >>> cf = pycov.CovFrame.from_bam([bam1, bam2]) >>> cf = pycov.CovFrame.from_bam(bam, region='19:41497204-41524301') """ # Parse input BAM files. bams = common.parse_list_or_file(bams) # Check the 'chr' prefix. if all([pybam.has_chr_prefix(x) for x in bams]): chr_prefix = 'chr' else: chr_prefix = '' # Run the 'samtools depth' command. args = [] if zero: args += ['-a'] if map_qual is not None: args += ['-Q', str(map_qual)] results = '' if regions is None: results += pysam.depth(*(bams + args)) else: if isinstance(regions, str): regions = [regions] elif isinstance(regions, list): pass elif isinstance(regions, pybed.BedFrame): regions = bf.to_regions() else: raise TypeError("Incorrect type of argument 'regions'") if '.bed' in regions[0]: bf = pybed.BedFrame.from_file(regions[0]) regions = bf.to_regions() else: regions = common.sort_regions(regions) for region in regions: region = chr_prefix + region.replace('chr', '') results += pysam.depth(*(bams + args + ['-r', region])) headers = ['Chromosome', 'Position'] dtype = {'Chromosome': str, 'Position': int} for i, bam in enumerate(bams): if names: name = names[i] else: samples = pybam.tag_sm(bam) if not samples: basename = Path(bam).stem message = (f'SM tags were not found for {bam}, will use ' f'file name as sample name ({basename})') samples = [basename] warnings.warn(message) if len(samples) > 1: m = f'multiple sample names detected: {bam}' raise ValueError(m) name = samples[0] headers.append(name) dtype[name] = int df = pd.read_csv(StringIO(results), sep='\t', header=None, names=headers, dtype=dtype) return cls(df)
def get_genome_metrics(bam_file, ref_length = 29903, platform = 'ILLUMINA', read_length=148, verbose=False, temp=None): if verbose: log.setLevel(logging.DEBUG) recovery_20 = 0 recovery_10 = 0 total_bases = 0 reads = 0 coverage = [ ] unmapped_reads = 0 try: # If illumina create temp bam of full mapped reads if platform == 'ILLUMINA': log.debug('Creating temp file and filtering for ' + os.path.basename(bam_file) ) if temp: fd = tempfile.NamedTemporaryFile(prefix='ronaldo', dir=temp) else: fd = tempfile.NamedTemporaryFile(prefix='ronaldo') bam = pysam.AlignmentFile(bam_file, "rb") if not bam.has_index(): pysam.index(bam_file) bam = pysam.AlignmentFile(bam_file, "rb") # Pre filter full mapped reads . temp_bam = pysam.AlignmentFile(fd, 'wb', template=bam) for read in bam.fetch(): if not read.is_unmapped: match_lengths = [match[1] - match[0] for match in read.cigar] if sum(match_lengths) >= read_length: temp_bam.write(read) bam.close() temp_bam.close() clean_bam_file = fd.name else: clean_bam_file = bam_file log.debug('Fetching read stats for ' + os.path.basename(bam_file) ) log.debug('Path: ' + clean_bam_file) pysam.index(clean_bam_file) for stat_line in pysam.idxstats(clean_bam_file).split('\n'): if stat_line.startswith('*'): unmapped_reads = int(stat_line.split('\t')[2]) + int(stat_line.split('\t')[3]) elif stat_line.startswith('MN908947.3'): reads = int(stat_line.split('\t')[2]) else: if len(stat_line) > 1: ref_name = stat_line.split('\t')[0] logging.warn(f'Other reference file detected, {ref_name}.') log.debug('Calculating depth for ' + os.path.basename(bam_file) ) for coord_line in pysam.depth(clean_bam_file ,'-a', '-d', '0').split('\n'): coord = coord_line.split('\t') if len(coord) > 2: total_bases += 1 coverage.append(float(coord[2])) if int(coord[2]) >= 10: recovery_10 += 1 if int(coord[2]) > 20: recovery_20 += 1 except (pysam.utils.SamtoolsError, OSError) as ex: log.error(f'Error opening BAM file {bam_file}') log.error(ex) return 0,0,0,0 finally: if platform == 'ILLUMINA': fd.close() if total_bases != ref_length and total_bases > 0 : log.warn(f'SARSCOV reference genome not expected size, found {total_bases}') if total_bases == 0 or not coverage: log.warn(f'No reads mapped at all in {bam_file}') return 0, 0, 0, 0 return round(float(recovery_10) / total_bases * 100, 2) , round(float(recovery_20) / total_bases * 100, 2), round(float(sum(coverage)) / len(coverage),3), reads