def test_load_cov_long_contig_name(): # test that long chrom labels auto handled. label = 'AS2_scf7180000696055' bampath = 'fixture/longcontignames.bam' x = pysamstats.load_coverage(bampath, chrom=label) assert len(label) == x.dtype["chrom"].itemsize x = pysamstats.load_coverage(Samfile(bampath), chrom=label, dtype={"chrom": "a10"}) assert 10 == x.dtype["chrom"].itemsize
def cnv_calling(bam_file, output): mybam = pysam.AlignmentFile(bam_file) for contig in mybam.header.references: stats = pysamstats.load_coverage(mybam, chrom=contig) reads = stats.reads_pp position = stats.pos chrom = stats.chrom thresh_up, thresh_down = threshold(reads) start = 0 big_start = 0 last_end = 0 state = "base" for i in range(1000, len(reads), 1000): mean = mean_of_array(reads, start, i) if mean >= thresh_up: if last_end != start: big_start = start last_end = i state = "on" prev_i = i else: if state == "on": print(chrom[big_start].decode("utf-8"), position[big_start], position[prev_i], 'cnv', sep='\t', file=output) state = "off" else: state = "off" start = i
def get_coverage(bamfile: list, chrom: str, start: int, end: int) -> pd.DataFrame: """Get read depth from from bam files according location Args: bamfile (list): list of bam file with index chrom (str): chromosome name start (int): start position of interest end (int): end position of interest Returns: pd.DataDrame: A dataframe with chromosom, position and depth for each bam file """ import warnings mybam = pysam.AlignmentFile(bamfile) df = pd.DataFrame( pysamstats.load_coverage( mybam, chrom=chrom, start=start, end=end, truncate=True, pad=True, fields=["chrom", "pos", "reads_all"], ) ) df["chrom"] = df["chrom"].apply(lambda x: x.decode()) df.rename({"reads_all": "depth"}, inplace=True, axis=1) # if window > 1 and agg: # # Group every window line and aggregate depth # return df.groupby(df.index // window).agg({"chrom":"first", "pos":"first" , "depth":agg}) return df
def check_mapping_stats(bam, out_csv, sep="\t"): from numpy import median, mean from pysamstats import load_coverage dico_size_ref_genome = {} dicoResume = defaultdict(OrderedDict) # for bam in bam_files: if not os.path.exists(Path(bam).as_posix() + ".bai"): pysam.index(Path(bam).as_posix()) sample = Path(bam).stem print(f"\n\n{'*'*30}\nSAMPLE NAME: {sample}\n{'*'*30}\n\n") bam_file = pysam.AlignmentFile(bam, "r") name_fasta_ref = Path( re.findall("[/].*\.fasta", bam_file.header["PG"][0]["CL"], flags=re.IGNORECASE)[0]).stem if name_fasta_ref not in dico_size_ref_genome: dico_size_ref_genome[name_fasta_ref] = sum( [dico["LN"] for dico in bam_file.header["SQ"]]) a = load_coverage(bam_file, pad=True) df = pd.DataFrame(a) df.chrom = df.chrom.str.decode(encoding='UTF-8') listMap = df[df.reads_all >= 1].reads_all dicoResume[sample]["Mean mapping Depth coverage"] = f"{mean(listMap):.2f}" dicoResume[sample][ "Median mapping Depth coverage"] = f"{median(listMap):.2f}" dicoResume[sample][ "Mean Genome Coverage"] = f"{(len(listMap)/dico_size_ref_genome[name_fasta_ref])*100:.2f}%" dataframe_mapping_stats = pd.DataFrame.from_dict(dicoResume, orient='index') with open(out_csv, "w") as out_csv_file: # print(f"Library size:\n{dataframe_mapping_stats}\n") dataframe_mapping_stats.to_csv(out_csv_file, index=True, sep=sep)
def test_load_cov_using_steppers(): # test that expected steppers give different/consistent results # this is the only bam file that differs between all/nofilter bampath = "fixture/longcontignames.bam" seq = 'AS2_scf7180000695891' pos = 14311 steppers = ["all", "nofilter", "samtools"] reads_all = [7, 8, 4] reads_pp = [4, 5, 4] for exp_all, exp_pp, step in zip(reads_all, reads_pp, steppers): a = pysamstats.load_coverage(Samfile(bampath), chrom=seq, stepper=step, pad=True) eq_(exp_all, a[pos]["reads_all"]) eq_(exp_pp, a[pos]["reads_pp"]) with assert_raises(ValueError): pysamstats.load_coverage(Samfile(bampath), chrom=seq, stepper="notastepper")
def main(): mybam = pysam.AlignmentFile('UMB.aligned.sorted.bam') file = 'output_std_umb_contig_final.txt' with open(file, "w") as f: print("Index Chrom Pos Thresh Diff", file=f) for contig in mybam.header.references: # Do the analysis per contig stats = pysamstats.load_coverage(mybam, chrom=contig) reads = stats.reads_pp position = stats.pos chrom = stats.chrom #print(len(reads)) #print(len(position)) #print(len(chrom)) thresh_up, thresh_down = threshold(reads) #print(thresh_up) #print(thresh_down) start = 0 big_start = 0 last_end = 0 state = "base" for i in range(1000, len(reads), 1000): mean = mean_of_array(reads, start, i) if mean >= thresh_up: if last_end != start: big_start = start last_end = i state = "on" prev_i = i else: if state == "on": print( f'{chrom[big_start].decode("utf-8")}:{position[big_start]} {chrom[prev_i].decode("utf-8"):>20}:{position[prev_i]}' ) print( f'{chrom[big_start].decode("utf-8")}:{position[big_start]} {chrom[prev_i].decode("utf-8"):>20}:{position[prev_i]}', file=f) state = "off" else: state = "off" start = i
def CalculateCoverage(in_bamFile, in_bamLegend, in_selectTrans, in_transLengthDict, in_startCodonCoorDict, in_stopCodonCoorDict, in_readLengths, in_readOffset, output_prefix): pysamFile = pysam.AlignmentFile(in_bamFile, "rb") pysamFile_trans = pysamFile.references in_selectTrans = set(pysamFile_trans).intersection(in_selectTrans) trans_set = set() all_counts = 0 if output_prefix: outputFileName = output_prefix + "_" + in_bamLegend else: outputFileName = in_bamLegend for trans in in_startCodonCoorDict.keys(): leftCoor = int(in_startCodonCoorDict[trans]) - 1 rightCoor = int(in_stopCodonCoorDict[trans]) - 3 (trans_counts, read_counts_frameSum, total_reads, cds_reads) = get_trans_frame_counts(pysamFile, trans, in_readLengths, in_readOffset, in_transLengthDict[trans], leftCoor, rightCoor) all_counts += total_reads ## total_reads for transcript level with open(outputFileName + "_raw_depth.txt", 'w') as f1, open(outputFileName + "_RPM_depth.txt", 'w') as f2: for trans in in_selectTrans: # print(trans) tmpTrans = pysamstats.load_coverage(pysamFile, chrom=trans, pad=True) tmpTransRaw = np.array([i[2] for i in tmpTrans]) tmpTransRPM = 10**6 * (tmpTransRaw / all_counts) trans_set.add(trans) f1.write("%s\t" % (trans)) f2.write("%s\t" % (trans)) for i in range(len(tmpTrans)): f1.write("%s\t" % (str(tmpTransRaw[i]))) f2.write("%s\t" % (str(tmpTransRPM[i]))) f1.write("\n") f2.write("\n") print("There are about " + str(len(trans_set)) + " transcripts used for coverage calculation!", file=sys.stderr)
def main(): mybam = pysam.AlignmentFile('UMB.aligned.sorted.bam') # iterate over statistics, one record at a time #stats = pysamstats.load_coverage(mybam, chrom='NZ_CP023386.1', start=(2500000), end=3000000) stats = pysamstats.load_coverage(mybam) reads = stats.reads_all position = stats.pos chrom = stats.chrom print(len(reads)) print(len(position)) print(len(chrom)) with open("output9.txt", "w") as f: print("Index Chrom Pos Thresh Diff", file=f) for i in range(len(reads)): if (i % 200) == 0: thresh = threshold(reads, i) if (i % 50000) == 0: print('currently: ' + str(i) + ' ' + str(chrom[i]).lstrip('b\'').rstrip('\'') + ' ' + str(position[i]) + ' ' + str(datetime.datetime.now().time())) if i == (len(reads)) - 1: #double check this is right current = reads[i] next_one = reads[0] else: current = reads[i] next_one = reads[i + 1] if abs(next_one - current) > thresh: print(str(i) + ' ' + str(chrom[i]).lstrip('b\'').rstrip('\'') + ' ' + str(position[i]) + ' ' + str(thresh) + ' ' + str(next_one - current), file=f) print( str(i) + ' ' + str(chrom[i]).lstrip('b\'').rstrip('\'') + ' ' + str(position[i]) + ' ' + str(thresh) + ' ' + str(next_one - current))
def get_coverage(bamfile: str, chrom: str, start: int, end: int, sample_rate=1) -> pd.DataFrame: """Get read depth from from bam files according location Args: bamfile (str): a bam file with index chrom (str): chromosome name start (int): start position of interest end (int): end position of interest Returns: pd.DataDrame: A dataframe with chromosom, position and depth for each bam file """ import warnings mybam = pysam.AlignmentFile(bamfile) df = pd.DataFrame( pysamstats.load_coverage( mybam, chrom=chrom, start=start, end=end, truncate=True, pad=True, fields=["chrom", "pos", "reads_all"], )) df["chrom"] = df["chrom"].apply(lambda x: x.decode()) df.rename({"reads_all": "depth"}, inplace=True, axis=1) # keep line every sample_rate row df = df[df.index % sample_rate == 0] return df
def _av_depth(self, chrom, start, stop): #aln_depth = self.aln.count_coverage(chrom, start, stop) #depth = sum(map(sum, aln_depth)) #return depth / len(aln_depth[0]) # average depth of the region c = pysamstats.load_coverage(self.aln, chrom=chrom, start=start, end=stop, truncate=True, max_depth=300000) return mean(c.reads_all)
samples = [] with open(inputfile, 'r') as f: for line in csv.reader(f, delimiter='\t'): samples.append(line) suspectdel = [] for i in range(0, len(samples)): bam = pysam.AlignmentFile(samples[i][1], 'rb') #Coverage of HRP and neighboring gene# exoncov = [] for k in range(0, len(exons)): ecoverage = pss.load_coverage(bam, reffile, pad=True, truncate=True, chrom=exons[k][0], start=exons[k][1], end=exons[k][2]) exonmean = np.mean(ecoverage.reads_all) exoncov.append([exonmean, exonmean / float(samples[i][2])]) with open('%s/HRPcov_%s.txt' % (outfolder, samples[i][0]), 'w') as f: f.write( 'chromosome\tstart\tend\tgeneID\tname\tcoverage\tnormalizedcov\n') for m in range(0, len(exons)): f.write('%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\n' % (exons[m][0], exons[m][1], exons[m][2], exons[m][3], exons[m][4], exoncov[m][0], exoncov[m][1])) #Flag suspected deletion# for n in range(0, len(exoncov)): if exoncov[n][1] < cutoff:
def pull_down(name, row): row[0] = '{0}_{1}'.format(name, row[0]) csvwriter.writerow(row) rows.append(row) chrom = row[0] start = 1 # chrom = row[12].split(':')[0] # start = int(row[12].split(':')[1].split('-')[0]) # end = int(row[12].split(':')[1].split('-')[1]) #row = ['csi-miR395c-3p_Cluster_151156', '86.36', '21', 'CUGAAGUGUUUGGGGGAACUC', '7', 'GUUCCUCUGAGCACUUCAUUG', '6', '21', 'ACUGAAGUGUUUGGGGGAACUCC', 'AGUUCCUCUGAGCACUUCAUUGG', 'UUGGUCGGAUGUCUCCUAGAGUUCCUCUGAGCACUUCAUUGGGUAUACAAUUUCUUAUGAAGAUUACCCACUGAAGUGUUUGGGGGAACUCCUGGACCCAUUCUACGGUUU'] # mybam_lc = pysamstats.load_coverage(mybam, chrom = chrom, start = start, # end = end) mybam_lc = pysamstats.load_coverage(mybam, chrom = chrom) G006_bam_lc = pysamstats.load_coverage(G006_bam, chrom = chrom) G002_bam_lc = pysamstats.load_coverage(G002_bam, chrom = chrom) BTIRed_bam_lc = pysamstats.load_coverage(BTIRed_bam, chrom = chrom) mature = row[3] star = row[5] precursor = row[10] end = len(precursor) if any(m.islower() for m in mature) or any(s.islower() for s in star): pass else: m_start = precursor.index(mature) m_end = m_start + len(mature) s_start = precursor.index(star) s_end = s_start + len(star) # if len(mybam_lc.pos) > 0: if (len(G006_bam_lc.pos) > 0) or (len(G002_bam_lc.pos) > 0) or \ (len(BTIRed_bam_lc.pos) > 0): positions = np.arange(start, end + 1) read_counts = [] G006_bam_lc_pos = list(G006_bam_lc.pos) for entry in positions: if entry in G006_bam_lc_pos: read_counts.append(G006_bam_lc.reads_all[G006_bam_lc_pos.index(entry)]) else: read_counts.append(0) G002_bam_lc_pos = list(G002_bam_lc.pos) for entry in positions: if entry in G002_bam_lc_pos: pos_ind = entry - 1 read_counts[pos_ind] += G002_bam_lc.reads_all[G002_bam_lc_pos.index(entry)] else: pass BTIRed_bam_lc_pos = list(BTIRed_bam_lc.pos) for entry in positions: if entry in BTIRed_bam_lc_pos: pos_ind = entry - 1 read_counts[pos_ind] += BTIRed_bam_lc.reads_all[BTIRed_bam_lc_pos.index(entry)] else: pass # print(read_counts) fig = plt.figure(figsize = (15, 15)) ax = plt.subplot(1, 1, 1) # if m_start > s_start: # plt.plot(positions[:s_start], read_counts[:s_start], # color = 'black', label = 'Tails') # plt.plot(positions[s_start:s_end+1], read_counts[s_start:s_end+1], # color = 'red', label = 'Star') # plt.plot(positions[s_end+1:m_start], read_counts[s_end+1:m_start], # color = 'gold', label = 'Loop') # plt.plot(positions[m_start:m_end+1], read_counts[m_start:m_end+1], # color = 'green', label = 'Mature') # plt.plot(positions[m_end+1:], read_counts[m_end+1:], # color = 'black') # elif s_start > m_start: # plt.plot(positions[:m_start], read_counts[:m_start], # color = 'black', label = 'Tails') # plt.plot(positions[m_start:m_end+1], read_counts[m_start:m_end+1], # color = 'green', label = 'Mature') # plt.plot(positions[m_end+1:s_start], read_counts[m_end+1:s_start], # color = 'gold', label = 'Loop') # plt.plot(positions[s_start:s_end+1], read_counts[s_start:s_end+1], # color = 'red', label = 'Star') # plt.plot(positions[s_end+1:], read_counts[s_end+1:], # color = 'black') barlist = plt.bar(positions, read_counts, width = 1.0, color = 'black') for bar in barlist[m_start:m_end + 1]: bar.set_color('red') for bar in barlist[s_start:s_end + 1]: bar.set_color('green') if m_start > s_start: for bar in barlist[s_end + 1:m_start]: bar.set_color('yellow') elif s_start > m_start: for bar in barlist[m_end + 1:s_start]: bar.set_color('yellow') else: pass # plt.legend() ind = np.arange(max(read_counts) + 1) plt.yticks(ind, ind) plt.xlabel('Position') plt.ylabel('Reads Mapped') plt.title('{}\n'.format(row[0])) ax.spines['right'].set_visible(False) # Removes right axis ax.spines['top'].set_visible(False) # Removes top axis ax.yaxis.set_ticks_position('none') # Keeps vertical ticks hidden ax.xaxis.set_ticks_position('bottom') # Keeps horizontal ticks hidden on top plt.savefig('/nethome/mct30/mapping_svgs/{0}_plant-precursors_bowtie1_a_T__mm0_gap0_split.svg'.format(row[12]), bbox_inches = 'tight', format = 'svg') plt.show() plt.close('all') else: pass mature_read_counts = read_counts[m_start : m_end + 1] star_read_counts = read_counts[s_start : s_end + 1] if m_start > s_start: loop_read_counts = [s_end + 1 : m_start] mature_tail_read_counts = [m_end + 1 : end] star_tail_read_counts = [start : s_start] elif s_start > m_start: loop_read_counts = [m_end + 1 : s_start] star_tail_read_counts = [s_end + 1: end] mature_tail_read_counts = [start : m_start] else: pass return mybam_lc