def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)
def merge_delete(ext_size, merge, peak_list, pvalue_list): # peaks_gain = read_diffpeaks(path) regions_plus = GenomicRegionSet('regions') #pot. mergeable regions_minus = GenomicRegionSet('regions') #pot. mergeable regions_unmergable = GenomicRegionSet('regions') last_orientation = "" for i, t in enumerate(peak_list): chrom, start, end, c1, c2, strand, ratio = t[0], t[1], t[2], t[3], t[4], t[5], t[6] r = GenomicRegion(chrom = chrom, initial = start, final = end, name = '', \ orientation = strand, data = str((c1, c2, pvalue_list[i], ratio))) if end - start > ext_size: if strand == '+': if last_orientation == '+': region_plus.add(r) else: regions_unmergable.add(r) elif strand == '-': if last_orientation == '-': region_mins.add(r) else: regions_unmergable.add(r) if merge: regions_plus.extend(ext_size/2, ext_size/2) regions_plus.merge() regions_plus.extend(-ext_size/2, -ext_size/2) merge_data(regions_plus) regions_minus.extend(ext_size/2, ext_size/2) regions_minus.merge() regions_minus.extend(-ext_size/2, -ext_size/2) merge_data(regions_minus) results = GenomicRegionSet('regions') for el in regions_plus: results.add(el) for el in regions_minus: results.add(el) for el in regions_unmergable: results.add(el) results.sort() return results
def merge_delete(ext_size, merge, peak_list, pvalue_list): # peaks_gain = read_diffpeaks(path) regions_plus = GenomicRegionSet('regions') #pot. mergeable regions_minus = GenomicRegionSet('regions') #pot. mergeable regions_unmergable = GenomicRegionSet('regions') last_orientation = "" for i, t in enumerate(peak_list): chrom, start, end, c1, c2, strand, ratio = t[0], t[1], t[2], t[3], t[ 4], t[5], t[6] r = GenomicRegion(chrom = chrom, initial = start, final = end, name = '', \ orientation = strand, data = str((c1, c2, pvalue_list[i], ratio))) if end - start > ext_size: if strand == '+': if last_orientation == '+': region_plus.add(r) else: regions_unmergable.add(r) elif strand == '-': if last_orientation == '-': region_mins.add(r) else: regions_unmergable.add(r) if merge: regions_plus.extend(ext_size / 2, ext_size / 2) regions_plus.merge() regions_plus.extend(-ext_size / 2, -ext_size / 2) merge_data(regions_plus) regions_minus.extend(ext_size / 2, ext_size / 2) regions_minus.merge() regions_minus.extend(-ext_size / 2, -ext_size / 2) merge_data(regions_minus) results = GenomicRegionSet('regions') for el in regions_plus: results.add(el) for el in regions_minus: results.add(el) for el in regions_unmergable: results.add(el) results.sort() return results
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_forward) bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table( table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join( args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join( args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join( args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_forward) bw_filename = os.path.join( args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)
def call_peaks(bam, csizes, pval, min_reads, binsize, cfile=None): ''' Call peaks on bam file using pvalue and binomial model. Returns GenomeRegionSet with peaks, and CoverageSet with signal. ''' # make chromsizes region set rs = get_chrom_sizes_as_genomicregionset(csizes) print("calculating extension sizes...") # calculate ext size ext, _ = get_extension_size(bam, start=0, end=300, stepsize=5) print("calculating coverage...") # calc coverage cov = CoverageSet('coverageset', rs) cov.coverage_from_bam(bam_file=bam, extension_size=ext, paired_reads=True) # calculate cov2 for output bw cov2 = CoverageSet('coverageset2', rs) cov2.coverage_from_bam(bam_file=bam, extension_size=ext, paired_reads=True, binsize=binsize, stepsize=binsize // 2) if cfile is not None: print(f"Using control file: {cfile}") control = CoverageSet('contorl', rs) control.coverage_from_bam(bam_file=cfile, extension_size=ext) with np.errstate(divide='ignore', invalid='ignore'): norm_igg(cov, control) # recalc overall coverage cov.overall_cov = reduce(lambda x, y: np.concatenate( (x, y)), [cov.coverage[i] for i in range(len(cov.genomicRegions))]) # total coverage s = np.sum(cov.overall_cov) # probability of event, a read in a bin, (avg reads/bin )/libsize p = np.mean(cov.overall_cov[cov.overall_cov > 0]) / s # what is the max coverage maxcov = np.max(cov.overall_cov) # create dict with probability for each count value mc = np.arange(0, maxcov + 1, dtype="object") d = {count: binom_test((count, s - count), p=p) for count in mc} # create GenomicRegionSet to hold peaks res = GenomicRegionSet('identified_peaks') print("calculating peaks...") # iterate through bins in genome, store peaks for i, c in enumerate(cov.overall_cov): if filter_bins(c, d, min_reads): chrom, s, e = cov.index2coordinates(i, rs) res.add(GenomicRegion(chrom, s, e + 1, data=d[c])) # merge ol peaks res.merge() # merge peaks within ext dist rc = res.cluster(ext) return rc, cov, cov2