def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)
def main(): cArgs = args() root = os.environ["RGTDATA"] if "RGTDATA" in os.environ else "/rgtdata" if not os.path.exists(root + "/{assembly}/genome_{assembly}.fa".format(assembly = cArgs.assembly)): print( "WARNING: genomic data is not present for {assembly}. We will attempt to download it.".format(assembly = cArgs.assembly), file = sys.stderr ) print( "If you are running many jobs, they might run faster if you mount the appropriate data at {root}/{assembly}.".format(root = root, assembly = cArgs.assembly), file = sys.stderr ) result = os.system("python3 /reg-gen/data/setupGenomicData.py --{assembly}".format(assembly = cArgs.assembly)) if result != 0: print("FATAL: Unable to load genome data for {assembly}.".format(assembly = cArgs.assembly), file = sys.stderr) return 1 if cArgs.occurrence_threshold is None: signal = footprint(cArgs.bam, cArgs.bed, cArgs.assembly, cArgs.ext_size, cArgs.dnase, cArgs.bias_type) else: g = GenomeData(organism = cArgs.assembly) with FilteredRegions(cArgs.bed, cArgs.occurrence_threshold, g.get_genome(), g.get_chromosome_sizes()) as b: signal = footprint(cArgs.bam, b.name, cArgs.assembly, cArgs.ext_size, cArgs.dnase, cArgs.bias_type) if cArgs.aggregate or cArgs.plot_output is not None: signal = aggregate(signal, (lambda x: "all") if cArgs.occurrence_threshold is None else lambda x: x["name"], cArgs.ext_size) if cArgs.plot_output is not None: plot(signal["all"]["forward"], signal["all"]["reverse"], cArgs.font, cArgs.plot_output) if cArgs.output_file is None: if not cArgs.output_as_tsv or not cArgs.aggregate: print(json.dumps(signal)) else: for k, v in signal.items(): if k != "all": print("%s\t%s\t%s" % (k, ','.join([ str(x) for x in v["forward"] ]), ','.join([ str(x) for x in v["reverse"] ]))) else: with open(cArgs.output_file, 'w') as o: if not cArgs.output_as_tsv or not cArgs.aggregate: o.write(json.dumps(signal) + '\n') else: for k, v in signal.items(): if k != "all": o.write("%s\t%s\t%s\n" % (k, ','.join([ str(x) for x in v["forward"] ]), ','.join([ str(x) for x in v["reverse"] ]))) return 0
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_forward) bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
import pandas as pd from pysam import Samfile from rgt.GenomicRegionSet import GenomicRegionSet from rgt.Util import GenomeData tf_file = sys.argv[1] bam_file = sys.argv[2] output_file = sys.argv[3] gr_tfs = GenomicRegionSet(name="TFs") gr_tfs.read(filename=tf_file) gr_genes = gr_tfs.gene_association(organism="hg38") # Fetching chromosome sizes genome_data = GenomeData("hg38") chrom_sizes_file_name = genome_data.get_chromosome_sizes() chrom_sizes_file = open(chrom_sizes_file_name, "r") chrom_sizes_dict = dict() for chrom_sizes_entry_line in chrom_sizes_file: chrom_sizes_entry_vec = chrom_sizes_entry_line.strip().split("\t") chrom_sizes_dict[chrom_sizes_entry_vec[0]] = int(chrom_sizes_entry_vec[1]) chrom_sizes_file.close() bam = Samfile(bam_file, "rb") tf_list = list() gene_list = list() tc_list = list() for i, r in enumerate(gr_tfs): tf = r.name.split(".")[-1]
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table( table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join( args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join( args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join( args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_forward) bw_filename = os.path.join( args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write( "fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)