def filter_deadzones(bed_deadzones, peak_regions): """Filter by peaklist by deadzones""" deadzones = GenomicRegionSet('deadzones') deadzones.read_bed(bed_deadzones) peak_regions = peak_regions.subtract(deadzones, whole_region=True) return peak_regions
def __init__(self, rna_fasta, rna_name, dna_region, organism, showdbs=False): self.organism = organism genome = GenomeData(organism) self.genome_path = genome.get_genome() # RNA: Path to the FASTA file self.rna_fasta = rna_fasta self.showdbs = showdbs rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA) rnas.read_fasta(self.rna_fasta) if rna_name: self.rna_name = rna_name else: self.rna_name = rnas[0].name # DNA: GenomicRegionSet self.dna_region = GenomicRegionSet(name="target") self.dna_region.read_bed(dna_region) self.dna_region = self.dna_region.gene_association( organism=self.organism, show_dis=True) self.topDBD = [] self.stat = OrderedDict(name=rna_name, genome=organism) self.stat["target_regions"] = str(len(self.dna_region))
def estimate_bias_vom(args): regions = GenomicRegionSet("regions") regions.read(args.regions_file) create_signal(args, regions) hmm_data = HmmData() learn_dependency_model = hmm_data.get_dependency_model() slim_dimont_predictor = hmm_data.get_slim_dimont_predictor() test_fa = hmm_data.get_default_test_fa() shutil.copy(test_fa, args.output_location) os.chdir(args.output_location) print((os.getcwd())) output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) infix = "{}_f_obs".format(str(args.k_nb)) create_model(args, output_fname_f_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_f_exp".format(str(args.k_nb)) create_model(args, output_fname_f_exp, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_obs".format(str(args.k_nb)) create_model(args, output_fname_r_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_exp".format(str(args.k_nb)) create_model(args, output_fname_r_exp, infix, learn_dependency_model, slim_dimont_predictor) os.remove(os.path.join(args.output_location, "test.fa")) compute_bias(args)
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \ inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \ tracker, debug, norm_regions, scaling_factors_ip, save_wig, housekeeping_genes, \ test, report, chrom_sizes_dict, counter, end, gc_content_cov=None, avg_gc_content=None, \ gc_hist=None, output_bw=True, save_input=False, m_threshold=80, a_threshold=95, rmdup=False): """Initialize the MultiCoverageSet""" regionset = regions regionset.sequences.sort() if norm_regions: norm_regionset = GenomicRegionSet('norm_regions') norm_regionset.read_bed(norm_regions) else: norm_regionset = None exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, report) multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize, rmdup=rmdup, path_bamfiles=bamfiles, path_inputs=inputs, exts=exts, exts_inputs=exts_inputs, factors_inputs=factors_inputs, chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig, strand_cov=True, housekeeping_genes=housekeeping_genes, tracker=tracker, gc_content_cov=gc_content_cov, avg_gc_content=avg_gc_content, gc_hist=gc_hist, end=end, counter=counter, output_bw=output_bw, folder_report=FOLDER_REPORT, report=report, save_input=save_input, m_threshold=m_threshold, a_threshold=a_threshold) return multi_cov_set
def estimate_bias_vom(args): regions = GenomicRegionSet("regions") regions.read(args.regions_file) create_signal(args, regions) hmm_data = HmmData() learn_dependency_model = hmm_data.get_dependency_model() slim_dimont_predictor = hmm_data.get_slim_dimont_predictor() test_fa = hmm_data.get_default_test_fa() shutil.copy(test_fa, args.output_location) os.chdir(args.output_location) print(os.getcwd()) output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) infix = "{}_f_obs".format(str(args.k_nb)) create_model(args, output_fname_f_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_f_exp".format(str(args.k_nb)) create_model(args, output_fname_f_exp, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_obs".format(str(args.k_nb)) create_model(args, output_fname_r_obs, infix, learn_dependency_model, slim_dimont_predictor) infix = "{}_r_exp".format(str(args.k_nb)) create_model(args, output_fname_r_exp, infix, learn_dependency_model, slim_dimont_predictor) os.remove(os.path.join(args.output_location, "test.fa")) compute_bias(args)
def fisher_table(motif_name, regions, mpbs, gene_set=False, mpbs_set=False): """ TODO Keyword arguments: motif_name -- TODO regions -- TODO mpbs -- TODO gene_set -- TODO mpbs_set -- TODO Return: a -- TODO b -- TODO gene_set -- TODO mpbs_set -- TODO """ # Fetching motif mpbs_motif = GenomicRegionSet(name="mpbs_motif") for region in mpbs.sequences: if motif_name in region.name: mpbs_motif.add(region) # Performing intersections if len(mpbs_motif) > 0: # regions which are overlapping with mpbs_motif intersect_original = regions.intersect(mpbs_motif, mode=OverlapType.ORIGINAL, rm_duplicates=True) # regions which are not overlapping with regions from mpbs_motif subtract_overlap = regions.subtract(mpbs_motif, whole_region=True) # Fetching genes if gene_set: gene_set_res = GeneSet(motif_name) for genomic_region in intersect_original.sequences: if genomic_region.name: gene_list = [e if e[0] != "." else e[1:] for e in genomic_region.name.split(":")] for g in gene_list: gene_set_res.genes.append(g) gene_set_res.genes = list(set(gene_set_res.genes)) # Keep only unique genes else: gene_set_res = None # Fetching mpbs if mpbs_set: mpbs_set_res = mpbs_motif.intersect(regions, mode=OverlapType.ORIGINAL, rm_duplicates=True) else: mpbs_set_res = None return len(intersect_original), len(subtract_overlap), gene_set_res, mpbs_set_res else: gene_set_res = GeneSet(motif_name) if gene_set else None mpbs_set_res = GenomicRegionSet(mpbs_motif.name) if mpbs_set else None return 0, len(regions), gene_set_res, mpbs_set_res
def dbd_regions(self, sig_region, output): """Generate the BED file of significant DBD regions and FASTA file of the sequences""" dbd_regions(exons=self.rna_regions, sig_region=sig_region, rna_name=self.rna_name, output=output) self.stat["DBD_all"] = str(len(self.rbss)) self.stat["DBD_sig"] = str(len(self.data["region"]["sig_region"])) sigDBD = GenomicRegionSet("DBD_sig") sigDBD.sequences = self.data["region"]["sig_region"] rbss = self.txp.get_rbs() overlaps = rbss.intersect(y=sigDBD, mode=OverlapType.ORIGINAL) self.stat["DBSs_target_DBD_sig"] = str(len(overlaps))
def main(): options, vcf_list = input() #thres_mq = 20 #thres_dp = 20 #filter_dbSNP = True #tfbs_motifs_path = '/home/manuel/workspace/cluster_p/human_genetics/exp/exp01_motifsearch_sox2/humangenetics_motifs/Match/chr11_mpbs.bed' sample_data = load_data(vcf_list) print("##Filter variants of samples", file=sys.stderr) pipeline(sample_data, options) if options.list_wt: wt_data = load_data(options.list_wt) print("##Filter variants of wildtypes", file=sys.stderr) pipeline(wt_data, options) union_wt = GenomicVariantSet(name = "union_wt") for wt in wt_data: union_wt.sequences += wt.sequences print("#wildtype variants:", file=sys.stderr) print("union WT", len(union_wt), file=sys.stderr, sep="\t") #delete Wildtype for sample in sample_data: sample.subtract(union_wt) print_length(sample_data, "#variants after subtracting wildtypes") else: print("#Do not filter by wildtype", file=sys.stderr) if options.max_density: get_max_density(GenomicVariantSets=sample_data, lowerBound=options.lower_bound, upperBound=options.upper_bound) else: print("#Do not perform max. density search", file=sys.stderr) if options.list_bed: tfbs_motifs = GenomicRegionSet('tfbs_motifs') tfbs_motifs.read_bed(options.list_bed) for sample in sample_data: sample.intersect(tfbs_motifs) print_length(sample_data, "#variants after filtering by BED file") else: print("#Do not filter by BED file", file=sys.stderr) print("#Compute intersection of sample's subsets (give intersection's name and size)") output_intersections(sample_data) print("#Write filtered sample files") for sample in sample_data: sample.write_vcf("%s-filtered.vcf" %sample.name)
def read_bed(self, bedfile, genome_file_dir): """Read the sequences defined by BED file on the given genomce. *Keyword arguments:* - bedfile -- The path to the BED file which defines the regions. - genome_file_dir -- A directory which contains the FASTA files for each chromosome. """ # Read BED into GenomicRegionSet from rgt.GenomicRegionSet import GenomicRegionSet bed = GenomicRegionSet(os.path.basename(bedfile)) bed.read_bed(bedfile) self.read_genomic_set(bed, genome_file_dir)
def merge_DBD_regions(path): """Merge all available DBD regions in BED format. """ for t in os.listdir(path): if os.path.isdir(os.path.join(path, t)): dbd_pool = GenomicRegionSet(t) for rna in os.listdir(os.path.join(path,t)): f = os.path.join(path, t, rna, "DBD_"+rna+".bed") if os.path.exists(f): dbd = GenomicRegionSet(rna) dbd.read_bed(f) for r in dbd: r.name = rna+"_"+r.name dbd_pool.combine(dbd) dbd_pool.write_bed(os.path.join(path, t, "DBD_"+t+".bed"))
def __init__(self, reads_file=None, regions_file=None, genome_file=None, k_nb=None, forward_shift=None, reverse_shift=None, bias_type=None, output_location=None): """ Initializes BiasTable. """ self.regions = GenomicRegionSet("Bias Regions") self.reads_file = reads_file if regions_file is not None: self.regions.read(regions_file) self.genome_file = genome_file self.k_nb = k_nb self.forward_shift = forward_shift self.reverse_shift = reverse_shift self.bias_type = bias_type self.output_location = output_location
class BiasTable: """ Represent a bias table. Authors: Eduardo G. Gusmao. """ def __init__(self, reads_file=None, regions_file=None, genome_file=None, k_nb=None, forward_shift=None, reverse_shift=None, bias_type=None, output_location=None): """ Initializes BiasTable. """ self.regions = GenomicRegionSet("Bias Regions") self.reads_file = reads_file if regions_file is not None: self.regions.read(regions_file) self.genome_file = genome_file self.k_nb = k_nb self.forward_shift = forward_shift self.reverse_shift = reverse_shift self.bias_type = bias_type self.output_location = output_location def load_table(self, table_file_name_F, table_file_name_R): """ Creates a bias table from a tab separated file with a k-mer and bias estimate in each line. Keyword arguments: table_file_name -- Table file name. Return: bias_table_F, bias_table_R -- Bias tables. """ bias_table_F = dict() table_file_F = open(table_file_name_F, "r") for line in table_file_F: ll = line.strip().split("\t") bias_table_F[ll[0]] = float(ll[1]) table_file_F.close() bias_table_R = dict() table_file_R = open(table_file_name_R, "r") for line in table_file_R: ll = line.strip().split("\t") bias_table_R[ll[0]] = float(ll[1]) table_file_R.close() return [bias_table_F, bias_table_R]
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def get_training_regionset(self): r = GenomicRegionSet('') r.add(self.regionset[self.counter]) if self.counter == len(self.chrom_sizes_dict): return None else: self.counter += 1 return r #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs # if test: # contained_chrom = ['chr1', 'chr2'] # else: # #contained_chrom = get_all_chrom(bamfiles) # contained_chrom = ['chr1', 'chr2']
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \ inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \ tracker, debug, norm_regions, scaling_factors_ip, save_wig): """Initialize the MultiCoverageSet""" regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: print("Call DPs on specified regions.", file=sys.stderr) with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: print("Call DPs on whole genome.", file=sys.stderr) with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end if norm_regions: norm_regionset = GenomicRegionSet('norm_regions') norm_regionset.read_bed(norm_regions) else: norm_regionset = None regionset.sequences.sort() exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, verbose) tracker.write(text=str(exts).strip('[]'), header="Extension size (rep1, rep2, input1, input2)") multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ path_bamfiles = bamfiles, path_inputs = inputs, exts = exts, exts_inputs = exts_inputs, factors_inputs = factors_inputs, \ chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, \ norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig) return multi_cov_set
def get_bc_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift, bias_table1, bias_table2) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) # Fetch bias corrected signal for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal signal1 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam1, bias_table=bias_table1, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) signal2 = bias_correction(chrom=region.chrom, start=p1, end=p2, bam=bam2, bias_table=bias_table2, genome_file_name=genome_data.get_genome(), forward_shift=forward_shift, reverse_shift=reverse_shift) if len(signal1) != len(signal_1) or len(signal2) != len(signal_2): continue # smooth the signal signal_1 = np.add(signal_1, np.array(signal1)) signal_2 = np.add(signal_2, np.array(signal2)) update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def get_experimental_matrix(bams, bed): """Load artificially experimental matrix. Only genes in BED file are needed.""" m = ExperimentalMatrix() m.fields = ['name', 'type', 'file'] m.fieldsDict = {} names = [] for bam in bams: n, _ = os.path.splitext(os.path.basename(bam)) m.files[n] = bam names.append(n) m.names = np.array(['housekeep'] + names) m.types = np.array(['regions'] + ['reads']*len(names)) g = GenomicRegionSet('RegionSet') g.read_bed(bed) m.objectsDict['housekeep'] = g return m
def subtract(self, x): """ Subtract GenomicVariantSet. *Keyword arguments:* - x -- instance of GenomicVariantSet which is subtracted """ tmp = GenomicRegionSet.subtract(self, x, whole_region=False) self.sequences = self._reconstruct_info(tmp)
def rna_associated_gene(rna_regions, name, organism): if rna_regions: s = [ rna_regions[0][0], min([e[1] for e in rna_regions]), max([e[2] for e in rna_regions]), rna_regions[0][3] ] g = GenomicRegionSet("RNA associated genes") g.add( GenomicRegion(chrom=s[0], initial=s[1], final=s[2], name=name, orientation=s[3]) ) asso_genes = g.gene_association(organism=organism, promoterLength=1000, show_dis=True) genes = asso_genes[0].name.split(":") closest_genes = [] for n in genes: if name not in n: closest_genes.append(n) closest_genes = set(closest_genes) if len(closest_genes) == 0: return "." else: return ":".join(closest_genes) else: return "."
def read_bed(self, bedfile, genome_file_dir): """Read the sequences defined by BED file on the given genomce""" # Read BED into GenomicRegionSet bed = GenomicRegionSet(os.path.basename(bedfile)) bed.read_bed(bedfile) # Parse each chromosome and fetch the defined region in this chromosome chroms = list(set(bed.get_chrom())) chro_files = [x.split(".")[0] for x in os.listdir(genome_file_dir)] for ch in chroms: if ch not in chro_files: print(" *** There is no genome FASTA file for: "+ch) # Read genome in FASTA according to the given chromosome ch_seq = SequenceSet(name=ch, seq_type=SequenceType.DNA) try: ch_seq.read_fasta(os.path.join(genome_file_dir, ch+".fa")) except: continue # Regions in given chromosome beds = bed.any_chrom(chrom=ch) for s in beds: seq = ch_seq[0].seq[s.initial:s.final] try: strand = s.strand except: strand = "+" self.sequences.append(Sequence(seq=seq, name=s.__repr__(), strand=strand))
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \ inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \ tracker, debug, norm_regions, scaling_factors_ip, save_wig, housekeeping_genes): """Initialize the MultiCoverageSet""" regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: print("Call DPs on specified regions.", file=sys.stderr) with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: print("Call DPs on whole genome.", file=sys.stderr) with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end if norm_regions: norm_regionset = GenomicRegionSet('norm_regions') norm_regionset.read_bed(norm_regions) else: norm_regionset = None if housekeeping_genes: scaling_factors_ip, _ = norm_gene_level(bamfiles, housekeeping_genes, name, verbose=True) if scaling_factors_ip: tracker.write(text=map(lambda x: str(x), scaling_factors_ip), header="Scaling factors") regionset.sequences.sort() exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, verbose) tracker.write(text=str(exts).strip('[]'), header="Extension size (rep1, rep2, input1, input2)") multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ path_bamfiles = bamfiles, path_inputs = inputs, exts = exts, exts_inputs = exts_inputs, factors_inputs = factors_inputs, \ chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, \ norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig) return multi_cov_set
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)
def test_fisher_table(self): regions = GenomicRegionSet("regions") regions.read(os.path.join(os.path.dirname(__file__), "test.bed")) mpbs = GenomicRegionSet("mpbs") mpbs.read(os.path.join(os.path.dirname(__file__), "test_mpbs.bed")) i, ni, gs, ms = fisher_table("GGT1", regions, mpbs) self.assertEqual(i, 0) self.assertEqual(ni, 36) i, ni, gs, ms = fisher_table("HIC2", regions, mpbs) self.assertEqual(i, 8) self.assertEqual(ni, 28) i, ni, gs, ms = fisher_table("RAC2", regions, mpbs, gene_set=True, mpbs_set=True) self.assertEqual(len(gs), 0) self.assertEqual(len(ms), 0)
def __init__(self, rna_fasta, rna_name, dna_region, organism, showdbs=False): self.organism = organism genome = GenomeData(organism) self.genome_path = genome.get_genome() # RNA: Path to the FASTA file self.rna_fasta = rna_fasta self.showdbs = showdbs rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA) rnas.read_fasta(self.rna_fasta) if rna_name: self.rna_name = rna_name else: self.rna_name = rnas[0].name # DNA: GenomicRegionSet self.dna_region = GenomicRegionSet(name="target") self.dna_region.read_bed(dna_region) self.dna_region = self.dna_region.gene_association(organism=self.organism, show_dis=True) self.topDBD = [] self.stat = OrderedDict(name=rna_name, genome=organism) self.stat["target_regions"] = str(len(self.dna_region))
def get_dbss(input_BED,output_BED,rna_fasta,output_rbss,organism,l,e,c,fr,fm,of,mf,rm,temp): regions = GenomicRegionSet("Target") regions.read_bed(input_BED) regions.gene_association(organism=organism, show_dis=True) connect_rna(rna_fasta, temp=temp, rna_name="RNA") rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA) rnas.read_fasta(os.path.join(temp,"rna_temp.fa")) rna_regions = get_rna_region_str(os.path.join(temp,rna_fasta)) # print(rna_regions) genome = GenomeData(organism) genome_path = genome.get_genome() txp = find_triplex(rna_fasta=rna_fasta, dna_region=regions, temp=temp, organism=organism, remove_temp=False, l=l, e=e, c=c, fr=fr, fm=fm, of=of, mf=mf, genome_path=genome_path, prefix="targeted_region", dna_fine_posi=True) print("Total binding events:\t",str(len(txp))) txp.write_bed(output_BED) txp.write_txp(filename=output_BED.replace(".bed",".txp")) rbss = txp.get_rbs() dbd_regions(exons=rna_regions, sig_region=rbss, rna_name="rna", output=output_rbss, out_file=True, temp=temp, fasta=False)
def get_raw_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal for read in bam1.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 for read in bam2.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def diff_analysis_run(args): # Initializing Error Handler err = ErrorHandler() output_location = os.path.join(args.output_location, "Lineplots") try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") # Check if the index file exists base_name1 = "{}.bai".format(args.reads_file1) if not os.path.exists(base_name1): pysam.index(args.reads_file1) base_name2 = "{}.bai".format(args.reads_file2) if not os.path.exists(base_name2): pysam.index(args.reads_file2) mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(args.mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(args.mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() mpbs.remove_duplicates() mpbs_name_list = list(set(mpbs.get_names())) signal_dict_by_tf_1 = dict() signal_dict_by_tf_2 = dict() motif_len_dict = dict() motif_num_dict = dict() pwm_dict_by_tf = dict() pool = Pool(processes=args.nc) # differential analysis using bias corrected signal if args.bc: hmm_data = HmmData() table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table1 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) bias_table2 = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift, bias_table1, bias_table2)) try: res = pool.map(get_bc_signal, mpbs_list) except Exception: logging.exception("get bias corrected signal failed") # differential analysis using raw signal else: mpbs_list = list() for mpbs_name in mpbs_name_list: mpbs_list.append((mpbs_name, args.mpbs_file1, args.mpbs_file2, args.reads_file1, args.reads_file2, args.organism, args.window_size, args.forward_shift, args.reverse_shift)) try: res = pool.map(get_raw_signal, mpbs_list) except Exception: logging.exception("get raw signal failed") for idx, mpbs_name in enumerate(mpbs_name_list): signal_dict_by_tf_1[mpbs_name] = res[idx][0] signal_dict_by_tf_2[mpbs_name] = res[idx][1] motif_len_dict[mpbs_name] = res[idx][2] pwm_dict_by_tf[mpbs_name] = res[idx][3] motif_num_dict[mpbs_name] = res[idx][4] if args.factor1 is None or args.factor2 is None: args.factor1, args.factor2 = compute_factors(signal_dict_by_tf_1, signal_dict_by_tf_2) output_factor(args, args.factor1, args.factor2) if args.output_profiles: output_profiles(mpbs_name_list, signal_dict_by_tf_1, output_location, args.condition1) output_profiles(mpbs_name_list, signal_dict_by_tf_2, output_location, args.condition2) ps_tc_results_by_tf = dict() plots_list = list() for mpbs_name in mpbs_name_list: plots_list.append((mpbs_name, motif_num_dict[mpbs_name], signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, args.condition1, args.condition2, pwm_dict_by_tf[mpbs_name], output_location, args.window_size, args.standardize)) pool.map(line_plot, plots_list) for mpbs_name in mpbs_name_list: res = get_ps_tc_results(signal_dict_by_tf_1[mpbs_name], signal_dict_by_tf_2[mpbs_name], args.factor1, args.factor2, motif_num_dict[mpbs_name], motif_len_dict[mpbs_name]) # # # only use the factors whose protection scores are greater than 0 # if res[0] > 0 and res[1] < 0: ps_tc_results_by_tf[mpbs_name] = res # # stat_results_by_tf = get_stat_results(ps_tc_results_by_tf) ps_tc_results_by_tf = scatter_plot(args, ps_tc_results_by_tf) output_stat_results(args, ps_tc_results_by_tf, motif_num_dict)
def initialize(name, genome_path, regions, stepsize, binsize, bam_file_1, bam_file_2, ext_1, ext_2, \ input_1, input_factor_1, ext_input_1, input_2, input_factor_2, ext_input_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\ factor_input_1, factor_input_2, debug, tracker): regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end regionset.sequences.sort() start = 0 end = 600 ext_stepsize = 5 #TODO: maybe for-loops? #compute extension size if [ext_1, ext_2, ext_input_1, ext_input_2].count(None) > 0: print("Computing read extension sizes...", file=sys.stderr) if ext_1 is None: ext_1, values_1 = get_extension_size(bam_file_1, start=start, end=end, stepsize=ext_stepsize) print("Read extension for first file: %s" %ext_1, file=sys.stderr) if ext_2 is None: ext_2, values_2 = get_extension_size(bam_file_2, start=start, end=end, stepsize=ext_stepsize) print("Read extension for second file: %s" %ext_2, file=sys.stderr) if input_1 is not None and ext_input_1 is None: ext_input_1, values_input_1 = get_extension_size(input_1, start=start, end=end, stepsize=ext_stepsize) print("Read extension for first input file: %s" %ext_input_1, file=sys.stderr) if input_1 is not None and input_2 is not None and input_1 == input_2 and 'ext_input_1' in locals() and 'values_input_1' in locals(): ext_input_2, values_input_2 = ext_input_1, values_input_1 elif input_2 is not None and ext_input_2 is None: ext_input_2, values_input_2 = get_extension_size(input_2, start=start, end=end, stepsize=ext_stepsize) print("Read extension for second input file: %s" %ext_input_2, file=sys.stderr) tracker.write(text=str(ext_1) + "," + str(ext_2), header="Extension size IP1, IP2") if input_1 is not None and input_2 is not None: tracker.write(text=str(ext_input_1) + "," + str(ext_input_2), header="Extension size Control1, Control2") if verbose: if 'values_1' in locals() and values_1 is not None: with open(name + '-read-ext-1', 'w') as f: for v, i in values_1: print(i, v, sep='\t', file=f) if 'values_2' in locals() and values_2 is not None: with open(name + '-read-ext-2', 'w') as f: for v, i in values_2: print(i, v, sep='\t', file=f) if 'values_input_1' in locals() and values_input_1 is not None: with open(name + '-read-ext-input-1', 'w') as f: for v, i in values_input_1: print(i, v, sep='\t', file=f) if 'values_input_2' in locals() and values_input_2 is not None: with open(name + '-read-ext-input-2', 'w') as f: for v, i in values_input_2: print(i, v, sep='\t', file=f) cov_cdp_mpp = DualCoverageSet(name=name, region=regionset, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ file_1=bam_file_1, ext_1=ext_1,\ file_2=bam_file_2, ext_2=ext_2, \ input_1=input_1, ext_input_1=ext_input_1, input_factor_1=input_factor_1, \ input_2=input_2, ext_input_2=ext_input_2, input_factor_2=input_factor_2, \ chrom_sizes=chrom_sizes, verbose=verbose, norm_strategy=norm_strategy, no_gc_content=no_gc_content, deadzones=deadzones,\ factor_input_1=factor_input_1, factor_input_2=factor_input_2, chrom_sizes_dict=chrom_sizes_dict, debug=debug, tracker=tracker) return cov_cdp_mpp, [ext_1, ext_2]
def test_get_fisher_dict(self): motif_names = ["TFIP11", "ACO2", "HIC2", "HAT5"] regions = GenomicRegionSet("regions") regions.read(os.path.join(os.path.dirname(__file__), "test.bed")) mpbs = GenomicRegionSet("mpbs") mpbs.read(os.path.join(os.path.dirname(__file__), "test_mpbs.bed")) regions2 = GenomicRegionSet("regions2") regions2.read(os.path.join(os.path.dirname(__file__), "test2.bed")) mpbs2 = GenomicRegionSet("mpbs2") mpbs2.read(os.path.join(os.path.dirname(__file__), "test2_mpbs.bed")) regions3 = GenomicRegionSet("regions3") regions3.read(os.path.join(os.path.dirname(__file__), "test3.bed")) mpbs3 = GenomicRegionSet("mpbs3") mpbs3.read(os.path.join(os.path.dirname(__file__), "test3_mpbs.bed")) result = get_fisher_dict(motif_names, regions, mpbs) intersecting = result[0] not_intersecting = result[1] for mn in ["TFIP11", "ACO2", "HAT5"]: self.assertEqual(intersecting[mn], 0) self.assertEqual(intersecting["HIC2"], 8) for mn in motif_names: self.assertEqual(intersecting[mn]+not_intersecting[mn], 36) result = get_fisher_dict(motif_names, regions, mpbs, gene_set=True, mpbs_set=True) gs = result[2] ms = result[3] for mn in ["TFIP11", "ACO2", "HAT5"]: self.assertEqual(len(gs[mn]), 0) self.assertEqual(len(gs["HIC2"]), 8) self.assertEqual(len(ms["HIC2"]), 8)
def chip_evaluate(args): # Evaluate Statistics fpr = dict() tpr = dict() roc_auc_1 = dict() roc_auc_10 = dict() roc_auc_50 = dict() roc_auc_100 = dict() recall = dict() precision = dict() prc_auc_1 = dict() prc_auc_10 = dict() prc_auc_50 = dict() prc_auc_100 = dict() footprint_file = args.footprint_file.split(",") footprint_name = args.footprint_name.split(",") footprint_type = args.footprint_type.split(",") max_score = 0 if "SEG" in footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read(args.tfbs_file) # Verifying the maximum score of the MPBS file for region in iter(mpbs_regions): score = int(region.data.split("\t")[0]) if score > max_score: max_score = score max_score += 1 max_points = [] for i in range(len(footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read(footprint_file[i]) footprints_regions.sort() if footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data.split("\t")[0]) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(increased_score_mpbs_regions) max_points.append(len(intersect_regions)) elif footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc_1[i], roc_auc_10[i], roc_auc_50[i], roc_auc_100[i] = \ roc_curve(footprints_regions) recall[i], precision[i], prc_auc_1[i], prc_auc_10[i], prc_auc_50[i], prc_auc_100[i] = \ precision_recall_curve(footprints_regions) max_points.append(len(footprints_regions)) # Output the statistics results into text stats_fname = os.path.join(args.output_location, "{}_stats.txt".format(args.output_prefix)) stats_header = ["METHOD", "AUC_100", "AUC_50", "AUC_10", "AUC_1", "AUPR_100", "AUPR_50", "AUPR_10", "AUPR_1"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(footprint_name)): stats_file.write(footprint_name[i] + "\t" + str(roc_auc_100[i]) + "\t" + str(roc_auc_50[i]) + "\t" + str(roc_auc_10[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(prc_auc_100[i]) + "\t" + str(prc_auc_50[i]) + "\t" + str(prc_auc_10[i]) + "\t" + str(prc_auc_1[i]) + "\n") # Output the curves if args.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" plot_curve(footprint_name, args.output_location, fpr, tpr, roc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) if args.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" plot_curve(footprint_name, args.output_location, recall, precision, prc_auc_100, label_x, label_y, args.output_prefix, curve_name, max_points=max_points) output_points(footprint_name, args.output_location, args.output_prefix, fpr, tpr, recall, precision)
def __init__(self, name): """*Keyword arguments:* - name -- Define the name of this BindingSiteSet. """ GenomicRegionSet.__init__(self, name = name)
class RandomTest: def __init__(self, rna_fasta, rna_name, dna_region, organism, showdbs=False): self.organism = organism genome = GenomeData(organism) self.genome_path = genome.get_genome() # RNA: Path to the FASTA file self.rna_fasta = rna_fasta self.showdbs = showdbs rnas = SequenceSet(name="rna", seq_type=SequenceType.RNA) rnas.read_fasta(self.rna_fasta) if rna_name: self.rna_name = rna_name else: self.rna_name = rnas[0].name # DNA: GenomicRegionSet self.dna_region = GenomicRegionSet(name="target") self.dna_region.read_bed(dna_region) self.dna_region = self.dna_region.gene_association( organism=self.organism, show_dis=True) self.topDBD = [] self.stat = OrderedDict(name=rna_name, genome=organism) self.stat["target_regions"] = str(len(self.dna_region)) def get_rna_region_str(self, rna): """Getting the rna region from the information header with the pattern: REGION_chr3_51978050_51983935_-_""" self.rna_regions = get_rna_region_str(rna) if self.rna_regions and len(self.rna_regions[0]) == 5: self.rna_expression = float(self.rna_regions[0][-1]) else: self.rna_expression = "n.a." def connect_rna(self, rna, temp): d = connect_rna(rna, temp, self.rna_name) self.stat["exons"] = str(d[0]) self.stat["seq_length"] = str(d[1]) self.rna_len = d[1] def target_dna(self, temp, remove_temp, cutoff, l, e, c, fr, fm, of, mf, par, obed=False): """Calculate the true counts of triplexes on the given dna regions""" self.triplexator_p = [l, e, c, fr, fm, of, mf] txp = find_triplex(rna_fasta=os.path.join(temp, "rna_temp.fa"), dna_region=self.dna_region, temp=temp, organism=self.organism, remove_temp=remove_temp, l=l, e=e, c=c, fr=fr, fm=fm, of=of, mf=mf, par=par, genome_path=self.genome_path, prefix="targeted_region", dna_fine_posi=False) txp.merge_rbs(rm_duplicate=True, region_set=self.dna_region, asgene_organism=self.organism, cutoff=cutoff) self.txp = txp self.stat["DBSs_target_all"] = str(len(self.txp)) txp.remove_duplicates() self.rbss = txp.merged_dict.keys() # if len(self.rbss) == 0: # print("ERROR: No potential binding event. Please change the parameters.") # sys.exit(1) txpf = find_triplex(rna_fasta=os.path.join(temp, "rna_temp.fa"), dna_region=self.dna_region, temp=temp, organism=self.organism, remove_temp=remove_temp, l=l, e=e, c=c, fr=fr, fm=fm, of=of, mf=mf, par=par, genome_path=self.genome_path, prefix="dbs", dna_fine_posi=True) txpf.remove_duplicates() txpf.merge_rbs(rbss=self.rbss, rm_duplicate=True, asgene_organism=self.organism) self.txpf = txpf self.stat["DBSs_target_all"] = str(len(self.txpf)) self.counts_tr = OrderedDict() self.counts_dbs = OrderedDict() for rbs in self.rbss: tr = len(self.txp.merged_dict[rbs]) self.counts_tr[rbs] = [tr, len(self.dna_region) - tr] self.counts_dbs[rbs] = len(self.txpf.merged_dict[rbs]) self.region_dbd = self.txpf.sort_rbs_by_regions(self.dna_region) self.region_dbs = self.txpf.sort_rd_by_regions( regionset=self.dna_region) self.region_dbsm = {} self.region_coverage = {} for region in self.dna_region: self.region_dbsm[region.toString()] = self.region_dbs[ region.toString()].get_dbs().merge(w_return=True) self.region_coverage[region.toString()] = float(self.region_dbsm[region.toString()].total_coverage()) / len \ (region) self.stat["target_regions"] = str(len(self.dna_region)) if obed: # btr = self.txp.get_dbs() # btr = btr.gene_association(organism=self.organism, show_dis=True) # btr.write_bed(os.path.join(temp, obed + "_target_region_dbs.bed")) # dbss = txpf.get_dbs() # dbss.write_bed(os.path.join(temp, obed + "_dbss.bed")) # output = self.dna_region.gene_association(organism=self.organism, show_dis=True) self.txp.write_bed(filename=os.path.join( temp, obed + "_target_region_dbs.bed"), dbd_tag=False, remove_duplicates=False, associated=self.organism) self.txpf.write_bed(filename=os.path.join(temp, obed + "_dbss.bed"), remove_duplicates=False) def random_test(self, repeats, temp, remove_temp, l, e, c, fr, fm, of, mf, rm, par, filter_bed, alpha): """Perform randomization for the given times""" self.repeats = repeats marks = numpy.round(numpy.linspace(0, repeats - 1, num=41)).tolist() print("random_test") print(par) # Prepare the input lists for multiprocessing mp_input = [] for i in range(repeats): mp_input.append([ str(i), os.path.join(temp, "rna_temp.fa"), self.dna_region, temp, self.organism, self.rbss, str(marks.count(i)), str(l), str(e), str(c), str(fr), str(fm), str(of), str(mf), str(rm), filter_bed, self.genome_path, par ]) # Multiprocessing print("\t\t|0% | 100%|") print("\t\t[", end="") pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() - 2) mp_output = pool.map(random_each, mp_input) # print(mp_output) pool.close() pool.join() print("]") # Processing the result self.region_matrix = [] self.dbss_matrix = [] self.data = { "region": { "ave": [], "sd": [], "p": [], "sig_region": [], "sig_boolean": [] }, "dbs": { "ave": [], "sd": [], "p": [], "sig_region": [], "sig_boolean": [] } } region_counts = [v[0] for v in mp_output] dbss_counts = [v[1] for v in mp_output] for i, rbs in enumerate(self.rbss): counts_regions = [v[i] for v in region_counts] self.data["region"]["ave"].append(numpy.mean(counts_regions)) self.data["region"]["sd"].append(numpy.std(counts_regions)) num_sig = len( [h for h in counts_regions if h > self.counts_tr[rbs][0]]) p_region = float(num_sig) / repeats self.data["region"]["p"].append(p_region) self.region_matrix.append(counts_regions) if p_region < alpha: self.data["region"]["sig_region"].append(rbs) self.data["region"]["sig_boolean"].append(True) else: self.data["region"]["sig_boolean"].append(False) try: if p_region < self.topDBD[1]: self.topDBD = [rbs.str_rna(pa=False), p_region] except: self.topDBD = [rbs.str_rna(pa=False), p_region] # Analysis based on DBSs if self.showdbs: counts_dbss = [v[i] for v in dbss_counts] self.data["dbs"]["ave"].append(numpy.mean(counts_dbss)) self.data["dbs"]["sd"].append(numpy.std(counts_dbss)) num_sig = len( [h for h in counts_dbss if h > self.counts_dbs[rbs]]) p_dbs = float(num_sig) / repeats self.data["dbs"]["p"].append(p_dbs) self.dbss_matrix.append(counts_dbss) if p_dbs < alpha: self.data["dbs"]["sig_region"].append(rbs) self.data["dbs"]["sig_boolean"].append(True) else: self.data["dbs"]["sig_boolean"].append(False) try: self.stat["p_value"] = str(min(self.data["region"]["p"])) except: self.stat["p_value"] = "1" self.region_matrix = numpy.array(self.region_matrix) if self.showdbs: self.dbss_matrix = numpy.array(self.dbss_matrix) counts_dbss = [v[i] for v in dbss_counts] self.stat["DBSs_random_ave"] = numpy.mean(counts_dbss) try: self.stat["p_value"] = str(min(self.data["region"]["p"])) except: self.stat["p_value"] = "1" def dbd_regions(self, sig_region, output): """Generate the BED file of significant DBD regions and FASTA file of the sequences""" dbd_regions(exons=self.rna_regions, sig_region=sig_region, rna_name=self.rna_name, output=output) self.stat["DBD_all"] = str(len(self.rbss)) self.stat["DBD_sig"] = str(len(self.data["region"]["sig_region"])) sigDBD = GenomicRegionSet("DBD_sig") sigDBD.sequences = self.data["region"]["sig_region"] rbss = self.txp.get_rbs() overlaps = rbss.intersect(y=sigDBD, mode=OverlapType.ORIGINAL) self.stat["DBSs_target_DBD_sig"] = str(len(overlaps)) def lineplot(self, txp, dirp, ac, cut_off, log, ylabel, linelabel, showpa, sig_region, filename): """Generate lineplot for RNA""" lineplot(txp=txp, rnalen=self.rna_len, rnaname=self.rna_name, dirp=dirp, sig_region=sig_region, cut_off=cut_off, log=log, ylabel=ylabel, linelabel=linelabel, filename=filename, ac=ac, showpa=showpa) def boxplot(self, dir, matrix, sig_region, truecounts, sig_boolean, ylabel, filename): """Generate the visualized plot""" tick_size = 8 label_size = 9 f, ax = plt.subplots(1, 1, dpi=300, figsize=(6, 4)) max_y = int(max([matrix.max()] + truecounts) * 1.1) + 1 min_y = max(int(matrix.min() * 0.9) - 1, 0) # Significant DBD rect = patches.Rectangle(xy=(1, 0), width=0.8, height=max_y, facecolor=sig_color, edgecolor="none", alpha=0.5, lw=None, label="Significant DBD") for i, r in enumerate(sig_boolean): if r: rect = patches.Rectangle(xy=(i + 0.6, min_y), width=0.8, height=max_y, facecolor=sig_color, edgecolor="none", alpha=0.5, lw=None, label="Significant DBD") ax.add_patch(rect) # Plotting bp = ax.boxplot(matrix.transpose(), notch=False, sym='o', vert=True, whis=1.5, positions=None, widths=None, patch_artist=True, bootstrap=None) z = 10 plt.setp(bp['boxes'], color=nontarget_color, alpha=1, edgecolor="none") plt.setp(bp['whiskers'], color='black', linestyle='-', linewidth=1, zorder=z, alpha=1) plt.setp(bp['fliers'], markerfacecolor='gray', color='white', alpha=0.3, markersize=1.8, zorder=z) plt.setp(bp['caps'], color='white', zorder=-1) plt.setp(bp['medians'], color='black', linewidth=1.5, zorder=z + 1) # Plot target regions plt.plot(range(1, len(self.rbss) + 1), truecounts, markerfacecolor=target_color, marker='o', markersize=5, linestyle='None', markeredgecolor="white", zorder=z + 5) ax.set_xlabel(self.rna_name + " DNA Binding Domains", fontsize=label_size) ax.set_ylabel(ylabel, fontsize=label_size, rotation=90) ax.set_ylim([min_y, max_y]) ax.yaxis.set_major_locator(MaxNLocator(integer=True)) ax.set_xticklabels([dbd.str_rna(pa=False) for dbd in self.rbss], rotation=35, ha="right", fontsize=tick_size) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(tick_size) for spine in ['top', 'right']: ax.spines[spine].set_visible(False) ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='on') ax.tick_params(axis='y', which='both', left='on', right='off', labelbottom='off') # Legend dot_legend, = plt.plot([1, 1], color=target_color, marker='o', markersize=5, markeredgecolor="white", linestyle='None') bp_legend, = plt.plot([1, 1], color=nontarget_color, linewidth=6, alpha=1) ax.legend([dot_legend, bp_legend, rect], ["Target Regions", "Non-target regions", "Significant DBD"], bbox_to_anchor=(0., 1.02, 1., .102), loc=2, mode="expand", borderaxespad=0., prop={'size': 9}, ncol=3, numpoints=1) bp_legend.set_visible(False) dot_legend.set_visible(False) # f.tight_layout(pad=1.08, h_pad=None, w_pad=None) f.savefig(os.path.join(dir, filename + ".png"), facecolor='w', edgecolor='w', bbox_extra_artists=(plt.gci()), bbox_inches='tight', dpi=300) # PDF for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(12) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(12) ax.xaxis.label.set_size(14) ax.yaxis.label.set_size(14) pp = PdfPages(os.path.join(dir, filename + '.pdf')) pp.savefig(f, bbox_extra_artists=(plt.gci()), bbox_inches='tight') pp.close() def gen_html(self, directory, parameters, obed, align=50, alpha=0.05, score=False): """Generate the HTML file""" dir_name = os.path.basename(directory) html_header = "Genomic Region Test: " + dir_name link_ds = OrderedDict() link_ds["RNA"] = "index.html" link_ds["Sig Target Regions"] = "starget_regions.html" link_ds["Target Regions"] = "target_regions.html" link_ds["Parameters"] = "parameters.html" ################################################## # index.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Plots html.add_figure("lineplot_region.png", align="left", width="45%", more_images=["boxplot_regions.png"]) if self.showdbs: html.add_figure("lineplot_dbs.png", align="left", width="45%", more_images=["boxplot_dbs.png"]) if self.showdbs: header_list = [[ "#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", "Target Regions", "Non-target Regions", None, "Statistics" ], [ "", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "NO. DBSs", "NO. DBSs (average)", "s.d.", "<i>p</i>-value" ]] header_titles = [ [ "Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", "Given target regions on DNA", "Regions from randomization", None, "Statistics based on DNA Binding Sites" ], [ "", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Number of related DNA Binding Sites binding to target regions", "Average number of DNA Binding Sites binding to random regions", "Standard deviation", "P-value" ] ] border_list = [ " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:2pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"" ] else: header_list = [[ "#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", None ], [ "", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "z-score" ]] header_titles = [ [ "Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", None ], [ "", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Z-score" ] ] border_list = [ " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "" ] type_list = 'ssssssssssssssss' col_size_list = [ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50 ] data_table = [] for i, rbs in enumerate(self.rbss): if self.data["region"]["p"][i] < alpha: p_region = "<font color=\"red\">" + value2str( self.data["region"]["p"][i]) + "</font>" else: p_region = value2str(self.data["region"]["p"][i]) zs = (self.counts_tr[rbs][0] - self.data["region"]["ave"][i]) / self.data["region"]["sd"][i] new_line = [ str(i + 1), rbs.str_rna(pa=False), '<a href="dbd_region.html#' + rbs.str_rna() + '" style="text-align:left">' + str(self.counts_tr[rbs][0]) + '</a>', str(self.counts_tr[rbs][1]), value2str(self.data["region"]["ave"][i]), value2str(self.data["region"]["sd"][i]), p_region, value2str(zs) ] if self.showdbs: if self.data["dbs"]["p"][i] < alpha: p_dbs = "<font color=\"red\">" + value2str( self.data["dbs"]["p"][i]) + "</font>" else: p_dbs = value2str(self.data["dbs"]["p"][i]) new_line += [ str(self.counts_dbs[rbs]), value2str(self.data["dbs"]["ave"][i]), value2str(self.data["dbs"]["sd"][i]), p_dbs ] data_table.append(new_line) data_table = natsort.natsorted(data_table, key=lambda x: x[6]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, border_list=border_list, sortable=True) html.add_heading("Notes") html.add_list([ "RNA name: " + self.rna_name, "Randomization is performed for " + str(self.repeats) + " times.", "DBD stands for DNA Binding Domain on RNA.", "DBS stands for DNA Binding Site on DNA." ]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "index.html")) ############################################################# # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain ############################################################# header_list = [ "#", "Target Region", "Associated Gene", "No. of DBSs", "DBS coverage" ] header_titles = [ "Rank", "Given target regions from BED files", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites locate within the region", "The proportion of the region covered by DBS binding" ] ######################################################### # dbd_region.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for rbsm in self.rbss: html.add_heading("DNA Binding Domain: " + rbsm.str_rna(), idtag=rbsm.str_rna()) data_table = [] for i, region in enumerate(self.txp.merged_dict[rbsm]): # Add information data_table.append([ str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), str(len(self.region_dbs[region.toString()])), value2str(self.region_coverage[region.toString()]) ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "dbd_region.html")) ############################################################# # Targeted regions centered ############################################################# ############################################################################################## # target_regions.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") if score: header_list = [ "#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Score", "Sum of ranks" ] header_titles = [ "Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Scores from BED file", "Sum of all the left-hand-side ranks" ] else: header_list = [ "#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Sum of ranks" ] header_titles = [ "Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Sum of all the left-hand-side ranks" ] html.add_heading("Target Regions") data_table = [] if not self.dna_region.sorted: self.dna_region.sort() # Calculate the ranking rank_count = len(self.dna_region) - rank_array( [len(self.region_dbs[p.toString()]) for p in self.dna_region]) rank_coverage = len(self.dna_region) - rank_array( [self.region_coverage[p.toString()] for p in self.dna_region]) if score: try: score_list = [ float(p.data.split("\t")[0]) for p in self.dna_region ] rank_score = len(self.dna_region) - rank_array( [abs(s) for s in score_list]) rank_sum = [ x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score) ] # sum_rank = rank_array(rank_sum) # method='min' except ImportError: print( "There is no score in BED file, please don't use '-score' argument." ) else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(self.dna_region): dbs_counts = str(len(self.region_dbs[region.toString()])) dbs_cover = value2str(self.region_coverage[region.toString()]) newline = [ str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), '<a href="region_dbs.html#' + region.toString() + '" style="text-align:left">' + dbs_counts + '</a>', dbs_cover ] if score: dbs_score = value2str(score_list[i]) region.data = "\t".join( [dbs_counts, dbs_cover, dbs_score, str(rank_sum[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) else: region.data = "\t".join( [dbs_counts, dbs_cover, str(rank_sum[i])]) newline.append(str(rank_sum[i])) data_table.append(newline) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) # data_table = sorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_heading("Notes") html.add_list(["All target regions without any bindings are ignored."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "target_regions.html")) self.dna_region.sort_score() self.dna_region.write_bed( os.path.join(directory, obed + "_target_regions.bed")) ############################################################################################## # starget_regions.html for significant target regions stargets = GenomicRegionSet("sig_targets") sig_dbs = {} sig_dbs_coverage = {} for i, r in enumerate(self.dna_region): sig_bindings = self.region_dbs[r.toString()].overlap_rbss( rbss=self.data["region"]["sig_region"]) dbs = sig_bindings.get_dbs() if len(dbs) > 0: stargets.add(r) m_dbs = dbs.merge(w_return=True) sig_dbs[r] = len(dbs) # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs) sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r) html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Select promoters in sig DBD if len(self.data["region"]["sig_region"]) == 0: html.add_heading("There is no significant DBD.") else: html.add_heading("Target regions bound by significant DBD") data_table = [] # Calculate the ranking rank_count = len(stargets) - rank_array( [sig_dbs[p] for p in stargets]) rank_coverage = len(stargets) - rank_array( [sig_dbs_coverage[p] for p in stargets]) if score: score_list = [float(p.data.split("\t")[0]) for p in stargets] rank_score = len(stargets) - rank_array( [abs(s) for s in score_list]) rank_sum = [ x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score) ] sum_rank = rank_array(rank_sum) # method='min' else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(stargets): dbssount = '<a href="region_dbs.html#' + region.toString() + \ '" style="text-align:left">' + str(sig_dbs[region]) + '</a>' region_link = region_link_internet(self.organism, region) newline = [ str(i + 1), region_link, split_gene_name(gene_name=region.name, org=self.organism), dbssount, value2str(sig_dbs_coverage[region]) ] if score: dbs_score = value2str(score_list[i]) # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) # print([dbs_score, str(sum_rank[i])]) else: # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])]) newline.append(str(rank_sum[i])) # newline += ["<i>" + str(rank_sum[i]) + "</i>"] # print(newline) data_table.append(newline) # print(data_table) # data_table = sorted(data_table, key=lambda x: x[-1]) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", header_titles=header_titles, border_list=None, sortable=True) html.add_heading("Notes") html.add_list([ "DBS stands for DNA Binding Site on DNA.", "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA." ]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "starget_regions.html")) ############################ # Subpages for targeted region centered page # region_dbs.html header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"] html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for i, region in enumerate(self.dna_region): if len(self.region_dbs[region.toString()]) == 0: continue else: html.add_heading( "Associated gene: " + split_gene_name(gene_name=region.name, org=self.organism), idtag=region.toString()) html.add_free_content([ '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="margin-left:50">' + region.toString(space=True) + '</a>' ]) data_table = [] for rd in self.region_dbs[region.toString()]: rbs = rd.rna.str_rna(pa=False) for rbsm in self.data["region"]["sig_region"]: # rbsm = rbsm.partition(":")[2].split("-") if rd.rna.overlap(rbsm): rbs = "<font color=\"red\">" + rbs + "</font>" data_table.append([ rbs, '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + rd.dna.chrom + "%3A" + str(rd.dna.initial) + "-" + str(rd.dna.final) + '" style="text-align:left">' + rd.dna.toString(space=True) + '</a>', rd.dna.orientation, rd.score, rd.motif, rd.orient ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.write(os.path.join(directory, "region_dbs.html")) ###############################################################################33 ################ Parameters.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") html.add_heading("Parameters") header_list = ["Description", "Arguments", "Value"] data_table = [ ["RNA sequence name", "-rn", parameters.rn], ["Input RNA sequence file", "-r", os.path.basename(parameters.r)], ["Input BED file", "-bed", os.path.basename(parameters.bed)], ["Output directory", "-o", os.path.basename(parameters.o)], ["Organism", "-organism", parameters.organism], ["Number of repitetion of andomization", "-n", str(parameters.n)], ["Alpha level for rejection p value", "-a", str(parameters.a)], [ "Cut off value for filtering out the low counts of DBSs", "-ccf", str(parameters.ccf) ], ["Remove temporary files", "-rt", str(parameters.rt)], [ "Input BED file for masking in randomization", "-f", str(parameters.f) ], ["Input file for RNA accecibility", "-ac", str(parameters.ac)], [ "Cut off value for RNA accecibility", "-accf", str(parameters.accf) ], [ "Output the BED files for DNA binding sites.", "-obed", str(parameters.obed) ], [ "Show parallel and antiparallel bindings in the plot separately.", "-showpa", str(parameters.showpa) ], ["Minimum length", "-l", str(self.triplexator_p[0])], ["Maximum error rate", "-e", str(self.triplexator_p[1])], [ "Tolerated number of consecutive errors", "-c", str(self.triplexator_p[2]) ], ["Filtering repeats", "-fr", str(self.triplexator_p[3])], ["Filtering mode", "-fm", str(self.triplexator_p[4])], ["Output format", "-of", str(self.triplexator_p[5])], ["Merge features", "-mf", str(self.triplexator_p[6])] ] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.add_free_content( ['<a href="summary.txt" style="margin-left:100">See details</a>']) html.write(os.path.join(directory, "parameters.html"))
def merge_delete(ext_size, merge, peak_list, pvalue_list): # peaks_gain = read_diffpeaks(path) regions_plus = GenomicRegionSet('regions') #pot. mergeable regions_minus = GenomicRegionSet('regions') #pot. mergeable regions_unmergable = GenomicRegionSet('regions') last_orientation = "" for i, t in enumerate(peak_list): chrom, start, end, c1, c2, strand, ratio = t[0], t[1], t[2], t[3], t[4], t[5], t[6] r = GenomicRegion(chrom = chrom, initial = start, final = end, name = '', \ orientation = strand, data = str((c1, c2, pvalue_list[i], ratio))) if end - start > ext_size: if strand == '+': if last_orientation == '+': region_plus.add(r) else: regions_unmergable.add(r) elif strand == '-': if last_orientation == '-': region_mins.add(r) else: regions_unmergable.add(r) if merge: regions_plus.extend(ext_size/2, ext_size/2) regions_plus.merge() regions_plus.extend(-ext_size/2, -ext_size/2) merge_data(regions_plus) regions_minus.extend(ext_size/2, ext_size/2) regions_minus.merge() regions_minus.extend(-ext_size/2, -ext_size/2) merge_data(regions_minus) results = GenomicRegionSet('regions') for el in regions_plus: results.add(el) for el in regions_minus: results.add(el) for el in regions_unmergable: results.add(el) results.sort() return results
metavar=' ', type=str, help="Define the organism") parser.add_argument( '-rmcoding', metavar=' ', type=float, help="Define the cutoff to remove the entries with coding potential") parser.add_argument('-mafdir', metavar=' ', type=str, help="Define the directory to MAF files") # python /projects/reg-gen/tools/phylocsf_check.py -i args = parser.parse_args() bed = GenomicRegionSet("input") bed.read(args.i) num = len(bed) organisms = { "hg18": "Human", "panTro2": "Chimp", "rheMac2": "Rhesus", "tarSyr1": "Tarsier", "micMur1": "Mouse_lemur", "otoGar1": "Bushbaby", "tupBel1": "Shrew", "mm9": "Mouse", "rn4": "Rat", "dipOrd1": "Kangaroo_Rat", "cavPor2": "Guinea_Pig",
def find(s, ch): return [i for i, ltr in enumerate(s) if ltr == ch] ################################################################################## parser = argparse.ArgumentParser(description='Check the coding potential by PhyloCSF', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-i', metavar=' ', type=str, help="Input BED file") parser.add_argument('-o', metavar=' ', type=str, help="Output BED file with the coding-potential score") parser.add_argument('-organism', metavar=' ', type=str, help="Define the organism") parser.add_argument('-rmcoding', metavar=' ', type=float, help="Define the cutoff to remove the entries with coding potential") parser.add_argument('-mafdir', metavar=' ', type=str, help="Define the directory to MAF files") # python /projects/reg-gen/tools/phylocsf_check.py -i args = parser.parse_args() bed = GenomicRegionSet("input") bed.read_bed(args.i) num = len(bed) organisms = { "hg18": "Human", "panTro2": "Chimp", "rheMac2": "Rhesus", "tarSyr1": "Tarsier", "micMur1": "Mouse_lemur", "otoGar1": "Bushbaby", "tupBel1": "Shrew", "mm9": "Mouse", "rn4": "Rat", "dipOrd1": "Kangaroo_Rat", "cavPor2": "Guinea_Pig", "speTri1": "Squirrel",
def chip_evaluate(self): """ This evaluation methodology uses motif-predicted binding sites (MPBSs) together with TF ChIP-seq data to evaluate the footprint predictions. return: """ # Evaluate Statistics fpr = dict() tpr = dict() roc_auc = dict() roc_auc_1 = dict() roc_auc_2 = dict() recall = dict() precision = dict() prc_auc = dict() if "SEG" in self.footprint_type: mpbs_regions = GenomicRegionSet("TFBS") mpbs_regions.read_bed(self.tfbs_file) mpbs_regions.sort() # Verifying the maximum score of the MPBS file max_score = -99999999 for region in iter(mpbs_regions): score = int(region.data) if score > max_score: max_score = score max_score += 1 for i in range(len(self.footprint_file)): footprints_regions = GenomicRegionSet("Footprints Prediction") footprints_regions.read_bed(self.footprint_file[i]) # Sort footprint prediction bed files footprints_regions.sort() if self.footprint_type[i] == "SEG": # Increasing the score of MPBS entry once if any overlaps found in the predicted footprints. increased_score_mpbs_regions = GenomicRegionSet("Increased Regions") intersect_regions = mpbs_regions.intersect(footprints_regions, mode=OverlapType.ORIGINAL) for region in iter(intersect_regions): region.data = str(int(region.data) + max_score) increased_score_mpbs_regions.add(region) # Keep the score of remained MPBS entry unchanged without_intersect_regions = mpbs_regions.subtract(footprints_regions, whole_region=True) for region in iter(without_intersect_regions): increased_score_mpbs_regions.add(region) increased_score_mpbs_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(increased_score_mpbs_regions) recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(increased_score_mpbs_regions) elif self.footprint_type[i] == "SC": footprints_regions.sort_score() fpr[i], tpr[i], roc_auc[i], roc_auc_1[i], roc_auc_2[i] = self.roc_curve(footprints_regions) recall[i], precision[i], prc_auc[i] = self.precision_recall_curve(footprints_regions) # Output the statistics results into text stats_fname = self.output_location + self.tf_name + "_stats.txt" stats_header = ["METHOD", "AUC_100", "AUC_10", "AUC_1", "AUPR"] with open(stats_fname, "w") as stats_file: stats_file.write("\t".join(stats_header) + "\n") for i in range(len(self.footprint_name)): stats_file.write(self.footprint_name[i] + "\t" + str(roc_auc[i]) + "\t" + str(roc_auc_1[i]) + "\t" + str(roc_auc_2[i]) + "\t" + str(prc_auc[i]) + "\n") # Output the curves if self.print_roc_curve: label_x = "False Positive Rate" label_y = "True Positive Rate" curve_name = "ROC" self.plot_curve(fpr, tpr, roc_auc, label_x, label_y, self.tf_name, curve_name) if self.print_pr_curve: label_x = "Recall" label_y = "Precision" curve_name = "PRC" self.plot_curve(recall, precision, prc_auc, label_x, label_y, self.tf_name, curve_name) self.output_points(self.tf_name, fpr, tpr, recall, precision)
def posi2region(self, regions, p): all = range(len(regions)) new_r = GenomicRegionSet(name="") for r in p: new_r.combine(regions[r]) return new_r
def gen_html(self, directory, parameters, obed, align=50, alpha=0.05, score=False): """Generate the HTML file""" dir_name = os.path.basename(directory) html_header = "Genomic Region Test: " + dir_name link_ds = OrderedDict() link_ds["RNA"] = "index.html" link_ds["Sig Target Regions"] = "starget_regions.html" link_ds["Target Regions"] = "target_regions.html" link_ds["Parameters"] = "parameters.html" ################################################## # index.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Plots html.add_figure("lineplot_region.png", align="left", width="45%", more_images=["boxplot_regions.png"]) if self.showdbs: html.add_figure("lineplot_dbs.png", align="left", width="45%", more_images=["boxplot_dbs.png"]) if self.showdbs: header_list = [[ "#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", "Target Regions", "Non-target Regions", None, "Statistics" ], [ "", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "NO. DBSs", "NO. DBSs (average)", "s.d.", "<i>p</i>-value" ]] header_titles = [ [ "Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", "Given target regions on DNA", "Regions from randomization", None, "Statistics based on DNA Binding Sites" ], [ "", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Number of related DNA Binding Sites binding to target regions", "Average number of DNA Binding Sites binding to random regions", "Standard deviation", "P-value" ] ] border_list = [ " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:2pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"" ] else: header_list = [[ "#", "DBD", "Target Regions", None, "Non-target Regions", None, "Statistics", None ], [ "", "", "with DBS", "without DBS", "with DBS (average)", "s.d.", "<i>p</i>-value", "z-score" ]] header_titles = [ [ "Rank", "DNA Binding Domain", "Given target regions on DNA", None, "Regions from randomization", None, "Statistics based on target regions", None ], [ "", "", "Number of target regions with DBS binding", "Number of target regions without DBS binding", "Average number of regions from randomization with DBS binding", "Standard deviation", "P value", "Z-score" ] ] border_list = [ " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", "", " style=\"border-right:1pt solid gray\"", " style=\"border-right:1pt solid gray\"", "" ] type_list = 'ssssssssssssssss' col_size_list = [ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50 ] data_table = [] for i, rbs in enumerate(self.rbss): if self.data["region"]["p"][i] < alpha: p_region = "<font color=\"red\">" + value2str( self.data["region"]["p"][i]) + "</font>" else: p_region = value2str(self.data["region"]["p"][i]) zs = (self.counts_tr[rbs][0] - self.data["region"]["ave"][i]) / self.data["region"]["sd"][i] new_line = [ str(i + 1), rbs.str_rna(pa=False), '<a href="dbd_region.html#' + rbs.str_rna() + '" style="text-align:left">' + str(self.counts_tr[rbs][0]) + '</a>', str(self.counts_tr[rbs][1]), value2str(self.data["region"]["ave"][i]), value2str(self.data["region"]["sd"][i]), p_region, value2str(zs) ] if self.showdbs: if self.data["dbs"]["p"][i] < alpha: p_dbs = "<font color=\"red\">" + value2str( self.data["dbs"]["p"][i]) + "</font>" else: p_dbs = value2str(self.data["dbs"]["p"][i]) new_line += [ str(self.counts_dbs[rbs]), value2str(self.data["dbs"]["ave"][i]), value2str(self.data["dbs"]["sd"][i]), p_dbs ] data_table.append(new_line) data_table = natsort.natsorted(data_table, key=lambda x: x[6]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, border_list=border_list, sortable=True) html.add_heading("Notes") html.add_list([ "RNA name: " + self.rna_name, "Randomization is performed for " + str(self.repeats) + " times.", "DBD stands for DNA Binding Domain on RNA.", "DBS stands for DNA Binding Site on DNA." ]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "index.html")) ############################################################# # RNA subpage: Profile of targeted regions for each merged DNA Binding Domain ############################################################# header_list = [ "#", "Target Region", "Associated Gene", "No. of DBSs", "DBS coverage" ] header_titles = [ "Rank", "Given target regions from BED files", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites locate within the region", "The proportion of the region covered by DBS binding" ] ######################################################### # dbd_region.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for rbsm in self.rbss: html.add_heading("DNA Binding Domain: " + rbsm.str_rna(), idtag=rbsm.str_rna()) data_table = [] for i, region in enumerate(self.txp.merged_dict[rbsm]): # Add information data_table.append([ str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), str(len(self.region_dbs[region.toString()])), value2str(self.region_coverage[region.toString()]) ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "dbd_region.html")) ############################################################# # Targeted regions centered ############################################################# ############################################################################################## # target_regions.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") if score: header_list = [ "#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Score", "Sum of ranks" ] header_titles = [ "Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Scores from BED file", "Sum of all the left-hand-side ranks" ] else: header_list = [ "#", "Target region", "Associated Gene", "DBSs Count", "DBS coverage", "Sum of ranks" ] header_titles = [ "Rank", "Target regions loaded from the given BED file", "Associated genes which is overlapping with the given region or close to it (less than 50000 bp)", "Number of DNA Binding Sites within the region", "The proportion of the region covered by DBS binding", "Sum of all the left-hand-side ranks" ] html.add_heading("Target Regions") data_table = [] if not self.dna_region.sorted: self.dna_region.sort() # Calculate the ranking rank_count = len(self.dna_region) - rank_array( [len(self.region_dbs[p.toString()]) for p in self.dna_region]) rank_coverage = len(self.dna_region) - rank_array( [self.region_coverage[p.toString()] for p in self.dna_region]) if score: try: score_list = [ float(p.data.split("\t")[0]) for p in self.dna_region ] rank_score = len(self.dna_region) - rank_array( [abs(s) for s in score_list]) rank_sum = [ x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score) ] # sum_rank = rank_array(rank_sum) # method='min' except ImportError: print( "There is no score in BED file, please don't use '-score' argument." ) else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(self.dna_region): dbs_counts = str(len(self.region_dbs[region.toString()])) dbs_cover = value2str(self.region_coverage[region.toString()]) newline = [ str(i + 1), '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="text-align:left">' + region.toString(space=True) + '</a>', split_gene_name(gene_name=region.name, org=self.organism), '<a href="region_dbs.html#' + region.toString() + '" style="text-align:left">' + dbs_counts + '</a>', dbs_cover ] if score: dbs_score = value2str(score_list[i]) region.data = "\t".join( [dbs_counts, dbs_cover, dbs_score, str(rank_sum[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) else: region.data = "\t".join( [dbs_counts, dbs_cover, str(rank_sum[i])]) newline.append(str(rank_sum[i])) data_table.append(newline) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) # data_table = sorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True, header_titles=header_titles, sortable=True) html.add_heading("Notes") html.add_list(["All target regions without any bindings are ignored."]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "target_regions.html")) self.dna_region.sort_score() self.dna_region.write_bed( os.path.join(directory, obed + "_target_regions.bed")) ############################################################################################## # starget_regions.html for significant target regions stargets = GenomicRegionSet("sig_targets") sig_dbs = {} sig_dbs_coverage = {} for i, r in enumerate(self.dna_region): sig_bindings = self.region_dbs[r.toString()].overlap_rbss( rbss=self.data["region"]["sig_region"]) dbs = sig_bindings.get_dbs() if len(dbs) > 0: stargets.add(r) m_dbs = dbs.merge(w_return=True) sig_dbs[r] = len(dbs) # self.promoter["de"]["merged_dbs"][promoter.toString()] = len(m_dbs) sig_dbs_coverage[r] = float(m_dbs.total_coverage()) / len(r) html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") # Select promoters in sig DBD if len(self.data["region"]["sig_region"]) == 0: html.add_heading("There is no significant DBD.") else: html.add_heading("Target regions bound by significant DBD") data_table = [] # Calculate the ranking rank_count = len(stargets) - rank_array( [sig_dbs[p] for p in stargets]) rank_coverage = len(stargets) - rank_array( [sig_dbs_coverage[p] for p in stargets]) if score: score_list = [float(p.data.split("\t")[0]) for p in stargets] rank_score = len(stargets) - rank_array( [abs(s) for s in score_list]) rank_sum = [ x + y + z for x, y, z in zip(rank_count, rank_coverage, rank_score) ] sum_rank = rank_array(rank_sum) # method='min' else: rank_sum = [x + y for x, y in zip(rank_count, rank_coverage)] sum_rank = rank_array(rank_sum) for i, region in enumerate(stargets): dbssount = '<a href="region_dbs.html#' + region.toString() + \ '" style="text-align:left">' + str(sig_dbs[region]) + '</a>' region_link = region_link_internet(self.organism, region) newline = [ str(i + 1), region_link, split_gene_name(gene_name=region.name, org=self.organism), dbssount, value2str(sig_dbs_coverage[region]) ] if score: dbs_score = value2str(score_list[i]) # region.data = "\t".join([dbs_counts, dbs_cover, dbs_score, str(sum_rank[i])]) newline.append(dbs_score) newline.append(str(rank_sum[i])) # print([dbs_score, str(sum_rank[i])]) else: # region.data = "\t".join([dbs_counts, dbs_cover, str(sum_rank[i])]) newline.append(str(rank_sum[i])) # newline += ["<i>" + str(rank_sum[i]) + "</i>"] # print(newline) data_table.append(newline) # print(data_table) # data_table = sorted(data_table, key=lambda x: x[-1]) data_table = natsort.natsorted(data_table, key=lambda x: x[-1]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", header_titles=header_titles, border_list=None, sortable=True) html.add_heading("Notes") html.add_list([ "DBS stands for DNA Binding Site on DNA.", "DBS coverage is the proportion of the region where has potential to form triple helices with the given RNA." ]) html.add_fixed_rank_sortable() html.write(os.path.join(directory, "starget_regions.html")) ############################ # Subpages for targeted region centered page # region_dbs.html header_list = ["RBS", "DBS", "Strand", "Score", "Motif", "Orientation"] html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") for i, region in enumerate(self.dna_region): if len(self.region_dbs[region.toString()]) == 0: continue else: html.add_heading( "Associated gene: " + split_gene_name(gene_name=region.name, org=self.organism), idtag=region.toString()) html.add_free_content([ '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + region.chrom + "%3A" + str(region.initial) + "-" + str(region.final) + '" style="margin-left:50">' + region.toString(space=True) + '</a>' ]) data_table = [] for rd in self.region_dbs[region.toString()]: rbs = rd.rna.str_rna(pa=False) for rbsm in self.data["region"]["sig_region"]: # rbsm = rbsm.partition(":")[2].split("-") if rd.rna.overlap(rbsm): rbs = "<font color=\"red\">" + rbs + "</font>" data_table.append([ rbs, '<a href="http://genome.ucsc.edu/cgi-bin/hgTracks?db=' + self.organism + "&position=" + rd.dna.chrom + "%3A" + str(rd.dna.initial) + "-" + str(rd.dna.final) + '" style="text-align:left">' + rd.dna.toString(space=True) + '</a>', rd.dna.orientation, rd.score, rd.motif, rd.orient ]) html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.write(os.path.join(directory, "region_dbs.html")) ###############################################################################33 ################ Parameters.html html = Html( name=html_header, links_dict=link_ds, # fig_dir=os.path.join(directory,"style"), fig_rpath="../style", RGT_header=False, other_logo="TDF", homepage="../index.html") html.add_heading("Parameters") header_list = ["Description", "Arguments", "Value"] data_table = [ ["RNA sequence name", "-rn", parameters.rn], ["Input RNA sequence file", "-r", os.path.basename(parameters.r)], ["Input BED file", "-bed", os.path.basename(parameters.bed)], ["Output directory", "-o", os.path.basename(parameters.o)], ["Organism", "-organism", parameters.organism], ["Number of repitetion of andomization", "-n", str(parameters.n)], ["Alpha level for rejection p value", "-a", str(parameters.a)], [ "Cut off value for filtering out the low counts of DBSs", "-ccf", str(parameters.ccf) ], ["Remove temporary files", "-rt", str(parameters.rt)], [ "Input BED file for masking in randomization", "-f", str(parameters.f) ], ["Input file for RNA accecibility", "-ac", str(parameters.ac)], [ "Cut off value for RNA accecibility", "-accf", str(parameters.accf) ], [ "Output the BED files for DNA binding sites.", "-obed", str(parameters.obed) ], [ "Show parallel and antiparallel bindings in the plot separately.", "-showpa", str(parameters.showpa) ], ["Minimum length", "-l", str(self.triplexator_p[0])], ["Maximum error rate", "-e", str(self.triplexator_p[1])], [ "Tolerated number of consecutive errors", "-c", str(self.triplexator_p[2]) ], ["Filtering repeats", "-fr", str(self.triplexator_p[3])], ["Filtering mode", "-fm", str(self.triplexator_p[4])], ["Output format", "-of", str(self.triplexator_p[5])], ["Merge features", "-mf", str(self.triplexator_p[6])] ] html.add_zebra_table(header_list, col_size_list, type_list, data_table, align=align, cell_align="left", auto_width=True) html.add_free_content( ['<a href="summary.txt" style="margin-left:100">See details</a>']) html.write(os.path.join(directory, "parameters.html"))
def create_file(self): # Expanding summits tfbs_summit_regions = GenomicRegionSet("TFBS Summit Regions") tfbs_summit_regions.read_bed(self.tfbs_summit_fname) for region in iter(tfbs_summit_regions): summit = int(region.data.split()[-1]) + region.initial region.initial = max(summit - (self.peak_ext / 2), 0) region.final = summit + (self.peak_ext / 2) # Calculating intersections mpbs_regions = GenomicRegionSet("MPBS Regions") mpbs_regions.read_bed(self.mpbs_fname) tfbs_summit_regions.sort() mpbs_regions.sort() with_overlap_regions = mpbs_regions.intersect(tfbs_summit_regions, mode=OverlapType.ORIGINAL) without_overlap_regions = mpbs_regions.subtract(tfbs_summit_regions, whole_region=True) tfbs_regions = GenomicRegionSet("TFBS Regions") for region in iter(with_overlap_regions): region.name = region.name.split(":")[0] + ":Y" tfbs_regions.add(region) for region in iter(without_overlap_regions): region.name = region.name.split(":")[0] + ":N" tfbs_regions.add(region) tfbs_regions.sort() tfbs_fname = os.path.join(self.output_location, "{}.bed".format(self.mpbs_name)) tfbs_regions.write_bed(tfbs_fname)
def get_bc_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() bam = Samfile(args.input_files[0], "rb") genome_data = GenomeData(args.organism) fasta = Fastafile(genome_data.get_genome()) hmm_data = HmmData() if args.bias_table: bias_table_list = args.bias_table.split(",") bias_table = BiasTable().load_table(table_file_name_F=bias_table_list[0], table_file_name_R=bias_table_list[1]) else: table_F = hmm_data.get_default_bias_table_F_ATAC() table_R = hmm_data.get_default_bias_table_R_ATAC() bias_table = BiasTable().load_table(table_file_name_F=table_F, table_file_name_R=table_R) if args.strand_specific: fname_forward = os.path.join(args.output_location, "{}_forward.wig".format(args.output_prefix)) fname_reverse = os.path.join(args.output_location, "{}_reverse.wig".format(args.output_prefix)) f_forward = open(fname_forward, "a") f_reverse = open(fname_reverse, "a") for region in regions: signal_f, signal_r = reads_file.get_bc_signal_by_fragment_length( ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=True) if args.norm: signal_f = reads_file.boyle_norm(signal_f) perc = scoreatpercentile(signal_f, 98) std = np.std(signal_f) signal_f = reads_file.hon_norm_atac(signal_f, perc, std) signal_r = reads_file.boyle_norm(signal_r) perc = scoreatpercentile(signal_r, 98) std = np.std(signal_r) signal_r = reads_file.hon_norm_atac(signal_r, perc, std) f_forward.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal_f)]) + "\n") f_reverse.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(-e) for e in np.nan_to_num(signal_r)]) + "\n") f_forward.close() f_reverse.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}_forward.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_forward, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_forward) bw_filename = os.path.join(args.output_location, "{}_reverse.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", fname_reverse, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(fname_reverse) else: output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) with open(output_fname, "a") as output_f: for region in regions: signal = reads_file.get_bc_signal_by_fragment_length(ref=region.chrom, start=region.initial, end=region.final, bam=bam, fasta=fasta, bias_table=bias_table, forward_shift=args.forward_shift, reverse_shift=args.reverse_shift, min_length=None, max_length=None, strand=False) if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)