def parse_barcode(bamfile): """parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome and writes to a file, create file for mutant and wildtype based on barcodes""" samfile = Samfile(bamfile, "rb") multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile) mutant_one = Samfile("MutantOne.bam","wb",template=samfile) wildtype_one = Samfile("WildtypeOne.bam","wb",template=samfile) mutant_two = Samfile("MutantTwo.bam","wb",template=samfile) wildtype_two = Samfile("WildtypeTwo.bam","wb",template=samfile) for line in samfile.fetch(): #if line.is_secondary: ## does this hit to more than one spot in genome # multi_hit_file.write(line) if "#GAGT"in line.qname: ## write to mutant file mutant_one.write(line) elif "#TTAG" in line.qname: mutant_two.write(line) elif "#ACCC" in line.qname: ### write to wildtype file wildtype_one.write(line) elif "#CGTA" in line.qname: ### write to wildtype file wildtype_two.write(line) multi_hit_file.close() mutant_one.close() wildtype_one.close() mutant_two.close() wildtype_two.close() samfile.close()
def get_raw_signal(arguments): (mpbs_region, reads_file, organism, window_size, forward_shift, reverse_shift) = arguments bam = Samfile(reads_file, "rb") signal = np.zeros(window_size) for region in mpbs_region: mid = (region.final + region.initial) // 2 p1 = mid - window_size // 2 p2 = mid + window_size // 2 if p1 <= 0: continue # Fetch raw signal for read in bam.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal
def bam_fill_seq(args): """ Fill empty sequence with known seqs """ if not args.source_bam: source_bam = args.bam else: source_bam = args.source_bam logging.info('Loading samfile: %s', source_bam) src_seqs = {1: {}, 2: {}} src = pysam.Samfile(source_bam) with src: for rec in src: if rec.is_supplementary: # skip supplementary alignment continue if rec.is_secondary: # skip supplementary alignment continue if rec.query_sequence is None: # empty continue if rec.is_read2: src_seqs[2][rec.qname] = (rec.query_sequence, rec.query_qualities, rec.is_reverse) else: src_seqs[1][rec.qname] = (rec.query_sequence, rec.query_qualities, rec.is_reverse) logging.info('Loaded read1 : %s', len(src_seqs[1])) logging.info('Loaded read2 : %s', len(src_seqs[2])) sam = Samfile(args.bam) if args.output.endswith('.bam'): mode = 'wb' else: mode = 'wh' out = pysam.Samfile(args.output, mode=mode, template=sam) if args.region: it = sam.fetch(region=args.region) else: it = sam for rec in it: qname = rec.qname if rec.query_sequence is None: # only fill when empty ret = src_seqs[2 if rec.is_read2 else 1].get(rec.qname) if ret is not None: seq, qual, is_rev = ret if is_rev != rec.is_reverse: seq = dna_revcomp(seq) if qual is not None: qual = list(reversed(qual)) cigar = Cigar(rec.cigartuples) seq = cigar.hard_clip_seq(seq) if qual is not None: qual = cigar.hard_clip_seq(qual) rec.query_sequence = seq # refill rec.query_qualities = qual out.write(rec)
def bam_variant_aln(args): samfile = Samfile(args.bam) for rec in args.vcf: fp = open(vcf) reader = pyvcf.Reader(fp) self.positions = [] for rec in samfile.fetch(vcf): samfile.getrname(rec.tid) rec
def main(args): m260b.debug.debug.DEBUG = args.debug ref_header, ref_sequence = read_basic_fasta(args.reference_file) if args.input_bam: reads = Samfile(args.input_bam) if args.start and args.stop: reads = reads.fetch(ref_header[1:].strip(), args.start, args.stop) else: reads = get_sorted_aligned_reads(args, ref_header, ref_sequence) #vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None chr = ref_header[1:].strip() fail_reasons = Counter() haplo_out = None if args.haplotype_out: haplo_out = Samfile(args.haplotype_out, 'wb', header=SAM_HEADER(ref_header, ref_sequence)) vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None for region, reads in active_regions(reads, ref_sequence, chr, start_offset=0, flank=30, dfrac=1.0): #print('Calling region {}-{}'.format(region.start, region.stop)) haplotype = build_haplotype(region.reference, reads, k=11, min_kmer_count=2) if haplotype.fail_reason: print('Failure {} at window\n{}'.format(haplotype.fail_reason, region)) continue # align the haplotype to the reference sequence offset, cigar, score, mismatch = banded_sw(region.reference, haplotype.seq) haplotype_start = region.start + offset _info = AlignmentInfo(haplotype_start, cigar, False, mismatch) haplo_seq = SeqRecord(Seq(haplotype.seq, DNA), id='Haplotype{}'.format(region.start)) dict.__setitem__(haplo_seq._per_letter_annotations, 'phred_quality', [40] * len(haplotype.seq)) haplo_read = alignment_info_to_sam(haplo_seq, _info, 'nomate', None, 'hw2_rg', False) if haplo_out: haplo_out.write(haplo_read) #print(haplotype) for variant in vcf_from_haplotype(region, haplotype, SAMPLE_NAME, chr): if vcf_stream: vcf_stream.write_record(variant) print(vcf2m260(variant)) if vcf_stream: vcf_stream.flush() vcf_stream.close()
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join([ "wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0" ])) os.remove(output_fname)
def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, "rb") #Iterates over each read instead of each contig outputs = defaultdict(list) #import ipdb; ipdb.set_trace() for aln in samfile.fetch(until_eof = True): ref = samfile.getrname(aln.tid) outputs[ref].append(aln) for ref, alns in outputs.iteritems(): print_reads(alns, ref, samfile.header)
class GenomicSignal: def __init__(self, file_name): self.bam = Samfile(file_name,"rb") def get_signal(self, ref, start, end, ext, initial_clip = 1000, ext_both_directions=False): pileup_region = PileupRegion(start,end,ext) iter = self.bam.fetch(reference=ref, start=start, end=end) if(not ext_both_directions): for alignment in iter: pileup_region.__call__(alignment) else: for alignment in iter: pileup_region.__call2__(alignment) raw_signal = array([min(e,initial_clip) for e in pileup_region.vector]) mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] return clip_signal
def get_raw_tracks(args): # Initializing Error Handler err = ErrorHandler() if len(args.input_files) != 2: err.throw_error("ME_FEW_ARG", add_msg="You must specify reads and regions file.") output_fname = os.path.join(args.output_location, "{}.wig".format(args.output_prefix)) bam = Samfile(args.input_files[0], "rb") regions = GenomicRegionSet("Interested regions") regions.read(args.input_files[1]) regions.merge() reads_file = GenomicSignal() with open(output_fname, "a") as output_f: for region in regions: # Raw counts signal = [0.0] * (region.final - region.initial) for read in bam.fetch(region.chrom, region.initial, region.final): if not read.is_reverse: cut_site = read.pos + args.forward_shift if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 else: cut_site = read.aend + args.reverse_shift - 1 if region.initial <= cut_site < region.final: signal[cut_site - region.initial] += 1.0 if args.norm: signal = reads_file.boyle_norm(signal) perc = scoreatpercentile(signal, 98) std = np.std(signal) signal = reads_file.hon_norm_atac(signal, perc, std) output_f.write("fixedStep chrom=" + region.chrom + " start=" + str(region.initial + 1) + " step=1\n" + "\n".join([str(e) for e in np.nan_to_num(signal)]) + "\n") output_f.close() if args.bigWig: genome_data = GenomeData(args.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() bw_filename = os.path.join(args.output_location, "{}.bw".format(args.output_prefix)) os.system(" ".join(["wigToBigWig", output_fname, chrom_sizes_file, bw_filename, "-verbose=0"])) os.remove(output_fname)
def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, "rb") ref_ids = [samfile.gettid(r) for r in samfile.references] #Iterates over each read instead of each contig reads_to_print = [] for aln in samfile.fetch(until_eof = True): if pair_is_aligned(aln, ref_ids): if args.read_pair == 1 and aln.is_read1: reads_to_print.append(aln) elif args.read_pair == 2 and aln.is_read2: reads_to_print.append(aln) elif args.read_pair == 0: reads_to_print.append(aln) if len(reads_to_print) >= 10000: # Flush the reads collected print_reads(reads_to_print) reads_to_print = [] print_reads(reads_to_print)
def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, option) ref_ids = [samfile.gettid(r) for r in samfile.references] #Iterates over each read instead of each contig reads_to_print_1 = [] reads_to_print_2 = [] reads_to_print_u = [] for aln in samfile.fetch(until_eof = True): if aln.tid in ref_ids: # This read is aligned if aln.rnext in ref_ids: # The mate is also aligned if aln.is_read1: reads_to_print_1.append(aln) reads_to_print_1 = flush_reads(reads_to_print_1, args.R1) elif aln.is_read2: reads_to_print_2.append(aln) reads_to_print_2 = flush_reads(reads_to_print_2, args.R2) else: reads_to_print_u.append(aln) reads_to_print_u = flush_reads(reads_to_print_u, args.u) print_reads(reads_to_print_1, args.R1) print_reads(reads_to_print_2, args.R2) print_reads(reads_to_print_u, args.u)
spr = 0.0 counter = 0.0 for line in intFile: # Fetching signal ll = line.strip().split("\t") mLen = int(ll[2]) - int(ll[1]) mid = (int(ll[1]) + int(ll[2])) / 2 p1 = max(mid - halfWindow, 0) p2 = mid + halfWindow # Fetch raw signal pileup_region = PileupRegion(p1, p2, 1) if (ps_version == "0.7.5"): bam.fetch(reference=ll[0], start=p1, end=p2, callback=pileup_region) else: iter = bam.fetch(reference=ll[0], start=p1, end=p2) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array( [min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Bias Correction correctedSignal = bias_correction(bam, clip_signal, biasTableF,
tf_list = list() gene_list = list() tc_list = list() for i, r in enumerate(gr_tfs): tf = r.name.split(".")[-1] gene = gr_genes[i].name if gene == "." or "+" in gene or "-" in gene or ":" in gene: continue mid = (r.initial + r.final) / 2 p1 = max(mid - 100, 0) p2 = min(mid + 100, chrom_sizes_dict[r.chrom]) iter = bam.fetch(reference=r.chrom, start=p1, end=p2) tc = 0 for alignment in iter: tc += 1 tf_list.append(tf) gene_list.append(gene) tc_list.append(tc) df = pd.DataFrame([tf_list, gene_list, tc_list]) df = df.transpose() df.rename(columns={0: 'TF', 1: 'Gene', 2: 'TC'}, inplace=True) df = df.groupby(['TF', 'Gene']).sum().reset_index() df.to_csv(output_file, header=False, index=False, sep='\t')
# Evaluating Overall TC try: regionTagCount = tag_count(chrName, p1, p2, dnaseBam, tcHalfWindow) except Exception: print "Exception TC raised in "+line writeOutput(ll,regionTagCount,resVec,outFile) continue # Fetching sequence try: sequence = str(genomeFile.fetch(chrName, p1, p2)) except Exception: print "Exception SEQUENCE raised in "+line writeOutput(ll,regionTagCount,resVec,outFile) continue # Fetching footprints try: footprints = fpBam.fetch(reference=chrName, start=p1, end=p2) except Exception: print "Exception FOOTPRINTS raised in "+line writeOutput(ll,regionTagCount,resVec,outFile) continue # Best mpbs maxPos = -99999 maxValue = globalMin maxMotifLen = -1 # Performing motif matching and footprint overlapping for res in search(sequence, [e.pssm_list for e in motifList], [e.min for e in motifList], absolute_threshold=True, both_strands=True): for (position, score) in res:
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict(); obsDictR = dict() expDictF = dict(); expDictR = dict() ct_reads_r=0 ct_reads_f=0 ct_kmers=0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift else: p1 = r.aend - (k_nb/2) + 1 - shift p2 = p1 + k_nb # Verifying PCR artifacts if(p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if(trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if(not r.is_reverse): ct_reads_r+=1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_f+=1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0,len(currStr)-k_nb): ct_kmers+=1 # Counting k-mer in dictionary s = currStr[i:i+k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i+k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A","C","G","T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e,0.0) for e in kmerComb]) bias_table_R = dict([(e,0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6) # Return return [bias_table_F, bias_table_R]
def get_raw_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal for read in bam1.fetch(region.chrom, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 for read in bam2.fetch(region.chrom, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
def estimate_bias_kmer(args): # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prevPos: trueCounter += 1 else: prevPos = p1 trueCounter = 0 if trueCounter > maxDuplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - args.k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + args.k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def create_signal(args, regions): def revcomp(s): rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")]) return "".join([rev_dict[e] for e in s[::-1]]) alphabet = ["A", "C", "G", "T"] kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] f_obs_dict = dict([(e, 0.0) for e in kmer_comb]) r_obs_dict = dict([(e, 0.0) for e in kmer_comb]) f_exp_dict = dict([(e, 0.0) for e in kmer_comb]) r_exp_dict = dict([(e, 0.0) for e in kmer_comb]) bam_file = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fasta_file = Fastafile(genome_data.get_genome()) for region in regions: # Fetching observed reads reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final) for read in reads: if not read.is_reverse: p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1 else: p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1 p2 = p1 + args.k_nb try: dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper() except Exception: continue if 'N' not in dna_sequence_obs: if read.is_reverse: dna_sequence_obs = revcomp(dna_sequence_obs) r_obs_dict[dna_sequence_obs] += 1 else: f_obs_dict[dna_sequence_obs] += 1 # Fetching whole sequence try: dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue dna_sequence_exp_rev = revcomp(dna_sequence_exp) for i in range(0, len(dna_sequence_exp) - args.k_nb): s = dna_sequence_exp[i:i + args.k_nb] if "N" not in s: f_exp_dict[s] += 1 s = dna_sequence_exp_rev[i:i + args.k_nb] if "N" not in s: r_exp_dict[s] += 1 output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) output_file_f_obs = open(output_fname_f_obs, "w") output_file_f_exp = open(output_fname_f_exp, "w") output_file_r_obs = open(output_fname_r_obs, "w") output_file_r_exp = open(output_fname_r_exp, "w") for kmer in r_obs_dict.keys(): if f_obs_dict[kmer] > 0: output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if f_exp_dict[kmer] > 0: output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if r_obs_dict[kmer] > 0: output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n") for kmer in r_obs_dict.keys(): if r_exp_dict[kmer] > 0: output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n") output_file_f_obs.close() output_file_f_exp.close() output_file_r_obs.close() output_file_r_exp.close()
def estimate_bias_pwm(args): # Parameters max_duplicates = 100 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) # Iterating on HS regions for region in regions: # Initialization prev_pos = -1 true_counter = 0 # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prev_pos: true_counter += 1 else: prev_pos = p1 true_counter = 0 if true_counter > max_duplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: for i in range(0, len(currStr)): obs_f_pwm_dict[currStr[i]][i] += 1 else: for i in range(0, len(currStr)): obs_r_pwm_dict[currStr[i]][i] += 1 # Evaluating expected frequencies # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue # Iterating on each sequence position s = None for i in range(0, len(currStr) - args.k_nb): # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] for i in range(0, len(s)): exp_f_pwm_dict[s[i]][i] += 1 # Counting k-mer in dictionary for reverse complement s = AuxiliaryFunctions.revcomp(s) for i in range(0, len(s)): exp_r_pwm_dict[s[i]][i] += 1 # Closing files bamFile.close() fastaFile.close() # Output pwms os.system("mkdir -p " + os.path.join(args.output_location, "pfm")) pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict] pwm_file_list = [] pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb))) pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb))) pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb))) pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_dict_list)): with open(pwm_file_list[i], "w") as pwm_file: for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n") motif_obs_f = motifs.read(open(pwm_obs_f), "pfm") motif_obs_r = motifs.read(open(pwm_obs_r), "pfm") motif_exp_f = motifs.read(open(pwm_exp_f), "pfm") motif_exp_r = motifs.read(open(pwm_exp_r), "pfm") # Output logos os.system("mkdir -p " + os.path.join(args.output_location, "logo")) logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb))) logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb))) logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb))) logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb))) motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb) exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb) bias_table_F[k_mer] = round(obs_f / exp_f, 6) obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb) exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb) bias_table_R[k_mer] = round(obs_r / exp_r, 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def get_raw_signal(arguments): (mpbs_name, mpbs_file1, mpbs_file2, reads_file1, reads_file2, organism, window_size, forward_shift, reverse_shift) = arguments mpbs1 = GenomicRegionSet("Motif Predicted Binding Sites of Condition1") mpbs1.read(mpbs_file1) mpbs2 = GenomicRegionSet("Motif Predicted Binding Sites of Condition2") mpbs2.read(mpbs_file2) mpbs = mpbs1.combine(mpbs2, output=True) mpbs.sort() bam1 = Samfile(reads_file1, "rb") bam2 = Samfile(reads_file2, "rb") genome_data = GenomeData(organism) fasta = Fastafile(genome_data.get_genome()) signal_1 = np.zeros(window_size) signal_2 = np.zeros(window_size) motif_len = None pwm = dict([("A", [0.0] * window_size), ("C", [0.0] * window_size), ("G", [0.0] * window_size), ("T", [0.0] * window_size), ("N", [0.0] * window_size)]) mpbs_regions = mpbs.by_names([mpbs_name]) num_motif = len(mpbs_regions) for region in mpbs_regions: if motif_len is None: motif_len = region.final - region.initial mid = (region.final + region.initial) / 2 p1 = mid - window_size / 2 p2 = mid + window_size / 2 if p1 <= 0: continue # Fetch raw signal for read in bam1.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_1[cut_site - p1] += 1.0 for read in bam2.fetch(region.chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal_2[cut_site - p1] += 1.0 update_pwm(pwm, fasta, region, p1, p2) return signal_1, signal_2, motif_len, pwm, num_motif
try: # Initialization ll = line.strip().split("\t") chrName = ll[0] p1 = int(ll[1]) p2 = int(ll[2]) p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - (k_nb / 2) p2_wk = p2_w + (k_nb / 2) # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for r in bamFile.fetch(chrName, p1_w, p2_w): if ((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos - p1_w] += 1.0 if ((r.is_reverse) and ((r.aend - 1) < p2_w)): nr[r.aend - 1 - p1_w] += 1.0 #for i in range(p1_w, p2_w): # print i+1, nf[i-p1_w], nr[i-p1_w] # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)):
# Opening files bamFile = Samfile(bamFileName, "rb") hsFile = open(hsFileName, "r") # Resulting statistics O_Plus = 0.0 O_Minus = 0.0 R = 0.0 # Iterating on HS regions for line in hsFile: # Fetching signal ll = line.strip().split("\t") pileup_region = PileupRegion(int(ll[1]), int(ll[2]), ext) iter = bamFile.fetch(reference=ll[0], start=int(ll[1]), end=int(ll[2])) for alignment in iter: pileup_region.__call__(alignment) raw_signalF = [min(e, initial_clip) for e in pileup_region.vectorF] raw_signalR = [min(e, initial_clip) for e in pileup_region.vectorR] # Updating statistics O_Plus += sum(raw_signalF) O_Minus += sum(raw_signalR) R += int(ll[2]) - int(ll[1]) # Writing results outputFile = open(outputFileName, "w") outputFile.write("\t".join(["O+", "O-", "R", "NormFactor+", "NormFactor-"]) + "\n") outputFile.write("\t".join(
def estimate_table_pwm(self, regions, dnase_file_name, genome_file_name, k_nb, forward_shift, reverse_shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. atac_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Initializing bam and fasta if (dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) obsSeqsF = [] obsSeqsR = [] expSeqsF = [] expSeqsR = [] # Iterating on HS regions for region in regions: # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions # if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift # else: p1 = r.aend - (k_nb/2) + 1 - shift if (not r.is_reverse): cut_site = r.pos + forward_shift - 1 p1 = cut_site - int(floor(k_nb / 2)) else: cut_site = r.aend + reverse_shift + 1 p1 = cut_site - int(floor(k_nb / 2)) p2 = p1 + k_nb # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if 'N' not in currStr: if (not r.is_reverse): obsSeqsF.append(Seq(currStr)) else: obsSeqsR.append(Seq(currStr)) # Evaluating expected frequencies # Fetching whole sequence try: currStr = str( fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - k_nb): s = currStr[i:i + k_nb] if 'N' not in currStr: # Counting k-mer in dictionary expSeqsF.append(Seq(s)) # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + k_nb] expSeqsR.append(Seq(s)) # Closing files bamFile.close() fastaFile.close() obsMotifsF = motifs.create(obsSeqsF) obsMotifsR = motifs.create(obsSeqsR) expMotifsF = motifs.create(expSeqsF) expMotifsR = motifs.create(expSeqsR) obsPwmF = obsMotifsF.pwm obsPwmR = obsMotifsR.pwm expPwmF = expMotifsF.pwm expPwmR = expMotifsR.pwm # Output logos logo_obs_f = os.path.join( self.output_loc, "Bias", "logo", "obs_{}_{}_f.pdf".format(str(k_nb), str(forward_shift))) logo_obs_r = os.path.join( self.output_loc, "Bias", "logo", "obs_{}_{}_r.pdf".format(str(k_nb), str(forward_shift))) logo_exp_f = os.path.join( self.output_loc, "Bias", "logo", "exp_{}_{}_f.pdf".format(str(k_nb), str(forward_shift))) logo_exp_r = os.path.join( self.output_loc, "Bias", "logo", "exp_{}_{}_r.pdf".format(str(k_nb), str(forward_shift))) obsMotifsF.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) obsMotifsR.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) expMotifsF.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) expMotifsR.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Output pwms pwm_data_list = [obsPwmF, obsPwmR, expPwmF, expPwmR] pwm_file_list = [] pwm_obs_f = os.path.join( self.output_loc, "Bias", "pwm", "obs_{}_{}_f.pwm".format(str(k_nb), str(forward_shift))) pwm_obs_r = os.path.join( self.output_loc, "Bias", "pwm", "obs_{}_{}_r.pwm".format(str(k_nb), str(forward_shift))) pwm_exp_f = os.path.join( self.output_loc, "Bias", "pwm", "exp_{}_{}_f.pwm".format(str(k_nb), str(forward_shift))) pwm_exp_r = os.path.join( self.output_loc, "Bias", "pwm", "exp_{}_{}_r.pwm".format(str(k_nb), str(forward_shift))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_data_list)): with open(pwm_file_list[i], "w") as f: f.write(str(pwm_data_list[i])) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obsF = self.get_pwm_score(k_mer, obsPwmF, k_nb) expF = self.get_pwm_score(k_mer, expPwmF, k_nb) bias_table_F[k_mer] = round(obsF / expF, 6) obsR = self.get_pwm_score(k_mer, obsPwmR, k_nb) expR = self.get_pwm_score(k_mer, expPwmR, k_nb) bias_table_R[k_mer] = round(obsR / expR, 6) # Return return [bias_table_F, bias_table_R]
higherStrand = "NA" for k in ctcfIndexList: motifFile = motifFileList[k] motifFetch = motifFile.fetch(chrom, start, end) for read in motifFetch: rr = read.qname.split(":") motifScore = float(rr[1]) if (motifScore > higherScore): higherScore = motifScore higherStrand = "+" if (read.is_reverse): higherStrand = "-" vectorTable1.append(higherStrand) # Genomic region regionVec = [] regionFetch = regionsFile.fetch(chrom, start, end) for read in regionFetch: qName = "NA" if (read.qname and read.qname != "."): qName = read.qname try: geneSymbol = aliasDict[read.qname.split(":")[1]] except Exception: geneSymbol = "NA" if (qName == "INTERGENIC:."): qName = "INTERGENIC:NA" startx = "NA" if (read.pos and read.pos != "."): startx = str(read.pos) endx = "NA" if (read.aend and read.aend != "."): endx = str(read.aend) strand = "+" if (read.is_reverse): strand = "-" if (qName == "INTERGENIC:NA"): strand = "NA"
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, forward_shift, reverse_shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if (dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if (not r.is_reverse): cut_site = r.pos + forward_shift - 1 p1 = cut_site - int(floor(k_nb / 2)) else: cut_site = r.aend + reverse_shift + 1 p1 = cut_site - int(floor(k_nb / 2)) p2 = p1 + k_nb # Verifying PCR artifacts if (p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if (trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if (not r.is_reverse): ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str( fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round( float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round( float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) # Return return [bias_table_F, bias_table_R]
chromList = sorted(csDict.keys()) ################################################# # Fetching observed frequencies ################################################# if (allTagsFg == "Y"): # Iterating on chromosomes for chrom in chromList: prevPos = -1 trueCounter = 0 # Iterating on chromosome reads for r in bamFile.fetch(chrom, (k_nb / 2), csDict[chrom] - (k_nb / 2)): # Calculating positions if (not r.is_reverse): p1 = r.pos - (k_nb / 2) - 1 # The -1 is because He is wrong else: p1 = r.aend - (k_nb / 2) + 1 # The +1 is because He is wrong p2 = p1 + k_nb # Verifying PCR artifacts if (p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if (trueCounter > maxDuplicates): continue
chromList = ["chr" + str(e) for e in range(1, 23) + ["X"]] command = "mkdir -p " + tempLoc os.system(command) # Iterating throught the regions chipRegionFile = open(chipRegionFileName, "rU") motifBamFile = Samfile(motifBamFileName, "rb") tempBedFileName = tempLoc + "tempBedFileName" tempBedFile = open(tempBedFileName, "w") for line in chipRegionFile: ll = line.strip().split("\t") chrom = ll[0] start = int(ll[1]) end = int(ll[2]) if (chrom not in chromList): continue mfetch = motifBamFile.fetch(chrom, start, end) bestScore = -9999 bestMotif = None for read in mfetch: reference_start = str(read.reference_start) reference_end = str(read.reference_end) rr = read.query_name.split(":") name = rr[0] score = float(rr[1]) strand = "+" if (read.is_reverse): strand = "-" if (score > bestScore): bestScore = score bestMotif = [ chrom, reference_start, reference_end, name, str(score), strand
class GenomicSignal: """ Represents a genomic signal. It should be used to fetch normalized and slope signals from a bam file. Usage: 1. Initialize class. 2. Call load_sg_coefs once. 3. Call get_signal as many times as needed. Authors: Eduardo G. Gusmao. """ def __init__(self, file_name): """ Initializes GenomicSignal. """ self.file_name = file_name self.sg_coefs = None self.bam = Samfile(file_name, "rb") def load_sg_coefs(self, slope_window_size): """ Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. Keyword arguments: slope_window_size -- Window size of Savitzky-Golay coefficients. Return: None -- It updates self.sg_coefs. """ self.sg_coefs = self.savitzky_golay_coefficients( slope_window_size, 2, 1) def get_tag_count(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000): """ Gets the tag count associated with self.bam based on start, end and ext. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand). upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand). forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region. reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read). initial_clip -- Signal will be initially clipped at this level to avoid outliers. Return: tag_count -- Total signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if (ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array( [min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Tag count try: tag_count = sum(clip_signal) except Exception: tag_count = 0 return tag_count def get_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, print_raw_signal=False, print_bc_signal=False, print_norm_signal=False, print_slope_signal=False, strands_specific=False): """ Gets the signal associated with self.bam based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. initial_clip -- Signal will be initially clipped at this level to avoid outliers. per_norm -- Percentile value for 'hon_norm' function of the normalized signal. per_slope -- Percentile value for 'hon_norm' function of the slope signal. bias_table -- Bias table to perform bias correction. genome_file_name -- Genome to perform bias correction. downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand). upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand). forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region. reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read). Return: hon_signal -- Normalized signal. slopehon_signal -- Slope signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if (ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array( [min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Cleavage bias correction bias_corrected_signal = self.bias_correction(clip_signal, bias_table, genome_file_name, ref, start, end, forward_shift, reverse_shift, strands_specific) # Boyle normalization (within-dataset normalization) boyle_signal = array(self.boyle_norm(bias_corrected_signal)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal, per_norm) std = boyle_signal.std() hon_signal = self.hon_norm(boyle_signal, perc, std) # Slope signal slope_signal = self.slope(hon_signal, self.sg_coefs) # Hon normalization on slope signal (between-dataset slope smoothing) abs_seq = array([abs(e) for e in slope_signal]) perc = scoreatpercentile(abs_seq, per_slope) std = abs_seq.std() slopehon_signal = self.hon_norm(slope_signal, perc, std) # Writing signal if (print_raw_signal): signal_file = open(print_raw_signal, "a") signal_file.write( "fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join([str(e) for e in nan_to_num(raw_signal)]) + "\n") signal_file.close() if (print_bc_signal): signal_file = open(print_bc_signal, "a") signal_file.write( "fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join([str(e) for e in nan_to_num(bias_corrected_signal)]) + "\n") signal_file.close() if (print_norm_signal): signal_file = open(print_norm_signal, "a") signal_file.write( "fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join([str(e) for e in nan_to_num(hon_signal)]) + "\n") signal_file.close() if (print_slope_signal): signal_file = open(print_slope_signal, "a") signal_file.write( "fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join([str(e) for e in nan_to_num(slope_signal)]) + "\n") signal_file.close() # Returning normalized and slope sequences return hon_signal, slopehon_signal def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift, strands_specific): """ Performs bias correction. Keyword arguments: signal -- Input signal. bias_table -- Bias table. Return: bias_corrected_signal -- Bias-corrected sequence. """ if (not bias_table): return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if (not read.is_reverse): cut_site = read.pos + forward_shift if cut_site >= start and cut_site < end: nf[cut_site - p1_w] += 1.0 # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)): # nf[i - start] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if cut_site >= start and cut_site < end: nr[cut_site - p1_w] += 1.0 # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)): # nr[i - start] += 1.0 # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0 # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper() currRevComp = AuxiliaryFunctions.revcomp( str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper()) #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, # p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal_forward.append(zf) bias_corrected_signal_reverse.append(zr) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Fixing the negative number in bias corrected signal min_value = abs(min(bias_corrected_signal_forward)) bias_fixed_signal_forward = [ e + min_value for e in bias_corrected_signal_forward ] min_value = abs(min(bias_corrected_signal_reverse)) bias_fixed_signal_reverse = [ e + min_value for e in bias_corrected_signal_reverse ] min_value = abs(min(bias_corrected_signal)) bias_fixed_signal = [e + min_value for e in bias_corrected_signal] # Termination fastaFile.close() if not strands_specific: return bias_corrected_signal else: return bias_fixed_signal_forward, bias_fixed_signal_reverse def hon_norm(self, sequence, mean, std): """ Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. Keyword arguments: sequence -- Input sequence. mean -- Global mean. std -- Global std. Return: norm_seq -- Normalized sequence. """ #if std != 0: # norm_seq = [] # for e in sequence: # norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std)))) # return norm_seq #else: # return sequence norm_seq = [] for e in sequence: if (e == 0.0): norm_seq.append(0.0) elif (e > 0.0): norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std)))) else: norm_seq.append(-1.0 / (1.0 + (exp(-(-e - mean) / std)))) return norm_seq def boyle_norm(self, sequence): """ Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. Keyword arguments: sequence -- Input sequence. Return: norm_seq -- Normalized sequence. """ mean = array([e for e in sequence if e > 0]).mean() if isnan(mean): return sequence else: norm_seq = [(float(e) / mean) for e in sequence] return norm_seq def savitzky_golay_coefficients(self, window_size, order, deriv): """ Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). Keyword arguments: window_size -- Size of the window for function interpolation. order -- Order of polynomial. deriv -- Derivative. Return: m[::-1] -- The Savitzky-Golay coefficients. """ # Get statistics # try: # TODO ERRORS window_size = abs(int(window_size)) order = abs(int(order)) # except ValueError, msg: # raise ValueError("windowSize and order have to be of type int") # if windowSize % 2 != 1 or windowSize < 1: # raise TypeError("windowSize size must be a positive odd number") # if windowSize < order + 2: # raise TypeError("windowSize is too small for the polynomials order") order_range = range(order + 1) half_window = (window_size - 1) // 2 # Precompute Coefficients b = mat([[k**i for i in order_range] for k in range(-half_window, half_window + 1)]) m = linalg.pinv(b).A[deriv] return m[::-1] def slope(self, sequence, sg_coefs): """ Evaluates the slope of sequence given the sg_coefs loaded. Keyword arguments: sequence -- Input sequence. sg_coefs -- Savitzky-Golay coefficients. Return: slope_seq -- Slope sequence. """ slope_seq = convolve(sequence, sg_coefs) slope_seq = [ e for e in slope_seq[(len(sg_coefs) / 2):(len(slope_seq) - (len(sg_coefs) / 2))] ] return slope_seq def get_signal_per_strand(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, print_raw_signal=False, print_bc_signal=False, print_norm_signal=False, print_slope_signal=False, strands_specific=True): """ :param ref: Chromosome name. :param start: Initial genomic coordinate of signal. :param end: Final genomic coordinate of signal. :param downstream_ext: Number of bps to extend towards the downstream region :param upstream_ext: Number of bps to extend towards the upstream region :param forward_shift: Number of bps to shift the reads aligned to the forward strand. :param reverse_shift: Number of bps to shift the reads aligned to the reverse strand. :param initial_clip: Signal will be initially clipped at this level to avoid outliers. :param per_norm: Percentile value for 'hon_norm' function of the normalized signal. :param per_slope: Percentile value for 'hon_norm' function of the slope signal. :param bias_table: Bias table to perform bias correction. :param genome_file_name: Genome to perform bias correction. :param print_raw_signal: :param print_bc_signal: :param print_norm_signal: :param print_slope_signal: :return: normalized and slope signal for each strand. """ raw_signal_forward = [0.0] * (end - start) raw_signal_reverse = [0.0] * (end - start) reads = self.bam.fetch(reference=ref, start=start, end=end) for read in reads: if (not read.is_reverse): cut_site = read.pos + forward_shift if cut_site >= start and cut_site < end: raw_signal_forward[cut_site - start] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if cut_site >= start and cut_site < end: raw_signal_reverse[cut_site - start] += 1.0 raw_signal_forward = array( [min(e, initial_clip) for e in raw_signal_forward]) raw_signal_reverse = array( [min(e, initial_clip) for e in raw_signal_reverse]) # Std-based clipping mean = raw_signal_forward.mean() std = raw_signal_forward.std() clip_signal_forward = [ min(e, mean + (10 * std)) for e in raw_signal_forward ] mean = raw_signal_reverse.mean() std = raw_signal_reverse.std() clip_signal_reverse = [ min(e, mean + (10 * std)) for e in raw_signal_reverse ] # Cleavage bias correction bc_signal_forward = None bc_signal_reverse = None if bias_table: bc_signal_forward, bc_signal_reverse = self.bias_correction( raw_signal_forward, bias_table, genome_file_name, ref, start, end, forward_shift, reverse_shift, strands_specific) else: bc_signal_forward = clip_signal_forward bc_signal_reverse = clip_signal_reverse # Boyle normalization (within-dataset normalization) boyle_signal_forward = array(self.boyle_norm(bc_signal_forward)) boyle_signal_reverse = array(self.boyle_norm(bc_signal_reverse)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal_forward, per_norm) std = boyle_signal_forward.std() hon_signal_forward = self.hon_norm(boyle_signal_forward, perc, std) perc = scoreatpercentile(boyle_signal_reverse, per_norm) std = boyle_signal_reverse.std() hon_signal_reverse = self.hon_norm(boyle_signal_reverse, perc, std) # Slope signal slope_signal_forward = self.slope(hon_signal_forward, self.sg_coefs) slope_signal_reverse = self.slope(hon_signal_reverse, self.sg_coefs) # Returning normalized and slope sequences return hon_signal_forward, slope_signal_forward, hon_signal_reverse, slope_signal_reverse
if(not os.path.exists(args.outdir)): os.makedirs(args.outdir) #________________________________________________________________________________________________________________ #get error dict; #________________________________________________________________________________________________________________ errors = defaultdict(set) with open(args.errors, 'r') as f: for l in f: a = l.strip().split("\t"); if(not args.etype or ",".join(a[:2]) in args.etype): errors[tuple(a[:2])].add(a[2]); errors2segments = defaultdict(lambda: defaultdict(list)); samfile = Samfile(args.path) for segment in samfile.fetch(until_eof=True): num = segment.query_name.split("|")[0] for etype, eset in errors.iteritems(): if(num in eset): errors2segments[etype][num].append(segment); break; additional = defaultdict(list); for fname in args.additional: tsamfile = Samfile(fname); for segment in tsamfile.fetch(until_eof=True): num = segment.query_name.split("|")[0] additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid))) tsamfile.close();
intFile = open(intFileName,"r") spr = 0.0 counter = 0.0 for line in intFile: # Fetching signal ll = line.strip().split("\t") mLen = int(ll[2]) - int(ll[1]) mid = (int(ll[1])+int(ll[2]))/2 p1 = max(mid - halfWindow,0) p2 = mid + halfWindow # Fetch raw signal pileup_region = PileupRegion(p1,p2,1) if(ps_version == "0.7.5"): bam.fetch(reference=ll[0], start=p1, end=p2, callback = pileup_region) else: iter = bam.fetch(reference=ll[0], start=p1, end=p2) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e,initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Bias Correction correctedSignal = bias_correction(bam, clip_signal, biasTableF, biasTableR, genomeFileName, ll[0], p1, p2) # Summing min value to signal stdzSignal = [e+minValue for e in correctedSignal]
class GenomicSignal: """ Represents a genomic signal. It should be used to fetch normalized and slope signals from a bam file. Usage: 1. Initialize class. 2. Call load_sg_coefs once. 3. Call get_signal as many times as needed. Authors: Eduardo G. Gusmao. """ def __init__(self, file_name=None): """ Initializes GenomicSignal. """ self.file_name = file_name self.sg_coefs = None if file_name is not None: self.bam = Samfile(file_name, "rb") def load_sg_coefs(self, slope_window_size): """ Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. Keyword arguments: slope_window_size -- Window size of Savitzky-Golay coefficients. Return: None -- It updates self.sg_coefs. """ self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1) def get_tag_count(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000): """ Gets the tag count associated with self.bam based on start, end and ext. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand). upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand). forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region. reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read). initial_clip -- Signal will be initially clipped at this level to avoid outliers. Return: tag_count -- Total signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if ps_version == "0.7.5": self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) # Tag count try: tag_count = sum(raw_signal) except Exception: tag_count = 0 return tag_count def get_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, print_raw_signal=False): """ Gets the signal associated with self.bam based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. initial_clip -- Signal will be initially clipped at this level to avoid outliers. per_norm -- Percentile value for 'hon_norm' function of the normalized signal. per_slope -- Percentile value for 'hon_norm' function of the slope signal. bias_table -- Bias table to perform bias correction. genome_file_name -- Genome to perform bias correction. downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand). upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand). forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region. reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read). Return: hon_signal -- Normalized signal. slopehon_signal -- Slope signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if ps_version == "0.7.5": self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Cleavage bias correction bc_signal = self.bias_correction_dnase(clip_signal, bias_table, genome_file_name, ref, start, end, forward_shift, reverse_shift) # Boyle normalization (within-dataset normalization) boyle_signal = array(self.boyle_norm(bc_signal)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal, per_norm) std = boyle_signal.std() hon_signal = self.hon_norm_dnase(boyle_signal, perc, std) # Slope signal slope_signal = self.slope(hon_signal, self.sg_coefs) # Returning normalized and slope sequences return hon_signal, slope_signal def get_signal_atac(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=50, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None): # Cleavage bias correction bc_signal_forward, bc_signal_reverse = self.bias_correction_atac(bias_table, genome_file_name, ref, start, end, forward_shift, reverse_shift) # Boyle normalization (within-dataset normalization) boyle_signal_forward = array(self.boyle_norm(bc_signal_forward)) boyle_signal_reverse = array(self.boyle_norm(bc_signal_reverse)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal_forward, per_norm) std = boyle_signal_forward.std() hon_signal_forward = self.hon_norm_atac(boyle_signal_forward, perc, std) perc = scoreatpercentile(boyle_signal_reverse, per_norm) std = boyle_signal_reverse.std() hon_signal_reverse = self.hon_norm_atac(boyle_signal_reverse, perc, std) # Slope signal slope_signal_forward = self.slope(hon_signal_forward, self.sg_coefs) slope_signal_reverse = self.slope(hon_signal_reverse, self.sg_coefs) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(slope_signal_forward, per_norm) std = np.std(slope_signal_forward) slope_signal_forward = self.hon_norm_atac(slope_signal_forward, perc, std) perc = scoreatpercentile(slope_signal_reverse, per_norm) std = np.std(slope_signal_forward) slope_signal_reverse = self.hon_norm_atac(slope_signal_reverse, perc, std) # Returning normalized and slope sequences return hon_signal_forward, slope_signal_forward, hon_signal_reverse, slope_signal_reverse def get_signal_atac2(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=50, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None): # Cleavage bias correction bc_signal = self.bias_correction_atac2(bias_table, genome_file_name, ref, start, end, forward_shift, reverse_shift) # Boyle normalization (within-dataset normalization) boyle_signal = array(self.boyle_norm(bc_signal)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal, per_norm) std = boyle_signal.std() hon_signal = self.hon_norm_atac(boyle_signal, perc, std) # Slope signal slope_signal = self.slope(hon_signal, self.sg_coefs) # Hon normalization (between-dataset normalization) slope_signal = self.boyle_norm(slope_signal) perc = scoreatpercentile(slope_signal, per_slope) std = np.std(slope_signal) slope_signal = self.hon_norm_atac(slope_signal, perc, std) # Returning normalized and slope sequences return hon_signal, slope_signal def bias_correction_dnase(self, signal, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift): if not bias_table: return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p1_wk <= 0: return signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal def bias_correction_atac(self, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0): # Return raw counts nf = [0.0] * (p2 - p1) nr = [0.0] * (p2 - p1) for read in self.bam.fetch(chrName, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: nf[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: nr[cut_site - p1] += 1.0 return nf, nr # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) bias_corrected_signal_forward.append(nhatf) bias_corrected_signal_reverse.append(nhatr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal_forward, bias_corrected_signal_reverse def bias_correction_atac2(self, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0): # Return raw counts signal = [0.0] * (p2 - p1) for read in self.bam.fetch(chrName, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bc_signal = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) bc_signal.append(nhatf + nhatr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bc_signal def hon_norm_atac(self, sequence, mean, std): """ Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. Keyword arguments: sequence -- Input sequence. mean -- Global mean. std -- Global std. Return: norm_seq -- Normalized sequence. """ if std != 0: norm_seq = [] for e in sequence: if e == 0: norm_seq.append(e) else: norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std)))) return norm_seq else: return sequence def hon_norm_dnase(self, sequence, mean, std): """ Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. Keyword arguments: sequence -- Input sequence. mean -- Global mean. std -- Global std. Return: norm_seq -- Normalized sequence. """ # if std != 0: # norm_seq = [] # for e in sequence: # norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std)))) # return norm_seq # else: # return sequence norm_seq = [] for e in sequence: if e == 0.0: norm_seq.append(0.0) elif e > 0.0: norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std)))) else: norm_seq.append(-1.0 / (1.0 + (exp(-(-e - mean) / std)))) return norm_seq def boyle_norm(self, sequence): """ Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. Keyword arguments: sequence -- Input sequence. Return: norm_seq -- Normalized sequence. """ mean = array([e for e in sequence if e > 0]).mean() if isnan(mean): return sequence else: norm_seq = [(float(e) / mean) for e in sequence] return norm_seq def savitzky_golay_coefficients(self, window_size, order, deriv): """ Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). Keyword arguments: window_size -- Size of the window for function interpolation. order -- Order of polynomial. deriv -- Derivative. Return: m[::-1] -- The Savitzky-Golay coefficients. """ # Get statistics # try: # TODO ERRORS window_size = abs(int(window_size)) order = abs(int(order)) # except ValueError, msg: # raise ValueError("windowSize and order have to be of type int") # if windowSize % 2 != 1 or windowSize < 1: # raise TypeError("windowSize size must be a positive odd number") # if windowSize < order + 2: # raise TypeError("windowSize is too small for the polynomials order") order_range = range(order + 1) half_window = (window_size - 1) // 2 # Precompute Coefficients b = mat([[k ** i for i in order_range] for k in range(-half_window, half_window + 1)]) m = linalg.pinv(b).A[deriv] return m[::-1] def slope(self, sequence, sg_coefs): """ Evaluates the slope of sequence given the sg_coefs loaded. Keyword arguments: sequence -- Input sequence. sg_coefs -- Savitzky-Golay coefficients. Return: slope_seq -- Slope sequence. """ slope_seq = convolve(sequence, sg_coefs) slope_seq = [e for e in slope_seq[(len(sg_coefs) / 2):(len(slope_seq) - (len(sg_coefs) / 2))]] return slope_seq def print_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, raw_signal_file=None, bc_signal_file=None, norm_signal_file=None, strand_specific=False): if raw_signal_file: pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if ps_version == "0.7.5": self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) f = open(raw_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(raw_signal)]) + "\n") f.close() if bc_signal_file or norm_signal_file: # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fasta = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts signal_raw_f = [0.0] * (p2_w - p1_w) signal_raw_r = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(ref, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: signal_raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: signal_raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(signal_raw_f[:window]) rSum = sum(signal_raw_r[:window]) fLast = signal_raw_f[0] rLast = signal_raw_r[0] for i in range((window / 2), len(signal_raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += signal_raw_f[i + (window / 2)] fLast = signal_raw_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_raw_r[i + (window / 2)] rLast = signal_raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] signal_bc = [] signal_bc_f = [] signal_bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) signal_bc.append(nhatf + nhatr) signal_bc_f.append(nhatf) signal_bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] if bc_signal_file: f = open(bc_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc)]) + "\n") f.close() if strand_specific: prefix = bc_signal_file.split(".")[0] bc_signal_file_f = prefix + "_Forward" + ".bc.wig" bc_signal_file_r = prefix + "_Reverse" + ".bc.wig" f = open(bc_signal_file_f, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc_f)]) + "\n") f.close() f = open(bc_signal_file_r, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_bc_r)]) + "\n") f.close() if norm_signal_file: norm_signal_bc = self.boyle_norm(signal_bc) perc = scoreatpercentile(norm_signal_bc, 98) std = np.std(norm_signal_bc) norm_signal_bc = self.hon_norm_atac(norm_signal_bc, perc, std) f = open(norm_signal_file, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(norm_signal_bc)]) + "\n") f.close() if strand_specific: prefix = bc_signal_file.split(".")[0] norm_signal_file_f = prefix + "_Forward" + ".norm.wig" norm_signal_file_r = prefix + "_Reverse" + ".norm.wig" signal_norm_f = self.boyle_norm(signal_bc_f) perc = scoreatpercentile(signal_norm_f, 98) std = np.std(signal_norm_f) signal_norm_f = self.hon_norm_atac(signal_norm_f, perc, std) signal_norm_r = self.boyle_norm(signal_bc_r) perc = scoreatpercentile(signal_norm_r, 98) std = np.std(signal_norm_r) signal_norm_r = self.hon_norm_atac(signal_norm_r, perc, std) f = open(norm_signal_file_f, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_norm_f)]) + "\n") f.close() f = open(norm_signal_file_r, "a") f.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(signal_norm_r)]) + "\n") f.close() def get_raw_signal_by_fragment_length(self, ref, start, end, bam, forward_shift, reverse_shift, min_length=None, max_length=None, strand=True): p1 = start p2 = end raw_f = [0.0] * (p2 - p1) raw_r = [0.0] * (p2 - p1) if min_length is None and max_length is None: for read in bam.fetch(ref, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: raw_f[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: raw_r[cut_site - p1] += 1.0 elif min_length is None and max_length is not None: for read in bam.fetch(ref, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if abs(read.template_length) <= max_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: raw_f[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: raw_r[cut_site - p1] += 1.0 elif min_length is not None and max_length is None: for read in bam.fetch(ref, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if abs(read.template_length) > min_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: raw_f[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: raw_r[cut_site - p1] += 1.0 elif min_length is not None and max_length is not None: for read in bam.fetch(ref, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if min_length <= abs(read.template_length) <= max_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: raw_f[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: raw_r[cut_site - p1] += 1.0 if strand: return np.array(raw_f), np.array(raw_r) else: return np.add(np.array(raw_f), np.array(raw_r)) def get_bc_signal_by_fragment_length(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift, min_length=None, max_length=None, strand=True): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0): # Return raw counts signal = [0.0] * (p2 - p1) for read in self.bam.fetch(ref, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts raw_f = [0.0] * (p2_w - p1_w) raw_r = [0.0] * (p2_w - p1_w) if min_length is None and max_length is None: for read in bam.fetch(ref, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is None and max_length is not None: for read in bam.fetch(ref, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if abs(read.template_length) <= max_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is not None and max_length is None: for read in bam.fetch(ref, p1_w, p2_w): if abs(read.template_length) > min_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 elif min_length is not None and max_length is not None: for read in bam.fetch(ref, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if min_length < abs(read.template_length) <= max_length: if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(raw_f[:window]) rSum = sum(raw_r[:window]) fLast = raw_f[0] rLast = raw_r[0] for i in range((window / 2), len(raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += raw_f[i + (window / 2)] fLast = raw_f[i - (window / 2) + 1] rSum -= rLast rSum += raw_r[i + (window / 2)] rLast = raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] bc_f = [] bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) bc_f.append(nhatf) bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] if strand: return np.array(bc_f), np.array(bc_r) else: return np.add(np.array(bc_f), np.array(bc_r)) def get_bias_raw_bc_signal(self, ref, start, end, bam, fasta, bias_table, forward_shift, reverse_shift, strand=False): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(k_nb / 2.) p2_wk = p2_w + int(k_nb / 2.) if p1 <= 0 or p1_w <= 0 or p2_wk <= 0: # Return raw counts signal = [0.0] * (p2 - p1) for read in self.bam.fetch(ref, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: signal[cut_site - p1] += 1.0 return signal currStr = str(fasta.fetch(ref, p1_wk - 1 + forward_shift, p2_wk - 2 + forward_shift)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + reverse_shift + 2, p2_wk + reverse_shift + 1)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) # Raw counts signal_raw_f = [0.0] * (p2_w - p1_w) signal_raw_r = [0.0] * (p2_w - p1_w) for read in bam.fetch(ref, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: signal_raw_f[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: signal_raw_r[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(signal_raw_f[:window]) rSum = sum(signal_raw_r[:window]) fLast = signal_raw_f[0] rLast = signal_raw_r[0] for i in range((window / 2), len(signal_raw_f) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += signal_raw_f[i + (window / 2)] fLast = signal_raw_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_raw_r[i + (window / 2)] rLast = signal_raw_r[i - (window / 2) + 1] # Calculating bias and writing to wig file fSum = sum(signal_bias_f[:window]) rSum = sum(signal_bias_r[:window]) fLast = signal_bias_f[0] rLast = signal_bias_r[0] bias_f = [] bias_r = [] raw = [] raw_f = [] raw_r = [] bc = [] bc_f = [] bc_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): nhatf = Nf[i - (window / 2)] * (signal_bias_f[i] / fSum) nhatr = Nr[i - (window / 2)] * (signal_bias_r[i] / rSum) bias_f.append(signal_bias_f[i]) bias_r.append(signal_bias_r[i]) raw.append(signal_raw_f[i] + signal_raw_r[i]) raw_f.append(signal_raw_f[i]) raw_r.append(signal_raw_r[i]) # zf = (signal_raw_f[i]) / (signal_bias_f[i]) # zr = (signal_raw_r[i]) / (signal_bias_r[i]) bc.append(nhatf + nhatr) bc_f.append(nhatf) bc_r.append(nhatr) fSum -= fLast fSum += signal_bias_f[i + (window / 2)] fLast = signal_bias_f[i - (window / 2) + 1] rSum -= rLast rSum += signal_bias_r[i + (window / 2)] rLast = signal_bias_r[i - (window / 2) + 1] currStr = str(fasta.fetch(ref, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fasta.fetch(ref, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create the bias signal signal_bias_f = [] signal_bias_r = [] for i in range(int(k_nb / 2.), len(currStr) - int(k_nb / 2) + 1): fseq = currStr[i - int(k_nb / 2.):i + int(k_nb / 2.)] rseq = currRevComp[len(currStr) - int(k_nb / 2.) - i:len(currStr) + int(k_nb / 2.) - i] try: signal_bias_f.append(fBiasDict[fseq]) except Exception: signal_bias_f.append(defaultKmerValue) try: signal_bias_r.append(rBiasDict[rseq]) except Exception: signal_bias_r.append(defaultKmerValue) bias_f = [] bias_r = [] for i in range((window / 2), len(signal_bias_f) - (window / 2)): bias_f.append(signal_bias_f[i]) bias_r.append(signal_bias_r[i]) if strand: return bias_f, bias_r, raw, raw_f, raw_r, bc, bc_f, bc_r else: return bias_f, bias_r, raw, bc
# Input inputStag1FileName = tempLocation + "inputStag1FileName.txt" inputStag2FileName = tempLocation + "inputStag2FileName.txt" outputRegionFileName = ol + "region_fold_change.pdf" outputSignalFileName = ol + "signal_fold_change.pdf" # Iterating on STAG1 stag1File = open(stag1FileName, "rU") inputStag1File = open(inputStag1FileName, "w") stag1File.readline() for line in stag1File: ll = line.strip().split("\t") sp = ll[7].split(":") if (sp[0] == "ENHANCER"): regionsFetch = regionsFile.fetch(ll[0], int(ll[5]), int(ll[6])) seFlag = False for read in regionsFetch: if (read.qname.split(":")[0] == "SUPERENHANCER"): seFlag = True break if (seFlag): continue if (sp[2] == "."): name = sp[0] else: name = sp[2] + "_" + sp[0] score = ll[3] ctcfScore = ll[4] inputStag1File.write("\t".join([name, score]) + "\n") stag1File.close() inputStag1File.close() # Iterating on STAG2
start = ll[4] end = ll[5] tss1 = str(max(int(ll[5]) - 0, 0)) tss2 = str(int(ll[5]) + ext) tes1 = str(max(int(ll[4]) - ext, 0)) tes2 = str(int(ll[4]) + 0) p1 = start p2 = tss2 try: geneSymbol = aliasDict[ensg] except Exception: geneSymbol = ensg # Active status activeStatus = "INACTIVE" h3k4me3Fetch = h3k4me3File.fetch(chrom, int(p1), int(p2)) mSum = sum(1 for _ in h3k4me3Fetch) h3k27acFetch = h3k27acFile.fetch(chrom, int(p1), int(p2)) aSum = sum(1 for _ in h3k27acFetch) if (mSum > 0 or aSum > 0): activeStatus = "ACTIVE" geneDict[ensg] = [ chrom, start, end, "GENE:" + geneSymbol + ":" + activeStatus, str(int(float(score))), strand ] tssDict[ensg] = [ chrom, tss1, tss2, "PROMOTER:" + geneSymbol + ":" + activeStatus, str(int(float(score))), strand ] ttsDict[ensg] = [ chrom, tes1, tes2, "TTS:" + geneSymbol + ":" + activeStatus,
def parse_gem_3c(f_name, out_file, genome_lengths, frags, verbose=False, tmp_format=False, **kwargs): """ Parse gem 3c sam file using pysam tools. :param f_name: path to sam file corresponding to the mapping of reads :param out_file: path to outfile tab separated format containing paired read information :param genome_lengths: a dictionary generated containing the length of the genomic sequence per chromosome :param False tmp_format: If True leave the file prepared to be merged with other map files. """ frag_chunk = kwargs.get('frag_chunk', 100000) try: fhandler = Samfile(f_name) except IOError: raise Exception('ERROR: file "%s" not found' % f_name) # max number of reads in buffer max_size = 1000000 # getrname chromosome names i = 0 crm_dict = {} while True: try: crm_dict[i] = fhandler.getrname(i) i += 1 except ValueError: break # iteration over reads sub_count = 0 nfile = 0 tmp_files = [] reads = [] cur_name = '' write_pairs = False read1 = None read2 = [] samiter = fhandler.fetch(until_eof=True) r = None try: r = next(samiter) except StopIteration: # empty SAM file return None pass while r: if not r.is_paired or r.is_unmapped or r.mapq < 4: try: r = next(samiter) except StopIteration: break continue if r.is_read1 and cur_name != r.qname: if read1 is None: read1 = r cur_name = r.qname try: r = next(samiter) except StopIteration: break continue else: write_pairs = True if not write_pairs: if r.is_read2 or r.is_supplementary: read2.append(r) try: r = next(samiter) except StopIteration: break continue else: if not read2: write_pairs = False read1 = None try: r = next(samiter) except StopIteration: break continue reads_grp = [] read_id = read1.query_name for read in [read1]+read2: if read.query_name != read_id: continue positive = not read.is_reverse crm = crm_dict[read.tid] len_seq = read.reference_end-read.pos if positive: pos = read.pos + 1 else: pos = read.pos + len_seq try: frag_piece = frags[crm][pos // frag_chunk] except KeyError: # Chromosome not in hash read_multi = [] break idx = bisect(frag_piece, pos) try: next_re = frag_piece[idx] except IndexError: # case where part of the read is mapped outside chromosome count = 0 while idx >= len(frag_piece) and count < len_seq: pos -= 1 count += 1 frag_piece = frags[crm][pos // frag_chunk] idx = bisect(frag_piece, pos) if count >= len_seq: raise Exception('Read mapped mostly outside ' + 'chromosome\n') next_re = frag_piece[idx] prev_re = frag_piece[idx - 1 if idx else 0] reads_grp.append([read.tid, crm, pos, positive, len_seq, prev_re, next_re]) if len(reads_grp) > 2: _merge_multis(reads_grp) elif len(reads_grp) < 2: reads_grp = [] reads_multi = [] for paired_reads in combinations(reads_grp, 2): read_multi = [item for sublist in sorted(paired_reads,key = lambda x: (x[0], x[2])) for item in sublist] if read_multi: reads_multi.append(read_multi) sub_count += 1 paired_total = len(reads_multi) paired_nbr = 0 for pair_read in reads_multi: read_name_id = read_id paired_nbr += 1 if paired_total > 1: read_name_id += '#%d/%d' % (paired_nbr,paired_total) reads.append([read_name_id]+pair_read) if sub_count >= max_size: sub_count = 0 nfile += 1 reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10])) read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n' % tuple(read) for read in reads] write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile) #map_out.write('\n'.join(reads)+'\n') del reads[:] write_pairs = False read1 = None del read2[:] if reads: nfile += 1 reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10])) read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n' % tuple(read) for read in reads] write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile) #map_out.write('\n'.join(reads)) #map_out.close() # we have now sorted temporary files # we do merge sort for eah pair if verbose: stdout.write('Merge sort') stdout.flush() while len(tmp_files) > 1: file1 = tmp_files.pop(0) try: file2 = tmp_files.pop(0) except IndexError: break if verbose: stdout.write('.') stdout.flush() nfile += 1 tmp_files.append(merge_sort(file1, file2, out_file, nfile, paired=True)) if verbose: stdout.write('\n') if tmp_format: os.rename(tmp_files[0], out_file) else: map_out = open(out_file, 'w') tmp_reads_fh = open(tmp_files[0],'rb') for crm in genome_lengths: map_out.write('# CRM %s\t%d\n' % (crm, genome_lengths[crm])) for read_line in tmp_reads_fh: read = read_line.split('\t') map_out.write('\t'.join([read[0]]+read[2:8]+read[9:])) map_out.close() os.system('rm -rf ' + tmp_files[0]) return out_file
# Iterating on AB locations for tline in abTreatFile: # Initialization cline = abControlFile.readline() tt = tline.strip().split("\t") cc = cline.strip().split("\t") chrom = tt[0] p1 = int(tt[1]) p2 = int(tt[2]) tcomp = tt[3] ccomp = cc[3] # Fetching TADs tfetch = treatTadFile.fetch(chrom, p1, p2) ttadList = [] for read in tfetch: ttadList.append([read.reference_start, read.reference_end]) cfetch = treatTadFile.fetch(chrom, p1, p2) ctadList = [] for read in cfetch: ctadList.append([read.reference_start, read.reference_end]) # Iterating on TADs treatment treatIntraCount = 0.0 treatInterCount = 0.0 flagTtad = True try: prevTad = ttadList[0] prevSum, prevAvg = summ_avg_tad_interaction(
class GenomicSignal: """ Represents a genomic signal. It should be used to fetch normalized and slope signals from a bam file. Usage: 1. Initialize class. 2. Call load_sg_coefs once. 3. Call get_signal as many times as needed. Authors: Eduardo G. Gusmao. """ def __init__(self, file_name): """ Initializes GenomicSignal. """ self.file_name = file_name self.sg_coefs = None self.bam = Samfile(file_name, "rb") def load_sg_coefs(self, slope_window_size): """ Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. Keyword arguments: slope_window_size -- Window size of Savitzky-Golay coefficients. Return: None -- It updates self.sg_coefs. """ self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1) def get_tag_count(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000): """ Gets the tag count associated with self.bam based on start, end and ext. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand). upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand). forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region. reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read). initial_clip -- Signal will be initially clipped at this level to avoid outliers. Return: tag_count -- Total signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if (ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Tag count try: tag_count = sum(clip_signal) except Exception: tag_count = 0 return tag_count def get_signal(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, print_raw_signal=False, print_bc_signal=False, print_norm_signal=False, print_slope_signal=False, strands_specific=False): """ Gets the signal associated with self.bam based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. initial_clip -- Signal will be initially clipped at this level to avoid outliers. per_norm -- Percentile value for 'hon_norm' function of the normalized signal. per_slope -- Percentile value for 'hon_norm' function of the slope signal. bias_table -- Bias table to perform bias correction. genome_file_name -- Genome to perform bias correction. downstream_ext -- Number of bps to extend towards the downstream region (right for forward strand and left for reverse strand). upstream_ext -- Number of bps to extend towards the upstream region (left for forward strand and right for reverse strand). forward_shift -- Number of bps to shift the reads aligned to the forward strand. Can be a positive number for a shift towards the downstream region (towards the inside of the aligned read) and a negative number for a shift towards the upstream region. reverse_shift -- Number of bps to shift the reads aligned to the reverse strand. Can be a positive number for a shift towards the upstream region and a negative number for a shift towards the downstream region (towards the inside of the aligned read). Return: hon_signal -- Normalized signal. slopehon_signal -- Slope signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift) if (ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array([min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Cleavage bias correction bias_corrected_signal = self.bias_correction(clip_signal, bias_table, genome_file_name, ref, start, end, forward_shift, reverse_shift, strands_specific) # Boyle normalization (within-dataset normalization) boyle_signal = array(self.boyle_norm(bias_corrected_signal)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal, per_norm) std = boyle_signal.std() hon_signal = self.hon_norm(boyle_signal, perc, std) # Slope signal slope_signal = self.slope(hon_signal, self.sg_coefs) # Hon normalization on slope signal (between-dataset slope smoothing) abs_seq = array([abs(e) for e in slope_signal]) perc = scoreatpercentile(abs_seq, per_slope) std = abs_seq.std() slopehon_signal = self.hon_norm(slope_signal, perc, std) # Writing signal if (print_raw_signal): signal_file = open(print_raw_signal, "a") signal_file.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(raw_signal)]) + "\n") signal_file.close() if (print_bc_signal): signal_file = open(print_bc_signal, "a") signal_file.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(bias_corrected_signal)]) + "\n") signal_file.close() if (print_norm_signal): signal_file = open(print_norm_signal, "a") signal_file.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(hon_signal)]) + "\n") signal_file.close() if (print_slope_signal): signal_file = open(print_slope_signal, "a") signal_file.write("fixedStep chrom=" + ref + " start=" + str(start + 1) + " step=1\n" + "\n".join( [str(e) for e in nan_to_num(slope_signal)]) + "\n") signal_file.close() # Returning normalized and slope sequences return hon_signal, slopehon_signal def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift, strands_specific): """ Performs bias correction. Keyword arguments: signal -- Input signal. bias_table -- Bias table. Return: bias_corrected_signal -- Bias-corrected sequence. """ if (not bias_table): return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if (not read.is_reverse): cut_site = read.pos + forward_shift if cut_site >= start and cut_site < end: nf[cut_site - p1_w] += 1.0 # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)): # nf[i - start] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if cut_site >= start and cut_site < end: nr[cut_site - p1_w] += 1.0 # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)): # nr[i - start] += 1.0 # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0 # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper()) #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, # p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal_forward.append(zf) bias_corrected_signal_reverse.append(zr) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Fixing the negative number in bias corrected signal min_value = abs(min(bias_corrected_signal_forward)) bias_fixed_signal_forward = [e + min_value for e in bias_corrected_signal_forward] min_value = abs(min(bias_corrected_signal_reverse)) bias_fixed_signal_reverse = [e + min_value for e in bias_corrected_signal_reverse] min_value = abs(min(bias_corrected_signal)) bias_fixed_signal = [e + min_value for e in bias_corrected_signal] # Termination fastaFile.close() if not strands_specific: return bias_corrected_signal else: return bias_fixed_signal_forward, bias_fixed_signal_reverse def hon_norm(self, sequence, mean, std): """ Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. Keyword arguments: sequence -- Input sequence. mean -- Global mean. std -- Global std. Return: norm_seq -- Normalized sequence. """ #if std != 0: # norm_seq = [] # for e in sequence: # norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std)))) # return norm_seq #else: # return sequence norm_seq = [] for e in sequence: if(e == 0.0): norm_seq.append(0.0) elif(e > 0.0): norm_seq.append(1.0/(1.0+(exp(-(e-mean)/std)))) else: norm_seq.append(-1.0/(1.0+(exp(-(-e-mean)/std)))) return norm_seq def boyle_norm(self, sequence): """ Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. Keyword arguments: sequence -- Input sequence. Return: norm_seq -- Normalized sequence. """ mean = array([e for e in sequence if e > 0]).mean() if isnan(mean): return sequence else: norm_seq = [(float(e) / mean) for e in sequence] return norm_seq def savitzky_golay_coefficients(self, window_size, order, deriv): """ Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). Keyword arguments: window_size -- Size of the window for function interpolation. order -- Order of polynomial. deriv -- Derivative. Return: m[::-1] -- The Savitzky-Golay coefficients. """ # Get statistics # try: # TODO ERRORS window_size = abs(int(window_size)) order = abs(int(order)) # except ValueError, msg: # raise ValueError("windowSize and order have to be of type int") # if windowSize % 2 != 1 or windowSize < 1: # raise TypeError("windowSize size must be a positive odd number") # if windowSize < order + 2: # raise TypeError("windowSize is too small for the polynomials order") order_range = range(order + 1) half_window = (window_size - 1) // 2 # Precompute Coefficients b = mat([[k ** i for i in order_range] for k in range(-half_window, half_window + 1)]) m = linalg.pinv(b).A[deriv] return m[::-1] def slope(self, sequence, sg_coefs): """ Evaluates the slope of sequence given the sg_coefs loaded. Keyword arguments: sequence -- Input sequence. sg_coefs -- Savitzky-Golay coefficients. Return: slope_seq -- Slope sequence. """ slope_seq = convolve(sequence, sg_coefs) slope_seq = [e for e in slope_seq[(len(sg_coefs) / 2):(len(slope_seq) - (len(sg_coefs) / 2))]] return slope_seq def get_signal_per_strand(self, ref, start, end, downstream_ext, upstream_ext, forward_shift, reverse_shift, initial_clip=1000, per_norm=98, per_slope=98, bias_table=None, genome_file_name=None, print_raw_signal=False, print_bc_signal=False, print_norm_signal=False, print_slope_signal=False, strands_specific=True): """ :param ref: Chromosome name. :param start: Initial genomic coordinate of signal. :param end: Final genomic coordinate of signal. :param downstream_ext: Number of bps to extend towards the downstream region :param upstream_ext: Number of bps to extend towards the upstream region :param forward_shift: Number of bps to shift the reads aligned to the forward strand. :param reverse_shift: Number of bps to shift the reads aligned to the reverse strand. :param initial_clip: Signal will be initially clipped at this level to avoid outliers. :param per_norm: Percentile value for 'hon_norm' function of the normalized signal. :param per_slope: Percentile value for 'hon_norm' function of the slope signal. :param bias_table: Bias table to perform bias correction. :param genome_file_name: Genome to perform bias correction. :param print_raw_signal: :param print_bc_signal: :param print_norm_signal: :param print_slope_signal: :return: normalized and slope signal for each strand. """ raw_signal_forward = [0.0] * (end - start) raw_signal_reverse = [0.0] * (end - start) reads = self.bam.fetch(reference=ref, start=start, end=end) for read in reads: if (not read.is_reverse): cut_site = read.pos + forward_shift if cut_site >= start and cut_site < end: raw_signal_forward[cut_site - start] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if cut_site >= start and cut_site < end: raw_signal_reverse[cut_site - start] += 1.0 raw_signal_forward = array([min(e, initial_clip) for e in raw_signal_forward]) raw_signal_reverse = array([min(e, initial_clip) for e in raw_signal_reverse]) # Std-based clipping mean = raw_signal_forward.mean() std = raw_signal_forward.std() clip_signal_forward = [min(e, mean + (10 * std)) for e in raw_signal_forward] mean = raw_signal_reverse.mean() std = raw_signal_reverse.std() clip_signal_reverse = [min(e, mean + (10 * std)) for e in raw_signal_reverse] # Cleavage bias correction bc_signal_forward = None bc_signal_reverse = None if bias_table: bc_signal_forward, bc_signal_reverse = self.bias_correction(raw_signal_forward, bias_table, genome_file_name, ref, start, end, forward_shift, reverse_shift, strands_specific) else: bc_signal_forward = clip_signal_forward bc_signal_reverse = clip_signal_reverse # Boyle normalization (within-dataset normalization) boyle_signal_forward = array(self.boyle_norm(bc_signal_forward)) boyle_signal_reverse = array(self.boyle_norm(bc_signal_reverse)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal_forward, per_norm) std = boyle_signal_forward.std() hon_signal_forward = self.hon_norm(boyle_signal_forward, perc, std) perc = scoreatpercentile(boyle_signal_reverse, per_norm) std = boyle_signal_reverse.std() hon_signal_reverse = self.hon_norm(boyle_signal_reverse, perc, std) # Slope signal slope_signal_forward = self.slope(hon_signal_forward, self.sg_coefs) slope_signal_reverse = self.slope(hon_signal_reverse, self.sg_coefs) # Returning normalized and slope sequences return hon_signal_forward, slope_signal_forward, hon_signal_reverse, slope_signal_reverse
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option("--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir [default: %default]") p.add_option("--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig [default: %default]") p.add_option("--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig [default: %default]") p.add_option("--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig [default: %default]") p.add_option("--astat", default=False, action="store_true", help="create .astat to list repetitiveness [default: %default]") p.add_option("--readids", default=False, action="store_true", help="create file of mapped and unmapped ids [default: %default]") from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print >> fw, "AS {0} {1}".format(ncontigs, totalreads) print >> fw for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments) print >> fw, fill(str(cseq.seq)) print >> fw if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print >> astatfw, "{0}\t{1:.1f}".format(contig, astat) text = fill([qual] * nbases, delimiter=" ", width=30) print >> fw, "BQ\n{0}".format(text) print >> fw rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print >> readsfw, readname rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print >> fw, af print >> fw for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags, fill(aseq)) qs = "QA 1 {0} 1 {0}".format(alen) print >> fw, rd print >> fw print >> fw, qs print >> fw
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option( "--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir", ) p.add_option( "--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig", ) p.add_option( "--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig", ) p.add_option( "--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig", ) p.add_option( "--astat", default=False, action="store_true", help="create .astat to list repetitiveness", ) p.add_option( "--readids", default=False, action="store_true", help="create file of mapped and unmapped ids", ) from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print("AS {0} {1}".format(ncontigs, totalreads), file=fw) print(file=fw) for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw) print(fill(str(cseq.seq)), file=fw) print(file=fw) if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print("{0}\t{1:.1f}".format(contig, astat), file=astatfw) text = fill([qual] * nbases, delimiter=" ", width=30) print("BQ\n{0}".format(text), file=fw) print(file=fw) rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print(readname, file=readsfw) rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print(af, file=fw) print(file=fw) for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format( rname, alen, ninfos, ntags, fill(aseq) ) qs = "QA 1 {0} 1 {0}".format(alen) print(rd, file=fw) print(file=fw) print(qs, file=fw) print(file=fw)
class GenomicSignal: """ Represents a genomic signal. It should be used to fetch normalized and slope signals from a bam or bw file. Usage: 1. Initialize class. 2. Call load_sg_coefs once. 3. Call get_signal as many times as needed. Authors: Eduardo G. Gusmao. Methods: load_sg_coefs(self, slope_window_size): Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 98, per_slope = 98) Gets the signal associated with self.bam or self.bw based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. hon_norm(self, sequence, mean, std): Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. boyle_norm(self, sequence): Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. savitzky_golay_coefficients(self, window_size, order, deriv): Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). slope(self, sequence, sg_coefs): Evaluates the slope of sequence given the sg_coefs loaded. """ def __init__(self, file_name): """ Initializes GenomicSignal. """ self.file_name = file_name self.bam = None self.bw = None self.sg_coefs = None self.is_bam = False self.is_bw = False if(self.file_name.split(".")[-1].upper() == "BAM"): self.is_bam = True self.bam = Samfile(file_name,"rb") elif(self.file_name.split(".")[-1].upper() == "BW" or self.file_name.split(".")[-1].upper() == "BIGWIG"): self.is_bw = True self.bw = BigWigFile(file_name) else: pass # TODO ERROR def load_sg_coefs(self, slope_window_size): """ Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. Keyword arguments: slope_window_size -- Window size of Savitzky-Golay coefficients. Return: None -- It updates self.sg_coefs. """ self.sg_coefs = self.savitzky_golay_coefficients(slope_window_size, 2, 1) def get_tag_count(self, ref, start, end, ext, initial_clip = 1000, ext_both_directions=False): """ Gets the tag count associated with self.bam based on start, end and ext. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications. initial_clip -- Signal will be initially clipped at this level to avoid outliers. Return: tag_count -- Total signal. """ # Fetch raw signal pileup_region = PileupRegion(start,end,ext) if(self.is_bam): if(ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback = pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) if(not ext_both_directions): for alignment in iter: pileup_region.__call__(alignment) else: for alignment in iter: pileup_region.__call2__(alignment) raw_signal = array([min(e,initial_clip) for e in pileup_region.vector]) elif(self.is_bw): signal = self.bw.pileup(ref, start, end) raw_signal = array([min(e,initial_clip) for e in signal]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Tag count try: tag_count = sum(clip_signal) except Exception: tag_count = 0 return tag_count def get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 99.5, per_slope = 98, bias_table = None, genome_file_name = None, ext_both_directions=False, print_wig = None): """ Gets the signal associated with self.bam based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications. initial_clip -- Signal will be initially clipped at this level to avoid outliers. per_norm -- Percentile value for 'hon_norm' function of the normalized signal. per_slope -- Percentile value for 'hon_norm' function of the slope signal. bias_table -- Bias table to perform bias correction. Return: hon_signal -- Normalized signal. slopehon_signal -- Slope signal. """ # Fetch raw signal pileup_region = PileupRegion(start,end,ext) if(self.is_bam): if(ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback = pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) if(not ext_both_directions): for alignment in iter: pileup_region.__call__(alignment) else: for alignment in iter: pileup_region.__call2__(alignment) raw_signal = array([min(e,initial_clip) for e in pileup_region.vector]) elif(self.is_bw): signal = self.bw.pileup(ref, start, end) raw_signal = array([min(e,initial_clip) for e in signal]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Bias correction bias_corrected_signal = self.bias_correction(clip_signal, bias_table, genome_file_name, ref, start, end) # Boyle normalization (within-dataset normalization) boyle_signal = array(self.boyle_norm(bias_corrected_signal)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal, per_norm) std = boyle_signal.std() hon_signal = self.hon_norm(boyle_signal, perc, std) # Slope signal slope_signal = self.slope(hon_signal, self.sg_coefs) # Hon normalization on slope signal (between-dataset slope smoothing) abs_seq = array([abs(e) for e in slope_signal]) perc = scoreatpercentile(abs_seq, per_slope) std = abs_seq.std() slopehon_signal = self.hon_norm(slope_signal, perc, std) # Writing signal if(print_wig): signal_file = open(print_wig+"signal.wig","a") norm_file = open(print_wig+"norm.wig","a") slope_file = open(print_wig+"slope.wig","a") signal_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in clip_signal])+"\n") norm_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in hon_signal])+"\n") slope_file.write("fixedStep chrom="+ref+" start="+str(start+1)+" step=1\n"+"\n".join([str(e) for e in slopehon_signal])+"\n") signal_file.close() norm_file.close() slope_file.close() # Returning normalized and slope sequences return hon_signal, slopehon_signal def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end): """ Performs bias correction. Keyword arguments: signal -- Input signal. bias_table -- Bias table. Return: bias_corrected_signal -- Bias-corrected sequence. """ if(not bias_table): return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table.table[0]; rBiasDict = bias_table.table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start; p2 = end p1_w = p1 - (window/2); p2_w = p2 + (window/2) p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2) # Raw counts nf = [0.0] * (p2_w-p1_w); nr = [0.0] * (p2_w-p1_w) for r in self.bam.fetch(chrName, p1_w, p2_w): if((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos-p1_w] += 1.0 if((r.is_reverse) and ((r.aend-1) < p2_w)): nr[r.aend-1-p1_w] += 1.0 # Smoothed counts Nf = []; Nr = []; fSum = sum(nf[:window]); rSum = sum(nr[:window]); fLast = nf[0]; rLast = nr[0] for i in range((window/2),len(nf)-(window/2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1] rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper()) # Iterating on sequence to create signal af = []; ar = [] for i in range((k_nb/2),len(currStr)-(k_nb/2)+1): fseq = currStr[i-(k_nb/2):i+(k_nb/2)] rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]); rSum = sum(ar[:window]); fLast = af[0]; rLast = ar[0] bias_corrected_signal = [] for i in range((window/2),len(af)-(window/2)): nhatf = Nf[i-(window/2)]*(af[i]/fSum) nhatr = Nr[i-(window/2)]*(ar[i]/rSum) zf = log(nf[i]+1)-log(nhatf+1) zr = log(nr[i]+1)-log(nhatr+1) bias_corrected_signal.append(zf+zr) fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1] rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1] # Termination fastaFile.close() return bias_corrected_signal def hon_norm(self, sequence, mean, std): """ Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. Keyword arguments: sequence -- Input sequence. mean -- Global mean. std -- Global std. Return: norm_seq -- Normalized sequence. """ norm_seq = [] for e in sequence: if(e == 0.0): norm_seq.append(0.0) elif(e > 0.0): norm_seq.append(1.0/(1.0+(exp(-(e-mean)/std)))) else: norm_seq.append(-1.0/(1.0+(exp(-(-e-mean)/std)))) return norm_seq def boyle_norm(self, sequence): """ Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. Keyword arguments: sequence -- Input sequence. Return: norm_seq -- Normalized sequence. """ mean = array([e for e in sequence if e>0]).mean() norm_seq = [(float(e)/mean) for e in sequence] return norm_seq def savitzky_golay_coefficients(self, window_size, order, deriv): """ Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). Keyword arguments: window_size -- Size of the window for function interpolation. order -- Order of polynomial. deriv -- Derivative. Return: m[::-1] -- The Savitzky-Golay coefficients. """ # Get statistics #try: # TODO Errors window_size = abs(int(window_size)) order = abs(int(order)) #except ValueError, msg: # raise ValueError("windowSize and order have to be of type int") #if windowSize % 2 != 1 or windowSize < 1: # raise TypeError("windowSize size must be a positive odd number") #if windowSize < order + 2: # raise TypeError("windowSize is too small for the polynomials order") order_range = range(order+1) half_window = (window_size -1) // 2 # Precompute Coefficients b = mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)]) m = linalg.pinv(b).A[deriv] return m[::-1] def slope(self, sequence, sg_coefs): """ Evaluates the slope of sequence given the sg_coefs loaded. Keyword arguments: sequence -- Input sequence. sg_coefs -- Savitzky-Golay coefficients. Return: slope_seq -- Slope sequence. """ slope_seq = convolve(sequence, sg_coefs) slope_seq = [e for e in slope_seq[(len(sg_coefs)/2):(len(slope_seq)-(len(sg_coefs)/2))]] return slope_seq
# Iterating on coordinates coordFile = open(coordFileName,"r") for line in coordFile: try: # Initialization ll = line.strip().split("\t") chrName = ll[0] p1 = int(ll[1]); p2 = int(ll[2]) p1_w = p1 - (window/2); p2_w = p2 + (window/2) p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2) # Raw counts pileup_region = PileupRegion(p1_w,p2_w) iter = bamFile.fetch(reference=chrName, start=p1_w, end=p2_w) for alignment in iter: pileup_region.__call__(alignment) nf = pileup_region.vectorF nr = pileup_region.vectorR outputFileRaw.write("fixedStep chrom="+chrName+" start="+str(p1+1)+" step=1\n") for i in range(0,len(nf)): outputFileRaw.write(str(nf[i]+nr[i])+"\n") #print "RAW reads" #for i in range(p1_w, p2_w): # print i+1, nf[i-p1_w], nr[i-p1_w] # Smoothed counts Nf = []; Nr = []; fSum = sum(nf[:window]); rSum = sum(nr[:window]); fLast = nf[0]; rLast = nr[0]
def bam_surject_msa(args): """ Caveats: - flags are remained as original statuses - remaining original values for MD, NM, and AS tags - mate are given as unmapped - same records are emited """ skip_flag = args.skip_flag sam = Samfile(args.bam) fasta = Fasta(open(args.msa_fasta)) mapped_ref_set = set(sam.references) # setup output if args.refnames is None: refnames = [ 'consensus{0}'.format(i) for i in xrange(len(args.msa_fastas)) ] else: refnames = args.refnames assert len(refnames) == len( args.msa_fastas ), 'The number of refnames should be the same as that of msa_fastas.' logging.info('Loading MSA fastas') logging.info('Skip flag: %s', args.skip_flag) fastas = [] ref_lens = [] target_ref_set = set() for fn in args.msa_fastas: with open(fn) as fp: fasta = Fasta(fp) fastas.append(fasta) if len(fasta.contigs) == 0: logging.error('Fasta file %s has no contigs', fn) raise Exception('No contigs') ref_lens.append(len(fasta.contigs[0])) target_ref_set.update(fasta.names) rest_refs = [r for r in sam.references if r not in target_ref_set] logging.info('%s are included in surjection targets.', len(target_ref_set)) logging.info('%s are not included in surjection targets.', len(rest_refs)) if args.keep_rest: logging.info('Rest of reference will be kept in surjected BAM file') org_ref_len_map = dict(zip(sam.references, sam.lengths)) refnames.extend([r for r in rest_refs]) ref_lens.extend([org_ref_len_map[r] for r in rest_refs]) fastas.extend([None for r in rest_refs]) logging.info('Setting up output BAMs') if args.output.endswith('.bam'): mode = 'wb' else: mode = 'wh' out = pysam.Samfile(args.output, mode=mode, reference_names=[refname], reference_lengths=[ref_length]) # iteration for refname, fasta in zip(refnames, fastas): out_tid = out.gettid(refname) if fasta is None: logging.info('Transfering %s', refname) src_tid = sam.gettid(refname) for rec in sam.fetch(reference=refname): if rec.flag & skip_flag: continue a = rec.__copy__() a.reference_id = out_tid if a.next_reference_id != src_tid: # pair on the same refs a.next_reference_id = out_tid else: a.next_reference_id = -1 # unpair a.next_reference_start = -1 out.write(a) continue logging.info('Surjecing to %s', refname) query_refs = fasta.names cc = _CigarChecker() if args.check else None for qref in query_refs: if qref not in mapped_ref_set: logging.warning('%s is not found in original BAM file', qref) continue #a = pysam.AlignedSegment() a = rec.__copy__() #print (rec) if not rec.is_unmapped: org_cigar = Cigar(rec.cigartuples) pos, cigar = mc.convert(rec.pos, org_cigar) if org_cigar.query_length != cigar.query_length: logging.error('Invalid cigar conversion for %s', rec.qname) logging.error('org %s %s %s', rec.pos, org_cigar, org_cigar.query_length) logging.error('new %s %s %s', pos, cigar, cigar.query_length) s1 = pos e1 = mc.get_pos(rec.pos + cigar.ref_length) logging.error('ref %s-%s %s', s1, e1, mc.get_ref_cigar(s1, e1)) logging.error('read %s', rec.seq) logging.error('qref %s', q_aln.seq[s1:e1]) raise Exception('Incompatible Cigar') cc and cc.check(rec, pos, cigar, org_cigar, mc, q_aln) a.cigar = cigar.values a.reference_start = pos a.reference_id = out_tid a.next_reference_id = -1 # this is required a.next_reference_start = -1 # this is required #a.flag = rec.flag #orec.seq = '*' #print (orec) out.write(a)
def create_signal(args, regions): def revcomp(s): rev_dict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")]) return "".join([rev_dict[e] for e in s[::-1]]) alphabet = ["A", "C", "G", "T"] kmer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] f_obs_dict = dict([(e, 0.0) for e in kmer_comb]) r_obs_dict = dict([(e, 0.0) for e in kmer_comb]) f_exp_dict = dict([(e, 0.0) for e in kmer_comb]) r_exp_dict = dict([(e, 0.0) for e in kmer_comb]) bam_file = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fasta_file = Fastafile(genome_data.get_genome()) for region in regions: # Fetching observed reads reads = bam_file.fetch(reference=region.chrom, start=region.initial, end=region.final) for read in reads: if not read.is_reverse: p1 = read.pos - int(floor(args.k_nb / 2)) + args.forward_shift - 1 else: p1 = read.aend - int(floor(args.k_nb / 2)) + args.reverse_shift + 1 p2 = p1 + args.k_nb try: dna_sequence_obs = str(fasta_file.fetch(region.chrom, p1, p2)).upper() except Exception: continue if 'N' not in dna_sequence_obs: if read.is_reverse: dna_sequence_obs = revcomp(dna_sequence_obs) r_obs_dict[dna_sequence_obs] += 1 else: f_obs_dict[dna_sequence_obs] += 1 # Fetching whole sequence try: dna_sequence_exp = str(fasta_file.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue dna_sequence_exp_rev = revcomp(dna_sequence_exp) for i in range(0, len(dna_sequence_exp) - args.k_nb): s = dna_sequence_exp[i:i + args.k_nb] if "N" not in s: f_exp_dict[s] += 1 s = dna_sequence_exp_rev[i:i + args.k_nb] if "N" not in s: r_exp_dict[s] += 1 output_fname_f_obs = os.path.join(args.output_location, "{}_f_obs.fa".format(str(args.k_nb))) output_fname_f_exp = os.path.join(args.output_location, "{}_f_exp.fa".format(str(args.k_nb))) output_fname_r_obs = os.path.join(args.output_location, "{}_r_obs.fa".format(str(args.k_nb))) output_fname_r_exp = os.path.join(args.output_location, "{}_r_exp.fa".format(str(args.k_nb))) output_file_f_obs = open(output_fname_f_obs, "w") output_file_f_exp = open(output_fname_f_exp, "w") output_file_r_obs = open(output_fname_r_obs, "w") output_file_r_exp = open(output_fname_r_exp, "w") for kmer in list(r_obs_dict.keys()): if f_obs_dict[kmer] > 0: output_file_f_obs.write(kmer + "\t" + str(f_obs_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if f_exp_dict[kmer] > 0: output_file_f_exp.write(kmer + "\t" + str(f_exp_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if r_obs_dict[kmer] > 0: output_file_r_obs.write(kmer + "\t" + str(r_obs_dict[kmer]) + "\n") for kmer in list(r_obs_dict.keys()): if r_exp_dict[kmer] > 0: output_file_r_exp.write(kmer + "\t" + str(r_exp_dict[kmer]) + "\n") output_file_f_obs.close() output_file_f_exp.close() output_file_r_obs.close() output_file_r_exp.close()
class BamFile: """ Represents a bam file. It should be used to fetch normalized and slope signals from a bam file. Usage: 1. Initialize class. 2. Call load_sg_coefs once. 3. Call get_signal as many times as needed. Authors: Eduardo G. Gusmao. Methods: load_sg_coefs(self, slope_window_size): Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. get_signal(self, ref, start, end, ext, initial_clip = 1000, per_norm = 98, per_slope = 98) Gets the signal associated with self.bam based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. hon_norm(self, sequence, mean, std): Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. boyle_norm(self, sequence): Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. savitzky_golay_coefficients(self, window_size, order, deriv): Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). slope(self, sequence, sg_coefs): Evaluates the slope of sequence given the sg_coefs loaded. """ def __init__(self, file_name): """ Initializes BamFile. Variables: bam -- Pysam's bam representation. sg_coefs -- Savitzky-Golay coefficients (list). Should be loaded after class initialization. """ self.file_name = file_name self.bam = Samfile(file_name, "rb") self.sg_coefs = None def load_sg_coefs(self, slope_window_size): """ Loads Savitzky-Golay coefficients into self.sg_coefs based on a slope_window_size. Keyword arguments: slope_window_size -- Window size of Savitzky-Golay coefficients. Return: None -- It updates self.sg_coefs. """ self.sg_coefs = self.savitzky_golay_coefficients( slope_window_size, 2, 1) def get_signal(self, ref, start, end, ext, initial_clip=1000, per_norm=98, per_slope=98): """ Gets the signal associated with self.bam based on start, end and ext. initial_clip, per_norm and per_slope are used as normalization factors during the normalization and slope evaluation procedures. Keyword arguments: ref -- Chromosome name. start -- Initial genomic coordinate of signal. end -- Final genomic coordinate of signal. ext -- Fragment extention. Eg. 1 for DNase and 200 for histone modifications. initial_clip -- Signal will be initially clipped at this level to avoid outliers. per_norm -- Percentile value for 'hon_norm' function of the normalized signal. per_slope -- Percentile value for 'hon_norm' function of the slope signal. Return: hon_signal -- Normalized signal. slopehon_signal -- Slope signal. """ # Fetch raw signal pileup_region = PileupRegion(start, end, ext) if (ps_version == "0.7.5"): self.bam.fetch(reference=ref, start=start, end=end, callback=pileup_region) else: iter = self.bam.fetch(reference=ref, start=start, end=end) for alignment in iter: pileup_region.__call__(alignment) raw_signal = array( [min(e, initial_clip) for e in pileup_region.vector]) # Std-based clipping mean = raw_signal.mean() std = raw_signal.std() clip_signal = [min(e, mean + (10 * std)) for e in raw_signal] # Boyle normalization (within-dataset normalization) boyle_signal = array(self.boyle_norm(clip_signal)) # Hon normalization (between-dataset normalization) perc = scoreatpercentile(boyle_signal, per_norm) std = boyle_signal.std() hon_signal = self.hon_norm(boyle_signal, perc, std) # Slope signal slope_signal = self.slope(hon_signal, self.sg_coefs) # Hon normalization on slope signal (between-dataset slope smoothing) abs_seq = array([abs(e) for e in slope_signal]) perc = scoreatpercentile(abs_seq, per_slope) std = abs_seq.std() slopehon_signal = self.hon_norm(slope_signal, perc, std) # Returning normalized and slope sequences return hon_signal, slopehon_signal def hon_norm(self, sequence, mean, std): """ Normalizes a sequence according to hon's criterion using mean and std. This represents a between-dataset normalization. Keyword arguments: sequence -- Input sequence. mean -- Global mean. std -- Global std. Return: norm_seq -- Normalized sequence. """ norm_seq = [] for e in sequence: if (e == 0.0): norm_seq.append(0.0) elif (e > 0.0): norm_seq.append(1.0 / (1.0 + (exp(-(e - mean) / std)))) else: norm_seq.append(-1.0 / (1.0 + (exp(-(-e - mean) / std)))) return norm_seq def boyle_norm(self, sequence): """ Normalizes a sequence according to Boyle's criterion. This represents a within-dataset normalization. Keyword arguments: sequence -- Input sequence. Return: norm_seq -- Normalized sequence. """ mean = array([e for e in sequence if e > 0]).mean() norm_seq = [(float(e) / mean) for e in sequence] return norm_seq def savitzky_golay_coefficients(self, window_size, order, deriv): """ Evaluate the Savitzky-Golay coefficients in order to evaluate the slope of the signal. It uses a window_size (of the interpolation), order (of the polynomial), deriv (derivative needed). Keyword arguments: window_size -- Size of the window for function interpolation. order -- Order of polynomial. deriv -- Derivative. Return: m[::-1] -- The Savitzky-Golay coefficients. """ # Get statistics #try: # TODO Errors window_size = abs(int(window_size)) order = abs(int(order)) #except ValueError, msg: # raise ValueError("windowSize and order have to be of type int") #if windowSize % 2 != 1 or windowSize < 1: # raise TypeError("windowSize size must be a positive odd number") #if windowSize < order + 2: # raise TypeError("windowSize is too small for the polynomials order") order_range = range(order + 1) half_window = (window_size - 1) // 2 # Precompute Coefficients b = mat([[k**i for i in order_range] for k in range(-half_window, half_window + 1)]) m = linalg.pinv(b).A[deriv] return m[::-1] def slope(self, sequence, sg_coefs): """ Evaluates the slope of sequence given the sg_coefs loaded. Keyword arguments: sequence -- Input sequence. sg_coefs -- Savitzky-Golay coefficients. Return: slope_seq -- Slope sequence. """ slope_seq = convolve(sequence, sg_coefs) slope_seq = [ e for e in slope_seq[(len(sg_coefs) / 2):(len(slope_seq) - (len(sg_coefs) / 2))] ] return slope_seq