def count_reads(transcripts, bam_iter, number_of_counts=1): """ Count the reads in a given transcript :TODO rename :TODO change to cython Arguments --------- transcripts : list list of exons bam_iter : pysam.BamFileIterator gotton after pysam.BamFile.fetch() call """ # Convert this to Cython out_counts = zeros(len(transcripts)) intron_lengths = [] read_vector = [] tree = Intersecter() # Assume exons are position sorted for ti, transcript in enumerate(transcripts): ex_list = [] for j, i in enumerate(transcript): tree.add_interval( Interval(int(i[0]), int(i[1]), value={'anno': ti})) if j != 0: ex_list.append(transcript[j-1][1]\ - transcript[j][0]) intron_lengths.append(ex_list) for read in bam_iter: block_counter = zeros((len(transcripts), )) intron_match = zeros((len(transcripts), )) blocks = read.get_blocks() junction_lengths = [] for i, j in enumerate(blocks): if i != 0: junction_lengths.append(blocks[i - 1][1] - j[0]) else: pass junction_lengths = set(junction_lengths) for i, k in enumerate(blocks): overlap = tree.find(k[0], k[1]) if len(overlap) == 0: break else: for s in overlap: if (k[0] >= s.start) and\ (k[1] <= s.end): block_counter[s.value['anno']] += 1 for ij, il in enumerate(intron_lengths): if set(junction_lengths).issubset(set(il)): intron_match[ij] = 1 else: pass smatch = nrepeat(len(blocks), len(transcripts)) gg = logical_and(block_counter == smatch, intron_match) read_vector.append(gg) out_counts += gg read_matrix = array(read_vector) uniq_r = sum_(read_matrix, axis=1) == 1 #normalization_constant = [for i in transcripts] return (out_counts)
def get_reads_and_ranges(bam_region, cid, chrom, region_start, region_end, strand, options): pos_range = defaultdict(lambda: [0,0]) filtered_reads = Intersecter() read_iterator = filter_reads( bam_region, options.chrom_prefix + chrom, region_start, region_end, options ) for read in read_iterator: if is_valid_paired(read, region_start, options): rstart = min(read.pos, read.pnext) + 1 rend = rstart + abs(read.isize) - 1 filtered_reads.add_interval(Interval(rstart, rend)) inc_pr(pos_range, rstart, rend, region_start, region_end) inc_pr_at(pos_range, rstart, region_start, region_end) inc_pr_at(pos_range, rend, region_start, region_end) elif is_valid_single(read, options): rstart = read.pos + 1 rend = rstart + aln_length(read.cigar) - 1 filtered_reads.add_interval(Interval(rstart, rend)) inc_pr(pos_range, rstart, rend, region_start, region_end) if as_merged(read, options) or as_trimmed(read, options): inc_pr_at(pos_range, rstart, region_start, region_end) inc_pr_at(pos_range, rend, region_start, region_end) elif read.is_reverse: inc_pr_at(pos_range, rend, region_start, region_end) else: inc_pr_at(pos_range, rstart, region_start, region_end) return filtered_reads, pos_range
def buildIntervalTree(exons): '''Build interval tree from exon annotations.''' tree = Intersecter() for exon in exons: tree.add_interval(Interval(exon.start, exon.end, value={'cStart': exon.cStart, 'cEnd': exon.cEnd})) return tree
def calculate_score(chrom, start, end,ctype="WPS"): filteredReads = Intersecter() posRange = defaultdict(int) for read in readIterator(args,chrom,start,end): if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue if isSoftClipped(read.cigar): continue if read.is_paired: if read.mate_is_unmapped: continue if read.rnext != read.tid: continue if read.is_read1 or (read.is_read2 and read.pnext+read.qlen < start): if read.isize == 0: continue if options.downsample != None and random.random() >= options.downsample: continue rstart = min(read.pos,read.pnext)+1 # 1-based rend = rstart+abs(read.isize)-1 # end included rlength = rend-rstart+1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart,rend)) if ctype == "COV": for i in range(rstart,rend+1): if i >= start and i <= end: posRange[i]+=1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart]+=1 if rend >= start and rend <= end: posRange[rend]+=1 else: if options.downsample != None and random.random() >= options.downsample: continue rstart = read.pos+1 # 1-based rend = rstart+aln_length(read.cigar)-1 # end included rlength = rend-rstart+1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart,rend)) if ctype == "COV": for i in range(rstart,rend+1): if i >= start and i <= end: posRange[i]+=1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart]+=1 if rend >= start and rend <= end: posRange[rend]+=1 if ctype == "WPS": protection = options.protection//2 for pos in xrange(start,end+1): rstart,rend = pos-protection,pos+protection gcount,bcount = 0,0 for read in filteredReads.find(rstart,rend): if (read.start > rstart) or (read.end < rend): bcount +=1 else: gcount +=1 posRange[pos]+=gcount-bcount res = [] for pos in xrange(start,end+1): res.append(posRange[pos]) return res
def count_reads(transcripts, bam_iter, number_of_counts=1): """ Count the reads in a given transcript :TODO rename :TODO change to cython Arguments --------- transcripts : list list of exons bam_iter : pysam.BamFileIterator gotton after pysam.BamFile.fetch() call """ # Convert this to Cython out_counts = zeros(len(transcripts)) intron_lengths = [] read_vector = [] tree = Intersecter() # Assume exons are position sorted for ti, transcript in enumerate(transcripts): ex_list = [] for j, i in enumerate(transcript): tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno':ti})) if j != 0: ex_list.append(transcript[j-1][1]\ - transcript[j][0]) intron_lengths.append(ex_list) for read in bam_iter: block_counter = zeros((len(transcripts),)) intron_match = zeros((len(transcripts),)) blocks = read.get_blocks() junction_lengths = [] for i,j in enumerate(blocks): if i != 0: junction_lengths.append(blocks[i - 1][1] - j[0]) else: pass junction_lengths = set(junction_lengths) for i, k in enumerate(blocks): overlap = tree.find(k[0], k[1]) if len(overlap) == 0: break else: for s in overlap: if (k[0] >= s.start) and\ (k[1] <= s.end): block_counter[s.value['anno']] += 1 for ij, il in enumerate(intron_lengths): if set(junction_lengths).issubset(set(il)): intron_match[ij] = 1 else: pass smatch = nrepeat(len(blocks), len(transcripts)) gg = logical_and(block_counter == smatch, intron_match) read_vector.append(gg) out_counts += gg read_matrix = array(read_vector) uniq_r = sum_(read_matrix, axis=1) == 1 #normalization_constant = [for i in transcripts] return(out_counts)
def init_intersecter(hits): ''' ''' intersecter = Intersecter() for h in hits: intersecter.add_interval(Interval(h[0], h[1])) return intersecter
def init_intersecter(hits): ''' ''' intersecter = Intersecter() for h in hits: intersecter.add_interval(Interval(h[0], h[1])) return intersecter
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): ''' This function cites the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara ''' ''' Function load_restriction_fragment cite the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara ''' """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print("## Loading Restriction File Intervals '" + in_file + "'...") bed_handle = open(in_file) nline = 0 for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print("Warning : wrong input format in line" + nline + ". Not a BED file !?") continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) midPoint = (start + end)/2 fragl = abs(end - start) name = name.strip() ## Discard fragments outside the size range if minfragsize != None and int(fragl) < int(minfragsize): print("Warning : fragment "+ name + " [" + fragl + "] outside of range. Discarded") continue if maxfragsize != None and int(fragl) > int(maxfragsize): print("Warning : fragment " + name + " [" + fragl + "] outside of range. Discarded") continue if chromosome in resFrag: tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint})) resFrag[chromosome] = tree bed_handle.close() return resFrag
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print "## Loading Restriction File Intervals '", in_file, "'..." bed_handle = open(in_file) nline = 0 nfilt = 0 for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline,". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) fragl = abs(end - start) name = name.strip() ## Discard fragments outside the size range filt=False if minfragsize != None and int(fragl) < int(minfragsize): nfilt+=1 filt=True elif maxfragsize != None and int(fragl) > int(maxfragsize): nfilt+=1 filt=True if chromosome in resFrag: tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) resFrag[chromosome] = tree if nfilt > 0: print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining." bed_handle.close() return resFrag
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print "## Loading Restriction File Intervals '", in_file, "'..." bed_handle = open(in_file) nline = 0 for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline,". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) fragl = abs(end - start) name = name.strip() ## Discard fragments outside the size range if minfragsize != None and int(fragl) < int(minfragsize): print "Warning : fragment ", name, " [", fragl, "] outside of range. Discarded" continue if maxfragsize != None and int(fragl) > int(maxfragsize): print "Warning : fragment ", name, " [", fragl,"] outside of range. Discarded" continue if chromosome in resFrag.keys(): tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) resFrag[chromosome] = tree bed_handle.close() return resFrag
def load_BED(in_file, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome BED file are half-open, meaning that a bin ]100, 200] covered the bases 101 to 200 in_file = input file [character] verbose = verbose mode [logical] """ x = {} if verbose: print "## Loading BED file '", in_file, "'..." featureNames = [] nline = 0 with open(in_file) as bed_handle: for line in bed_handle: if nline > 0 and nline % 5000 == 0 and verbose: print "## %d features loaded ..." % nline nline += 1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print >> sys.stderr, "Warning : wrong input format in line", nline, ". Not a BED file !?" continue # BED files are zero-based, half-open as Intervals objects start = int(start) end = int(end) featureNames.append(name.strip()) if chromosome in x: tree = x[chromosome] tree.add_interval( Interval(start, end, value={'pos': nline - 1})) else: tree = Intersecter() tree.add_interval( Interval(start, end, value={'pos': nline - 1})) x[chromosome] = tree bed_handle.close() return (x, featureNames)
def load_bed(in_file, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ intervals = {} if verbose: print >> sys.stderr, "## Loading BED file '", in_file, "'..." bed_handle = open(in_file) nline = 0 for line in bed_handle: nline += 1 bedtab = line.strip().split("\t") try: chromosome, start, end = bedtab[:3] except ValueError: print >> sys.stderr, "Warning : wrong input format in line", nline, ". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) fragl = abs(end - start) if chromosome in intervals: tree = intervals[chromosome] tree.add_interval(Interval(start, end)) else: tree = Intersecter() tree.add_interval(Interval(start, end)) intervals[chromosome] = tree bed_handle.close() return intervals
def load_restriction_fragment(in_file, verbose): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print "## Loading Restriction File Intervals '", in_file, "'..." bed_handle = open(in_file) for line in bed_handle: bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: # FIXME we might want a proper warning message here ! continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) name = name.strip() if chromosome in resFrag.keys(): tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) resFrag[chromosome] = tree bed_handle.close() return resFrag
def maps_gene(mapped): '''Determine if the mapped alignment falls within a gene.''' global intersecters try: intersecter = intersecters[mapped['genome']] except KeyError: genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'}) intersecter = Intersecter() # Interval end is exclusive, need to +1 to line up with actual position [intersecter.add_interval(Interval(gene['start'], gene['end'] + 1, gene['uid'])) for gene in genes] intersecters[mapped['genome']] = intersecter return intersecter.find(mapped['refStart'], mapped['refEnd'])
def load_BED(in_file, exclusionSize=0, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ x = {} x_ex = {} if verbose: print "## Loading BED file '", in_file, "'..." nline = 0 with open(in_file) as bed_handle: for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline,". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) name = name.strip() if chromosome in x.keys(): tree = x[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) x[chromosome] = tree ## Exclusion regions if exclusionSize > 0: if chromosome in x_ex.keys(): tree_ex = x_ex[chromosome] tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) else: tree_ex = Intersecter() tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) x_ex[chromosome] = tree_ex bed_handle.close() return (x, x_ex)
def maps_gene(mapped): """Determine if the mapped alignment falls within a gene.""" global intersecters try: intersecter = intersecters[mapped['genome']] except KeyError: genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'}) intersecter = Intersecter() # Interval end is exclusive, need to +1 to line up with actual position [ intersecter.add_interval( Interval(gene['start'], gene['end'] + 1, gene['uid'])) for gene in genes ] intersecters[mapped['genome']] = intersecter return intersecter.find(mapped['refStart'], mapped['refEnd'])
def load_BED(in_file, exclusionSize=0, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ x = {} x_ex = {} if verbose: print "## Loading BED file '", in_file, "'..." nline = 0 with open(in_file) as bed_handle: for line in bed_handle: nline += 1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline, ". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) name = name.strip() if chromosome in x: tree = x[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) x[chromosome] = tree ## Exclusion regions if exclusionSize > 0: if chromosome in x_ex: tree_ex = x_ex[chromosome] tree_ex.add_interval( Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval( Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) else: tree_ex = Intersecter() tree_ex.add_interval( Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval( Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) x_ex[chromosome] = tree_ex bed_handle.close() return (x, x_ex)
def compare_two_transcripts(trans1, trans2, transcript_dict, afe=False): """ Returns the splice differences between two transcripts. Single exon-comparisons are ignored. Parameters ---------- trans1 : string transcript of interest trans2 : string second transcript of interest transcript_dict : a dictionary of transcript names with values being a list of exons afe : bool whether to include alternate start and ends :TODO make a better return :TODO maybe include something similar to to_plot Returns ------- Exclusive Junctions : 5' upstream exons : 3' downstram exons : Skipped Exons : Diffevent """ # TODO refactor this t1 = transcript_dict[trans1] t2 = transcript_dict[trans2] tree = Intersecter() starts1 = [i[0] for i in t1] starts2 = [i[0] for i in t2] reverse = False if min(starts1) <= min(starts2): s1 = t1 s2 = t2 s2_beg = min(starts2) else: s1 = t2 s2 = t1 reverse = True s2_beg = min(starts1) if reverse: torder = (trans2, trans1) else: torder = (trans1, trans2) # Ignore single-exon stuff if len(s1) <= 1 or len(s2) <= 1: return([], []) for i in s1: tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno':i[2]})) matching_exons = [] exclusive_juncs = [] skipped_exons = [] altends = [] exon_match = {} s1.sort(key=lambda x: x[0]) s2.sort(key=lambda x: x[0]) max_exon_1 = s1[-1][2] max_exon_2 = s2[-1][2] #end_position_s2 = max([i[1] for i in s2]) s1_end = max([i[1] for i in s1]) prev_match = None if max_exon_1 < s1[0][2]: strand = -1 else: strand = 1 for pcurr in range(len(s2)): start, end, exon_n = s2[pcurr] overlap = tree.find(int(start), int(end)) if len(overlap) == 0: if prev_match and (start < s1_end): #skipped exons cigar = _generate_cigar(s2, pcurr, mskip=1) try: if exon_match[exon_n - strand] == prev_match.value['anno']: try: nm = tree.find(*s2[pcurr + 1][0:2])[0] ocigar = [(3, nm.start - prev_match.end)] nexon = nm.value['anno'] except IndexError: nm=s1[_get_by_exonn(prev_match.value['anno']+strand,s1)] ocigar = [(3,nm[0] - prev_match.end)] nexon = nm[2] skipped_exons.append(DiffEvent('skipped_exon', start, end, torder, cigar2=cigar, cigar1 = ocigar, exon_num = (None, exon_n), exon2=(prev_match.value['anno'], nexon)) ) except KeyError: # Multiple skipped exons ncig = _generate_cigar(s2, pcurr, mskip=1)[1:] skipped_exons[-1]._extend(ncig, cig=2) elif start > s1_end: if prev_match: cigar = _generate_cigar(s2, pcurr, mskip=1) pm = tree.find(*s2[pcurr - 1][0:2])[0] pexon = pm.value['anno'] ocigar = [] for i in range(pexon, max_exon_1+strand, strand): narg = _get_by_exonn(i, s1) ocigar.append((0, s1[narg][0], s1[narg][1])) try: ocigar.append((3, s1[narg][1] - s1[narg+1][0])) except IndexError: pass #:TODO extend ocigar till end? altends.append(DiffEvent('AE', start, end, torder, cigar2=cigar, cigar1=ocigar, exon_num = (None, exon_n))) else: pass else: # Alternate start site that starts in between exons # of other transcript cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1] try: nm = tree.find(*s2[pcurr + 1][0:2])[0] except IndexError: from IPython import embed embed() nexon = nm.value['anno'] narg = _get_by_exonn(nexon - strand, s1) pmatch = s1[narg] ocigar = [(0, pmatch[1] - pmatch[0]), (3, nm.start - pmatch[1])] altends.append(DiffEvent('AS', start, end, torder, cigar2=cigar, cigar1 = ocigar)) elif len(overlap) == 1: if start == overlap[0].start and end == overlap[0].end: s1_exon_n = overlap[0].value['anno'] matching_exons.append((start, end, (s1_exon_n, exon_n), (0, 0))) if prev_match: if s1_exon_n - prev_match.value['anno'] == strand: pass else: # Difference in exon matches mskip = abs(s1_exon_n - prev_match.value['anno'] ) - 1 narg = _get_by_exonn(prev_match.value['anno']+strand, s1) s_s1 = s1[narg] # skipped s1 cigar = _generate_cigar(s1, narg, mskip=mskip) ocigar = [(3, start - s2[pcurr-1][1])] # Remove previous one skipped_exons.append( DiffEvent('skipped_exon', s_s1[0], s_s1[1], torder, cigar2 = ocigar, cigar1 = cigar, exon_num = (s_s1[2], None), exon2 = (exon_n-strand, exon_n))) prev_match = overlap[0] else: sstart = min(start, overlap[0].start) ssend = max(end, overlap[0].end) # Ignore 5' or 3' differences if (exon_n == max_exon_2 and overlap[0].value['anno'] == max_exon_1): if end == overlap[0].end: prev_match = overlap[0] else: exclusive_juncs.append( (sstart, ssend, (overlap[0].value['anno'], exon_n), (overlap[0].start - start, overlap[0].end - end) )) # Deal with partial matches prev_match = overlap[0] exon_match[exon_n] = int(overlap[0].value['anno']) else: pass skipped_exons = EventCollection(transcript_ids = [s1, s2], events=skipped_exons) skipped_exons.events.extend(altends) return(matching_exons, skipped_exons)
The threshold can also be adjusted (thresh_max). """ for line in smooth_list: if line > 0 and not started: started = True start_base = pos + startpos if line < 0 and started: thresh += 1 if thresh >= thresh_max: thresh = 0 started = False end_base = pos + startpos region_len = end_base - start_base if region_len >= nucl_min and region_len <= nucl_max: nucl.add_interval(Interval(start_base, end_base)) count += 1 pos += 1 ## initialize a list for data from find_contig() method contig_list = [] for read in nucl.find(0, end_base + 1): ## for each potential nucleosome interval, call find contig method. find_contig(read, smooth_list, startpos, contig_list) ## lastly, call this method to print all probable nucleosome ranges to a bed file calc_nucl_distance(contig_list, smooth_list, startpos, chrom_num)
if read.rnext != read.tid: continue if read.is_read1 or (read.is_read2 and read.pnext + read.qlen < regionStart - protection - 1): if read.isize == 0: continue if options.downsample != None and random.random( ) >= options.downsample: continue rstart = min(read.pos, read.pnext) + 1 # 1-based lseq = abs(read.isize) rend = rstart + lseq - 1 # end included if minInsSize != None and ((lseq < minInsSize) or (lseq > maxInsSize)): continue filteredReads.add_interval(Interval(rstart, rend)) #print read.qname,rstart,rend,rend-rstart,abs(read.isize) for i in range(rstart, rend + 1): if i >= regionStart and i <= regionEnd: posRange[i][0] += 1 if rstart >= regionStart and rstart <= regionEnd: posRange[rstart][1] += 1 if rend >= regionStart and rend <= regionEnd: posRange[rend][1] += 1 else: if options.downsample != None and random.random( ) >= options.downsample: continue rstart = read.pos + 1 # 1-based lseq = aln_length(read.cigar) rend = rstart + lseq - 1 # end included
def regionTree1(mylist): resFrag = Intersecter() for s,e in mylist: resFrag.add_interval(Interval(s,e)) return resFrag
def compare_two_transcripts(trans1, trans2, transcript_dict, afe=False): """ Returns the splice differences between two transcripts. Single exon-comparisons are ignored. Parameters ---------- trans1 : string transcript of interest trans2 : string second transcript of interest transcript_dict : a dictionary of transcript names with values being a list of exons afe : bool whether to include alternate start and ends :TODO make a better return :TODO maybe include something similar to to_plot Returns ------- Exclusive Junctions : 5' upstream exons : 3' downstram exons : Skipped Exons : Diffevent """ # TODO refactor this t1 = transcript_dict[trans1] t2 = transcript_dict[trans2] tree = Intersecter() starts1 = [i[0] for i in t1] starts2 = [i[0] for i in t2] reverse = False if min(starts1) <= min(starts2): s1 = t1 s2 = t2 s2_beg = min(starts2) else: s1 = t2 s2 = t1 reverse = True s2_beg = min(starts1) if reverse: torder = (trans2, trans1) else: torder = (trans1, trans2) # Ignore single-exon stuff if len(s1) <= 1 or len(s2) <= 1: return ([], []) for i in s1: tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno': i[2]})) matching_exons = [] exclusive_juncs = [] skipped_exons = [] altends = [] exon_match = {} s1.sort(key=lambda x: x[0]) s2.sort(key=lambda x: x[0]) max_exon_1 = s1[-1][2] max_exon_2 = s2[-1][2] #end_position_s2 = max([i[1] for i in s2]) s1_end = max([i[1] for i in s1]) prev_match = None if max_exon_1 < s1[0][2]: strand = -1 else: strand = 1 for pcurr in range(len(s2)): start, end, exon_n = s2[pcurr] overlap = tree.find(int(start), int(end)) if len(overlap) == 0: if prev_match and (start < s1_end): #skipped exons cigar = _generate_cigar(s2, pcurr, mskip=1) try: if exon_match[exon_n - strand] == prev_match.value['anno']: try: nm = tree.find(*s2[pcurr + 1][0:2])[0] ocigar = [(3, nm.start - prev_match.end)] nexon = nm.value['anno'] except IndexError: nm = s1[_get_by_exonn( prev_match.value['anno'] + strand, s1)] ocigar = [(3, nm[0] - prev_match.end)] nexon = nm[2] skipped_exons.append( DiffEvent('skipped_exon', start, end, torder, cigar2=cigar, cigar1=ocigar, exon_num=(None, exon_n), exon2=(prev_match.value['anno'], nexon))) except KeyError: # Multiple skipped exons ncig = _generate_cigar(s2, pcurr, mskip=1)[1:] skipped_exons[-1]._extend(ncig, cig=2) elif start > s1_end: if prev_match: cigar = _generate_cigar(s2, pcurr, mskip=1) pm = tree.find(*s2[pcurr - 1][0:2])[0] pexon = pm.value['anno'] ocigar = [] for i in range(pexon, max_exon_1 + strand, strand): narg = _get_by_exonn(i, s1) ocigar.append((0, s1[narg][0], s1[narg][1])) try: ocigar.append((3, s1[narg][1] - s1[narg + 1][0])) except IndexError: pass #:TODO extend ocigar till end? altends.append( DiffEvent('AE', start, end, torder, cigar2=cigar, cigar1=ocigar, exon_num=(None, exon_n))) else: pass else: # Alternate start site that starts in between exons # of other transcript cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1] try: nm = tree.find(*s2[pcurr + 1][0:2])[0] except IndexError: from IPython import embed embed() nexon = nm.value['anno'] narg = _get_by_exonn(nexon - strand, s1) pmatch = s1[narg] ocigar = [(0, pmatch[1] - pmatch[0]), (3, nm.start - pmatch[1])] altends.append( DiffEvent('AS', start, end, torder, cigar2=cigar, cigar1=ocigar)) elif len(overlap) == 1: if start == overlap[0].start and end == overlap[0].end: s1_exon_n = overlap[0].value['anno'] matching_exons.append( (start, end, (s1_exon_n, exon_n), (0, 0))) if prev_match: if s1_exon_n - prev_match.value['anno'] == strand: pass else: # Difference in exon matches mskip = abs(s1_exon_n - prev_match.value['anno']) - 1 narg = _get_by_exonn(prev_match.value['anno'] + strand, s1) s_s1 = s1[narg] # skipped s1 cigar = _generate_cigar(s1, narg, mskip=mskip) ocigar = [(3, start - s2[pcurr - 1][1])] # Remove previous one skipped_exons.append( DiffEvent('skipped_exon', s_s1[0], s_s1[1], torder, cigar2=ocigar, cigar1=cigar, exon_num=(s_s1[2], None), exon2=(exon_n - strand, exon_n))) prev_match = overlap[0] else: sstart = min(start, overlap[0].start) ssend = max(end, overlap[0].end) # Ignore 5' or 3' differences if (exon_n == max_exon_2 and overlap[0].value['anno'] == max_exon_1): if end == overlap[0].end: prev_match = overlap[0] else: exclusive_juncs.append( (sstart, ssend, (overlap[0].value['anno'], exon_n), (overlap[0].start - start, overlap[0].end - end))) # Deal with partial matches prev_match = overlap[0] exon_match[exon_n] = int(overlap[0].value['anno']) else: pass skipped_exons = EventCollection(transcript_ids=[s1, s2], events=skipped_exons) skipped_exons.events.extend(altends) return (matching_exons, skipped_exons)
if isSoftClipped(read.cigar): continue if read.is_paired: if read.mate_is_unmapped: continue if read.rnext != read.tid: continue if read.is_read1 or (not options.pipe and read.is_read2 and read.pnext+read.qlen < start): if read.isize == 0: continue if options.downsample != None and random.random() >= options.downsample: continue if options.random: rstart = min(read.pos,read.pnext)+1+random.randint(-5,5) # 1-based rend = rstart+abs(read.isize)-1+random.randint(-5,5) # end included else: rstart = min(read.pos,read.pnext)+1 # 1-based rend = rstart+abs(read.isize)-1 # end included filteredReads.add_interval(Interval(rstart,rend)) #print read.qname,rstart,rend,rend-rstart,abs(read.isize) for i in range(rstart,rend+1): if i >= start and i <= end: posRange[i][0]+=1 if rstart >= start and rstart <= end: posRange[rstart][1]+=1 if rend >= start and rend <= end: posRange[rend][1]+=1 else: if options.downsample != None and random.random() >= options.downsample: continue if options.random: rstart = read.pos+1+random.randint(-5,5) # 1-based rend = rstart+aln_length(read.cigar)-1+random.randint(-5,5) # end included else: rstart = read.pos+1 # 1-based
def calculate_score(chrom, start, end, ctype="WPS"): filteredReads = Intersecter() posRange = defaultdict(int) for read in readIterator(args, chrom, start, end): if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue if isSoftClipped(read.cigar): continue if read.is_paired: if read.mate_is_unmapped: continue if read.rnext != read.tid: continue if read.is_read1 or (read.is_read2 and read.pnext + read.qlen < start): if read.isize == 0: continue if options.downsample != None and random.random( ) >= options.downsample: continue rstart = min(read.pos, read.pnext) + 1 # 1-based rend = rstart + abs(read.isize) - 1 # end included rlength = rend - rstart + 1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart, rend)) if ctype == "COV": for i in range(rstart, rend + 1): if i >= start and i <= end: posRange[i] += 1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart] += 1 if rend >= start and rend <= end: posRange[rend] += 1 else: if options.downsample != None and random.random( ) >= options.downsample: continue rstart = read.pos + 1 # 1-based rend = rstart + aln_length(read.cigar) - 1 # end included rlength = rend - rstart + 1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart, rend)) if ctype == "COV": for i in range(rstart, rend + 1): if i >= start and i <= end: posRange[i] += 1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart] += 1 if rend >= start and rend <= end: posRange[rend] += 1 if ctype == "WPS": protection = options.protection // 2 for pos in xrange(start, end + 1): rstart, rend = pos - protection, pos + protection gcount, bcount = 0, 0 for read in filteredReads.find(rstart, rend): if (read.start > rstart) or (read.end < rend): bcount += 1 else: gcount += 1 posRange[pos] += gcount - bcount res = [] for pos in xrange(start, end + 1): res.append(posRange[pos]) return res
def load_BED(in_file, exclusionSize=0, chroms=None, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ x = {} x_ex = {} skipped_chrom = [] if verbose: print("## Loading BED file {} ...".format(in_file), file=sys.stderr) nline = 0 with open(in_file) as bed_handle: for line in bed_handle: if len(line.strip()) == 0: continue if nline % 1000000 == 0 and nline != 0 and verbose: sys.stderr.write("{} million lines loaded\n".format(int(nline/1000000))) nline += 1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print("Warning : wrong input format in line {}. Not a BED file !?".format(nline), file=sys.stderr) sys.exit(1) continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) name = name.strip() if chroms is not None and chromosome not in chroms: if chromosome not in skipped_chrom: print("Warning : Restrict to cis interactions - {} skipped".format(chromosome), file=sys.stderr) skipped_chrom.append(chromosome) continue if chromosome in x: tree = x[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) x[chromosome] = tree ## Exclusion regions if exclusionSize > 0: if chromosome in x_ex: tree_ex = x_ex[chromosome] tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) else: tree_ex = Intersecter() tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) x_ex[chromosome] = tree_ex return (x, x_ex)
def build_resFragBin_tree(in_file, resolution=None, verbose=False): """ build restriction fragments bins tree based on the user defined resolution, e.g. 10fragment """ binTree = {} if resolution is None: print( "Please provide suitable resolution to bin contact! For example resolution = 10 for 10 RE fragment as one bin." ) sys.exit() else: resolution = int(resolution) if verbose: print("## Building RE fragment bins tree from ordered'" + in_file + "'...") bed_handle = open(in_file) nline = 0 flag = 0 for line in bed_handle: nline += 1 bedtab = line.split("\t") try: chromosome, start, end = bedtab[:3] start = int(start) end = int(end) except ValueError: print("Warning : wrong input format in line" + nline + ". Not a BED file !?") continue # First line if flag == 0: #Bin variables store current bin information startBin = start chromosomeBin = chromosome flag = 1 elif chromosomePre != chromosome: #Start of another chromosome nline = 1 if startBin != endPre: endBin = endPre midBin = round((startBin + endBin) / 2) if chromosomeBin in binTree.keys(): tree = binTree[chromosomeBin] tree.add_interval( Interval(startBin, endBin, value={'midPoint': midBin})) else: tree = Intersecter() tree.add_interval( Interval(startBin, endBin, value={'midPoint': midBin})) binTree[chromosomeBin] = tree startBin = start chromosomeBin = chromosome elif nline % resolution == 0: endBin = end midBin = round((startBin + endBin) / 2) if chromosomeBin in binTree.keys(): tree = binTree[chromosomeBin] tree.add_interval( Interval(startBin, endBin, value={'midPoint': midBin})) else: tree = Intersecter() tree.add_interval( Interval(startBin, endBin, value={'midPoint': midBin})) binTree[chromosomeBin] = tree startBin = end chromosomeBin = chromosome # Update the Pre variable set startPre = start endPre = end chromosomePre = chromosome # for the last bin if nline % resolution != 0 and startBin != start and chromosomeBin == chromosome: endBin = end midBin = round((startBin + endBin) / 2) if chromosomeBin in binTree.keys(): tree = binTree[chromosomeBin] tree.add_interval( Interval(startBin, endBin, value={'midPoint': midBin})) else: tree = Intersecter() tree.add_interval( Interval(startBin, endBin, value={'midPoint': midBin})) binTree[chromosomeBin] = tree bed_handle.close() return binTree
continue ## only want proper pair reads if read.is_proper_pair: ## if read is proper pair, add entire PE read start/end to list if abs( read.template_length ) < 10000: ## precaution for weird reads - sometimes have huge length if read.is_read1: ## get start/end of read read_start = min(read.reference_start, read.next_reference_start) + 1 read_end = read_start + abs(read.template_length) ## add start/end to interval list start_end_list.add_interval(Interval(read_start, read_end)) ## get last position from last read final_pos = min(read.reference_start, read.next_reference_start) + abs( read.template_length) if options.region != None: chrom = options.region.split(':')[0] start, end = map(int, options.region.split(':')[1].split('-')) else: chrom = read.reference_name start, end = init_pos, final_pos window_size = 120 ## define the window size (could be made an option) prot_region = window_size // 2 ## definitely a parameter worth messing with
def generate(x): "Generates random interval over a size and span" lo = randint(10000, SIZE) hi = lo + randint(1, randint(1, 10**4)) return (lo, hi) def generate_point(x): lo = randint(10000, SIZE) return (lo, lo) # use this to force both examples to generate the same data seed(10) # generate 10 thousand random intervals data = map(generate, xrange(N)) # generate the intervals to query over query = map(generate_point, xrange(1000)) # create the interval tree tree = Intersecter() # build an interval tree from the rest of the data for start, end in data: tree.add_interval( Interval(start, end) ) # perform the query for q, q in query: overlap = tree.find(q, q) out = [ (x.start, x.end) for x in overlap ] print '(%s) -> %s' % (q, out)