def process_file(chr_features, pc_file): if pc_file[-2:] == 'gz': pc_f = gzip.open(pc_file) elif os.path.isfile(pc_file): pc_f = open(pc_file) elif os.path.isfile(pc_file + '.gz'): pc_f = gzip.open(pc_file + '.gz') chrom = os.path.split(pc_file)[1].split('.')[0] print >> sys.stderr, 'Processing %s ...' % chrom, block_start = 0 block_scores = [] line = pc_f.readline() while line: if line.startswith('fixedStep'): if block_scores: intersect_scores(chr_features.get(chrom, IntervalTree()), block_start, block_scores) a = line.split() chrom = a[1][6:] block_start = int(a[2][6:]) block_scores = [] else: block_scores.append(float(line.rstrip())) line = pc_f.readline() intersect_scores(chr_features.get(chrom, IntervalTree()), block_start, block_scores) pc_f.close() print >> sys.stderr, 'Done'
def __init__(self, gtf_filename): self.gtf_filename = gtf_filename self.genome = defaultdict(lambda: IntervalTree()) # chr --> IntervalTree --> (0-start, 1-end, transcript ID) self.transcript = defaultdict(lambda: IntervalTree()) # tID --> IntervalTree --> (0-start, 1-end, {'ith': i-th exon, 'eID': exon ID}) self.exon = defaultdict(lambda: []) # (0start,1end) --> list of (tID, ith-exon, chr) self.transcript_info = {} # tID --> chr self.readGTF(self.gtf_filename)
def load_exons_and_genes(genesF): gtrees={} etrees={} genesIN = open(genesF) #map transcript (isoform) name to cluster_id ("gene") t_to_gene_map = {} #load individual transcripts (isoforms) for line in genesIN: #skip header if line[0] == 'c': continue line = line.rstrip() fields = line.split('\t') (cluster_id,tname,refid,strand,tstart,tend) = fields[:6] refid = refid.replace("chr","") eStarts = fields[9].split(',') eEnds = fields[10].split(',') alignID = fields[12] #now save the exons as intervals if refid not in etrees: etrees[refid] = IntervalTree() #use 1-based closed interval tstart=int(tstart)+1 for (eStart,eEnd) in zip(eStarts,eEnds): if len(eStart) == 0: continue #use 1-based closed interval eStart=int(eStart)+1 #sys.stderr.write("%s %s %s\n"%(eStart,eEnd,cluster_id)) #must adjust for the open intervals (both) of the interval tree itv = Interval(eStart-1,int(eEnd)+1, value=[cluster_id,alignID,strand]) etrees[refid].insert_interval(itv) #now map to the cluster_id and figure whether we can increase #the longest transcript coordinate span with these coordinates tend = int(tend) if cluster_id not in t_to_gene_map: t_to_gene_map[cluster_id]=[tstart,tend,refid] if tstart < t_to_gene_map[cluster_id][0]: t_to_gene_map[cluster_id][0] = tstart if tend > t_to_gene_map[cluster_id][1]: t_to_gene_map[cluster_id][1] = tend genesIN.close() #now convert the cluster (gene) coordinate extents to intervals for (cluster_id,span) in t_to_gene_map.iteritems(): (st,en,refid) = span if refid not in gtrees: gtrees[refid] = IntervalTree() #sys.stderr.write("%d %d %s\n"%(st,en,cluster_id)) #must adjust for the open intervals (both) of the interval tree itv = Interval(int(st)-1,int(en)+1,value=cluster_id) gtrees[refid].insert_interval(itv) return (etrees,gtrees)
def main(argv): """reads in two haploid bam files so odd ploidy can be modelled""" # get input params options = get_commandline_options() output_bam_file = options.output_bam_file bam_file_haplotype1 = options.bam_file_haplotype1 bam_file_haplotype2 = options.bam_file_haplotype2 query_chr = options.query_chr truth_file = options.truth_file mutate_somatic_variants = options.mutate_somatic_variants somatic_snv_files = options.somatic_snv_files clonal_percs = options.clonal_percs subsample_somatic_snvs = options.subsample_somatic_snvs ploidy_depth = options.ploidy_depth bam_depth = options.bam_depth truth_set_cn_calls = read_in_truth_set(truth_file) snv_out_dir = os.path.dirname(output_bam_file) + "/forced_somatic_snv_frequencies" somatic_variants = IntervalTree() if options.rand_seed: random.seed(options.rand_seed) if mutate_somatic_variants: somatic_variants = read_in_somatic_vcf_file( somatic_snv_files, clonal_percs, query_chr, truth_set_cn_calls, snv_out_dir, subsample_somatic_snvs) # somatic_indel_file = '' if not defined and not used anyway at the mo create_synthetic_bam(bam_file_haplotype1, bam_file_haplotype2, output_bam_file, query_chr, mutate_somatic_variants, somatic_variants, truth_set_cn_calls, ploidy_depth, bam_depth) print("finished first haplotype")
def load_repeats(repeatsF): rtrees = {} gtype = "" tname = "" seen = set() repeatsIN = open(repeatsF) FIRST = True for line in repeatsIN: #skip header if FIRST: FIRST = False continue line = line.rstrip() fields = line.split('\t') (refid, st, en) = fields[5:8] orient = fields[9] tname = fields[10] gtype = fields[11] #present already in interval tree if "%s_%s" % (st, en) in seen: continue seen.add("%s_%s" % (st, en)) if refid not in rtrees: rtrees[refid] = IntervalTree() itv = Interval(int(st), int(en), value=[tname, gtype]) rtrees[refid].insert_interval(itv) repeatsIN.close() return rtrees
def add_intervals(genes, exons): '''Add exons to interval tree.''' for exon in exons: if exon.value['chrom'] not in genes: genes[exon.value['chrom']] = IntervalTree() genes[exon.value['chrom']].insert_interval(exon)
def BED_to_interval_tree(BED_file): """ Creates an index of intervals, using an interval tree, for each BED entry :param BED_file: file handler of a BED file :return interval tree """ from bx.intervals.intersection import IntervalTree, Interval bed_interval_tree = {} for line in BED_file: if line[0] == "#": continue fields = line.strip().split() chrom, start_bed, end_bed, = fields[0], int(fields[1]), int(fields[2]) if chrom not in bed_interval_tree: bed_interval_tree[chrom] = IntervalTree() # skip if a region overlaps with a region already seen """ if len(bed_interval_tree[chrom].find(start_bed, start_bed + 1)) > 0: continue """ bed_interval_tree[chrom].add_interval(Interval(start_bed, end_bed)) return bed_interval_tree
def intersect_scores(chr_features, interval2lnc, lnc_cons, chrom, block_start, block_scores): features = chr_features.get(chrom, IntervalTree()) block_end = block_start + len(block_scores) - 1 for overlap_interval in features.find(block_start, block_start + len(block_scores)): # block internal to invterval if overlap_interval.start <= block_start <= block_end <= overlap_interval.end: start = 0 end = len(block_scores) # interval internal to block elif block_start <= overlap_interval.start <= overlap_interval.end <= block_end: start = overlap_interval.start - block_start end = start + overlap_interval.end - overlap_interval.start + 1 # left block overlap interval elif block_start < overlap_interval.start: start = overlap_interval.start - block_start end = start + block_end - overlap_interval.start + 1 # right block overlap interval else: start = 0 end = overlap_interval.end - block_start + 1 #start = overlap_interval.start - block_start #end = start + overlap_interval.end - overlap_interval.start for tid in interval2lnc[(chrom, overlap_interval.start, overlap_interval.end)]: lnc_cons[tid] += block_scores[start:end]
def create_inttree_from_file(infile): """Create interval tree to store annotations Args: infile: handle of open BED file with annotations Return: dictionary {chromosome name : interval tree with coordinates} """ genome = {} for line in infile: clean_line = line.strip() parts = clean_line.split() chrom, start, stop = parts[0], int(parts[1]), int(parts[2]) name = parts[3] tree = None #if chromosome already in tree, index to this tree if chrom in genome: tree = genome[chrom] else: #first time we encounter chromosome, create a new interval tree tree = IntervalTree() genome[chrom] = tree #add interval to tree tree.add(start, stop, name) return genome
def index_gff3(gff3_file_path): #following an example from https://malariageninformatics.wordpress.com/2011/07/07/using-interval-trees-to-query-genome-annotations-by-position/ # dictionary mapping chromosome names to interval trees genome = dict() # parse the annotations file (GFF3) and build the interval trees gff = pd.read_csv(gff3_file_path, sep="\t", header=None, comment="#") for idx, row in gff.iterrows(): if args.tag is not None and row[2] != args.tag: continue seqid = row[0] start = int(row[3]) end = int(row[4]) tree = None # one interval tree per chromosome if seqid in genome: tree = genome[seqid] else: # first time we've encountered this chromosome, create an interval tree tree = IntervalTree() genome[seqid] = tree # index the feature if args.attribute is None and args.join is None: tree.add(start, end, row) else: attr = row[8].split(";") o = list() for n in attr: k, v = n.split("=") if k == args.attribute or k == args.join: o.append(v) o = ",".join(o) tree.add(start, end, o) return genome
def load_repeats(repeatsF): rtrees={} gtype = "" tname = "" seen = set() repeatsIN = open(repeatsF) FIRST = True for line in repeatsIN: #skip header if FIRST: FIRST = False continue line = line.rstrip() fields = line.split('\t') (refid,st,en) = fields[5:8] refid = refid.replace("chr","") strand = fields[9] tname = fields[10] gtype = fields[11] #use 1-based closed interval st=int(st)+1 #present already in interval tree if "%d_%s" % (st,en) in seen: continue seen.add("%d_%s" % (st,en)) if refid not in rtrees: rtrees[refid] = IntervalTree() #must adjust for the open intervals (both) of the interval tree itv = Interval(st-1,int(en)+1, value=[tname,gtype,strand]) rtrees[refid].insert_interval(itv) repeatsIN.close() return rtrees
def read_in_somatic_vcf_file(somatic_snv_files, clonal_percs, query_chr, truth_set_cn_calls, output_dir, subsample_somatic_snvs): """ read clonal somatic SNV vcf files""" fsf = open(os.path.join(output_dir,'forced_somatic_snv_frequencies_' + str(query_chr) + '.json'), 'w') print("ri ",query_chr) h = IntervalTree() h2={} for (somatic_snv_file, clonal_perc) in zip(somatic_snv_files, clonal_percs): # for now just do SNVs as adding in indels involve increasing the size of reads which could cause issues; # thinking about it probably wouldnt - quite faffy though FH = open(somatic_snv_file,'r') for line in FH: if re.match('#',line): continue random_no = random.random() if random_no > subsample_somatic_snvs: continue (chrom, pos, id, ref, alt, qual, filter, info, format, normal, tumor)=line.strip().split() pos=int(pos) if chrom != query_chr: continue if format != 'DP:FDP:SDP:SUBDP:AU:CU:GU:TU': sys.exit('vcf format not the usual'+'DP:FDP:SDP:SUBDP:AU:CU:GU:TU') print("tumor ",tumor) (DP,FDP,SDP,SUBDP,AU,CU,GU,TU) = tumor.strip().split(':') cov=float(DP) if ref =='A': l=[CU,GU,TU] if ref =='C': l=[AU,GU,TU] if ref =='G': l=[CU,AU,TU] if ref =='T': l=[CU,GU,AU] #should be a pithy python way to do this but this'll do for now (first, second, third)=sorted([int(cv.split(',')[0] ) for cv in l], reverse=True) #just using first tier reads for now if random.random() > 0.5: somatic_haplotypeCN = 'firsthaplotype_CN' else: somatic_haplotypeCN = 'secondhaplotype_CN' print("pos ",pos, "shcn ", somatic_haplotypeCN , " r ") region_CN = 2 for region in truth_set_cn_calls[query_chr]: if pos >= region['start'] and pos <= region['end']: region_CN = region[somatic_haplotypeCN] somatic_mutation_freq = float(assign_freq_based_on_ploidy(region_CN)) somatic_mutation_freq *= float(clonal_perc) h.add(pos, pos,{'pos':pos, 'ref':ref, 'alt':alt, 'line':line, 'freq':somatic_mutation_freq, 'somatic_haplotype':somatic_haplotypeCN}) #theoretically bug: could have snv and indel at same pos #also bug if snp or indel last/first on read h2[pos] = {'pos':pos, 'ref':ref, 'alt':alt, 'line':line, 'freq':somatic_mutation_freq, 'somatic_haplotype':somatic_haplotypeCN} pprint.pprint(h2) json.dump(h2, fsf, indent=4, sort_keys=True) return h
def setUp(self): iv = IntervalTree() iv.add_interval(Interval(50, 59)) for i in range(0, 110, 10): if i == 50: continue f = Interval(i, i + 9) iv.add_interval(f) self.intervals = iv
def main(argv): bedfile = argv[1] wigfile = argv[2] intersecter = IntervalTree() for peak in parse_wig(wigfile): intersecter.insert_interval(peak) report(intersecter, bedfile)
def test_empty(self): iv = IntervalTree() self.assertEqual([], iv.find(100, 300)) self.assertEqual([], iv.after(100)) self.assertEqual([], iv.before(100)) self.assertEqual([], iv.after_interval(100)) self.assertEqual([], iv.before_interval(100)) self.assertEqual([], iv.upstream_of_interval(100)) self.assertEqual([], iv.downstream_of_interval(100)) self.assertEqual(None, iv.traverse(lambda x: x.append(1)))
def add(self, chrom, element): """insert an element. use this method as the IntervalTree one. this will simply call the IntervalTree.add method on the right tree :param chrom: chromosome :param element: the argument of IntervalTree.insert_interval :return: None """ self._trees.setdefault(chrom, IntervalTree()).insert_interval(element)
def btab_reclist_to_interval_list_0basedStart(recs): """ Return chr, list of IntervalNode """ tree = IntervalTree() for rec in recs: tree.insert(rec['rStart1'] - 1, rec['rEnd1']) path = [] tree.traverse(path.append) chr = recs[0]['chr'] return chr, path
def insert(self, chrom, start, end, gene_id, gene_name): from bx.intervals.intersection import Interval from bx.intervals.intersection import IntervalNode from bx.intervals.intersection import IntervalTree if chrom in self.chroms: self.chroms[chrom].insert( start, end, MyInterval(start, end, [gene_id, gene_name])) else: self.chroms[chrom] = IntervalTree() self.chroms[chrom].insert( start, end, MyInterval(start, end, [gene_id, gene_name]))
def btab_reclist_to_interval_list_0basedStart(recs): """ Return chr, list of IntervalNode """ tree = IntervalTree() for rec in recs: tree.insert(rec["rStart1"] - 1, rec["rEnd1"]) path = [] tree.traverse(path.append) seqname = recs[0]["chr"] return seqname, path
def main(): infile = sys.argv[1] qfile = sys.argv[2] genes = {} for chr, gene in parse_gene_coordinate(infile): if chr not in genes: genes[chr] = IntervalTree() genes[chr].insert_interval(gene) find_overlap(genes, qfile)
def get_nearest_gene_intervall_tree(depict_gene_annotation_file, depictgenes): ens_col = 0 chr_col = 6 str_col = 1 sta_col = 2 end_col = 3 trees = {} for i in range(1, 23, 1): trees[str(i)] = IntervalTree() with open (depict_gene_annotation_file,'r') as infile: for line in infile.readlines()[1:]: words = line.strip().split('\t') if words[ens_col] in depictgenes and words[chr_col] in [str(x) for x in range(1,23,1)]: tss = int(words[sta_col]) if words[str_col] == '1' else int(words[end_col]) trees[words[chr_col]].insert_interval(Interval(tss, tss, value=words[ens_col])) if words[ens_col] in depictgenes and words[chr_col] in [str(x) for x in range(1,23,1)] else None return trees
def index_genes(G, window=0): G = G.GroupBy(_.seqname).Sort(_.start) G = G.Get(_.seqname, _.name, _.start, _.end).Flat() chrs = {} for (seqname, name, start, end) in zip(*G()): if seqname not in chrs: chrs[seqname] = IntervalTree() #fi chrs[seqname].add(start - window, end + window, (name, start, end)) print seqname, start, end, name #efor return chrs
def read_genes(filename): chroms = {} with open(filename) as file: reader = csv.DictReader(file, delimiter="\t") for row in reader: try: chrom = row['chromosome'] start = int(row['GRCh37 start']) end = int(row['GRCh37 end']) symbol = row['symbol'] tier = int(row['tier']) if chrom not in chroms: chroms[chrom] = IntervalTree() chroms[chrom].insert(start, end, (symbol, tier)) except: pass return chroms
def setUp(self): iv = IntervalTree() n = 0 for i in range(1, 1000, 80): iv.insert(i, i + 10, dict(value=i * i)) # add is synonym for insert. iv.add(i + 20, i + 30, dict(astr=str(i * i))) # or insert/add an interval object with start, end attrs. iv.insert_interval( Interval(i + 40, i + 50, value=dict(astr=str(i * i)))) iv.add_interval( Interval(i + 60, i + 70, value=dict(astr=str(i * i)))) n += 4 self.intervals = self.iv = iv self.nintervals = n
def merge_gene_into_cluster(args): """ Merge external genes into clusters of genes from clusterGenes """ args = parse_args(args) f_gl = args.f_gl f_gl_gene = args.f_gl_gene f_ext_gene = args.f_ext_gene f_out = args.f_out f_out_no_overlap = args.f_out_no_overlap print("Loading gl gene ...") gl_gene_dict = load_gene(f_gl_gene) print("Loading gl ...") cluster_dict = dict() for cluster, gene in load_gl(f_gl): if cluster not in cluster_dict.keys(): new_cluster = Cluster(cluster) cluster_dict[cluster] = new_cluster assert gene in gl_gene_dict.keys(), "Cannot find {0} in {1}".format( gene, f_gl_gene) cluster_dict[cluster].add_gene(gl_gene_dict[gene]) cluster_list = list(cluster_dict.values()) # Build Chrom:Strand IntervalTree ctree = dict() for cluster in cluster_list: cluster.build_exon_block() key = (cluster.chrom, cluster.strand) if key not in ctree: ctree[key] = IntervalTree() ctree[key].insert(cluster.start, cluster.end, cluster) print("Loading external gene ...") ext_gene = list(load_gene(f_ext_gene).values()) print("Assigning gene into clusters ...") with open(f_out_no_overlap, "w") as f: assign_gene_to_cluster(ext_gene, ctree, f) with open(f_out, "w") as f: for cluster in cluster_list: cluster.write_mapping(f)
def plot_coverage(coords, bams): '''Given the name of a DNA coordinates firl and a list of bam file names, plot the read aligment coverage for each bam file for each coordinate. One graph per coordinate will be generated. The coverage for each BAM file for a given coordinate will be plotted on the same graph. The coordinates file should be in TSV format.''' coords = get_coords(coords) for chrom, start, end in coords: logging.info("processing coord {} {} {}".format(chrom, start, end)) # Start plotting the graph and generate a name for the output file graph_filename = start_graph(chrom, start, end) coords_range = range(start, end + 1) for bam_filename in bams: # interval tree tracks the start and end mapped coordinates # of each read in the bam file that lies within our region # of interest. interval_tree = IntervalTree() with pysam.Samfile(bam_filename, "rb") as bam: logging.info("processing bam file {}".format(bam_filename)) # Collect all the reads from the BAM file which lie in # the region of interest. # fetch uses 0-based indexing. Our input coordinates are # in 1-based coordinates. reads = bam.fetch(chrom, start - 1, end - 1) # Insert the start and end of each aligned read into the # interval tree. for read in reads: if len(read.positions) > 0: # Add 1 to convert from 0-based to 1-based coordinates first_pos = read.positions[0] + 1 last_pos = read.positions[-1] + 1 interval_tree.add(first_pos, last_pos, None) # For each base position in our region of interest, # count the number of reads which overlap this position. # This computes the coverage for each position in the region. counts = [ len(interval_tree.find(pos, pos)) for pos in coords_range ] # Plot the coverage information for this bam file legend_text = bam_name_legend(bam_filename) plot_graph(counts, coords_range, legend_text) # Close the drawing of the graph for this set of coordinates end_graph(graph_filename)
def make_intervals(hindiii_genome): ''' Need to convert to 0-based for bx-python overlaps ''' #make genome hindiii fragments into intervals genome = dict() for frag in hindiii_genome.values(): tree = None # one interval tree per chromosome if frag.chrom in genome: tree = genome[frag.chrom] else: # first time we've encountered this chromosome, create an interval tree tree = IntervalTree() genome[frag.chrom] = tree # index the feature tree.add(int(frag.start) - 1, int(frag.end), frag.fragment_id) return genome
def index_gff3_id(G, features=["exon"]): G = G[_.feature.In(Rep(features))]; #G = G[_.strand == '+'].GroupBy(_.parent).Sort(_.start, descend=False) | Stack | G[_.strand == '-'].GroupBy(_.parent).Sort(_.start, descend=True); G = G.GroupBy(_.parent).Sort(_.start); G = G.Get(_.parent, _.start, _.end, _.strand).Flat(); genes = {}; for (parent, start, end, strand) in zip(*G()): if parent not in genes: genes[parent] = [{}, IntervalTree()]; #fi exon_n = len(genes[parent][0].keys()); genes[parent][0][exon_n+1] = (start, end); genes[parent][1].add(start, end, (start, end, exon_n+1)); print parent, start, end, exon_n+1; #efor return genes;
def index_gtf(gtf_file_path): # dictionary mapping chromosome names to interval trees genome = dict() #parse the annotations file (Gtf) and build the interval trees with open(gtf_file_path, "r") as annotations_file: reader = csv.reader(annotations_file, delimiter = '\t') for row in reader: if len(row) == 9 and not row[0].startswith('##'): seqid = row[0] start = int(row[3]) end = int(row[4]) tree = None # build one interval tree per chromosome if seqid in genome: tree = genome[seqid] else: #first time we've encoutered this chromosome, creat an interval tree tree = IntervalTree() genome[seqid] = tree #index the feature tree.add(start, end, tuple(row)) return genome
def get_intervals(intervalsFile): """ Creates an index of intervals for each restriction site :param intervalsFile: file handler of a BED file """ intervals_tree = {} ff = open(intervalsFile, 'r') for line in ff: fields = line.strip().split() chrom, start_int, end_int, = fields[0], int(fields[1]), int(fields[2]) if chrom not in intervals_tree: intervals_tree[chrom] = IntervalTree() try: intervals_tree[chrom].add_interval(Interval(start_int, end_int)) except: sys.stderr.write("Problem with line:{}\n".format(line)) sys.stderr.write(fields) ff.close() return intervals_tree