def analyze_deletions(bam_file, threshold=50): bam_reader = pysam.AlignmentFile(bam_file, "rb") fasta_file = Shared.get_dependency( os.path.join( "albicans", "reference genome", "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta") ) chrom_names = [] chrom_lens = {} for record in SeqIO.parse(fasta_file, "fasta"): chrom_names.append(record.id) chrom_lens[record.id] = RangeSet([(1, len(record))]) seen = {chrom: [] for chrom in chrom_names} for read in bam_reader.fetch(): chrom_name = bam_reader.getrname(read.reference_id) if "chrM" in chrom_name: continue seen[chrom_name].append( (read.reference_start + 1, read.reference_end - 1 + 1)) unseen = { chrom: chrom_lens[chrom] - RangeSet(seen[chrom]) for chrom in chrom_names } write_ranges( unseen, "/Users/bermanlab/dev/transposon-pipeline/dependencies/albicans/deleted_regions.csv" ) ranges = { chrom: [r for r in unseen[chrom] if r[1] - r[0] >= threshold] for chrom in chrom_names } pprint(ranges) print "Total unseen:", sum(r.coverage for r in unseen.values()) print "Total filtered unseen:", sum( sum(r[1] - r[0] + 1 for r in rs) for rs in ranges.values()) for chrom in chrom_names: print chrom print "Total subranges:", len(unseen[chrom]) print "Total length:", unseen[chrom].coverage print "Ignored long subranges:", len(ranges[chrom]) print "Total length:", sum(r[1] - r[0] + 1 for r in ranges[chrom]) print "\n" import GenomicFeatures alb_db = GenomicFeatures.default_alb_db() for r in unseen[chrom]: fs = alb_db.get_features_at_range(chrom, r) if fs: print chrom, r, ", ".join(f.standard_name for f in fs) print "\n"
def _get_ignored_regions(self): deleted_regions = self.deleted_regions homologous_regions = self.homologous_regions result = {} for chrom in set(deleted_regions.keys() + homologous_regions.keys()): result[chrom] = deleted_regions.get( chrom, RangeSet()) + homologous_regions.get(chrom, RangeSet()) return result
def _get_ignored_features(self): result = set() for f in self.feature_db.get_all_features(): chrom_ignored = self.ignored_regions.get(f.chromosome, RangeSet()) if (chrom_ignored & RangeSet([(f.start, f.stop)])).coverage / float(len(f)) > (1.0 - self._min_feature_coverage) or \ not f.is_orf:# or "dubious" in f.type.lower(): result.add(f.standard_name) return result
def analyze_hom_regions(bam_file, fasta_file, out_file, threshold=50, feature_db=None): bam_reader = pysam.AlignmentFile(bam_file, "rb") chrom_names = [] chrom_lens = {} for record in SeqIO.parse(fasta_file, "fasta"): chrom_names.append(record.id) chrom_lens[record.id] = RangeSet([(1, len(record))]) seen = {chrom: [] for chrom in chrom_names} mapq_count = {i: 0 for i in range(20)} low_map_reads = 0 for read in bam_reader.fetch(): if read.mapq < 20: low_map_reads += 1 mapq_count[read.mapq] += 1 else: # We use reference_end because Crick reads can align beginning from the other side. seen[bam_reader.getrname(read.reference_id)].append( (read.reference_start + 1, read.reference_end)) all_ranges = { chrom: chrom_lens[chrom] - RangeSet(seen[chrom]) for chrom in chrom_names } ranges = { chrom: [r for r in all_ranges[chrom] if r[1] - r[0] >= threshold] for chrom in chrom_names } write_ranges({chrom: RangeSet(ranges[chrom]) for chrom in chrom_names}, out_file) print low_map_reads pprint(mapq_count) pprint(ranges) for chrom in chrom_names: print chrom print "Ignored subranges:", len(all_ranges[chrom]) print "Total length:", all_ranges[chrom].coverage print "Ignored long subranges:", len(ranges[chrom]) print "Total length:", sum(r[1] - r[0] + 1 for r in ranges[chrom]) print "\n" if feature_db is not None: for r in ranges[chrom]: fs = feature_db.get_features_at_range(chrom, r) if fs: print chrom, r, ", ".join(f.standard_name for f in fs) print "\n"
def _get_homologous_regions(self): ranges = self._read_range_data( Shared.get_dependency( os.path.join("albicans", "homologous_regions.csv"))) return { chrom: RangeSet(r for r in range_set if r[1] - r[0] + 1 >= self._ignore_region_threshold) for chrom, range_set in ranges.iteritems() }
def _read_range_data(self, file_name): result = {} with open(file_name, 'r') as in_file: reader = csv.reader(in_file) next(reader) for chrom, start, stop in reader: chrom = self.feature_db.get_std_chrom_name(chrom) if chrom not in result: result[chrom] = [] result[chrom].append((int(start), int(stop))) return {chrom: RangeSet(ranges) for chrom, ranges in result.items()}
def draw_genomic_region( organism, chromosome, region_start, region_end, hits, draw_domains, draw_directions, out_file, highlighted_genes=frozenset(), exclude_genes=frozenset(), label=None, absolute_pixel_size=0, rna_bam=None ): '''Takes drawing settings from handle_args module for region or draw_gene, and draws figure. Parameters ---------- organism : organism genome is of chromosome : chromosome for feature being drawn region_start: bp position to start drawing figure on chromosome region_end : bp position to end drawing figure on chromosome hits : hit data for feature draw_domains : domain argument draw_directions : direction argument out_file : output directory highlighted_genes: set of genes to highlight in drawing label : label for figure absolute-pixel-size: absolute-pixel-size argument Writes ------ Figure(s) to png image file(s) ''' # TODO: there's an open issue of what to do when the label isn't provided. # Consider all usages and do it right. ignored_regions = organism.ignored_regions[chromosome] region_len = region_end - region_start track_height = 5 feature_height = 20 # label_height = 20 if label else 0 rna_track_height = 20 label_height = 20 width = 350 if absolute_pixel_size <= 0 else int(region_len / absolute_pixel_size) height = track_height * len(hits) + feature_height + label_height + track_height * 2 + \ (rna_track_height if rna_bam else 0) surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height) ctx = cairo.Context(surface) ctx.scale(width, height) # Normalizing the canvas # Draw white background ctx.rectangle(0, 0, 1, 1) ctx.set_source_rgb(1, 1, 1) ctx.fill() # Draw the tracks track_height_scaled = float(track_height) / height for track_ix, track_hits in enumerate(hits): track_y = track_ix * track_height / float(height) chrom_track = track_hits[chromosome] try: left_hit_ix = chrom_track.index(chrom_track.find_ge(region_start)) except ValueError: left_hit_ix = 0 try: right_hit_ix = chrom_track.index(chrom_track.find_gt(region_end)) except ValueError: right_hit_ix = len(chrom_track) relevant_hits = chrom_track[left_hit_ix:right_hit_ix] for pixel in range(width): left_bp = region_start + int(ceil(pixel * (region_len / float(width)))) right_bp = region_start + int(floor((pixel+1) * (region_len / float(width)))) reads_in_pixel = sum(h["hit_count"] for h in relevant_hits if left_bp <= h["hit_pos"] <= right_bp) if reads_in_pixel: ctx.rectangle((pixel / float(width)), track_y, 1.0 / width, track_height_scaled) color = max(0, 0.9 - reads_in_pixel / 100.0) ctx.set_source_rgb(color, color, color) ctx.fill() # Draw the feature: features_in_range = organism.feature_db.get_features_at_range(chromosome, (region_start, region_end)) feature_track_y = float(height - feature_height - label_height - track_height*2 - (rna_track_height if rna_bam else 0)) / height feature_track_height_scaled = float(feature_height) / height scale_bp = lambda bp: min(1.0, max(0.0, float(bp - region_start)) / region_len) for feature in features_in_range: if feature.standard_name in exclude_genes: continue # Draw the feature as a blue rectangle: feature_start = scale_bp(feature.start) feature_end = scale_bp(feature.stop) ctx.rectangle(feature_start, feature_track_y, feature_end - feature_start, feature_track_height_scaled) ctx.set_source_rgb(0.0, 100/255.0, 180/255.0) ctx.fill() # Draw introns: intron_track_y = feature_track_y + feature_track_height_scaled for intron_start, intron_end in feature.exons.complement(feature.start, feature.stop): intron_start = scale_bp(intron_start) intron_end = scale_bp(intron_end) ctx.rectangle(intron_start, intron_track_y, intron_end - intron_start, track_height_scaled) ctx.set_source_rgb(1, 1, 0) ctx.fill() # Draw its domains: if (feature.standard_name in highlighted_genes and draw_domains == GENES_HIGHLIGHTED) or \ draw_domains == GENES_ALL: domain_pad = 1.0/6 * feature_height domain_y = feature_track_y + domain_pad / height domain_height = 2.0/3 * feature_track_height_scaled for domain in feature.domains: domain_start = scale_bp(domain[0]) domain_end = scale_bp(domain[1]) ctx.rectangle(domain_start, domain_y, domain_end - domain_start, domain_height) ctx.set_source_rgb(0, 0, 0) ctx.fill() # Chop off a rectangle to indicate directionality: if (feature.standard_name in highlighted_genes and draw_directions == GENES_HIGHLIGHTED) or \ draw_directions == GENES_ALL: arrow_width = min(10.0 / width, feature_end - feature_start) # 10 pixels or the length of the gene if feature.strand == 'W': end = scale_bp(feature.stop) ctx.move_to(end - arrow_width, feature_track_y) ctx.line_to(end, feature_track_y) ctx.line_to(end, feature_track_y + 0.5 * feature_track_height_scaled) ctx.close_path() ctx.set_source_rgb(1, 1, 1) ctx.fill() ctx.move_to(end - arrow_width, feature_track_y + feature_track_height_scaled) ctx.line_to(end, feature_track_y + feature_track_height_scaled) ctx.line_to(end, feature_track_y + 0.5 * feature_track_height_scaled) ctx.close_path() ctx.set_source_rgb(1, 1, 1) ctx.fill() else: start = scale_bp(feature.start) ctx.move_to(start, feature_track_y) ctx.line_to(start + arrow_width, feature_track_y) ctx.line_to(start, feature_track_y + 0.5 * feature_track_height_scaled) ctx.close_path() ctx.set_source_rgb(1, 1, 1) ctx.fill() ctx.move_to(start, feature_track_y + feature_track_height_scaled) ctx.line_to(start + arrow_width, feature_track_y + feature_track_height_scaled) ctx.line_to(start, feature_track_y + 0.5 * feature_track_height_scaled) ctx.close_path() ctx.set_source_rgb(1, 1, 1) ctx.fill() if label is None:# and feature.standard_name in highlighted_genes: feature_name = feature.name ctx.set_font_matrix(cairo.Matrix(xx=15/float(width), yy=15/float(height))) (_x, _y, label_text_width, label_text_height, _dx, _dy) = ctx.text_extents(feature_name) label_track_y = (height - label_height) / float(height) ctx.move_to(feature_start + (feature_end - feature_start - label_text_width) / 2, label_track_y + label_text_height * 1.25) ctx.set_source_rgb(0, 0, 0) ctx.show_text(feature_name) # Draw ignored regions: ignored_track_y = feature_track_y + feature_track_height_scaled + track_height_scaled for ignored_start, ignored_stop in ignored_regions & RangeSet([(region_start, region_end)]): ignored_start = scale_bp(ignored_start) ignored_stop = scale_bp(ignored_stop) ctx.rectangle(ignored_start, ignored_track_y, ignored_stop - ignored_start, track_height_scaled) ctx.set_source_rgb(1, 0, 0) ctx.fill() # Add RNA if Calbicans: if rna_bam: rna_seq = pysam.AlignmentFile(rna_bam, "rb") aligned_reads = [0] * int(region_end - region_start) for aligned_read in rna_seq.fetch(chromosome, region_start, region_end): for p in aligned_read.get_reference_positions(): target_ix = int(p - region_start) if not (0 <= target_ix < len(aligned_reads)): continue aligned_reads[target_ix] += 1 max_aligned = float(max(aligned_reads)) if max_aligned > 0: rna_track_y = ignored_track_y + track_height_scaled rna_track_height_scaled = float(rna_track_height) / height for ix, nreads in enumerate(aligned_reads): ref_pos = ix + int(region_start) ctx.rectangle( scale_bp(ref_pos), rna_track_y + (rna_track_height_scaled - rna_track_height_scaled * (nreads / max_aligned)), scale_bp(ref_pos+1) - scale_bp(ref_pos), rna_track_height_scaled * (nreads / max_aligned) ) ctx.set_source_rgb(0, 0, 0) ctx.fill() # Draw the text: if label: ctx.set_font_matrix(cairo.Matrix(xx=15/float(width), yy=15/float(height))) (_x, _y, label_text_width, label_text_height, _dx, _dy) = ctx.text_extents(label) label_track_y = (height - label_height) / float(height) ctx.move_to((1.0 - label_text_width) / 2.0, label_track_y + label_text_height * 1.25) ctx.set_source_rgb(0, 0, 0) old_font = ctx.get_font_face() slanted_font = cairo.ToyFontFace(old_font.get_family(), cairo.FONT_SLANT_OBLIQUE, old_font.get_weight()) for element in label.split(" "): if element.startswith("Ca") or element.startswith("Sp") or element.startswith("Sc"): ctx.set_font_face(slanted_font) ctx.show_text(element) ctx.show_text(" ") ctx.set_font_face(old_font) ctx.set_font_face(old_font) # Output to PNG surface.write_to_png(out_file)