示例#1
0
def analyze_deletions(bam_file, threshold=50):
    bam_reader = pysam.AlignmentFile(bam_file, "rb")

    fasta_file = Shared.get_dependency(
        os.path.join(
            "albicans", "reference genome",
            "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta")
    )
    chrom_names = []
    chrom_lens = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        chrom_names.append(record.id)
        chrom_lens[record.id] = RangeSet([(1, len(record))])

    seen = {chrom: [] for chrom in chrom_names}

    for read in bam_reader.fetch():
        chrom_name = bam_reader.getrname(read.reference_id)
        if "chrM" in chrom_name:
            continue
        seen[chrom_name].append(
            (read.reference_start + 1, read.reference_end - 1 + 1))

    unseen = {
        chrom: chrom_lens[chrom] - RangeSet(seen[chrom])
        for chrom in chrom_names
    }
    write_ranges(
        unseen,
        "/Users/bermanlab/dev/transposon-pipeline/dependencies/albicans/deleted_regions.csv"
    )
    ranges = {
        chrom: [r for r in unseen[chrom] if r[1] - r[0] >= threshold]
        for chrom in chrom_names
    }

    pprint(ranges)

    print "Total unseen:", sum(r.coverage for r in unseen.values())
    print "Total filtered unseen:", sum(
        sum(r[1] - r[0] + 1 for r in rs) for rs in ranges.values())

    for chrom in chrom_names:
        print chrom
        print "Total subranges:", len(unseen[chrom])
        print "Total length:", unseen[chrom].coverage
        print "Ignored long subranges:", len(ranges[chrom])
        print "Total length:", sum(r[1] - r[0] + 1 for r in ranges[chrom])
        print "\n"

        import GenomicFeatures
        alb_db = GenomicFeatures.default_alb_db()
        for r in unseen[chrom]:
            fs = alb_db.get_features_at_range(chrom, r)
            if fs:
                print chrom, r, ", ".join(f.standard_name for f in fs)

        print "\n"
示例#2
0
    def _get_ignored_regions(self):
        deleted_regions = self.deleted_regions
        homologous_regions = self.homologous_regions
        result = {}
        for chrom in set(deleted_regions.keys() + homologous_regions.keys()):
            result[chrom] = deleted_regions.get(
                chrom, RangeSet()) + homologous_regions.get(chrom, RangeSet())

        return result
示例#3
0
    def _get_ignored_features(self):
        result = set()

        for f in self.feature_db.get_all_features():
            chrom_ignored = self.ignored_regions.get(f.chromosome, RangeSet())
            if (chrom_ignored & RangeSet([(f.start, f.stop)])).coverage / float(len(f)) > (1.0 - self._min_feature_coverage) or \
                not f.is_orf:# or "dubious" in f.type.lower():
                result.add(f.standard_name)

        return result
示例#4
0
def analyze_hom_regions(bam_file,
                        fasta_file,
                        out_file,
                        threshold=50,
                        feature_db=None):
    bam_reader = pysam.AlignmentFile(bam_file, "rb")
    chrom_names = []
    chrom_lens = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        chrom_names.append(record.id)
        chrom_lens[record.id] = RangeSet([(1, len(record))])

    seen = {chrom: [] for chrom in chrom_names}
    mapq_count = {i: 0 for i in range(20)}

    low_map_reads = 0
    for read in bam_reader.fetch():
        if read.mapq < 20:
            low_map_reads += 1
            mapq_count[read.mapq] += 1
        else:
            # We use reference_end because Crick reads can align beginning from the other side.
            seen[bam_reader.getrname(read.reference_id)].append(
                (read.reference_start + 1, read.reference_end))

    all_ranges = {
        chrom: chrom_lens[chrom] - RangeSet(seen[chrom])
        for chrom in chrom_names
    }
    ranges = {
        chrom: [r for r in all_ranges[chrom] if r[1] - r[0] >= threshold]
        for chrom in chrom_names
    }
    write_ranges({chrom: RangeSet(ranges[chrom])
                  for chrom in chrom_names}, out_file)

    print low_map_reads
    pprint(mapq_count)
    pprint(ranges)

    for chrom in chrom_names:
        print chrom
        print "Ignored subranges:", len(all_ranges[chrom])
        print "Total length:", all_ranges[chrom].coverage
        print "Ignored long subranges:", len(ranges[chrom])
        print "Total length:", sum(r[1] - r[0] + 1 for r in ranges[chrom])
        print "\n"

        if feature_db is not None:
            for r in ranges[chrom]:
                fs = feature_db.get_features_at_range(chrom, r)
                if fs:
                    print chrom, r, ", ".join(f.standard_name for f in fs)

            print "\n"
示例#5
0
    def _get_homologous_regions(self):
        ranges = self._read_range_data(
            Shared.get_dependency(
                os.path.join("albicans", "homologous_regions.csv")))

        return {
            chrom:
            RangeSet(r for r in range_set
                     if r[1] - r[0] + 1 >= self._ignore_region_threshold)
            for chrom, range_set in ranges.iteritems()
        }
示例#6
0
    def _read_range_data(self, file_name):
        result = {}

        with open(file_name, 'r') as in_file:
            reader = csv.reader(in_file)
            next(reader)
            for chrom, start, stop in reader:
                chrom = self.feature_db.get_std_chrom_name(chrom)
                if chrom not in result:
                    result[chrom] = []
                result[chrom].append((int(start), int(stop)))

        return {chrom: RangeSet(ranges) for chrom, ranges in result.items()}
示例#7
0
def draw_genomic_region(
        organism,
        chromosome,
        region_start,
        region_end,
        hits,
        draw_domains,
        draw_directions,
        out_file,
        highlighted_genes=frozenset(),
        exclude_genes=frozenset(),
        label=None,
        absolute_pixel_size=0,
        rna_bam=None
    ):
    '''Takes drawing settings from handle_args module for region or draw_gene, and draws figure.

    Parameters
    ----------
        organism    :       organism genome is of
        chromosome  :       chromosome for feature being drawn
        region_start:       bp position to start drawing figure on chromosome
        region_end  :       bp position to end drawing figure on chromosome
        hits    :           hit data for feature
        draw_domains    :   domain argument
        draw_directions :   direction argument
        out_file    :       output directory
        highlighted_genes:  set of genes to highlight in drawing
        label   :           label for figure
        absolute-pixel-size: absolute-pixel-size argument

    Writes
    ------
        Figure(s) to png image file(s)
    '''
    
    # TODO: there's an open issue of what to do when the label isn't provided.
    # Consider all usages and do it right.
    
    ignored_regions = organism.ignored_regions[chromosome]
    
    region_len = region_end - region_start
    
    track_height = 5
    feature_height = 20
#     label_height = 20 if label else 0
    rna_track_height = 20
    label_height = 20
    
    width = 350 if absolute_pixel_size <= 0 else int(region_len / absolute_pixel_size)
    height = track_height * len(hits) + feature_height + label_height + track_height * 2 + \
        (rna_track_height if rna_bam else 0)
    
    surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
    ctx = cairo.Context(surface)
    
    ctx.scale(width, height) # Normalizing the canvas
    
    # Draw white background
    ctx.rectangle(0, 0, 1, 1)
    ctx.set_source_rgb(1, 1, 1)
    ctx.fill()
    
    # Draw the tracks
    track_height_scaled = float(track_height) / height
    for track_ix, track_hits in enumerate(hits):
        track_y = track_ix * track_height / float(height)
        chrom_track = track_hits[chromosome]
        try:
            left_hit_ix = chrom_track.index(chrom_track.find_ge(region_start))
        except ValueError:
            left_hit_ix = 0
        try:
            right_hit_ix = chrom_track.index(chrom_track.find_gt(region_end))
        except ValueError:
            right_hit_ix = len(chrom_track)
        relevant_hits = chrom_track[left_hit_ix:right_hit_ix]
        
        for pixel in range(width):
            left_bp = region_start + int(ceil(pixel * (region_len / float(width))))
            right_bp = region_start + int(floor((pixel+1) * (region_len / float(width))))
            reads_in_pixel = sum(h["hit_count"] for h in relevant_hits if left_bp <= h["hit_pos"] <= right_bp)
            if reads_in_pixel:
                ctx.rectangle((pixel / float(width)),
                              track_y,
                              1.0 / width,
                              track_height_scaled)
                color = max(0, 0.9 - reads_in_pixel / 100.0)
                ctx.set_source_rgb(color, color, color)
                ctx.fill()
    
    # Draw the feature:
    features_in_range = organism.feature_db.get_features_at_range(chromosome, (region_start, region_end))
    
    feature_track_y = float(height - feature_height - label_height - track_height*2 -
                            (rna_track_height if rna_bam else 0)) / height
    feature_track_height_scaled = float(feature_height) / height
    scale_bp = lambda bp: min(1.0, max(0.0, float(bp - region_start)) / region_len) 
    for feature in features_in_range:
        if feature.standard_name in exclude_genes:
            continue
        
        # Draw the feature as a blue rectangle:
        feature_start = scale_bp(feature.start)
        feature_end = scale_bp(feature.stop)
        
        ctx.rectangle(feature_start, feature_track_y, feature_end - feature_start, feature_track_height_scaled)
        ctx.set_source_rgb(0.0, 100/255.0, 180/255.0)
        ctx.fill()
        
        # Draw introns:
        intron_track_y = feature_track_y + feature_track_height_scaled
        for intron_start, intron_end in feature.exons.complement(feature.start, feature.stop):
            intron_start = scale_bp(intron_start)
            intron_end = scale_bp(intron_end)
            ctx.rectangle(intron_start, intron_track_y, intron_end - intron_start, track_height_scaled)
            ctx.set_source_rgb(1, 1, 0)
            ctx.fill()
        
        # Draw its domains:
        if (feature.standard_name in highlighted_genes and draw_domains == GENES_HIGHLIGHTED) or \
            draw_domains == GENES_ALL:
            domain_pad = 1.0/6 * feature_height
            domain_y = feature_track_y + domain_pad / height
            domain_height = 2.0/3 * feature_track_height_scaled
            for domain in feature.domains:
                domain_start = scale_bp(domain[0])
                domain_end = scale_bp(domain[1])
                
                ctx.rectangle(domain_start, domain_y,
                              domain_end - domain_start, domain_height)
                ctx.set_source_rgb(0, 0, 0)
                ctx.fill()
                
        # Chop off a rectangle to indicate directionality:
        if (feature.standard_name in highlighted_genes and draw_directions == GENES_HIGHLIGHTED) or \
            draw_directions == GENES_ALL:
            arrow_width = min(10.0 / width, feature_end - feature_start) # 10 pixels or the length of the gene
            if feature.strand == 'W':
                end = scale_bp(feature.stop)
                ctx.move_to(end - arrow_width, feature_track_y)
                ctx.line_to(end, feature_track_y)
                ctx.line_to(end, feature_track_y + 0.5 * feature_track_height_scaled)
                ctx.close_path()
                ctx.set_source_rgb(1, 1, 1)
                ctx.fill()
                
                ctx.move_to(end - arrow_width, feature_track_y + feature_track_height_scaled)
                ctx.line_to(end, feature_track_y + feature_track_height_scaled)
                ctx.line_to(end, feature_track_y + 0.5 * feature_track_height_scaled)
                ctx.close_path()
                ctx.set_source_rgb(1, 1, 1)
                ctx.fill()
            else:
                start = scale_bp(feature.start)
                ctx.move_to(start, feature_track_y)
                ctx.line_to(start + arrow_width, feature_track_y)
                ctx.line_to(start, feature_track_y + 0.5 * feature_track_height_scaled)
                ctx.close_path()
                ctx.set_source_rgb(1, 1, 1)
                ctx.fill()
                
                ctx.move_to(start, feature_track_y + feature_track_height_scaled)
                ctx.line_to(start + arrow_width, feature_track_y + feature_track_height_scaled)
                ctx.line_to(start, feature_track_y + 0.5 * feature_track_height_scaled)
                ctx.close_path()
                ctx.set_source_rgb(1, 1, 1)
                ctx.fill()
                
        if label is None:# and feature.standard_name in highlighted_genes:
            feature_name = feature.name
            ctx.set_font_matrix(cairo.Matrix(xx=15/float(width), yy=15/float(height)))
            (_x, _y, label_text_width, label_text_height, _dx, _dy) = ctx.text_extents(feature_name)
            label_track_y = (height - label_height) / float(height)
            ctx.move_to(feature_start + (feature_end - feature_start - label_text_width) / 2, label_track_y + label_text_height * 1.25)
            ctx.set_source_rgb(0, 0, 0)
            ctx.show_text(feature_name)
    
    # Draw ignored regions:
    ignored_track_y = feature_track_y + feature_track_height_scaled + track_height_scaled
    for ignored_start, ignored_stop in ignored_regions & RangeSet([(region_start, region_end)]):
        ignored_start = scale_bp(ignored_start)
        ignored_stop = scale_bp(ignored_stop)
        ctx.rectangle(ignored_start, ignored_track_y, ignored_stop - ignored_start, track_height_scaled)
        ctx.set_source_rgb(1, 0, 0)
        ctx.fill()
        
    # Add RNA if Calbicans:
    if rna_bam:
        rna_seq = pysam.AlignmentFile(rna_bam, "rb")
        aligned_reads = [0] * int(region_end - region_start)
        for aligned_read in rna_seq.fetch(chromosome, region_start, region_end):
            for p in aligned_read.get_reference_positions():
                target_ix = int(p - region_start)
                if not (0 <= target_ix < len(aligned_reads)):
                    continue
                aligned_reads[target_ix] += 1
        
        max_aligned = float(max(aligned_reads))
        
        if max_aligned > 0:
            rna_track_y = ignored_track_y + track_height_scaled
            rna_track_height_scaled = float(rna_track_height) / height
             
            for ix, nreads in enumerate(aligned_reads):
                ref_pos = ix + int(region_start)
                ctx.rectangle(
                    scale_bp(ref_pos),
                    rna_track_y + (rna_track_height_scaled - rna_track_height_scaled * (nreads / max_aligned)),
                    scale_bp(ref_pos+1) - scale_bp(ref_pos),
                    rna_track_height_scaled * (nreads / max_aligned)
                )
                ctx.set_source_rgb(0, 0, 0)
                ctx.fill()
        
    
    # Draw the text:
    if label:
        ctx.set_font_matrix(cairo.Matrix(xx=15/float(width), yy=15/float(height)))
        (_x, _y, label_text_width, label_text_height, _dx, _dy) = ctx.text_extents(label)
        label_track_y = (height - label_height) / float(height)
        ctx.move_to((1.0 - label_text_width) / 2.0, label_track_y + label_text_height * 1.25)
        ctx.set_source_rgb(0, 0, 0)
        
        old_font = ctx.get_font_face()
        slanted_font = cairo.ToyFontFace(old_font.get_family(), cairo.FONT_SLANT_OBLIQUE, old_font.get_weight())
        for element in label.split(" "):
            if element.startswith("Ca") or element.startswith("Sp") or element.startswith("Sc"):
                ctx.set_font_face(slanted_font)
            ctx.show_text(element)
            ctx.show_text(" ")
            ctx.set_font_face(old_font)
        ctx.set_font_face(old_font)
        
    
    # Output to PNG
    surface.write_to_png(out_file)