Exemplo n.º 1
0
    def _load_bam(self, filename):
        spans = { }

        if os.path.isdir(filename):
            filename = os.path.join(filename, "alignments_filtered_sorted.bam")
        
        for alignment in sam.Bam_reader(filename):
            if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                continue
        
            if self.polya and not any( item.startswith("AA:i:") for item in alignment.extra ):
                continue
                
            AN = 0.0
            AG = 0.0
            for item in alignment.extra:
                if item.startswith("AN:i:"): AN = float(item[5:])
                if item.startswith("AG:i:"): AG = float(item[5:])
        
            strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1
        
            start = alignment.reference_start
            end = alignment.reference_end

            # 3' end            
            if strand >= 0:
                start = end-1
            else:
                end = start+1
            
            if end+self.lap-start <= 0: continue
            
            rname = alignment.reference_name
            if (rname,strand) not in spans: 
                spans[(rname,strand)] = [ ]          
            spans[(rname, strand)].append((start,end+self.lap, AN,AG))
                
        return spans
Exemplo n.º 2
0
    def run(self):
        assert self.extension is not None, '--extension must be specified'

        #workspace = self.get_workspace()
        workspace = working_directory.Working(self.working_dir,
                                              must_exist=True)
        if self.annotations == None:
            reference = workspace.get_reference()
            annotations_filename = reference.annotations_filename()
        else:
            annotations_filename = self.annotations

        types = [item.lower() for item in self.types.split(',')]

        parts = self.parts or self.types
        parts = [item.lower() for item in parts.split(',')]

        all_annotations = list(
            annotation.read_annotations(annotations_filename))
        annotation.link_up_annotations(all_annotations)
        for item in all_annotations:
            item.primary = None

        annotations = [
            item for item in all_annotations if item.type.lower() in types
        ]

        part_annotations = []
        seen = set()
        queue = [(item, item) for item in annotations]
        while queue:
            primary, item = queue.pop()
            if item.type.lower() in parts:
                assert item.primary is None, "Feature with multiple parents"
                item.primary = primary
                key = (id(primary), item.start, item.end, item.seqid,
                       item.strand)
                # Ignore duplicate exons (many isoforms will have the same exons)
                if key not in seen:
                    seen.add(key)
                    part_annotations.append(item)
            queue.extend((primary, item2) for item2 in item.children)

        del seen
        del all_annotations

        self.log.log('%d annotations\n' % len(annotations))
        self.log.log('%d part annotations\n' % len(part_annotations))

        #assert annotations, 'No annotations of specified types in file'

        for item in part_annotations:
            this_extension = self.extension
            if "max_extension" in item.attr:
                this_extension = min(this_extension,
                                     int(item.attr["max_extension"]))

            if item.strand >= 0:
                item.tail_pos = item.end
                item.end += this_extension
            else:
                item.tail_pos = item.start
                item.start -= this_extension

        for item in annotations:
            item.hits = []  # [ (tail_length, adaptor_bases) ]

        index = span_index.index_annotations(part_annotations)

        for alignment in sam.Bam_reader(workspace /
                                        'alignments_filtered_sorted.bam'):
            if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                continue

            start = alignment.reference_start
            end = alignment.reference_end
            alignment_length = end - start
            strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1
            fragment_feature = annotation.Annotation(
                seqid=alignment.reference_name,
                start=start,
                end=end,
                strand=strand)

            if strand >= 0:
                tail_pos = end
            else:
                tail_pos = start

            tail_length = 0
            adaptor_bases = 0
            for item in alignment.extra:
                if item.startswith('AN:i:'):
                    tail_length = int(item[5:])
                elif item.startswith('AD:i:'):
                    adaptor_bases = int(item[5:])

            hits = index.get(fragment_feature, same_strand=True)
            if hits:
                gene = min(
                    hits,
                    key=lambda gene:
                    (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                # Nearest by tail_pos
                # failing that, by id to ensure a deterministic choice

                gene.primary.hits.append((tail_length, adaptor_bases))

        for item in annotations:
            del item.parents
            del item.children
            del item.primary

        f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
        pickle.dump((workspace.name, workspace.get_tags(), annotations), f,
                    pickle.HIGHEST_PROTOCOL)
        f.close()
Exemplo n.º 3
0
def make_ambiguity_bigwig(prefix, bam_filenames, stop_after=None, subsample=1):
    #import pysam

    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])

    with open(prefix + "-chrom.sizes", "wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"], entry["LN"]))

    chrom_names = [entry["SN"] for entry in header["SQ"]]
    chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]]

    #alf.close()

    unambiguous = dict([(i, Piler(j))
                        for i, j in zip(chrom_names, chrom_sizes)])
    total = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)])

    for filename in bam_filenames:
        #alf = pysam.AlignmentFile(filename)
        alf = sam.Bam_reader(filename)
        n = 0

        sub = subsample - 1
        for item in alf:
            if item.is_unmapped or item.is_supplementary:
                continue

            sub = (sub + 1) % subsample
            if sub: continue

            #spanner = fragment_split_coverage([item])
            spanner = fragment_coverage([item
                                         ])  #TODO fixme when blocks available
            total[item.reference_name].add(spanner)

            NH = 1
            for item2 in item.extra:
                if item2.startswith("NH:i:"):
                    NH = int(item2[5:])
            if NH == 1:
                unambiguous[item.reference_name].add(spanner)

            n += 1
            if stop_after is not None and n > stop_after: break
            if n % 1000000 == 0: print prefix, filename, grace.pretty_number(n)

        alf.close()

    ambiguities = []
    for i in xrange(len(total)):
        u = unambiguous[chrom_names[i]].get()
        t = map_spanner(lambda x: x * 1j, total[chrom_names[i]].get())
        c = pile([u, t], initial=0.0)
        c = map_spanner(lambda x: max(0.0, x.imag - x.real) / max(x.imag, 1.0),
                        c)
        ambiguities.append(c)

    bedgraph(prefix + ".bedgraph",
             zip(chrom_names, [item for item in ambiguities]))
    subprocess.check_call([
        "wigToBigWig", prefix + ".bedgraph", prefix + "-chrom.sizes",
        prefix + ".bw"
    ])
    os.unlink(prefix + ".bedgraph")
    os.unlink(prefix + "-chrom.sizes")
Exemplo n.º 4
0
def make_ambiguity_bigwig_by_readname(prefix,
                                      bam_filenames,
                                      stop_after=None,
                                      subsample=1):
    #import pysam

    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])

    with open(prefix + "-chrom.sizes", "wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"], entry["LN"]))

    chrom_names = [entry["SN"] for entry in header["SQ"]]
    chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]]

    #alf.close()

    unambiguous = dict([(i, Piler(j))
                        for i, j in zip(chrom_names, chrom_sizes)])
    total = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)])

    old = grace.status("Ambiguity bigwig")

    for filename in bam_filenames:
        #alf = pysam.AlignmentFile(filename)
        alf = sam.Bam_reader(filename)
        n = 0

        sub = subsample - 1
        for (key, items) in itertools.groupby(alf,
                                              lambda item: item.query_name):
            sub = (sub + 1) % subsample
            if sub: continue

            items = [
                item for item in items
                if not item.is_unmapped and not item.is_supplementary
            ]
            if not items:
                continue

            # Only use top scoring alignments
            AS = [item.get_AS() for item in items]
            best_AS = max(AS)
            items = [
                item for item, this_AS in zip(items, AS) if this_AS >= best_AS
            ]

            for item in items:
                #spanner = fragment_split_coverage([item])
                spanner = fragment_coverage(
                    [item])  #TODO fixme when blocks available
                spanner = scale_spanner(1.0 / len(items), spanner)
                total[item.reference_name].add(spanner)
                if len(items) == 1:
                    unambiguous[item.reference_name].add(spanner)

            n += 1
            if stop_after is not None and n > stop_after: break
            if n % 1000000 == 0:
                grace.status(
                    os.path.basename(prefix) + " " + filename + " " +
                    grace.pretty_number(n))

        alf.close()

    ambiguities = []
    for i in xrange(len(total)):
        u = unambiguous[chrom_names[i]].get()
        t = map_spanner(lambda x: x * 1j, total[chrom_names[i]].get())
        c = pile([u, t], initial=0.0)
        c = map_spanner(lambda x: max(0.0, x.imag - x.real) / max(x.imag, 1.0),
                        c)
        ambiguities.append(c)

    bedgraph(prefix + ".bedgraph",
             zip(chrom_names, [item for item in ambiguities]))
    subprocess.check_call([
        "wigToBigWig", prefix + ".bedgraph", prefix + "-chrom.sizes",
        prefix + ".bw"
    ])
    os.unlink(prefix + ".bedgraph")
    os.unlink(prefix + "-chrom.sizes")

    grace.status(old)
Exemplo n.º 5
0
def make_bigwig(prefix,
                bam_filenames,
                make_spanner,
                fragments=False,
                stop_after=None,
                scale=1.0,
                polya=False):
    have_pysam = False
    try:
        import pysam
        have_pysam = True
    except ImportError:
        pass

    #alf = pysam.AlignmentFile(bam_filenames[0])
    #header = alf.header
    header = sam.parsed_bam_headers(bam_filenames[0])

    with open(prefix + "-chrom.sizes", "wb") as f:
        for entry in header["SQ"]:
            f.write("{}\t{}\n".format(entry["SN"], entry["LN"]))

    chrom_names = [entry["SN"] for entry in header["SQ"]]
    chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]]

    #alf.close()

    forward = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)])
    reverse = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)])

    old = grace.status("Bigwig")

    for filename in bam_filenames:
        if have_pysam:
            alf = pysam.AlignmentFile(filename)
        else:
            alf = sam.Bam_reader(filename)

        n = 0

        if not fragments:
            for item in alf:
                if item.is_unmapped or item.is_secondary or item.is_supplementary:
                    continue

                if polya and not alignment_is_polya(item):
                    continue

                # Assume --> <-- oriented read pairs
                which = forward if bool(item.is_reverse) == bool(
                    item.is_read2) else reverse
                which[item.reference_name].add(make_spanner(item))

                n += 1
                if stop_after is not None and n > stop_after: break
                if n % 1000000 == 0:
                    grace.status(
                        os.path.basename(prefix) + " " + filename + " " +
                        grace.pretty_number(n))

        else:
            for item in iter_fragments(alf):
                if polya and not any(alignment_is_polya(al) for al in item):
                    continue

                # Assume --> <-- oriented read pairs
                which = forward if bool(item[0].is_reverse) == bool(
                    item[0].is_read2) else reverse
                which[item[0].reference_name].add(make_spanner(item))

                n += 1
                if stop_after is not None and n > stop_after: break
                if n % 1000000 == 0:
                    grace.status(
                        os.path.basename(prefix) + " " + filename + " " +
                        grace.pretty_number(n))

        if have_pysam:
            alf.close()

    bedgraph(
        prefix + "-fwd.bedgraph",
        zip(chrom_names, [
            scale_spanner(scale, forward[item].get()) for item in chrom_names
        ]))
    subprocess.check_call([
        "wigToBigWig", prefix + "-fwd.bedgraph", prefix + "-chrom.sizes",
        prefix + "-fwd.bw"
    ])
    os.unlink(prefix + "-fwd.bedgraph")

    bedgraph(
        prefix + "-rev.bedgraph",
        zip(chrom_names, [
            scale_spanner(scale, reverse[item].get()) for item in chrom_names
        ]))
    subprocess.check_call([
        "wigToBigWig", prefix + "-rev.bedgraph", prefix + "-chrom.sizes",
        prefix + "-rev.bw"
    ])
    os.unlink(prefix + "-rev.bedgraph")

    os.unlink(prefix + "-chrom.sizes")

    grace.status(old)