Exemplo n.º 1
0
def getTargets(in_aln, in_targets=None):
    """
    Return the list of targeted regions.

    :param in_aln: Path to the alignment file (format: SAM/BAM).
    :type in_aln: str
    :param in_targets: Path to the targeted regions (format: BED). They must not contains any overlap.
    :type in_targets: str
    :return: List of targeted regions.
    :rtype: anacore.region.RegionList
    """
    selected_regions = RegionList()
    if in_targets is None:
        with pysam.AlignmentFile(in_aln, "rb") as FH_bam:
            for ref_info in FH_bam.header["SQ"]:
                selected_regions.append(
                    Region(1, ref_info["LN"], "+", ref_info["SN"], ref_info["SN"])
                )
    else:
        selected_regions = getAreas(in_targets)
        # Check lack of overlap
        selected_regions = sorted(selected_regions, key=lambda x: (x.reference.name, x.start, x.end))
        prev_region = selected_regions[0]
        for curr_region in selected_regions[1:]:
            if curr_region.reference.name == prev_region.reference.name:
                if prev_region.end >= curr_region.start:
                    raise Exception("The regions {} and {} contains an overlap.".format(prev_region, curr_region))
            prev_region = curr_region
    return selected_regions
Exemplo n.º 2
0
def shallowFromAlignment(aln_path, selected_regions, depth_mode, min_depth, log):
    """
    Return the list of shallow regions from the alignment file.

    :param aln_path: Path to the alignment file (format: SAM/BAM).
    :type aln_path: str
    :param selected_regions: Targeted regions. They must not contains any overlap between them.
    :type selected_regions: anacore.region.RegionList
    :param depth_mode: How count the depth: by reads (each reads is added independently) or by fragment (the R1 and R2 coming from the same pair are counted only once).
    :type depth_mode: str
    :param min_depth: All the locations with a depth under this value are reported in shallows areas.
    :type min_depth: int
    :param log: Logger of the script.
    :type log: logging.Logger
    :return: List of shallow regions.
    :rtype: anacore.region.RegionList
    """
    shallow = RegionList()
    nb_selected_regions = len(selected_regions)
    idx_in_part = 1
    with pysam.AlignmentFile(aln_path, "rb") as FH_bam:
        for idx_region, region in enumerate(selected_regions):
            if idx_in_part > nb_selected_regions / 10:
                idx_in_part = 0
                log.info("Processed regions {}/{}.".format(idx_region + 1, nb_selected_regions))
            idx_in_part += 1
            prev_opened = {"start": None, "end": None}
            curr_checked = region.start - 1
            for pileupcolumn in FH_bam.pileup(region.reference.name, region.start - 1, region.end - 1, max_depth=100000000):
                if pileupcolumn.reference_pos + 1 >= region.start and pileupcolumn.reference_pos + 1 <= region.end:
                    # Missing positions
                    while curr_checked < pileupcolumn.reference_pos:
                        addToShallow(region.reference, curr_checked, prev_opened, shallow)
                        curr_checked += 1
                    # Current position
                    curr_reads_depth = 0
                    curr_frag = set()
                    for pileupread in pileupcolumn.pileups:
                        if pileupcolumn.reference_pos + 1 < region.start or pileupcolumn.reference_pos + 1 > region.end:
                            raise Exception("The reference position {}:{} is out of target {}.".format(region.reference.name, pileupcolumn.reference_pos, region))
                        if not pileupread.alignment.is_secondary and not pileupread.alignment.is_duplicate and not pileupread.is_refskip:
                            curr_reads_depth += 1
                            curr_frag.add(pileupread.alignment.query_name)
                    curr_depth = curr_reads_depth
                    if depth_mode == "fragment":
                        curr_depth = len(curr_frag)
                    if min_depth > curr_depth:
                        addToShallow(region.reference, pileupcolumn.reference_pos, prev_opened, shallow)
                    curr_checked = pileupcolumn.reference_pos + 1
            # Missing positions
            while curr_checked < region.end:
                addToShallow(region.reference, curr_checked, prev_opened, shallow)
                curr_checked += 1
            if prev_opened["start"] is not None:
                shallow.append(
                    Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", region.reference)
                )
    return shallow
Exemplo n.º 3
0
    def getCDSFromTranscript(self):
        """
        Return CDS of the protein from the transcript and his exons. This function is used when CDS are not defined in the protein but exons and protein start and end are defined.

        :return: The list of CDS of the protein in protein strand order.
        :rtype: region.Regionlist
        """
        # Check information completion
        if self.transcript is None:
            raise Exception(
                "A link with a transcript is required to return CDS for {}.".
                format(self))
        if self.start is None or self.end is None:
            raise Exception(
                "Start and end for {} are required to return CDS from transcript {}."
                .format(self, self.transcript))
        # Exons to CDS
        exons = sorted(self.transcript.children,
                       key=lambda x: (x.start, x.end))
        nb_exons = len(exons)
        idx_exon = 0
        curr_exon = exons[idx_exon]
        while self.start > curr_exon.end:
            idx_exon += 1
            curr_exon = exons[idx_exon]
        cds = RegionList()
        while curr_exon is not None and self.end >= curr_exon.start:
            cds_start = max(self.start, curr_exon.start)
            cds_end = min(self.end, curr_exon.end)
            cds.append(CDS(cds_start, cds_end, self.strand, self.reference))
            idx_exon += 1
            curr_exon = None
            if idx_exon < nb_exons:
                curr_exon = exons[idx_exon]
        # Sort by strand order
        if self.strand == "-":
            cds = RegionList(
                sorted(cds, key=lambda x: (x.end, x.start), reverse=True))
        # Return
        return cds