Exemplo n.º 1
0
def getPrimersByChr(in_regions):
    """
    @summary: Returns the list of primers by chromosome.
    @param in_regions: [str] Path to the amplicons design with their primers (format: BED). The zone of interest is defined by thickStart and thickEnd.
    @return: [dict] By chromosome an instance of RegionList containing the primers. Each primer has an location annotation: upstream or downstream (this information is strand + based).
    """
    primers_by_chr = dict()
    with BEDIO(in_regions) as FH_in:
        for record in FH_in:
            if record.chrom not in primers_by_chr:
                primers_by_chr[record.chrom] = RegionList()
            if record.thickStart is None or record.thickEnd is None:
                raise Exception(
                    'The BED file "' + in_regions +
                    '" does not contains thickStart and thickEnd for all the amplicons.'
                )
            upstream_primer = Region(record.start, record.thickStart - 1,
                                     record.strand, record.reference, None,
                                     {"location": "upstream"})
            primers_by_chr[record.chrom].append(upstream_primer)
            downstream_primer = Region(record.thickEnd + 1, record.end,
                                       record.strand, record.reference, None,
                                       {"location": "downstream"})
            primers_by_chr[record.chrom].append(downstream_primer)
    return primers_by_chr
Exemplo n.º 2
0
def mergedOverlapped(regions,
                     padding=0,
                     trace=False):  ###################################### pb
    """
    """
    sorted_regions = sorted(regions, key=lambda x: (x.start, x.end))
    deleted_idx = []
    prev_region = Region(-1, -1)
    # Extend regions
    for idx, curr_region in enumerate(sorted_regions):
        curr_start = max(1, curr_region.start - padding)
        prev_end = curr_region.end + padding
        if curr_start <= prev_end:  # Overlap between regions
            if trace:
                if "merge_traceback" not in prev_region.annot:
                    prev_region.annot["merge_traceback"] = [
                        Region(prev_region.start, prev_region.end,
                               prev_region.strand, prev_region.reference,
                               prev_region.name)
                    ]
                prev_region.annot["merge_traceback"].append(
                    Region(curr_region.start, curr_region.end,
                           curr_region.strand, curr_region.reference,
                           curr_region.name))
            prev_region.end = max(
                curr_region.end,
                prev_region.end)  # Max to manage included regions
            deleted_idx.append(idx)
        else:
            prev_region = curr_region
    # Delete useless regions
    for idx in sorted(deleted_idx, reverse=True):
        del (sorted_regions[idx])
Exemplo n.º 3
0
    def testShallowFromAlignment(self):
        """
        art_chr1:
                10        20        30        40        50        60        70        80        90       100       110       120
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT
                   *******.************************************************** ******************************************.*********
                TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA
                                                                              ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG
                 ------------------------------------------------------------------------------------------------------------------

        art_chr2:
                10        20        30        40        50        60        70        80        90       100       110       120
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|123456789|12345678
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT               **********************         **********************
               ********************************************        ***************************************************
               AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAA        CATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC
                 -------------------------------------------------------------------------------------------

        art_chr3:
                10        20        30        40        50        60        70
        123456789|123456789|123456789|123456789|123456789|123456789|123456789|12
        ATGACTGAATATAAACTTGTGGTAGTTGGAGCTGGTGGCGTAGGCAAGAGTGCCTTGACGATACAGCTAAAT

                 -------------------------------
        """
        with open(self.tmp_sam, "w") as writer:
            writer.write("""@SQ	SN:art_chr1	LN:128
@SQ	SN:art_chr2	LN:128
@SQ	SN:art_chr3	LN:72
@PG	ID:bwa	PN:bwa	VN:0.7.17-r1188	CL:bwa mem ref.fasta reads.fasta
read_1	0	art_chr1	12	60	3S58M	*	0	0	TCGTAAACTTCTGGTAGTTGGAGCTGGTGTTTGCCATAAATAATACTAAATCATTTGAAGA	*	NM:i:1	MD:Z:7G50	AS:i:53	XS:i:0
read_2	0	art_chr1	71	60	52M	*	0	0	ATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCCTTTACGATACAG	*	NM:i:1	MD:Z:42G9	AS:i:47	XS:i:0
read_3	0	art_chr2	8	60	44M8D51M	*	0	0	AATATAAACTTGTGGTAGTTGGAGCTGGTGTTTGCCATAAATAACATTTGAAGATATTCACCATTATAGAGAACAAATGCGTAGGCAAGAGTGCC	*	NM:i:8	MD:Z:44^TACTAAAT51	AS:i:81	XS:i:0
""")
        samToBam(self.tmp_sam, self.tmp_bam)
        pysam.index(self.tmp_bam)

        class FakeLogger:
            def info(self, msg):
                pass

        selected_regions = RegionList([
            Region(10, 123, None, "art_chr1"),
            Region(10, 100, None, "art_chr2"),
            Region(10, 40, None, "art_chr3"),
        ])

        expected = [
            "art_chr1:10-11",
            "art_chr1:70-70",
            "art_chr1:123-123",
            "art_chr3:10-40",
        ]
        observed = [
            str(elt) for elt in shallowFromAlignment(
                self.tmp_bam, selected_regions, "reads", 1, FakeLogger())
        ]
        self.assertEqual(sorted(expected), sorted(observed))
Exemplo n.º 4
0
 def testGetPosOnRef(self):
     # Forward strand
     f_stranded_region = Region(9, 15, "+")
     self.assertEqual(f_stranded_region.getPosOnRef(1), 9)  # First nt
     self.assertEqual(f_stranded_region.getPosOnRef(3), 11)
     self.assertEqual(f_stranded_region.getPosOnRef(7), 15)  # Last nt
     # self.assertRaises(ValueError, f_stranded_region.getPosOnRef(8))  # Out of region
     # Reverse strand
     r_stranded_region = Region(9, 15, "-")
     self.assertEqual(r_stranded_region.getPosOnRef(1), 15)  # First nt
     self.assertEqual(r_stranded_region.getPosOnRef(3), 13)
     self.assertEqual(r_stranded_region.getPosOnRef(7), 9)  # Last nt
Exemplo n.º 5
0
 def testSplittedByRef(self):
     reg_list = RegionList([
         Region(10, 30, "-", "chr1", "region1"),
         Region(40, 70, "-", "chr1", "region2"),
         Region(80, 100, "-", "chr2", "region3")
     ])
     reg_by_chr = splittedByRef(reg_list)
     expected = ["chr1:region1", "chr1:region2", "chr2:region3"]
     observed = []
     for chrom, regions in sorted(reg_by_chr.items()):
         named_regions = []
         for curr_region in regions:
             named_regions.append("{}:{}".format(chrom, curr_region.name))
         observed.extend(named_regions)
     self.assertEqual(expected, observed)
Exemplo n.º 6
0
def getFragmentRegion(chrom_seq, target, target_seq, start_pos, fragment_len):
    fragment_seq = ""
    end_pos = None
    if start_pos > target.end:  # Fragment starts after target
        end_pos = start_pos + fragment_len - 1
        fragment_seq = chrom_seq[
            start_pos - 1:end_pos]  # Position is 1-based indexes are 0-based
    elif start_pos + fragment_len - 1 < target.start:  # Fragment ends before target
        end_pos = start_pos + fragment_len - 1
        fragment_seq = chrom_seq[
            start_pos - 1:end_pos]  # Position is 1-based indexes are 0-based
    else:  # Fragment overlap target
        start_idx_on_target = start_pos - target.start
        # Before target
        if start_pos < target.start:  # Fragment starts before target
            start_idx_on_target = 0
            add_start_pos = start_pos
            add_end_pos = target.start - 1
            fragment_seq = chrom_seq[
                add_start_pos -
                1:add_end_pos]  # Position is 1-based indexes are 0-based
        # On target
        fragment_seq_on_target, end_idx, missing_len = getPartialFragment(
            target_seq, start_idx_on_target, fragment_len - len(fragment_seq))
        fragment_seq += fragment_seq_on_target
        end_pos = target.start + end_idx + missing_len
        # After target
        if missing_len > 0:  # Fragment ends after target
            add_start_pos = target.end + 1
            add_end_pos = add_start_pos + missing_len - 1
            fragment_seq += chrom_seq[
                add_start_pos -
                1:add_end_pos]  # Position is 1-based indexes are 0-based
    return Region(start_pos, end_pos, None, target.reference, None,
                  {"seq": fragment_seq})
Exemplo n.º 7
0
def getTargets(in_aln, in_targets=None):
    """
    Return the list of targeted regions.

    :param in_aln: Path to the alignment file (format: SAM/BAM).
    :type in_aln: str
    :param in_targets: Path to the targeted regions (format: BED). They must not contains any overlap.
    :type in_targets: str
    :return: List of targeted regions.
    :rtype: anacore.region.RegionList
    """
    selected_regions = RegionList()
    if in_targets is None:
        with pysam.AlignmentFile(in_aln, "rb") as FH_bam:
            for ref_info in FH_bam.header["SQ"]:
                selected_regions.append(
                    Region(1, ref_info["LN"], "+", ref_info["SN"], ref_info["SN"])
                )
    else:
        selected_regions = getAreas(in_targets)
        # Check lack of overlap
        selected_regions = sorted(selected_regions, key=lambda x: (x.reference.name, x.start, x.end))
        prev_region = selected_regions[0]
        for curr_region in selected_regions[1:]:
            if curr_region.reference.name == prev_region.reference.name:
                if prev_region.end >= curr_region.start:
                    raise Exception("The regions {} and {} contains an overlap.".format(prev_region, curr_region))
            prev_region = curr_region
    return selected_regions
Exemplo n.º 8
0
def addToShallow(curr_chr, curr_pos, prev_opened, shallows):
    """
    Add current position in current shallow frame if they are consecutive else create a shallow area with previous frame and open new shallow frame with current pos.

    :param curr_chr: Name of the current region.
    :type curr_chr: str
    :param curr_pos: The current position with low DP (0-based).
    :type curr_pos: int
    :param prev_opened: The previous shallow frame ({"start": x, "end": y}).
    :type prev_opened: dict
    :param shallows: The list of shallows areas
    :type shallows: anacore.region.RegionList
    """
    if prev_opened["start"] is None:
        prev_opened["start"] = curr_pos
        prev_opened["end"] = curr_pos
    else:
        if prev_opened["end"] == curr_pos - 1:
            prev_opened["end"] = curr_pos
        else:
            shallows.append(
                Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", curr_chr)
            )
            prev_opened["start"] = curr_pos
            prev_opened["end"] = curr_pos
Exemplo n.º 9
0
 def testGetMinDist(self):
     region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         region.getMinDist(Region(14, 18, "+", "chr1")),
         0
     )
     self.assertEqual(
         region.getMinDist(Region(16, 18, "+", "chr1")),
         1
     )
     self.assertEqual(
         region.getMinDist(Region(1, 5, "+", "chr1")),
         4
     )
     with self.assertRaises(Exception):
         region.getMinDist(Region(1, 5, "+", "chr2"))
Exemplo n.º 10
0
def shallowFromAlignment(aln_path, selected_regions, depth_mode, min_depth, log):
    """
    Return the list of shallow regions from the alignment file.

    :param aln_path: Path to the alignment file (format: SAM/BAM).
    :type aln_path: str
    :param selected_regions: Targeted regions. They must not contains any overlap between them.
    :type selected_regions: anacore.region.RegionList
    :param depth_mode: How count the depth: by reads (each reads is added independently) or by fragment (the R1 and R2 coming from the same pair are counted only once).
    :type depth_mode: str
    :param min_depth: All the locations with a depth under this value are reported in shallows areas.
    :type min_depth: int
    :param log: Logger of the script.
    :type log: logging.Logger
    :return: List of shallow regions.
    :rtype: anacore.region.RegionList
    """
    shallow = RegionList()
    nb_selected_regions = len(selected_regions)
    idx_in_part = 1
    with pysam.AlignmentFile(aln_path, "rb") as FH_bam:
        for idx_region, region in enumerate(selected_regions):
            if idx_in_part > nb_selected_regions / 10:
                idx_in_part = 0
                log.info("Processed regions {}/{}.".format(idx_region + 1, nb_selected_regions))
            idx_in_part += 1
            prev_opened = {"start": None, "end": None}
            curr_checked = region.start - 1
            for pileupcolumn in FH_bam.pileup(region.reference.name, region.start - 1, region.end - 1, max_depth=100000000):
                if pileupcolumn.reference_pos + 1 >= region.start and pileupcolumn.reference_pos + 1 <= region.end:
                    # Missing positions
                    while curr_checked < pileupcolumn.reference_pos:
                        addToShallow(region.reference, curr_checked, prev_opened, shallow)
                        curr_checked += 1
                    # Current position
                    curr_reads_depth = 0
                    curr_frag = set()
                    for pileupread in pileupcolumn.pileups:
                        if pileupcolumn.reference_pos + 1 < region.start or pileupcolumn.reference_pos + 1 > region.end:
                            raise Exception("The reference position {}:{} is out of target {}.".format(region.reference.name, pileupcolumn.reference_pos, region))
                        if not pileupread.alignment.is_secondary and not pileupread.alignment.is_duplicate and not pileupread.is_refskip:
                            curr_reads_depth += 1
                            curr_frag.add(pileupread.alignment.query_name)
                    curr_depth = curr_reads_depth
                    if depth_mode == "fragment":
                        curr_depth = len(curr_frag)
                    if min_depth > curr_depth:
                        addToShallow(region.reference, pileupcolumn.reference_pos, prev_opened, shallow)
                    curr_checked = pileupcolumn.reference_pos + 1
            # Missing positions
            while curr_checked < region.end:
                addToShallow(region.reference, curr_checked, prev_opened, shallow)
                curr_checked += 1
            if prev_opened["start"] is not None:
                shallow.append(
                    Region(prev_opened["start"] + 1, prev_opened["end"] + 1, "+", region.reference)
                )
    return shallow
Exemplo n.º 11
0
 def testGetPosOnRegion(self):
     # Forward strand
     f_stranded_region = Region(9, 15, "+", "chr1")
     self.assertEqual(f_stranded_region.getPosOnRegion(9), 1)  # First nt
     self.assertEqual(f_stranded_region.getPosOnRegion(11), 3)
     self.assertEqual(f_stranded_region.getPosOnRegion(15), 7)  # Last nt
     with self.assertRaises(ValueError):
         f_stranded_region.getPosOnRegion(8)  # Out of region
     with self.assertRaises(ValueError):
         f_stranded_region.getPosOnRegion(16)  # Out of region
     # Reverse strand
     r_stranded_region = Region(9, 15, "-", "chr1")
     self.assertEqual(r_stranded_region.getPosOnRegion(15), 1)  # First nt
     self.assertEqual(r_stranded_region.getPosOnRegion(13), 3)
     self.assertEqual(r_stranded_region.getPosOnRegion(9), 7)  # Last nt
     with self.assertRaises(ValueError):
         r_stranded_region.getPosOnRegion(8)  # Out of region
     with self.assertRaises(ValueError):
         r_stranded_region.getPosOnRegion(16)  # Out of region
Exemplo n.º 12
0
    def _parseLine(self):
        """
        Return a structured record from the TopHatFusionIO current line.

        :return: The record.
        :rtype: dict
        """
        fusion, trash_1, contig_a, contig_b, depth_a, depth_b, mate_distances = [elt.strip() for elt in self.current_line.split('@')]
        chrom, break_a, break_b, orientation, nb_splitted_reads, nb_splitted_pairs, nb_pairs_splitted_reads, nb_contradict, base_cover_left, base_cover_right, trash_1 = [field.strip() for field in fusion.split("\t")]
        chrom_a, chrom_b = chrom.split("-")
        break_a = int(break_a)
        break_b = int(break_b)
        strand_a, strand_b = [("+" if elt == "f" else "-") for elt in orientation]
        return {
            "partner_a": Region(break_a, break_a, strand_a, chrom_a),
            "partner_b": Region(break_b, break_b, strand_b, chrom_b),
            "nb_splitted_reads": int(nb_splitted_reads),
            "nb_splitted_pairs": int(nb_splitted_pairs),
            "nb_pairs_splitted_reads": int(nb_pairs_splitted_reads),
            "nb_contradict": int(nb_contradict),
            "base_cover_left": int(base_cover_left),
            "base_cover_right": int(base_cover_right)
        }
Exemplo n.º 13
0
def getVariantRegion(variant):
    """
    @summary: Returns region object corresponding to the variant.
    @param variant: [VCFRecord] The variant.
    @return: [Region] The region object corresponding to the variant.
    @warnings: This function can only be used on variant with only one alternative allele.
    """
    std_variant = deepcopy(variant)
    std_variant.normalizeSingleAllele()
    return Region(
        std_variant.pos,
        std_variant.pos + len(std_variant.ref) -
        1,  # Works also with nomalized insertion
        None,
        std_variant.chrom)
Exemplo n.º 14
0
 def testConsolidate(self):
     reg_list = RegionList([
         Region(5, 9, "-", "chr1", "region1"),
         Region(10, 30, "-", "chr1", "region2"),
         Region(30, 40, "-", "chr1", "region3"),
         Region(35, 39, "-", "chr1", "region4"),
         Region(40, 70, "-", "chr1", "region5"),
         Region(71, 90, "-", "chr1", "region6"),
         Region(92, 100, "-", "chr1", "region7"),
         Region(100, 100, "+", "chr1", "region8"),
         Region(80, 100, "-", "chr2", "region9")
     ])
     # Merge overlapping
     consolidated_reg = consolidated(reg_list, False)
     expected = ["chr1:5-9[-]", "chr1:10-70[-]", "chr1:71-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"]
     observed = [curr.getCoordinatesStr() for curr in consolidated_reg]
     self.assertEqual(expected, observed)
     # Merge overlapping and contiguous
     consolidated_reg = consolidated(reg_list, True)
     expected = ["chr1:5-90[-]", "chr1:92-100[None]", "chr2:80-100[-]"]
     observed = [curr.getCoordinatesStr() for curr in consolidated_reg]
     self.assertEqual(expected, observed)
Exemplo n.º 15
0
def variantsRegionFromVCF(vcf_path, min_count=1, symbol="GENE", hgvsc="CDS", hgvsp="AA", count="CNT"):
    """
    Return the region object corresponding to the known variants in a VCF.

    :param vcf_path: Path to the variants file (format: VCF).
    :type vcf_path: str
    :param min_count: Minimum number of samples where the variant is known in the databases to use its information.
    :type min_count: int
    :param symbol: Tag used in VCF.info to store the symbol of the gene.
    :type symbol: str
    :param hgvsc: Tag used in VCF.info to store the HGVSc.
    :type hgvsc: str
    :param hgvsp: Tag used in VCF.info to store the HGVSp.
    :type hgvsp: str
    :param count: Tag used in VCF.info to store the number of database's samples with this variant.
    :type count: str
    :return: List of variants regions.
    :rtype: anacore.region.RegionList
    """
    variants_region = None
    with VCFIO(vcf_path) as FH_in:
        variants_region = [
            Region(
                record.pos,
                record.pos + len(record.ref),
                None,
                record.chrom,
                record.id,
                {
                    "id": record.id,
                    "gene": ("" if symbol not in record.info else record.info[symbol]),
                    "HGVSp": ("" if hgvsp not in record.info else record.info[hgvsp]),
                    "HGVSc": ("" if hgvsc not in record.info else record.info[hgvsc]),
                    "count": (None if count not in record.info else int(record.info[count]))
                }
            ) for record in FH_in if (symbol not in record.info or "_ENST" not in record.info[symbol]) and (count not in record.info or int(record.info[count]) >= min_count)
        ]
    return RegionList(variants_region)
Exemplo n.º 16
0
def exonsPos(record, genes_by_chr):
    """
    Return by positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries.

    :param record: Breakdend record with CIPOS.
    :type record: anacore.vcf.VCFRecord
    :param genes_by_chr: By chromosomes a tree where nodes are genes, transcripts, protein, exons and CDS.
    :type genes_by_chr: dict
    :return: By positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries.
    :rtype: dict
    """
    record_strand = getStrand(record)
    exons_pos = {}
    start, end = getBNDInterval(record)
    interval_region = Region(start, end, None, record.chrom, record.getName())
    if record.chrom in genes_by_chr:
        overlapped_genes = genes_by_chr[record.chrom].getOverlapped(
            interval_region)
        for curr_gene in overlapped_genes:
            overlapped_transcripts = curr_gene.children.getOverlapped(
                interval_region)
            for curr_transcript in overlapped_transcripts:
                for subregion in curr_transcript.children.getOverlapped(
                        interval_region):
                    if record_strand == subregion.strand and issubclass(
                            subregion.__class__, Exon):
                        if interval_region.start <= subregion.start and interval_region.end >= subregion.start:  # Breakend match to exon start
                            if subregion.start not in exons_pos:
                                exons_pos[subregion.start] = 1
                            else:
                                exons_pos[subregion.start] += 1
                        if interval_region.start <= subregion.end and interval_region.end >= subregion.end:
                            if subregion.end not in exons_pos:
                                exons_pos[subregion.end] = 1
                            else:
                                exons_pos[subregion.end] += 1
    return exons_pos
Exemplo n.º 17
0
 def testStrandedContains(self):
     # Forward strand
     f_region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         f_region.strandedContains(Region(9, 12, "+", "chr1")),
         True
     )
     self.assertEqual(
         f_region.strandedContains(Region(9, 12, "-", "chr1")),
         False
     )
     # Reverse strand
     r_region = Region(9, 15, "-", "chr1")
     self.assertEqual(
         r_region.strandedContains(Region(9, 12, "+", "chr1")),
         False
     )
     self.assertEqual(
         r_region.strandedContains(Region(9, 12, "-", "chr1")),
         True
     )
Exemplo n.º 18
0
 def testHasStrandedOverlap(self):
     # Forward strand
     f_region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         f_region.hasStrandedOverlap(Region(5, 12, "+", "chr1")),
         True
     )
     self.assertEqual(
         f_region.hasStrandedOverlap(Region(5, 12, "-", "chr1")),
         False
     )
     # Reverse strand
     r_region = Region(9, 15, "-", "chr1")
     self.assertEqual(
         r_region.hasStrandedOverlap(Region(5, 12, "+", "chr1")),
         False
     )
     self.assertEqual(
         r_region.hasStrandedOverlap(Region(5, 12, "-", "chr1")),
         True
     )
Exemplo n.º 19
0
 def testHasOverlap(self):
     # Forward
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(8, 9, "+", "chr1")), False)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(9, 9, "+", "chr1")), False)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(9, 10, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(10, 10, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(12, 12, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(12, 29, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(30, 30, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(30, 31, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(31, 31, "+", "chr1")), False)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(37, 39, "+", "chr1")), False)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(39, 39, "+", "chr1")), False)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(39, 40, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(40, 40, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(98, 99, "+", "chr1")), True)
     self.assertEqual(
         self.fwd["protein"].hasOverlap(Region(100, 102, "+", "chr1")),
         False)
     # Reverse
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(8, 9, "-", "chr1")), False)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(9, 9, "-", "chr1")), False)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(9, 10, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(10, 10, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(12, 12, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(12, 29, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(30, 30, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(30, 31, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(31, 31, "-", "chr1")), False)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(37, 39, "-", "chr1")), False)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(39, 39, "-", "chr1")), False)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(39, 40, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(40, 40, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(98, 99, "-", "chr1")), True)
     self.assertEqual(
         self.rvs["protein"].hasOverlap(Region(100, 102, "-", "chr1")),
         False)
Exemplo n.º 20
0
    )
    args = parser.parse_args()

    # Get transcripts
    gene_by_tr = getGeneByRefTr(args.input_reference_tr)
    selected_transcripts = getTranscriptAnnot(args.input_annotation,
                                              gene_by_tr)
    tr_by_chr = splittedByRef(selected_transcripts)
    # Write renamed regions
    out_nb_col = BEDIO.getMaxNbCol(args.input_regions)
    if out_nb_col == 3:
        out_nb_col = 4
    with BEDIO(args.input_regions) as FH_regions:
        with BEDIO(args.output_regions, "w", out_nb_col) as FH_out:
            for record_idx, record in enumerate(FH_regions):
                target = Region(record.start, record.end, record.strand,
                                record.chrom)
                if args.is_thick_based and record.thickStart is not None and record.thickEnd is not None:
                    target.start = record.thickStart
                    target.end = record.thickEnd
                overlapped_tr = list()
                if record.chrom in tr_by_chr:
                    overlapped_tr = tr_by_chr[record.chrom].getOverlapped(
                        target)
                if len(overlapped_tr) > 1:
                    warnings.warn(
                        "The region {} overlaps several transcripts ({}).".
                        format(target, [str(tr) for tr in overlapped_tr]))
                if len(overlapped_tr) >= 1:
                    overlapped_exons = overlapped_tr[0].children.getOverlapped(
                        target)
                    features = list()
Exemplo n.º 21
0
 def testHasOverlap(self):
     region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         region.hasOverlap(Region(9, 9, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(15, 15, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(12, 13, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(9, 15, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(8, 14, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(10, 16, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(8, 16, "+", "chr1")),
         True
     )
     self.assertEqual(
         region.hasOverlap(Region(8, 8, "+", "chr1")),
         False
     )
     self.assertEqual(
         region.hasOverlap(Region(16, 16, "+", "chr1")),
         False
     )
     self.assertEqual(
         region.hasOverlap(Region(12, 13, "+", "chr2")),
         False
     )
Exemplo n.º 22
0
    def testGetTranscriptsAnnot_withoutUTR_threeExons(self):
        exon_1 = Exon(10, 40, "+", "chr1", "fwd_exon_1")
        exon_2 = Exon(91, 150, "+", "chr1", "fwd_exon_2")
        exon_3 = Exon(201, 361, "+", "chr1", "fwd_exon_3")
        cds_1 = CDS(10, 40, "+", "chr1", "fwd_cds_1")
        cds_2 = CDS(91, 150, "+", "chr1", "fwd_cds_2")
        cds_3 = CDS(201, 361, "+", "chr1", "fwd_cds_3")
        gene_1 = Gene(10, 350, None, "chr1", "gene_1", {"id": "g_1"})
        transcrit_1 = Transcript(None,
                                 None,
                                 None,
                                 "chr1",
                                 "transcrit_1", {"id": "tr_1"},
                                 parent=gene_1,
                                 children=[exon_1, exon_2, exon_3])
        protein_1 = Protein(None,
                            None,
                            None,
                            "chr1",
                            "protein_1",
                            children=[cds_1, cds_2, cds_3],
                            transcript=transcrit_1)
        queries = [
            Region(80, 100, None, "chr1", "query_1",
                   {"desc": "starts before exon_2 ; ends in exon_2."}),
            Region(100, 180, None, "chr1", "query_2",
                   {"desc": "starts in exon_2 ; ends after exon_2."}),
            Region(
                91, 150, None, "chr1", "query_3", {
                    "desc":
                    "starts at the start of exon_2 ; ends at the end of exon_2."
                }),
            Region(80, 170, None, "chr1", "query_4",
                   {"desc": "starts before exon_2 ; ends after exon_2."}),
            Region(80, 230, None, "chr1", "query_5",
                   {"desc": "starts before exon_2 ; ends in exon_3."}),
            Region(100, 400, None, "chr1", "query_6",
                   {"desc": "starts in exon_2 ; ends after exon_3."}),
            Region(100, 250, None, "chr1", "query_7",
                   {"desc": "starts in exon_2 ; ends in exon_3."}),
            Region(80, 370, None, "chr1", "query_8",
                   {"desc": "starts before exon_2 ; ends after exon_3."}),
            Region(90, 151, None, "chr1", "query_9", {
                "desc":
                "starts just before exon_2 ; ends just after exon_2."
            })
        ]

        # Expected forward 3 exons
        expected = {
            "query_1": {
                "start_EXON": None,
                "start_INTRON": "1/2",
                "start_Protein_position": 11,
                "end_EXON": "2/3",
                "end_INTRON": None,
                "end_Protein_position": 14
            },
            "query_2": {
                "start_EXON": "2/3",
                "start_INTRON": None,
                "start_Protein_position": 14,
                "end_EXON": None,
                "end_INTRON": "2/2",
                "end_Protein_position": 31
            },
            "query_3": {
                "start_EXON": "2/3",
                "start_INTRON": None,
                "start_Protein_position": 11,
                "end_EXON": "2/3",
                "end_INTRON": None,
                "end_Protein_position": 31
            },
            "query_4": {
                "start_EXON": None,
                "start_INTRON": "1/2",
                "start_Protein_position": 11,
                "end_EXON": None,
                "end_INTRON": "2/2",
                "end_Protein_position": 31
            },
            "query_5": {
                "start_EXON": None,
                "start_INTRON": "1/2",
                "start_Protein_position": 11,
                "end_EXON": "3/3",
                "end_INTRON": None,
                "end_Protein_position": 41
            },
            "query_6": {
                "start_EXON": "2/3",
                "start_INTRON": None,
                "start_Protein_position": 14,
                "end_EXON": "3/3",
                "end_INTRON": None,
                "end_Protein_position": 84
            },
            "query_7": {
                "start_EXON": "2/3",
                "start_INTRON": None,
                "start_Protein_position": 14,
                "end_EXON": "3/3",
                "end_INTRON": None,
                "end_Protein_position": 47
            },
            "query_8": {
                "start_EXON": None,
                "start_INTRON": "1/2",
                "start_Protein_position": 11,
                "end_EXON": "3/3",
                "end_INTRON": None,
                "end_Protein_position": 84
            },
            "query_9": {
                "start_EXON": None,
                "start_INTRON": "1/2",
                "start_Protein_position": 11,
                "end_EXON": None,
                "end_INTRON": "2/2",
                "end_Protein_position": 31
            },
        }
        for query_name, query_res in expected.items():
            for key, val in {
                    "SYMBOL": "gene_1",
                    "Gene": "g_1",
                    "Feature": "tr_1",
                    "Feature_type": "Transcript",
                    "STRAND": "1"
            }.items():
                query_res[key] = val
        # Apply forward strand
        for exon in transcrit_1.children:
            exon.strand = "+"
        for cds in protein_1.children:
            cds.strand = "+"
        transcrit_1.sortChildren()
        protein_1.sortChildren()
        # Assert
        for curr_query in queries:
            annotations = getTranscriptsAnnot(curr_query, [transcrit_1])
            self.assertEqual([expected[curr_query.name]], annotations)

        # Expected reverse 3 exons
        expected = {
            "query_1": {
                "start_EXON": "2/3",
                "start_INTRON": None,
                "start_Protein_position": 71,
                "end_EXON": None,
                "end_INTRON": "2/2",
                "end_Protein_position": 74
            },
            "query_2": {
                "start_EXON": None,
                "start_INTRON": "1/2",
                "start_Protein_position": 54,
                "end_EXON": "2/3",
                "end_INTRON": None,
                "end_Protein_position": 71
            },
            "query_3": {
                "start_EXON": "2/3",
                "start_INTRON": None,
                "start_Protein_position": 54,
                "end_EXON": "2/3",
                "end_INTRON": None,
                "end_Protein_position": 74
            },
            "query_4": {
                "start_EXON": None,
                "start_INTRON": "1/2",
                "start_Protein_position": 54,
                "end_EXON": None,
                "end_INTRON": "2/2",
                "end_Protein_position": 74
            },
            "query_5": {
                "start_EXON": "1/3",
                "start_INTRON": None,
                "start_Protein_position": 44,
                "end_EXON": None,
                "end_INTRON": "2/2",
                "end_Protein_position": 74
            },
            "query_6": {
                "start_EXON": "1/3",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "2/3",
                "end_INTRON": None,
                "end_Protein_position": 71
            },
            "query_7": {
                "start_EXON": "1/3",
                "start_INTRON": None,
                "start_Protein_position": 38,
                "end_EXON": "2/3",
                "end_INTRON": None,
                "end_Protein_position": 71
            },
            "query_8": {
                "start_EXON": "1/3",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": None,
                "end_INTRON": "2/2",
                "end_Protein_position": 74
            },
            "query_9": {
                "start_EXON": None,
                "start_INTRON": "1/2",
                "start_Protein_position": 54,
                "end_EXON": None,
                "end_INTRON": "2/2",
                "end_Protein_position": 74
            }
        }
        for query_name, query_res in expected.items():
            for key, val in {
                    "SYMBOL": "gene_1",
                    "Gene": "g_1",
                    "Feature": "tr_1",
                    "Feature_type": "Transcript",
                    "STRAND": "-1"
            }.items():
                query_res[key] = val
        # Apply reverse strand
        for exon in transcrit_1.children:
            exon.strand = "-"
        for cds in protein_1.children:
            cds.strand = "-"
        transcrit_1.sortChildren()
        protein_1.sortChildren()
        # Asert
        for curr_query in queries:
            annotations = getTranscriptsAnnot(curr_query, [transcrit_1])
            self.assertEqual([expected[curr_query.name]], annotations)
Exemplo n.º 23
0
    def testGetTranscriptsAnnot_withoutUTR_oneExon(self):
        exon_1 = Exon(91, 150, "+", "chr1", "exon_2")
        cds_1 = CDS(91, 150, "+", "chr1", "cds_1")
        gene_1 = Gene(10, 350, None, "chr1", "gene_1", {"id": "g_1"})
        transcrit_1 = Transcript(None,
                                 None,
                                 None,
                                 "chr1",
                                 "transcrit_1", {"id": "tr_1"},
                                 parent=gene_1,
                                 children=[exon_1])
        protein_1 = Protein(None,
                            None,
                            None,
                            "chr1",
                            "protein_2",
                            children=[cds_1],
                            transcript=transcrit_1)
        queries = [
            Region(80, 160, None, "chr1", "query_1",
                   {"desc": "starts before exon_1 ; ends after exon_1."}),
            Region(
                91, 150, None, "chr1", "query_2",
                {"desc": "starts at start of exon_1 ; ends at end of exon_1."
                 }),
            Region(100, 110, None, "chr1", "query_3",
                   {"desc": "starts in exon_1 ; ends in exon_1."}),
            Region(80, 100, None, "chr1", "query_4",
                   {"desc": "starts before exon_1 ; ends in exon_1."}),
            Region(110, 200, None, "chr1", "query_5",
                   {"desc": "starts in exon_1 ; ends after exon_1."}),
        ]

        # Expected forward 1 exon
        expected = {
            "query_1": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_2": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_3": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 4,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 7
            },
            "query_4": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 4
            },
            "query_5": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 7,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
        }
        for query_name, query_res in expected.items():
            for key, val in {
                    "SYMBOL": "gene_1",
                    "Gene": "g_1",
                    "Feature": "tr_1",
                    "Feature_type": "Transcript",
                    "STRAND": "1"
            }.items():
                query_res[key] = val
        # Apply forward strand
        for exon in transcrit_1.children:
            exon.strand = "+"
        for cds in protein_1.children:
            cds.strand = "+"
        transcrit_1.sortChildren()
        protein_1.sortChildren()
        # Asert
        for curr_query in queries:
            annotations = getTranscriptsAnnot(curr_query, [transcrit_1])
            self.assertEqual([expected[curr_query.name]], annotations)

        # Expected reverse 1 exon
        expected = {
            "query_1": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_2": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_3": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 14,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 17
            },
            "query_4": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 17,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 20
            },
            "query_5": {
                "start_EXON": "1/1",
                "start_INTRON": None,
                "start_Protein_position": 1,
                "end_EXON": "1/1",
                "end_INTRON": None,
                "end_Protein_position": 14
            },
        }
        for query_name, query_res in expected.items():
            for key, val in {
                    "SYMBOL": "gene_1",
                    "Gene": "g_1",
                    "Feature": "tr_1",
                    "Feature_type": "Transcript",
                    "STRAND": "-1"
            }.items():
                query_res[key] = val
        # Apply reverse strand
        for exon in transcrit_1.children:
            exon.strand = "-"
        for cds in protein_1.children:
            cds.strand = "-"
        transcrit_1.sortChildren()
        protein_1.sortChildren()
        # Asert
        for curr_query in queries:
            annotations = getTranscriptsAnnot(curr_query, [transcrit_1])
            self.assertEqual([expected[curr_query.name]], annotations)
Exemplo n.º 24
0
def isReadthrough(up, down, annotation_field, genes, rt_max_dist, annCmpName, regCmpName):
    """
    Return True if the two breakends can be a readthrough.

    :param up: The breakend of the first shard in fusion.
    :type up: anacore.vcf.VCFRecord
    :param down: The breakend of the second shard in fusion.
    :type down: anacore.vcf.VCFRecord
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :param genes: The genes regions by chr.
    :type genes: AnnotGetter
    :param rt_max_dist: Maximum distance to evaluate if the fusion is a readthrough.
    :type rt_max_dist: int
    :param annCmpName: Callable used to return gene unique name from one VCF annotation.
    :type annCmpName: callable(annot)
    :param regCmpName: Callable used to return gene unique name from a gene region.
    :type regCmpName: callable(anacore.genomicRegion.Gene)
    :return: True if the two breakends can be a readthrough.
    :rtype: boolean
    """
    is_readthrough = False
    if up.chrom == down.chrom:
        up_strand = getStrand(up, True)
        down_strand = getStrand(down, False)
        if (up_strand == "+" and down_strand == "+") or (up_strand == "-" and down_strand == "-"):  # Readthrough are +/+ or -/-
            first = up
            second = down
            if first.pos > second.pos:
                first = down
                second = up
            first_start, first_end = getBNDInterval(first)
            second_start, second_end = getBNDInterval(second)
            interval_start = min(first_start, second_start)
            interval_end = max(first_end, second_end) + 1
            if interval_end - interval_start <= rt_max_dist:
                first_bp_gene = {annCmpName(annot) for annot in first.info[annotation_field]}
                second_bp_gene = {annCmpName(annot) for annot in second.info[annotation_field]}
                full_overlapping_gene = first_bp_gene & second_bp_gene
                only_first_bp_gene = first_bp_gene - second_bp_gene
                only_second_bp_gene = second_bp_gene - first_bp_gene
                if len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0:
                    strand_by_gene = {annCmpName(annot): annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field]}
                    only_first_bp_gene = {gene for gene in only_first_bp_gene if strand_by_gene[gene] == up_strand}
                    only_second_bp_gene = {gene for gene in only_second_bp_gene if strand_by_gene[gene] == up_strand}
                    possible_on_strand = len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0
                    if possible_on_strand:
                        interval_region = Region(interval_start, interval_end, up_strand, first.chrom)
                        overlapped_genes = genes.getChr(first.chrom).getOverlapped(interval_region)
                        overlapped_genes = RegionList([gene for gene in overlapped_genes if regCmpName(gene) not in full_overlapping_gene and gene.strand == up_strand])
                        overlapped_genes_by_id = {regCmpName(gene): gene for gene in overlapped_genes}
                        contradict_readthrough = False
                        for start_gene_id in only_first_bp_gene:
                            start_gene = overlapped_genes_by_id[start_gene_id]
                            for end_gene_id in only_second_bp_gene:
                                end_gene = overlapped_genes_by_id[end_gene_id]
                                for interval_gene in overlapped_genes:
                                    if regCmpName(interval_gene) != regCmpName(start_gene) and \
                                       regCmpName(interval_gene) != regCmpName(end_gene):
                                        if not interval_gene.hasOverlap(start_gene) and not interval_gene.hasOverlap(end_gene):
                                            contradict_readthrough = True
                        is_readthrough = not contradict_readthrough
    return is_readthrough
Exemplo n.º 25
0
 def testLength(self):
     self.assertEqual(Region(9, 15, None).length(), 7)
     self.assertEqual(Region(9, 15, "+").length(), 7)
     self.assertEqual(Region(9, 15, "-").length(), 7)
     self.assertEqual(Region(9, None, "-").length(), 1)
     self.assertEqual(Region(9, 9, "-").length(), 1)
Exemplo n.º 26
0
def getGeneAnnot(record, genes_by_chr):
    """
    Return genomic items overlapped by the BND record.

    :param record: The BND record.
    :type record: anacore.vcf.VCFRecord
    :param genes_by_chr: By chromosomes a tree where nodes are genes, transcripts, protein, exons and CDS.
    :type genes_by_chr: dict
    :return: The list of annotations (one annotation by overlapped transcript).
    :rtype: list
    """
    record_strand = getStrand(record)
    shard_before_bnd = shardIsBeforeBND(record)
    bnd_region = Region(record.info["ANNOT_POS"], None, None, record.chrom,
                        record.getName())
    annotations = []
    if record.chrom in genes_by_chr:
        overlapped_genes = genes_by_chr[record.chrom].getOverlapped(bnd_region)
        for curr_gene in overlapped_genes:
            overlapped_transcripts = curr_gene.children.getOverlapped(
                bnd_region)
            if len(overlapped_transcripts) == 0:
                log.warn(
                    "The breakpoint {} is contained by gene {} but by 0 of these transcripts."
                    .format(bnd_region, curr_gene))
            else:
                for curr_transcript in overlapped_transcripts:
                    if len(curr_transcript.proteins) > 1:
                        log.error(
                            "The management of several proteins for one transcript is not implemented. The transcript {} contains several proteins {}."
                            .format(curr_transcript, curr_transcript.proteins),
                            exec_info=True)
                    if curr_transcript.strand is None:
                        log.error("The transcript {} has no strand.".format(
                            curr_transcript),
                                  exec_info=True)
                    curr_annot = {
                        "SYMBOL":
                        curr_gene.name,
                        "Gene":
                        curr_gene.annot["id"],
                        "Feature":
                        curr_transcript.annot["id"],
                        "Feature_type":
                        "Transcript",
                        "STRAND":
                        curr_transcript.strand,
                        "Protein":
                        "" if len(curr_transcript.proteins) == 0 else
                        curr_transcript.proteins[0].annot["id"],
                        "RNA_ELT_TYPE":
                        None,
                        "RNA_ELT_POS":
                        None,
                        "CDS_position":
                        None,
                        "Protein_position":
                        None,
                        "Codon_position":
                        None
                    }
                    # Intron, exon and CDS posiion
                    subregion, subregion_idx = curr_transcript.getSubFromRefPos(
                        bnd_region.start)
                    if issubclass(subregion.__class__, Intron):  # On intron
                        curr_annot["RNA_ELT_TYPE"] = "intron"
                        curr_annot["RNA_ELT_POS"] = "{}/{}".format(
                            subregion_idx,
                            len(curr_transcript.children) - 1)
                        if len(
                                curr_transcript.proteins
                        ) > 0 and curr_transcript.strand == record_strand:
                            curr_protein = curr_transcript.proteins[0]
                            # Get CDS on last implicated exon for first shard and first implicated exon on second shard
                            ref_pos = subregion.end + 1
                            if shard_before_bnd:
                                ref_pos = subregion.start - 1
                            curr_annot[
                                "CDS_position"] = curr_protein.getNtPosFromRefPos(
                                    ref_pos)
                            if curr_annot["CDS_position"] is None or curr_annot[
                                    "CDS_position"] == 1 or curr_annot[
                                        "CDS_position"] == curr_protein.length:
                                curr_annot["CDS_position"] = None
                                curr_annot["RNA_ELT_TYPE"] += "&utr"
                                if curr_protein.strand == "+":
                                    curr_annot["RNA_ELT_POS"] += "&" + (
                                        "5prim" if curr_protein.start >
                                        bnd_region.start else "3prim")
                                    if curr_protein.start > bnd_region.start:
                                        curr_annot[
                                            "CDS_DIST"] = getDistBeforeCDSForward(
                                                bnd_region.start, curr_protein)
                                else:
                                    curr_annot["RNA_ELT_POS"] += "&" + (
                                        "5prim" if curr_protein.end <
                                        bnd_region.start else "3prim")
                                    if curr_protein.end < bnd_region.start:
                                        curr_annot[
                                            "CDS_DIST"] = getDistBeforeCDSReverse(
                                                bnd_region.start, curr_protein)
                            else:
                                curr_annot["Protein_position"], curr_annot[
                                    "Codon_position"] = curr_protein.getPosOnRegion(
                                        ref_pos)
                    else:  # On exon
                        nb_exon = len(curr_transcript.children)
                        curr_annot["RNA_ELT_TYPE"] = "exon"
                        curr_annot["RNA_ELT_POS"] = "{}/{}".format(
                            subregion_idx, nb_exon)
                        if bnd_region.start == subregion.start:
                            if subregion_idx == 1 and subregion.strand == "+":  # Start of the first exon
                                curr_annot[
                                    "RNA_ELT_TYPE"] += "&transcriptStart"
                            elif subregion_idx == nb_exon and subregion.strand == "-":  # End of the last exon
                                curr_annot["RNA_ELT_TYPE"] += "&transcriptEnd"
                            else:
                                curr_annot["RNA_ELT_TYPE"] += "&splice" + (
                                    "End"
                                    if subregion.strand == "+" else "Start")
                        elif bnd_region.start == subregion.end:
                            if subregion_idx == 1 and subregion.strand == "-":  # Start of the first exon
                                curr_annot[
                                    "RNA_ELT_TYPE"] += "&transcriptStart"
                            elif subregion_idx == nb_exon and subregion.strand == "+":  # End of the last exon
                                curr_annot["RNA_ELT_TYPE"] += "&transcriptEnd"
                            else:
                                curr_annot["RNA_ELT_TYPE"] += "&splice" + (
                                    "End"
                                    if subregion.strand == "-" else "Start")
                        if len(curr_transcript.proteins) > 0:
                            curr_protein = curr_transcript.proteins[0]
                            curr_annot[
                                "CDS_position"] = curr_protein.getNtPosFromRefPos(
                                    bnd_region.start)
                            # UTR
                            if curr_annot["CDS_position"] is None:
                                curr_annot["RNA_ELT_TYPE"] += "&utr"
                                if curr_transcript.proteins[0].strand == "+":
                                    curr_annot["RNA_ELT_POS"] += "&" + (
                                        "5prim" if curr_protein.start >
                                        bnd_region.start else "3prim")
                                    if curr_protein.start > bnd_region.start:
                                        curr_annot[
                                            "CDS_DIST"] = getDistBeforeCDSForward(
                                                bnd_region.start, curr_protein)
                                else:
                                    curr_annot["RNA_ELT_POS"] += "&" + (
                                        "5prim" if curr_protein.end <
                                        bnd_region.start else "3prim")
                                    if curr_protein.end < bnd_region.start:
                                        curr_annot[
                                            "CDS_DIST"] = getDistBeforeCDSReverse(
                                                bnd_region.start, curr_protein)
                            # Protein position
                            else:
                                curr_annot["Protein_position"], curr_annot[
                                    "Codon_position"] = curr_protein.getPosOnRegion(
                                        bnd_region.start)
                    # Add to annotations
                    annotations.append(curr_annot)
    return annotations
Exemplo n.º 27
0
 def testIterOverlapped_3(self):
     """Case where a subject is included in another."""
     # Init test data
     sbjct_1 = Region(7, 10, "+", "chr1", "sbjct_1")
     sbjct_2 = Region(14, 20, "+", "chr1", "sbjct_2")
     sbjct_3 = Region(16, 18, "+", "chr1", "sbjct_3")
     sbjct_4 = Region(24, 29, "+", "chr1", "sbjct_4")
     subjects = RegionList([sbjct_1, sbjct_2, sbjct_3, sbjct_4])
     queries_info = [
         {"query": Region(11, 11, "+", "chr1", "query_l1_01"), "overlapped": []},
         {"query": Region(12, 12, "+", "chr1", "query_l1_02"), "overlapped": []},
         {"query": Region(13, 13, "+", "chr1", "query_l1_03"), "overlapped": []},
         {"query": Region(14, 14, "+", "chr1", "query_l1_04"), "overlapped": [sbjct_2]},
         {"query": Region(15, 15, "+", "chr1", "query_l1_05"), "overlapped": [sbjct_2]},
         {"query": Region(16, 16, "+", "chr1", "query_l1_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 17, "+", "chr1", "query_l1_07"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 18, "+", "chr1", "query_l1_08"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 19, "+", "chr1", "query_l1_09"), "overlapped": [sbjct_2]},
         {"query": Region(20, 20, "+", "chr1", "query_l1_10"), "overlapped": [sbjct_2]},
         {"query": Region(21, 21, "+", "chr1", "query_l1_11"), "overlapped": []},
         {"query": Region(22, 22, "+", "chr1", "query_l1_12"), "overlapped": []},
         {"query": Region(11, 13, "+", "chr1", "query_l3_01"), "overlapped": []},
         {"query": Region(12, 14, "+", "chr1", "query_l3_02"), "overlapped": [sbjct_2]},
         {"query": Region(13, 15, "+", "chr1", "query_l3_03"), "overlapped": [sbjct_2]},
         {"query": Region(14, 16, "+", "chr1", "query_l3_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 17, "+", "chr1", "query_l3_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 18, "+", "chr1", "query_l3_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 19, "+", "chr1", "query_l3_07"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 20, "+", "chr1", "query_l3_08"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 21, "+", "chr1", "query_l3_09"), "overlapped": [sbjct_2]},
         {"query": Region(20, 22, "+", "chr1", "query_l3_10"), "overlapped": [sbjct_2]},
         {"query": Region(21, 23, "+", "chr1", "query_l3_11"), "overlapped": []},
         {"query": Region(13, 17, "+", "chr1", "query_l5_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 19, "+", "chr1", "query_l5_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 21, "+", "chr1", "query_l5_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 22, "+", "chr1", "query_l5_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 23, "+", "chr1", "query_l5_05"), "overlapped": [sbjct_2]},
         {"query": Region(20, 24, "+", "chr1", "query_l5_06"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 18, "+", "chr1", "query_l6_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 19, "+", "chr1", "query_l6_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 20, "+", "chr1", "query_l6_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 21, "+", "chr1", "query_l6_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 22, "+", "chr1", "query_l6_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 23, "+", "chr1", "query_l6_06"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(19, 24, "+", "chr1", "query_l6_07"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 19, "+", "chr1", "query_l7_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 20, "+", "chr1", "query_l7_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 21, "+", "chr1", "query_l7_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(16, 22, "+", "chr1", "query_l7_04"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(17, 23, "+", "chr1", "query_l7_05"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(18, 24, "+", "chr1", "query_l7_06"), "overlapped": [sbjct_2, sbjct_3, sbjct_4]},
         {"query": Region(19, 24, "+", "chr1", "query_l7_07"), "overlapped": [sbjct_2, sbjct_4]},
         {"query": Region(13, 20, "+", "chr1", "query_l8_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 21, "+", "chr1", "query_l8_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(15, 22, "+", "chr1", "query_l8_03"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(13, 21, "+", "chr1", "query_l9_01"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(14, 22, "+", "chr1", "query_l9_02"), "overlapped": [sbjct_2, sbjct_3]},
         {"query": Region(13, 22, "+", "chr1", "query_l10_01"), "overlapped": [sbjct_2, sbjct_3]}
     ]
     queries_info = sorted(queries_info, key=lambda x: (x["query"].start, x["query"].end))
     # Independant evaluation
     for curr_eval in queries_info:
         obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped([curr_eval["query"]], subjects)]
         self.assertEqual(obs_overlapped, [curr_eval["overlapped"]])
     # Grouped evaluation
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info]
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)
     # Grouped evaluation and inclusion between subjects starts the list of subjects
     shifted_subjects = subjects[1:]
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = [curr_info["overlapped"] for curr_info in queries_info]
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, shifted_subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)
     # Grouped evaluation and inclusion between subjects ends the list of subjects
     poped_subjects = subjects[:-1]
     queries = [curr_info["query"] for curr_info in queries_info]
     expec_overlapped = []
     for curr_info in queries_info:
         expec_overlapped.append([elt for elt in curr_info["overlapped"] if elt != sbjct_4])
     obs_overlapped = [overlapped_subjects for query, overlapped_subjects in iterOverlapped(queries, poped_subjects)]
     self.assertEqual(obs_overlapped, expec_overlapped)
Exemplo n.º 28
0
 def testContains(self):
     container_region = Region(9, 15, "+", "chr1")
     self.assertEqual(
         container_region.contains(Region(9, 9, "+", "chr1")),
         True
     )
     self.assertEqual(
         container_region.contains(Region(15, 15, "+", "chr1")),
         True
     )
     self.assertEqual(
         container_region.contains(Region(12, 13, "+", "chr1")),
         True
     )
     self.assertEqual(
         container_region.contains(Region(9, 15, "+", "chr1")),
         True
     )
     self.assertEqual(
         container_region.contains(Region(8, 14, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(10, 16, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(8, 16, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(8, 8, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(16, 16, "+", "chr1")),
         False
     )
     self.assertEqual(
         container_region.contains(Region(12, 13, "+", "chr2")),
         False
     )
Exemplo n.º 29
0
def groupBNDByFusions(bnd_by_id, annotation_field):
    """
    Return by chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second).

    :param bnd_by_id: Breakend by ID coming from one fusion caller.
    :type bnd_by_id: dict
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :return: By chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second).
    :rtype: dict
    """
    caller_fusions = dict()
    processed_fusions = set()
    fusion_by_name = {}
    for id, record in bnd_by_id.items():
        for alt_idx, alt in enumerate(record.alt):
            alt_first_bnd = record
            first_new_id = alt_first_bnd.id
            if len(record.alt) > 1:
                first_new_id += "_" + str(
                    alt_idx)  # Record must be splitted for each mate
                alt_first_bnd = getAlleleRecord(record, alt_idx)
                alt_first_bnd.info["MATEID"] = [record.info["MATEID"][alt_idx]]
            mate_id = alt_first_bnd.info["MATEID"][0]
            mate_record = bnd_by_id[mate_id]
            alt_second_bnd = mate_record
            second_new_id = alt_second_bnd.id
            if len(mate_record.alt) > 1:
                first_idx = mate_record.info["MATEID"].index(alt_first_bnd.id)
                second_new_id += "_" + first_idx  # Record must be splitted for each mate
                alt_second_bnd = getAlleleRecord(mate_record, first_idx)
                alt_second_bnd.info["MATEID"] = [
                    mate_record.info["MATEID"][first_idx]
                ]
            fusion_id = " @@ ".join(
                sorted([alt_first_bnd.id, alt_second_bnd.id]))
            alt_first_bnd.id = first_new_id
            alt_second_bnd.info["MATEID"] = [first_new_id]
            alt_second_bnd.id = second_new_id
            alt_first_bnd.info["MATEID"] = [second_new_id]
            if fusion_id not in processed_fusions:
                processed_fusions.add(fusion_id)
                if "RNA_FIRST" not in alt_first_bnd.info and "RNA_FIRST" not in alt_second_bnd.info:
                    raise Exception(
                        "Tag RNA_FIRST must be present in one of the breakend {} or {}."
                        .format(alt_first_bnd.id, mate_id))
                if "RNA_FIRST" in alt_second_bnd.info:
                    aux = alt_first_bnd
                    alt_first_bnd = alt_second_bnd
                    alt_second_bnd = aux
                interval_first_bnd = getBNDInterval(alt_first_bnd)
                fusion_name = " @@ ".join(
                    sorted([alt_first_bnd.getName(),
                            alt_second_bnd.getName()]))
                if fusion_name not in fusion_by_name:
                    region_first_bnd = Region(interval_first_bnd[0],
                                              interval_first_bnd[1],
                                              reference=alt_first_bnd.chrom,
                                              annot={
                                                  "first": alt_first_bnd,
                                                  "second": alt_second_bnd
                                              })
                    if alt_first_bnd.chrom not in caller_fusions:
                        caller_fusions[alt_first_bnd.chrom] = RegionList()
                    caller_fusions[alt_first_bnd.chrom].append(
                        region_first_bnd)
                    fusion_by_name[fusion_name] = region_first_bnd
                else:  # Caller contains several entries for the same pair of breakends (same fusion but several anotations)
                    fusion_by_name[fusion_name].annot["first"].info[
                        annotation_field] += alt_first_bnd.info[
                            annotation_field]
                    fusion_by_name[fusion_name].annot["second"].info[
                        annotation_field] += alt_second_bnd.info[
                            annotation_field]
    return caller_fusions
Exemplo n.º 30
0
 def testFromStr(self):
     observed = Region.fromStr("12:1534187-1534287")
     expected = Region(1534187, 1534287, None, "12")
     self.assertEqual(str(observed), str(expected))