Пример #1
0
def isInner(first, second, annotation_field, annCmpName, regCmpName):
    """
    Return True if the two breakends fall in the same gene (in at least one overlapped gene).

    :param first: The breakend of the first shard in fusion.
    :type first: anacore.vcf.VCFRecord
    :param second: The breakend of the second shard in fusion.
    :type second: anacore.vcf.VCFRecord
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :param annCmpName: Callable used to return gene unique name from one VCF annotation.
    :type annCmpName: callable(annot)
    :param regCmpName: Callable used to return gene unique name from a gene region.
    :type regCmpName: callable(anacore.genomicRegion.Gene)
    :return: True if the two breakends falls in the same gene (in at least one overlapped gene).
    :rtype: boolean
    """
    is_inner_gene = False
    record_gene = {annCmpName(annot) for annot in first.info[annotation_field]}
    mate_gene = {annCmpName(annot) for annot in second.info[annotation_field]}
    full_overlapping = record_gene & mate_gene
    if len(full_overlapping) > 0:
        first_strand = getStrand(first, True)
        second_strand = getStrand(second, False)
        if first_strand != second_strand:
            is_inner_gene = True
        else:
            genes_strands = {annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field] if annCmpName(annot) in full_overlapping}
            if first_strand in genes_strands:
                is_inner_gene = True
    return is_inner_gene
Пример #2
0
def isInner(first, second, annotation_field):
    """
    Return True if the two breakends falls in the same gene (in at least one overlapped gene).

    :param first: The breakend of the first shard in fusion.
    :type first: anacore.vcf.VCFRecord
    :param second: The breakend of the second shard in fusion.
    :type second: anacore.vcf.VCFRecord
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :return: True if the two breakends falls in the same gene (in at least one overlapped gene).
    :rtype: boolean
    """
    is_inner_gene = False
    record_gene = {annot["SYMBOL"] for annot in first.info[annotation_field]}
    mate_gene = {annot["SYMBOL"] for annot in second.info[annotation_field]}
    full_overlapping = record_gene & mate_gene
    if len(full_overlapping) > 0:
        first_strand = getStrand(first, True)
        second_strand = getStrand(second, False)
        if first_strand != second_strand:
            is_inner_gene = True
        else:
            genes_strands = {
                annot["STRAND"]
                for annot in first.info[annotation_field] +
                second.info[annotation_field]
                if annot["SYMBOL"] in full_overlapping
            }
            if first_strand in genes_strands:
                is_inner_gene = True
    return is_inner_gene
Пример #3
0
def getBreakendInfo(record, annot_field="ANN", assembly_id=None):
    coordinates = {
        "region": record.chrom,
        "pos": record.pos,
        "cipos": None if "CIPOS" not in record.info else record.info["CIPOS"],
        "annot_pos": record.info["ANNOT_POS"],
        "ref": record.ref,
        "alt": record.alt[0],
        "assembly": (None if assembly_id is None else assembly_id),
        "strand": getStrand(record)
    }
    features = {}
    for idx, feature in enumerate(record.info[annot_field]):
        cp_feature = deepcopy(feature)
        del(cp_feature["IN_FRAME"])
        features[str(idx)] = cp_feature
    return {"coordinates": coordinates, "features_by_id": features}
Пример #4
0
def exonsPos(record, genes_by_chr):
    """
    Return by positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries.

    :param record: Breakdend record with CIPOS.
    :type record: anacore.vcf.VCFRecord
    :param genes_by_chr: By chromosomes a tree where nodes are genes, transcripts, protein, exons and CDS.
    :type genes_by_chr: dict
    :return: By positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries.
    :rtype: dict
    """
    record_strand = getStrand(record)
    exons_pos = {}
    start, end = getBNDInterval(record)
    interval_region = Region(start, end, None, record.chrom, record.getName())
    if record.chrom in genes_by_chr:
        overlapped_genes = genes_by_chr[record.chrom].getOverlapped(
            interval_region)
        for curr_gene in overlapped_genes:
            overlapped_transcripts = curr_gene.children.getOverlapped(
                interval_region)
            for curr_transcript in overlapped_transcripts:
                for subregion in curr_transcript.children.getOverlapped(
                        interval_region):
                    if record_strand == subregion.strand and issubclass(
                            subregion.__class__, Exon):
                        if interval_region.start <= subregion.start and interval_region.end >= subregion.start:  # Breakend match to exon start
                            if subregion.start not in exons_pos:
                                exons_pos[subregion.start] = 1
                            else:
                                exons_pos[subregion.start] += 1
                        if interval_region.start <= subregion.end and interval_region.end >= subregion.end:
                            if subregion.end not in exons_pos:
                                exons_pos[subregion.end] = 1
                            else:
                                exons_pos[subregion.end] += 1
    return exons_pos
Пример #5
0
def isReadthrough(up, down, annotation_field, genes, rt_max_dist, annCmpName, regCmpName):
    """
    Return True if the two breakends can be a readthrough.

    :param up: The breakend of the first shard in fusion.
    :type up: anacore.vcf.VCFRecord
    :param down: The breakend of the second shard in fusion.
    :type down: anacore.vcf.VCFRecord
    :param annotation_field: Field used to store annotations.
    :type annotation_field: str
    :param genes: The genes regions by chr.
    :type genes: AnnotGetter
    :param rt_max_dist: Maximum distance to evaluate if the fusion is a readthrough.
    :type rt_max_dist: int
    :param annCmpName: Callable used to return gene unique name from one VCF annotation.
    :type annCmpName: callable(annot)
    :param regCmpName: Callable used to return gene unique name from a gene region.
    :type regCmpName: callable(anacore.genomicRegion.Gene)
    :return: True if the two breakends can be a readthrough.
    :rtype: boolean
    """
    is_readthrough = False
    if up.chrom == down.chrom:
        up_strand = getStrand(up, True)
        down_strand = getStrand(down, False)
        if (up_strand == "+" and down_strand == "+") or (up_strand == "-" and down_strand == "-"):  # Readthrough are +/+ or -/-
            first = up
            second = down
            if first.pos > second.pos:
                first = down
                second = up
            first_start, first_end = getBNDInterval(first)
            second_start, second_end = getBNDInterval(second)
            interval_start = min(first_start, second_start)
            interval_end = max(first_end, second_end) + 1
            if interval_end - interval_start <= rt_max_dist:
                first_bp_gene = {annCmpName(annot) for annot in first.info[annotation_field]}
                second_bp_gene = {annCmpName(annot) for annot in second.info[annotation_field]}
                full_overlapping_gene = first_bp_gene & second_bp_gene
                only_first_bp_gene = first_bp_gene - second_bp_gene
                only_second_bp_gene = second_bp_gene - first_bp_gene
                if len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0:
                    strand_by_gene = {annCmpName(annot): annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field]}
                    only_first_bp_gene = {gene for gene in only_first_bp_gene if strand_by_gene[gene] == up_strand}
                    only_second_bp_gene = {gene for gene in only_second_bp_gene if strand_by_gene[gene] == up_strand}
                    possible_on_strand = len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0
                    if possible_on_strand:
                        interval_region = Region(interval_start, interval_end, up_strand, first.chrom)
                        overlapped_genes = genes.getChr(first.chrom).getOverlapped(interval_region)
                        overlapped_genes = RegionList([gene for gene in overlapped_genes if regCmpName(gene) not in full_overlapping_gene and gene.strand == up_strand])
                        overlapped_genes_by_id = {regCmpName(gene): gene for gene in overlapped_genes}
                        contradict_readthrough = False
                        for start_gene_id in only_first_bp_gene:
                            start_gene = overlapped_genes_by_id[start_gene_id]
                            for end_gene_id in only_second_bp_gene:
                                end_gene = overlapped_genes_by_id[end_gene_id]
                                for interval_gene in overlapped_genes:
                                    if regCmpName(interval_gene) != regCmpName(start_gene) and \
                                       regCmpName(interval_gene) != regCmpName(end_gene):
                                        if not interval_gene.hasOverlap(start_gene) and not interval_gene.hasOverlap(end_gene):
                                            contradict_readthrough = True
                        is_readthrough = not contradict_readthrough
    return is_readthrough
Пример #6
0
def selectedPos(first, first_exons_sup_by_pos, second,
                second_exons_sup_by_pos):
    """
    Return retained spot positions for the first and the second breakend when they contain CIPOS (exception for imprecise). Choice is based on placement on exons boundaries contained in CIPOS interval.

    :param first: Breakend of the 5' shard of the fusion.
    :type first: anacore.vcf.VCFRecord
    :param first_exons_sup_by_pos: By positions of exons boundaries overlapped by the first breakend, the number of alternative transcripts with this exon boundaries.
    :type first_exons_sup_by_pos: dict
    :param second: Breakend of the 3' shard of the fusion.
    :type second: anacore.vcf.VCFRecord
    :param second_exons_sup_by_pos: By positions of exons boundaries overlapped by the second breakend, the number of alternative transcripts with this exon boundaries.
    :type second_exons_sup_by_pos: dict
    :return: Retained spot positions for the first and the second breakend when they contain CIPOS.
    :rtype: (int, int)
    """
    if len(first_exons_sup_by_pos) == 0 and len(
            second_exons_sup_by_pos
    ) == 0:  # No shard contain an exon at breakend pos
        return (first.pos, second.pos)
    else:
        first_strand = getStrand(first, True)
        second_strand = getStrand(second, False)
        if len(
                second_exons_sup_by_pos
        ) == 0:  # Only the 5' shard contains at least one exon at breakend pos: the most supported exon boundary for the first breakend is retained
            selected_pos = getMostSupported(first_exons_sup_by_pos)
            offset = selected_pos - first.pos
            if first_strand == second_strand:
                return (selected_pos, second.pos + offset)
            else:
                cipos = 0 if "CIPOS" not in second.info else second.info[
                    "CIPOS"][1]
                return (selected_pos, second.pos + cipos - offset)
        elif len(
                first_exons_sup_by_pos
        ) == 0:  # Only the 3' shard contains at least one exon at breakend pos: the most supported exon boundary for the second breakend is retained
            selected_pos = getMostSupported(second_exons_sup_by_pos)
            offset = selected_pos - second.pos
            if first_strand == second_strand:
                return (first.pos + offset, selected_pos)
            else:
                cipos = 0 if "CIPOS" not in first.info else first.info[
                    "CIPOS"][1]
                return (first.pos + cipos - offset, selected_pos)
        else:  # The two shards contain at least one exon at breakend pos
            first_offsets = {pos - first.pos for pos in first_exons_sup_by_pos}
            second_offsets = {
                pos - second.pos
                for pos in second_exons_sup_by_pos
            }
            second_cipos = 0 if "CIPOS" not in second.info else second.info[
                "CIPOS"][1]
            if first_strand != second_strand:
                second_offsets = {
                    abs(second_cipos - offset)
                    for offset in second_offsets
                }
            common = first_offsets & second_offsets
            if len(common
                   ) == 1:  # Only one common offset between two breakends
                offset = min(common)
            elif len(
                    common
            ) > 1:  # Several common offsets between two breakends: the offset with the most sum of support (exons first + exons second) is retained
                supp_by_offset = {}
                for pos, support in first_exons_sup_by_pos.items():
                    offset = pos - first.pos
                    if offset in common:
                        supp_by_offset[offset] = support
                for pos, support in second_exons_sup_by_pos.items():
                    offset = pos - second.pos
                    if first_strand != second_strand:
                        offset = abs(second_cipos - offset)
                    if offset in common:
                        supp_by_offset[offset] += support
                offset = getMostSupported(supp_by_offset)
            else:  # No common offset between two breakends: the most supported exon boundary for the first breakend is retained
                selected_pos = getMostSupported(first_exons_sup_by_pos)
                offset = selected_pos - first.pos
            if first_strand == second_strand:
                return (first.pos + offset, second.pos + offset)
            else:
                return (first.pos + offset, second.pos + second_cipos - offset)
Пример #7
0
def annotModelRetIntron(first, second, annotation_field):
    """
    Add GENE_SHARD and IN_FRAME in annotations in a context where introns may
    have been retained.
    GENE_SHARD determines if the part of the transcript implicated on the fusion
    RNA is the 5' of the original (up) or the 3' (down).
    IN_FRAME determines if the transcript in the 3' shard of the fusion transcript
    express a part of the original protein: 5' shard imports promoter and start
    of the first transcript in right strand, 3' shard imports the end of the
    second transcript in right strand and the phase of the second transcript is
    kept.
    The following table details IN_FRAME values for all the analysed configurations:
    5' shard     3' shard   Inframe   Note
    5'UTR        5'UTR      1         The first does not start
    CDS          5'UTR      ?/1       1 if first BP is on intron or on end of exon and the second is on intron or start of exon and UTR of second as length compatible to phase.
    3'UTR        5'UTR      ?         If the trancription terminator is cut and the sequence continue to exon and splice to the next or if is readthrough
    5'UTR        CDS        ?         Can be use an other TSS and traduction start.
    CDS          CDS        0/1       Check the phase (end of first shard and start of second shard)
    3'UTR        CDS        ?         If the trancription terminator is cut and the sequence continue to exon and splice to the next or if is readthrough
    *            3'UTR      0         The second is not expressed
    non-coding   5'UTR      1         1 if first and second BND are in intron or in splice site.

    :param first: Breakend of the 5' shard of the fusion.
    :type first: anacore.vcf.VCFRecord
    :param second: Breakend of the 3' shard of the fusion.
    :type second: anacore.vcf.VCFRecord
    :param annotation_field: Field used for store annotations.
    :type annotation_field: str
    """
    first_strand = getStrand(first, True)
    second_strand = getStrand(second, False)
    annotGeneShard(first, annotation_field)
    annotGeneShard(second, annotation_field)
    for second_annot in second.info[annotation_field]:
        if "IN_FRAME" not in second_annot:
            second_annot["IN_FRAME"] = []
    for first_annot in first.info[annotation_field]:
        if "IN_FRAME" not in first_annot:
            first_annot["IN_FRAME"] = []
        for second_annot in second.info[annotation_field]:
            inframe = "0"
            if first_annot["GENE_SHARD"] == "up" and second_annot[
                    "GENE_SHARD"] == "down":
                if first_strand == first_annot[
                        "STRAND"] and second_strand == second_annot["STRAND"]:
                    if second_annot[
                            "Protein"] != "":  # The second RNA is coding
                        inframe = "."
                        if first_annot[
                                "Protein"] != "":  # First and second transcripts are coding
                            if not first_annot["RNA_ELT_TYPE"].endswith(
                                    "utr"
                            ) and not second_annot["RNA_ELT_TYPE"].endswith(
                                    "utr"):  # first: CDS and second: CDS
                                is_intron_jct = "intron" in first_annot[
                                    "RNA_ELT_TYPE"] and "intron" in second_annot[
                                        "RNA_ELT_TYPE"]
                                is_exon_jct = None
                                if not is_intron_jct:
                                    is_exon_jct = "exon" in first_annot[
                                        "RNA_ELT_TYPE"] and "exon" in second_annot[
                                            "RNA_ELT_TYPE"]
                                if is_intron_jct or is_exon_jct:
                                    # ################## TODO: check stop codon ?
                                    inframe = "0"
                                    if (first_annot["Codon_position"] == 3 and
                                            second_annot["Codon_position"] == 1
                                        ) or (second_annot["Codon_position"] -
                                              first_annot["Codon_position"]
                                              == 1):
                                        inframe = "1"
                            else:  # At least one breakend falls in UTR
                                if second_annot["RNA_ELT_TYPE"].endswith(
                                        "utr"
                                ) and second_annot["RNA_ELT_POS"].endswith(
                                        "3prim"):  # first: * and second: 3'UTR
                                    inframe = "0"
                                elif first_annot["RNA_ELT_TYPE"].endswith(
                                        "utr"
                                ) and second_annot["RNA_ELT_TYPE"].endswith(
                                        "utr"):  # first: UTR and second: UTR
                                    if first_annot["RNA_ELT_POS"].endswith(
                                            "5prim"
                                    ) and second_annot["RNA_ELT_POS"].endswith(
                                            "5prim"
                                    ):  # first: 5'UTR and second: 5'UTR
                                        inframe = "1"
                                elif not first_annot["RNA_ELT_TYPE"].endswith(
                                        "utr"
                                ) and second_annot["RNA_ELT_TYPE"].endswith(
                                        "utr"):  # first: CDS and second: UTR
                                    if second_annot["RNA_ELT_POS"].endswith(
                                            "5prim"
                                    ):  # first: CDS and second: 5'UTR
                                        skip = False
                                        if "exon" in first_annot[
                                                "RNA_ELT_TYPE"] and "intron" in second_annot[
                                                    "RNA_ELT_TYPE"]:
                                            skip = True
                                            if "spliceStart" in first_annot[
                                                    "RNA_ELT_TYPE"] or "transcriptEnd" in first_annot[
                                                        "RNA_ELT_TYPE"]:
                                                skip = False
                                        elif "intron" in first_annot[
                                                "RNA_ELT_TYPE"] and "exon" in second_annot[
                                                    "RNA_ELT_TYPE"]:
                                            skip = True
                                            if "spliceEnd" in first_annot[
                                                    "RNA_ELT_TYPE"] or "transcriptStart" in first_annot[
                                                        "RNA_ELT_TYPE"]:
                                                skip = False
                                        if not skip:
                                            inframe = "0"
                                            first_and_utr_codon_pos = first_annot[
                                                "Codon_position"] + (
                                                    second_annot["CDS_DIST"] %
                                                    3)
                                            # ################## TODO: check stop codon ?
                                            if first_and_utr_codon_pos == 3:
                                                inframe = "1"
                        else:  # First is not coding
                            if second_annot["RNA_ELT_TYPE"].endswith(
                                    "utr"
                            ) and second_annot["RNA_ELT_POS"].endswith(
                                    "3prim"):  # first: * and second: 3'UTR
                                inframe = "0"
                            elif second_annot["RNA_ELT_POS"].endswith(
                                    "5prim"
                            ):  # first: non-coding and second: 5'UTR
                                if first_annot["RNA_ELT_TYPE"].startswith(
                                        "intron"):
                                    if second_annot["RNA_ELT_TYPE"].startswith(
                                            "intron"
                                    ):  # from non-coding first in intron to second 5'UTR in intron
                                        inframe = "1"
                                elif "spliceStart" in first_annot[
                                        "RNA_ELT_TYPE"] or "transcriptEnd" in first_annot[
                                            "RNA_ELT_TYPE"]:
                                    if "spliceEnd" in second_annot[
                                            "RNA_ELT_TYPE"] or "transcriptStart" in second_annot[
                                                "RNA_ELT_TYPE"]:  # from non-coding first on splice donor to second 5'UTR on splice acceptor
                                        inframe = "1"
            first_annot["IN_FRAME"].append("{}:{}".format(
                second_annot["Feature"], inframe))
            second_annot["IN_FRAME"].append("{}:{}".format(
                first_annot["Feature"], inframe))
    for first_annot in first.info[annotation_field]:
        first_annot["IN_FRAME"] = "&".join(first_annot["IN_FRAME"])
    for second_annot in second.info[annotation_field]:
        second_annot["IN_FRAME"] = "&".join(second_annot["IN_FRAME"])
Пример #8
0
def getGeneAnnot(record, genes_by_chr):
    """
    Return genomic items overlapped by the BND record.

    :param record: The BND record.
    :type record: anacore.vcf.VCFRecord
    :param genes_by_chr: By chromosomes a tree where nodes are genes, transcripts, protein, exons and CDS.
    :type genes_by_chr: dict
    :return: The list of annotations (one annotation by overlapped transcript).
    :rtype: list
    """
    record_strand = getStrand(record)
    shard_before_bnd = shardIsBeforeBND(record)
    bnd_region = Region(record.info["ANNOT_POS"], None, None, record.chrom,
                        record.getName())
    annotations = []
    if record.chrom in genes_by_chr:
        overlapped_genes = genes_by_chr[record.chrom].getOverlapped(bnd_region)
        for curr_gene in overlapped_genes:
            overlapped_transcripts = curr_gene.children.getOverlapped(
                bnd_region)
            if len(overlapped_transcripts) == 0:
                log.warn(
                    "The breakpoint {} is contained by gene {} but by 0 of these transcripts."
                    .format(bnd_region, curr_gene))
            else:
                for curr_transcript in overlapped_transcripts:
                    if len(curr_transcript.proteins) > 1:
                        log.error(
                            "The management of several proteins for one transcript is not implemented. The transcript {} contains several proteins {}."
                            .format(curr_transcript, curr_transcript.proteins),
                            exec_info=True)
                    if curr_transcript.strand is None:
                        log.error("The transcript {} has no strand.".format(
                            curr_transcript),
                                  exec_info=True)
                    curr_annot = {
                        "SYMBOL":
                        curr_gene.name,
                        "Gene":
                        curr_gene.annot["id"],
                        "Feature":
                        curr_transcript.annot["id"],
                        "Feature_type":
                        "Transcript",
                        "STRAND":
                        curr_transcript.strand,
                        "Protein":
                        "" if len(curr_transcript.proteins) == 0 else
                        curr_transcript.proteins[0].annot["id"],
                        "RNA_ELT_TYPE":
                        None,
                        "RNA_ELT_POS":
                        None,
                        "CDS_position":
                        None,
                        "Protein_position":
                        None,
                        "Codon_position":
                        None
                    }
                    # Intron, exon and CDS posiion
                    subregion, subregion_idx = curr_transcript.getSubFromRefPos(
                        bnd_region.start)
                    if issubclass(subregion.__class__, Intron):  # On intron
                        curr_annot["RNA_ELT_TYPE"] = "intron"
                        curr_annot["RNA_ELT_POS"] = "{}/{}".format(
                            subregion_idx,
                            len(curr_transcript.children) - 1)
                        if len(
                                curr_transcript.proteins
                        ) > 0 and curr_transcript.strand == record_strand:
                            curr_protein = curr_transcript.proteins[0]
                            # Get CDS on last implicated exon for first shard and first implicated exon on second shard
                            ref_pos = subregion.end + 1
                            if shard_before_bnd:
                                ref_pos = subregion.start - 1
                            curr_annot[
                                "CDS_position"] = curr_protein.getNtPosFromRefPos(
                                    ref_pos)
                            if curr_annot["CDS_position"] is None or curr_annot[
                                    "CDS_position"] == 1 or curr_annot[
                                        "CDS_position"] == curr_protein.length:
                                curr_annot["CDS_position"] = None
                                curr_annot["RNA_ELT_TYPE"] += "&utr"
                                if curr_protein.strand == "+":
                                    curr_annot["RNA_ELT_POS"] += "&" + (
                                        "5prim" if curr_protein.start >
                                        bnd_region.start else "3prim")
                                    if curr_protein.start > bnd_region.start:
                                        curr_annot[
                                            "CDS_DIST"] = getDistBeforeCDSForward(
                                                bnd_region.start, curr_protein)
                                else:
                                    curr_annot["RNA_ELT_POS"] += "&" + (
                                        "5prim" if curr_protein.end <
                                        bnd_region.start else "3prim")
                                    if curr_protein.end < bnd_region.start:
                                        curr_annot[
                                            "CDS_DIST"] = getDistBeforeCDSReverse(
                                                bnd_region.start, curr_protein)
                            else:
                                curr_annot["Protein_position"], curr_annot[
                                    "Codon_position"] = curr_protein.getPosOnRegion(
                                        ref_pos)
                    else:  # On exon
                        nb_exon = len(curr_transcript.children)
                        curr_annot["RNA_ELT_TYPE"] = "exon"
                        curr_annot["RNA_ELT_POS"] = "{}/{}".format(
                            subregion_idx, nb_exon)
                        if bnd_region.start == subregion.start:
                            if subregion_idx == 1 and subregion.strand == "+":  # Start of the first exon
                                curr_annot[
                                    "RNA_ELT_TYPE"] += "&transcriptStart"
                            elif subregion_idx == nb_exon and subregion.strand == "-":  # End of the last exon
                                curr_annot["RNA_ELT_TYPE"] += "&transcriptEnd"
                            else:
                                curr_annot["RNA_ELT_TYPE"] += "&splice" + (
                                    "End"
                                    if subregion.strand == "+" else "Start")
                        elif bnd_region.start == subregion.end:
                            if subregion_idx == 1 and subregion.strand == "-":  # Start of the first exon
                                curr_annot[
                                    "RNA_ELT_TYPE"] += "&transcriptStart"
                            elif subregion_idx == nb_exon and subregion.strand == "+":  # End of the last exon
                                curr_annot["RNA_ELT_TYPE"] += "&transcriptEnd"
                            else:
                                curr_annot["RNA_ELT_TYPE"] += "&splice" + (
                                    "End"
                                    if subregion.strand == "-" else "Start")
                        if len(curr_transcript.proteins) > 0:
                            curr_protein = curr_transcript.proteins[0]
                            curr_annot[
                                "CDS_position"] = curr_protein.getNtPosFromRefPos(
                                    bnd_region.start)
                            # UTR
                            if curr_annot["CDS_position"] is None:
                                curr_annot["RNA_ELT_TYPE"] += "&utr"
                                if curr_transcript.proteins[0].strand == "+":
                                    curr_annot["RNA_ELT_POS"] += "&" + (
                                        "5prim" if curr_protein.start >
                                        bnd_region.start else "3prim")
                                    if curr_protein.start > bnd_region.start:
                                        curr_annot[
                                            "CDS_DIST"] = getDistBeforeCDSForward(
                                                bnd_region.start, curr_protein)
                                else:
                                    curr_annot["RNA_ELT_POS"] += "&" + (
                                        "5prim" if curr_protein.end <
                                        bnd_region.start else "3prim")
                                    if curr_protein.end < bnd_region.start:
                                        curr_annot[
                                            "CDS_DIST"] = getDistBeforeCDSReverse(
                                                bnd_region.start, curr_protein)
                            # Protein position
                            else:
                                curr_annot["Protein_position"], curr_annot[
                                    "Codon_position"] = curr_protein.getPosOnRegion(
                                        bnd_region.start)
                    # Add to annotations
                    annotations.append(curr_annot)
    return annotations
Пример #9
0
def fastStandardize(first, second, seq_handler, padding=50):
    """
    Each breakend of the pair is placed at the left most position, and the uncertainty is represented with the CIPOS tag. The ALT string is then constructed assuming this choice.

    :param first: The breakend of the first shard in fusion (donor).
    :type first: anacore.vcf.VCFRecord
    :param second: The breakend of the second shard in fusion (acceptor).
    :type second: anacore.vcf.VCFRecord
    :param seq_handler: Indexed reader for the reference genome used in fusion calling.
    :type seq_handler: anacore.sequenceIO.IdxFastaIO
    :param padding: Number of nucleotids to inspect before and after the breakends: upstream and downstream movements are limited to this number of nucleotids.
    :type padding: int
    """
    first_strand = getStrand(first, True)
    second_strand = getStrand(second, False)
    before_first = seq_handler.getSub(first.chrom, max(first.pos - padding, 1),
                                      first.pos)
    before_second = seq_handler.getSub(second.chrom,
                                       max(second.pos - padding, 1),
                                       second.pos)
    after_first = seq_handler.getSub(first.chrom, first.pos,
                                     first.pos + padding)
    after_second = seq_handler.getSub(second.chrom, second.pos,
                                      second.pos + padding)
    cipos_start = 0
    cipos_end = 0
    if first_strand == second_strand:  # Same strand
        # Move to upstream
        before_first_seq = before_first
        before_second_seq = before_second[:-1]
        if first_strand == "-":
            before_first_seq = before_first[:-1]
            before_second_seq = before_second
        for nt_first, nt_second in zip(before_first_seq[::-1],
                                       before_second_seq[::-1]):
            if nt_first != nt_second:
                break
            cipos_start -= 1
        # Move to downstream
        after_first_seq = after_first[1:]
        after_second_seq = after_second
        if first_strand == "-":
            after_first_seq = after_first
            after_second_seq = after_second[1:]
        for nt_first, nt_second in zip(after_first_seq, after_second_seq):
            if nt_first != nt_second:
                break
            cipos_end += 1
        # Update records
        if cipos_start != 0 or cipos_end != 0:
            first.pos = first.pos + cipos_start
            first.ref = seq_handler.getSub(first.chrom, first.pos, first.pos)
            first.info["CIPOS"] = [0, cipos_end - cipos_start]
            second.pos = second.pos + cipos_start
            second.ref = seq_handler.getSub(second.chrom, second.pos,
                                            second.pos)
            second.info["CIPOS"] = [0, cipos_end - cipos_start]
            first.alt[0], second.alt[0] = getAltFromCoord(
                getCoordStr(first, True), getCoordStr(second, False))
            first.alt[0].replace("N", first.ref)
            second.alt[0].replace("N", second.ref)
    else:  # Different strand
        before_second = getComplement(before_second)
        after_second = getComplement(after_second)
        # Move before first cointaining breakend and after second excluding breakend
        before_first_seq = before_first
        after_second_seq = after_second[1:]
        if first_strand == "-":
            before_first_seq = before_first[:-1]
            after_second_seq = after_second
        for nt_first, nt_second in zip(before_first_seq[::-1],
                                       after_second_seq):
            if nt_first != nt_second:
                break
            cipos_start -= 1
        # Move before second cointaining breakend and after first excluding breakend
        after_first_seq = after_first[1:]
        before_second_seq = before_second
        if first_strand == "-":
            after_first_seq = after_first
            before_second_seq = before_second[:-1]
        for nt_first, nt_second in zip(after_first_seq,
                                       before_second_seq[::-1]):
            if nt_first != nt_second:
                break
            cipos_end += 1
        # Update records
        if cipos_start != 0 or cipos_end != 0:
            first.pos = first.pos + cipos_start
            first.ref = seq_handler.getSub(first.chrom, first.pos, first.pos)
            first.info["CIPOS"] = [0, cipos_end - cipos_start]
            second.pos = second.pos - cipos_end  # because cipos_start for first is - cipos_end for second
            second.ref = seq_handler.getSub(second.chrom, second.pos,
                                            second.pos)
            second.info["CIPOS"] = first.info["CIPOS"]
            second_down_pos = second.pos + (cipos_end - cipos_start)
            first.alt[0], trash = getAltFromCoord(
                getCoordStr(first, True), {
                    "chrom": second.chrom,
                    "pos": second_down_pos,
                    "strand": second_strand
                })
            first.alt[0].replace("N", first.ref)
            first_down_pos = first.pos + (cipos_end - cipos_start)
            trach, second.alt[0] = getAltFromCoord(
                {
                    "chrom": first.chrom,
                    "pos": first_down_pos,
                    "strand": first_strand
                }, getCoordStr(second, False))
            second.alt[0].replace("N", second.ref)