def isInner(first, second, annotation_field, annCmpName, regCmpName): """ Return True if the two breakends fall in the same gene (in at least one overlapped gene). :param first: The breakend of the first shard in fusion. :type first: anacore.vcf.VCFRecord :param second: The breakend of the second shard in fusion. :type second: anacore.vcf.VCFRecord :param annotation_field: Field used to store annotations. :type annotation_field: str :param annCmpName: Callable used to return gene unique name from one VCF annotation. :type annCmpName: callable(annot) :param regCmpName: Callable used to return gene unique name from a gene region. :type regCmpName: callable(anacore.genomicRegion.Gene) :return: True if the two breakends falls in the same gene (in at least one overlapped gene). :rtype: boolean """ is_inner_gene = False record_gene = {annCmpName(annot) for annot in first.info[annotation_field]} mate_gene = {annCmpName(annot) for annot in second.info[annotation_field]} full_overlapping = record_gene & mate_gene if len(full_overlapping) > 0: first_strand = getStrand(first, True) second_strand = getStrand(second, False) if first_strand != second_strand: is_inner_gene = True else: genes_strands = {annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field] if annCmpName(annot) in full_overlapping} if first_strand in genes_strands: is_inner_gene = True return is_inner_gene
def isInner(first, second, annotation_field): """ Return True if the two breakends falls in the same gene (in at least one overlapped gene). :param first: The breakend of the first shard in fusion. :type first: anacore.vcf.VCFRecord :param second: The breakend of the second shard in fusion. :type second: anacore.vcf.VCFRecord :param annotation_field: Field used to store annotations. :type annotation_field: str :return: True if the two breakends falls in the same gene (in at least one overlapped gene). :rtype: boolean """ is_inner_gene = False record_gene = {annot["SYMBOL"] for annot in first.info[annotation_field]} mate_gene = {annot["SYMBOL"] for annot in second.info[annotation_field]} full_overlapping = record_gene & mate_gene if len(full_overlapping) > 0: first_strand = getStrand(first, True) second_strand = getStrand(second, False) if first_strand != second_strand: is_inner_gene = True else: genes_strands = { annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field] if annot["SYMBOL"] in full_overlapping } if first_strand in genes_strands: is_inner_gene = True return is_inner_gene
def getBreakendInfo(record, annot_field="ANN", assembly_id=None): coordinates = { "region": record.chrom, "pos": record.pos, "cipos": None if "CIPOS" not in record.info else record.info["CIPOS"], "annot_pos": record.info["ANNOT_POS"], "ref": record.ref, "alt": record.alt[0], "assembly": (None if assembly_id is None else assembly_id), "strand": getStrand(record) } features = {} for idx, feature in enumerate(record.info[annot_field]): cp_feature = deepcopy(feature) del(cp_feature["IN_FRAME"]) features[str(idx)] = cp_feature return {"coordinates": coordinates, "features_by_id": features}
def exonsPos(record, genes_by_chr): """ Return by positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries. :param record: Breakdend record with CIPOS. :type record: anacore.vcf.VCFRecord :param genes_by_chr: By chromosomes a tree where nodes are genes, transcripts, protein, exons and CDS. :type genes_by_chr: dict :return: By positions of exons boundaries overlapped by the breakend, the number of alternative transcripts with this exon boundaries. :rtype: dict """ record_strand = getStrand(record) exons_pos = {} start, end = getBNDInterval(record) interval_region = Region(start, end, None, record.chrom, record.getName()) if record.chrom in genes_by_chr: overlapped_genes = genes_by_chr[record.chrom].getOverlapped( interval_region) for curr_gene in overlapped_genes: overlapped_transcripts = curr_gene.children.getOverlapped( interval_region) for curr_transcript in overlapped_transcripts: for subregion in curr_transcript.children.getOverlapped( interval_region): if record_strand == subregion.strand and issubclass( subregion.__class__, Exon): if interval_region.start <= subregion.start and interval_region.end >= subregion.start: # Breakend match to exon start if subregion.start not in exons_pos: exons_pos[subregion.start] = 1 else: exons_pos[subregion.start] += 1 if interval_region.start <= subregion.end and interval_region.end >= subregion.end: if subregion.end not in exons_pos: exons_pos[subregion.end] = 1 else: exons_pos[subregion.end] += 1 return exons_pos
def isReadthrough(up, down, annotation_field, genes, rt_max_dist, annCmpName, regCmpName): """ Return True if the two breakends can be a readthrough. :param up: The breakend of the first shard in fusion. :type up: anacore.vcf.VCFRecord :param down: The breakend of the second shard in fusion. :type down: anacore.vcf.VCFRecord :param annotation_field: Field used to store annotations. :type annotation_field: str :param genes: The genes regions by chr. :type genes: AnnotGetter :param rt_max_dist: Maximum distance to evaluate if the fusion is a readthrough. :type rt_max_dist: int :param annCmpName: Callable used to return gene unique name from one VCF annotation. :type annCmpName: callable(annot) :param regCmpName: Callable used to return gene unique name from a gene region. :type regCmpName: callable(anacore.genomicRegion.Gene) :return: True if the two breakends can be a readthrough. :rtype: boolean """ is_readthrough = False if up.chrom == down.chrom: up_strand = getStrand(up, True) down_strand = getStrand(down, False) if (up_strand == "+" and down_strand == "+") or (up_strand == "-" and down_strand == "-"): # Readthrough are +/+ or -/- first = up second = down if first.pos > second.pos: first = down second = up first_start, first_end = getBNDInterval(first) second_start, second_end = getBNDInterval(second) interval_start = min(first_start, second_start) interval_end = max(first_end, second_end) + 1 if interval_end - interval_start <= rt_max_dist: first_bp_gene = {annCmpName(annot) for annot in first.info[annotation_field]} second_bp_gene = {annCmpName(annot) for annot in second.info[annotation_field]} full_overlapping_gene = first_bp_gene & second_bp_gene only_first_bp_gene = first_bp_gene - second_bp_gene only_second_bp_gene = second_bp_gene - first_bp_gene if len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0: strand_by_gene = {annCmpName(annot): annot["STRAND"] for annot in first.info[annotation_field] + second.info[annotation_field]} only_first_bp_gene = {gene for gene in only_first_bp_gene if strand_by_gene[gene] == up_strand} only_second_bp_gene = {gene for gene in only_second_bp_gene if strand_by_gene[gene] == up_strand} possible_on_strand = len(only_first_bp_gene) != 0 and len(only_second_bp_gene) != 0 if possible_on_strand: interval_region = Region(interval_start, interval_end, up_strand, first.chrom) overlapped_genes = genes.getChr(first.chrom).getOverlapped(interval_region) overlapped_genes = RegionList([gene for gene in overlapped_genes if regCmpName(gene) not in full_overlapping_gene and gene.strand == up_strand]) overlapped_genes_by_id = {regCmpName(gene): gene for gene in overlapped_genes} contradict_readthrough = False for start_gene_id in only_first_bp_gene: start_gene = overlapped_genes_by_id[start_gene_id] for end_gene_id in only_second_bp_gene: end_gene = overlapped_genes_by_id[end_gene_id] for interval_gene in overlapped_genes: if regCmpName(interval_gene) != regCmpName(start_gene) and \ regCmpName(interval_gene) != regCmpName(end_gene): if not interval_gene.hasOverlap(start_gene) and not interval_gene.hasOverlap(end_gene): contradict_readthrough = True is_readthrough = not contradict_readthrough return is_readthrough
def selectedPos(first, first_exons_sup_by_pos, second, second_exons_sup_by_pos): """ Return retained spot positions for the first and the second breakend when they contain CIPOS (exception for imprecise). Choice is based on placement on exons boundaries contained in CIPOS interval. :param first: Breakend of the 5' shard of the fusion. :type first: anacore.vcf.VCFRecord :param first_exons_sup_by_pos: By positions of exons boundaries overlapped by the first breakend, the number of alternative transcripts with this exon boundaries. :type first_exons_sup_by_pos: dict :param second: Breakend of the 3' shard of the fusion. :type second: anacore.vcf.VCFRecord :param second_exons_sup_by_pos: By positions of exons boundaries overlapped by the second breakend, the number of alternative transcripts with this exon boundaries. :type second_exons_sup_by_pos: dict :return: Retained spot positions for the first and the second breakend when they contain CIPOS. :rtype: (int, int) """ if len(first_exons_sup_by_pos) == 0 and len( second_exons_sup_by_pos ) == 0: # No shard contain an exon at breakend pos return (first.pos, second.pos) else: first_strand = getStrand(first, True) second_strand = getStrand(second, False) if len( second_exons_sup_by_pos ) == 0: # Only the 5' shard contains at least one exon at breakend pos: the most supported exon boundary for the first breakend is retained selected_pos = getMostSupported(first_exons_sup_by_pos) offset = selected_pos - first.pos if first_strand == second_strand: return (selected_pos, second.pos + offset) else: cipos = 0 if "CIPOS" not in second.info else second.info[ "CIPOS"][1] return (selected_pos, second.pos + cipos - offset) elif len( first_exons_sup_by_pos ) == 0: # Only the 3' shard contains at least one exon at breakend pos: the most supported exon boundary for the second breakend is retained selected_pos = getMostSupported(second_exons_sup_by_pos) offset = selected_pos - second.pos if first_strand == second_strand: return (first.pos + offset, selected_pos) else: cipos = 0 if "CIPOS" not in first.info else first.info[ "CIPOS"][1] return (first.pos + cipos - offset, selected_pos) else: # The two shards contain at least one exon at breakend pos first_offsets = {pos - first.pos for pos in first_exons_sup_by_pos} second_offsets = { pos - second.pos for pos in second_exons_sup_by_pos } second_cipos = 0 if "CIPOS" not in second.info else second.info[ "CIPOS"][1] if first_strand != second_strand: second_offsets = { abs(second_cipos - offset) for offset in second_offsets } common = first_offsets & second_offsets if len(common ) == 1: # Only one common offset between two breakends offset = min(common) elif len( common ) > 1: # Several common offsets between two breakends: the offset with the most sum of support (exons first + exons second) is retained supp_by_offset = {} for pos, support in first_exons_sup_by_pos.items(): offset = pos - first.pos if offset in common: supp_by_offset[offset] = support for pos, support in second_exons_sup_by_pos.items(): offset = pos - second.pos if first_strand != second_strand: offset = abs(second_cipos - offset) if offset in common: supp_by_offset[offset] += support offset = getMostSupported(supp_by_offset) else: # No common offset between two breakends: the most supported exon boundary for the first breakend is retained selected_pos = getMostSupported(first_exons_sup_by_pos) offset = selected_pos - first.pos if first_strand == second_strand: return (first.pos + offset, second.pos + offset) else: return (first.pos + offset, second.pos + second_cipos - offset)
def annotModelRetIntron(first, second, annotation_field): """ Add GENE_SHARD and IN_FRAME in annotations in a context where introns may have been retained. GENE_SHARD determines if the part of the transcript implicated on the fusion RNA is the 5' of the original (up) or the 3' (down). IN_FRAME determines if the transcript in the 3' shard of the fusion transcript express a part of the original protein: 5' shard imports promoter and start of the first transcript in right strand, 3' shard imports the end of the second transcript in right strand and the phase of the second transcript is kept. The following table details IN_FRAME values for all the analysed configurations: 5' shard 3' shard Inframe Note 5'UTR 5'UTR 1 The first does not start CDS 5'UTR ?/1 1 if first BP is on intron or on end of exon and the second is on intron or start of exon and UTR of second as length compatible to phase. 3'UTR 5'UTR ? If the trancription terminator is cut and the sequence continue to exon and splice to the next or if is readthrough 5'UTR CDS ? Can be use an other TSS and traduction start. CDS CDS 0/1 Check the phase (end of first shard and start of second shard) 3'UTR CDS ? If the trancription terminator is cut and the sequence continue to exon and splice to the next or if is readthrough * 3'UTR 0 The second is not expressed non-coding 5'UTR 1 1 if first and second BND are in intron or in splice site. :param first: Breakend of the 5' shard of the fusion. :type first: anacore.vcf.VCFRecord :param second: Breakend of the 3' shard of the fusion. :type second: anacore.vcf.VCFRecord :param annotation_field: Field used for store annotations. :type annotation_field: str """ first_strand = getStrand(first, True) second_strand = getStrand(second, False) annotGeneShard(first, annotation_field) annotGeneShard(second, annotation_field) for second_annot in second.info[annotation_field]: if "IN_FRAME" not in second_annot: second_annot["IN_FRAME"] = [] for first_annot in first.info[annotation_field]: if "IN_FRAME" not in first_annot: first_annot["IN_FRAME"] = [] for second_annot in second.info[annotation_field]: inframe = "0" if first_annot["GENE_SHARD"] == "up" and second_annot[ "GENE_SHARD"] == "down": if first_strand == first_annot[ "STRAND"] and second_strand == second_annot["STRAND"]: if second_annot[ "Protein"] != "": # The second RNA is coding inframe = "." if first_annot[ "Protein"] != "": # First and second transcripts are coding if not first_annot["RNA_ELT_TYPE"].endswith( "utr" ) and not second_annot["RNA_ELT_TYPE"].endswith( "utr"): # first: CDS and second: CDS is_intron_jct = "intron" in first_annot[ "RNA_ELT_TYPE"] and "intron" in second_annot[ "RNA_ELT_TYPE"] is_exon_jct = None if not is_intron_jct: is_exon_jct = "exon" in first_annot[ "RNA_ELT_TYPE"] and "exon" in second_annot[ "RNA_ELT_TYPE"] if is_intron_jct or is_exon_jct: # ################## TODO: check stop codon ? inframe = "0" if (first_annot["Codon_position"] == 3 and second_annot["Codon_position"] == 1 ) or (second_annot["Codon_position"] - first_annot["Codon_position"] == 1): inframe = "1" else: # At least one breakend falls in UTR if second_annot["RNA_ELT_TYPE"].endswith( "utr" ) and second_annot["RNA_ELT_POS"].endswith( "3prim"): # first: * and second: 3'UTR inframe = "0" elif first_annot["RNA_ELT_TYPE"].endswith( "utr" ) and second_annot["RNA_ELT_TYPE"].endswith( "utr"): # first: UTR and second: UTR if first_annot["RNA_ELT_POS"].endswith( "5prim" ) and second_annot["RNA_ELT_POS"].endswith( "5prim" ): # first: 5'UTR and second: 5'UTR inframe = "1" elif not first_annot["RNA_ELT_TYPE"].endswith( "utr" ) and second_annot["RNA_ELT_TYPE"].endswith( "utr"): # first: CDS and second: UTR if second_annot["RNA_ELT_POS"].endswith( "5prim" ): # first: CDS and second: 5'UTR skip = False if "exon" in first_annot[ "RNA_ELT_TYPE"] and "intron" in second_annot[ "RNA_ELT_TYPE"]: skip = True if "spliceStart" in first_annot[ "RNA_ELT_TYPE"] or "transcriptEnd" in first_annot[ "RNA_ELT_TYPE"]: skip = False elif "intron" in first_annot[ "RNA_ELT_TYPE"] and "exon" in second_annot[ "RNA_ELT_TYPE"]: skip = True if "spliceEnd" in first_annot[ "RNA_ELT_TYPE"] or "transcriptStart" in first_annot[ "RNA_ELT_TYPE"]: skip = False if not skip: inframe = "0" first_and_utr_codon_pos = first_annot[ "Codon_position"] + ( second_annot["CDS_DIST"] % 3) # ################## TODO: check stop codon ? if first_and_utr_codon_pos == 3: inframe = "1" else: # First is not coding if second_annot["RNA_ELT_TYPE"].endswith( "utr" ) and second_annot["RNA_ELT_POS"].endswith( "3prim"): # first: * and second: 3'UTR inframe = "0" elif second_annot["RNA_ELT_POS"].endswith( "5prim" ): # first: non-coding and second: 5'UTR if first_annot["RNA_ELT_TYPE"].startswith( "intron"): if second_annot["RNA_ELT_TYPE"].startswith( "intron" ): # from non-coding first in intron to second 5'UTR in intron inframe = "1" elif "spliceStart" in first_annot[ "RNA_ELT_TYPE"] or "transcriptEnd" in first_annot[ "RNA_ELT_TYPE"]: if "spliceEnd" in second_annot[ "RNA_ELT_TYPE"] or "transcriptStart" in second_annot[ "RNA_ELT_TYPE"]: # from non-coding first on splice donor to second 5'UTR on splice acceptor inframe = "1" first_annot["IN_FRAME"].append("{}:{}".format( second_annot["Feature"], inframe)) second_annot["IN_FRAME"].append("{}:{}".format( first_annot["Feature"], inframe)) for first_annot in first.info[annotation_field]: first_annot["IN_FRAME"] = "&".join(first_annot["IN_FRAME"]) for second_annot in second.info[annotation_field]: second_annot["IN_FRAME"] = "&".join(second_annot["IN_FRAME"])
def getGeneAnnot(record, genes_by_chr): """ Return genomic items overlapped by the BND record. :param record: The BND record. :type record: anacore.vcf.VCFRecord :param genes_by_chr: By chromosomes a tree where nodes are genes, transcripts, protein, exons and CDS. :type genes_by_chr: dict :return: The list of annotations (one annotation by overlapped transcript). :rtype: list """ record_strand = getStrand(record) shard_before_bnd = shardIsBeforeBND(record) bnd_region = Region(record.info["ANNOT_POS"], None, None, record.chrom, record.getName()) annotations = [] if record.chrom in genes_by_chr: overlapped_genes = genes_by_chr[record.chrom].getOverlapped(bnd_region) for curr_gene in overlapped_genes: overlapped_transcripts = curr_gene.children.getOverlapped( bnd_region) if len(overlapped_transcripts) == 0: log.warn( "The breakpoint {} is contained by gene {} but by 0 of these transcripts." .format(bnd_region, curr_gene)) else: for curr_transcript in overlapped_transcripts: if len(curr_transcript.proteins) > 1: log.error( "The management of several proteins for one transcript is not implemented. The transcript {} contains several proteins {}." .format(curr_transcript, curr_transcript.proteins), exec_info=True) if curr_transcript.strand is None: log.error("The transcript {} has no strand.".format( curr_transcript), exec_info=True) curr_annot = { "SYMBOL": curr_gene.name, "Gene": curr_gene.annot["id"], "Feature": curr_transcript.annot["id"], "Feature_type": "Transcript", "STRAND": curr_transcript.strand, "Protein": "" if len(curr_transcript.proteins) == 0 else curr_transcript.proteins[0].annot["id"], "RNA_ELT_TYPE": None, "RNA_ELT_POS": None, "CDS_position": None, "Protein_position": None, "Codon_position": None } # Intron, exon and CDS posiion subregion, subregion_idx = curr_transcript.getSubFromRefPos( bnd_region.start) if issubclass(subregion.__class__, Intron): # On intron curr_annot["RNA_ELT_TYPE"] = "intron" curr_annot["RNA_ELT_POS"] = "{}/{}".format( subregion_idx, len(curr_transcript.children) - 1) if len( curr_transcript.proteins ) > 0 and curr_transcript.strand == record_strand: curr_protein = curr_transcript.proteins[0] # Get CDS on last implicated exon for first shard and first implicated exon on second shard ref_pos = subregion.end + 1 if shard_before_bnd: ref_pos = subregion.start - 1 curr_annot[ "CDS_position"] = curr_protein.getNtPosFromRefPos( ref_pos) if curr_annot["CDS_position"] is None or curr_annot[ "CDS_position"] == 1 or curr_annot[ "CDS_position"] == curr_protein.length: curr_annot["CDS_position"] = None curr_annot["RNA_ELT_TYPE"] += "&utr" if curr_protein.strand == "+": curr_annot["RNA_ELT_POS"] += "&" + ( "5prim" if curr_protein.start > bnd_region.start else "3prim") if curr_protein.start > bnd_region.start: curr_annot[ "CDS_DIST"] = getDistBeforeCDSForward( bnd_region.start, curr_protein) else: curr_annot["RNA_ELT_POS"] += "&" + ( "5prim" if curr_protein.end < bnd_region.start else "3prim") if curr_protein.end < bnd_region.start: curr_annot[ "CDS_DIST"] = getDistBeforeCDSReverse( bnd_region.start, curr_protein) else: curr_annot["Protein_position"], curr_annot[ "Codon_position"] = curr_protein.getPosOnRegion( ref_pos) else: # On exon nb_exon = len(curr_transcript.children) curr_annot["RNA_ELT_TYPE"] = "exon" curr_annot["RNA_ELT_POS"] = "{}/{}".format( subregion_idx, nb_exon) if bnd_region.start == subregion.start: if subregion_idx == 1 and subregion.strand == "+": # Start of the first exon curr_annot[ "RNA_ELT_TYPE"] += "&transcriptStart" elif subregion_idx == nb_exon and subregion.strand == "-": # End of the last exon curr_annot["RNA_ELT_TYPE"] += "&transcriptEnd" else: curr_annot["RNA_ELT_TYPE"] += "&splice" + ( "End" if subregion.strand == "+" else "Start") elif bnd_region.start == subregion.end: if subregion_idx == 1 and subregion.strand == "-": # Start of the first exon curr_annot[ "RNA_ELT_TYPE"] += "&transcriptStart" elif subregion_idx == nb_exon and subregion.strand == "+": # End of the last exon curr_annot["RNA_ELT_TYPE"] += "&transcriptEnd" else: curr_annot["RNA_ELT_TYPE"] += "&splice" + ( "End" if subregion.strand == "-" else "Start") if len(curr_transcript.proteins) > 0: curr_protein = curr_transcript.proteins[0] curr_annot[ "CDS_position"] = curr_protein.getNtPosFromRefPos( bnd_region.start) # UTR if curr_annot["CDS_position"] is None: curr_annot["RNA_ELT_TYPE"] += "&utr" if curr_transcript.proteins[0].strand == "+": curr_annot["RNA_ELT_POS"] += "&" + ( "5prim" if curr_protein.start > bnd_region.start else "3prim") if curr_protein.start > bnd_region.start: curr_annot[ "CDS_DIST"] = getDistBeforeCDSForward( bnd_region.start, curr_protein) else: curr_annot["RNA_ELT_POS"] += "&" + ( "5prim" if curr_protein.end < bnd_region.start else "3prim") if curr_protein.end < bnd_region.start: curr_annot[ "CDS_DIST"] = getDistBeforeCDSReverse( bnd_region.start, curr_protein) # Protein position else: curr_annot["Protein_position"], curr_annot[ "Codon_position"] = curr_protein.getPosOnRegion( bnd_region.start) # Add to annotations annotations.append(curr_annot) return annotations
def fastStandardize(first, second, seq_handler, padding=50): """ Each breakend of the pair is placed at the left most position, and the uncertainty is represented with the CIPOS tag. The ALT string is then constructed assuming this choice. :param first: The breakend of the first shard in fusion (donor). :type first: anacore.vcf.VCFRecord :param second: The breakend of the second shard in fusion (acceptor). :type second: anacore.vcf.VCFRecord :param seq_handler: Indexed reader for the reference genome used in fusion calling. :type seq_handler: anacore.sequenceIO.IdxFastaIO :param padding: Number of nucleotids to inspect before and after the breakends: upstream and downstream movements are limited to this number of nucleotids. :type padding: int """ first_strand = getStrand(first, True) second_strand = getStrand(second, False) before_first = seq_handler.getSub(first.chrom, max(first.pos - padding, 1), first.pos) before_second = seq_handler.getSub(second.chrom, max(second.pos - padding, 1), second.pos) after_first = seq_handler.getSub(first.chrom, first.pos, first.pos + padding) after_second = seq_handler.getSub(second.chrom, second.pos, second.pos + padding) cipos_start = 0 cipos_end = 0 if first_strand == second_strand: # Same strand # Move to upstream before_first_seq = before_first before_second_seq = before_second[:-1] if first_strand == "-": before_first_seq = before_first[:-1] before_second_seq = before_second for nt_first, nt_second in zip(before_first_seq[::-1], before_second_seq[::-1]): if nt_first != nt_second: break cipos_start -= 1 # Move to downstream after_first_seq = after_first[1:] after_second_seq = after_second if first_strand == "-": after_first_seq = after_first after_second_seq = after_second[1:] for nt_first, nt_second in zip(after_first_seq, after_second_seq): if nt_first != nt_second: break cipos_end += 1 # Update records if cipos_start != 0 or cipos_end != 0: first.pos = first.pos + cipos_start first.ref = seq_handler.getSub(first.chrom, first.pos, first.pos) first.info["CIPOS"] = [0, cipos_end - cipos_start] second.pos = second.pos + cipos_start second.ref = seq_handler.getSub(second.chrom, second.pos, second.pos) second.info["CIPOS"] = [0, cipos_end - cipos_start] first.alt[0], second.alt[0] = getAltFromCoord( getCoordStr(first, True), getCoordStr(second, False)) first.alt[0].replace("N", first.ref) second.alt[0].replace("N", second.ref) else: # Different strand before_second = getComplement(before_second) after_second = getComplement(after_second) # Move before first cointaining breakend and after second excluding breakend before_first_seq = before_first after_second_seq = after_second[1:] if first_strand == "-": before_first_seq = before_first[:-1] after_second_seq = after_second for nt_first, nt_second in zip(before_first_seq[::-1], after_second_seq): if nt_first != nt_second: break cipos_start -= 1 # Move before second cointaining breakend and after first excluding breakend after_first_seq = after_first[1:] before_second_seq = before_second if first_strand == "-": after_first_seq = after_first before_second_seq = before_second[:-1] for nt_first, nt_second in zip(after_first_seq, before_second_seq[::-1]): if nt_first != nt_second: break cipos_end += 1 # Update records if cipos_start != 0 or cipos_end != 0: first.pos = first.pos + cipos_start first.ref = seq_handler.getSub(first.chrom, first.pos, first.pos) first.info["CIPOS"] = [0, cipos_end - cipos_start] second.pos = second.pos - cipos_end # because cipos_start for first is - cipos_end for second second.ref = seq_handler.getSub(second.chrom, second.pos, second.pos) second.info["CIPOS"] = first.info["CIPOS"] second_down_pos = second.pos + (cipos_end - cipos_start) first.alt[0], trash = getAltFromCoord( getCoordStr(first, True), { "chrom": second.chrom, "pos": second_down_pos, "strand": second_strand }) first.alt[0].replace("N", first.ref) first_down_pos = first.pos + (cipos_end - cipos_start) trach, second.alt[0] = getAltFromCoord( { "chrom": first.chrom, "pos": first_down_pos, "strand": first_strand }, getCoordStr(second, False)) second.alt[0].replace("N", second.ref)