def changeCosmicAnnotations(record, annot_field, cosmic_reader): """ Replace non-allele specific COSMIC annotations produced by VEP to allelle-specific annotations. .. warning:: Alleles not annotated by VEP will remain unannotated despite data in databank. :param record: Annotated VCF record from VEP. :type record: anacore.vcf.VCFRecord :param annot_field: Field used to store annotations. :type annot_field: str :param cosmic_reader: File handler open on COSMIC databank with mode 'i'. :type cosmic_reader: anacore.vcf.VCFIO """ annot_chr = record.chrom.upper() annot_name_prefix = "{}:{}={}/".format( annot_chr[3:] if annot_chr.startswith("CHR") else annot_chr, record.pos, record.ref.upper()) # Get overlapping COSMIC variants db_overlapping = [ elt for elt in cosmic_reader.getSub( record.chrom[3:] if record.chrom.startswith("chr") else record. chrom, int(record.refStart()), int(record.refEnd() + 0.5)) ] # Replace COSMIC annotations for annot in record.info[annot_field]: annot_name = annot_name_prefix + annot["Allele"].upper() new_existing = [] # Remove old COSMIC annotations if annot["Existing_variation"] is not None: for curr_exist in annot["Existing_variation"].split("&"): if not curr_exist.startswith("COS"): new_existing.append(curr_exist) # Add new COSMIC annotations db_ids = set() for db_record in db_overlapping: if len(db_record.alt) == 1: if annot_name == db_record.getName().upper(): db_ids = db_ids | set(db_record.id.split(";")) else: for alt_idx, db_alt in enumerate(db_record.alt): db_alt_record = getAlleleRecord(cosmic_reader, db_record, alt_idx) if annot_name == db_alt_record.getName().upper(): db_ids = db_ids | set(db_record.id.split(";")) new_existing += sorted(db_ids) # Change existing variants if len(new_existing) != 0: annot["Existing_variation"] = "&".join(new_existing) else: annot["Existing_variation"] = None
def normAndMove(genome_path, in_variant_file, out_variant_file, trace_unstandard): """ Write in a new file the normalized version of each variant. The normalization constists in three steps: 1- The variants with multiple alternative alleles are splitted in one record by alternative allele. 2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.). 3- The allele is replaced by the most upstream allele that can have the same alternative sequence (example: a deletion in homopolymer is moved to first nucleotid of this homopolymer). :param genome_path: Path to the genome file (format: fasta). :type genome_path: str :param in_variant_file: Path to the variants file (format: VCF). :type in_variant_file: str :param out_variant_file: Path to the normalized variants file (format: VCF). :type out_variant_file: str :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO. :type trace_unstandard: bool """ genome_by_chr = getSeqByChr(genome_path) with VCFIO(out_variant_file, "w") as FH_out: with VCFIO(in_variant_file) as FH_in: # Header FH_out.copyHeader(FH_in) if trace_unstandard: FH_out.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "The variant id (chromosome:position=reference/alternative) before standardization." ) FH_out.writeHeader() # Records for record in FH_in: curr_chrom = genome_by_chr[record.chrom] for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) if trace_unstandard: alt_record.info["UNSTD"] = "{}:{}={}/{}".format( alt_record.chrom, alt_record.pos, alt_record.ref, "/".join(alt_record.alt)) FH_out.write(alt_record.getMostUpstream(curr_chrom))
def normOnly(in_variant_file, out_variant_file, trace_unstandard): """ Write in a new file the normalized version of each variant. The normalization constists in two steps: 1- The variants with multiple alternative alleles are splitted in one record by alternative allele. 2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.). :param in_variant_file: Path to the variants file (format: VCF). :type in_variant_file: str :param out_variant_file: Path to the normalized variants file (format: VCF). :type out_variant_file: str :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO. :type trace_unstandard: bool """ with VCFIO(out_variant_file, "w") as FH_out: with VCFIO(in_variant_file) as FH_in: # Header FH_out.copyHeader(FH_in) if trace_unstandard: FH_out.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "The variant id (chromosome:position=reference/alternative) before standardization." ) FH_out.writeHeader() # Records for record in FH_in: for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) if trace_unstandard: alt_record.info["UNSTD"] = "{}:{}={}/{}".format( alt_record.chrom, alt_record.pos, alt_record.ref, "/".join(alt_record.alt)) alt_record.normalizeSingleAllele() FH_out.write(alt_record)
def groupBNDByFusions(bnd_by_id, annotation_field): """ Return by chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second). :param bnd_by_id: Breakend by ID coming from one fusion caller. :type bnd_by_id: dict :param annotation_field: Field used to store annotations. :type annotation_field: str :return: By chromosome the region of the first breakend in each fucion. The annotation of regions contains the two breakends (tags: first and second). :rtype: dict """ caller_fusions = dict() processed_fusions = set() fusion_by_name = {} for id, record in bnd_by_id.items(): for alt_idx, alt in enumerate(record.alt): alt_first_bnd = record first_new_id = alt_first_bnd.id if len(record.alt) > 1: first_new_id += "_" + str( alt_idx) # Record must be splitted for each mate alt_first_bnd = getAlleleRecord(record, alt_idx) alt_first_bnd.info["MATEID"] = [record.info["MATEID"][alt_idx]] mate_id = alt_first_bnd.info["MATEID"][0] mate_record = bnd_by_id[mate_id] alt_second_bnd = mate_record second_new_id = alt_second_bnd.id if len(mate_record.alt) > 1: first_idx = mate_record.info["MATEID"].index(alt_first_bnd.id) second_new_id += "_" + first_idx # Record must be splitted for each mate alt_second_bnd = getAlleleRecord(mate_record, first_idx) alt_second_bnd.info["MATEID"] = [ mate_record.info["MATEID"][first_idx] ] fusion_id = " @@ ".join( sorted([alt_first_bnd.id, alt_second_bnd.id])) alt_first_bnd.id = first_new_id alt_second_bnd.info["MATEID"] = [first_new_id] alt_second_bnd.id = second_new_id alt_first_bnd.info["MATEID"] = [second_new_id] if fusion_id not in processed_fusions: processed_fusions.add(fusion_id) if "RNA_FIRST" not in alt_first_bnd.info and "RNA_FIRST" not in alt_second_bnd.info: raise Exception( "Tag RNA_FIRST must be present in one of the breakend {} or {}." .format(alt_first_bnd.id, mate_id)) if "RNA_FIRST" in alt_second_bnd.info: aux = alt_first_bnd alt_first_bnd = alt_second_bnd alt_second_bnd = aux interval_first_bnd = getBNDInterval(alt_first_bnd) fusion_name = " @@ ".join( sorted([alt_first_bnd.getName(), alt_second_bnd.getName()])) if fusion_name not in fusion_by_name: region_first_bnd = Region(interval_first_bnd[0], interval_first_bnd[1], reference=alt_first_bnd.chrom, annot={ "first": alt_first_bnd, "second": alt_second_bnd }) if alt_first_bnd.chrom not in caller_fusions: caller_fusions[alt_first_bnd.chrom] = RegionList() caller_fusions[alt_first_bnd.chrom].append( region_first_bnd) fusion_by_name[fusion_name] = region_first_bnd else: # Caller contains several entries for the same pair of breakends (same fusion but several anotations) fusion_by_name[fusion_name].annot["first"].info[ annotation_field] += alt_first_bnd.info[ annotation_field] fusion_by_name[fusion_name].annot["second"].info[ annotation_field] += alt_second_bnd.info[ annotation_field] return caller_fusions
primers_by_chr = getPrimersByChr(args.input_regions) chr_seq = None with VCFIO(args.input_variants) as FH_in: with VCFIO(args.output_variants, "w") as FH_out: # Header FH_out.copyHeader(FH_in) FH_out.filter["PRIM"] = HeaderFilterAttr( 'PRIM', 'The variant is located on an amplicon primer (amplicon desgin: ' + args.input_regions + ').') FH_out.writeHeader() # Records for record in FH_in: for alt_idx, alt in enumerate(record.alt): is_kept = True alt_record = getAlleleRecord(FH_in, record, alt_idx) alt_region = getVariantRegion(alt_record) overlapped_primers = primers_by_chr[ alt_record.chrom].getOverlapped(alt_region) if len( overlapped_primers ) > 0: # The variant overlaps a primer or variant is an insertion just before the downstream primer is_kept = False if len(overlapped_primers ) == 1: # Variants over 2 primers are removed if alt_record.isIndel(): if chr_seq is None or chr_seq.id != alt_record.chrom: chr_seq = getSeqRecord( args.input_sequences, alt_record.chrom) if canBeMovedToInterest( overlapped_primers[0], chr_seq.string,
def addVCFVariants(variants, vcf_path, vcf_idx, spl_name=None): """ Add variant from VCF in dict. :param variants: By uniq ID the variants. The content of this variable is set by the call of this function. Content example: { "chr1:10=A/T":{ "chrom":"chr1", "pos":10, "ref":"A", "alt":"T", "freq":[0.2, 0.5] }, "chr1:10=A/G":{ "chrom":"chr1", "pos":10, "ref":"A", "alt":"G", "freq":[0.01, 0] }, "chr3:20=G/T":{ "chrom":"chr3", "pos":20, "ref":"G", "alt":"T", "freq":[0, 0.4] } } The list of frequencies is appended by each call of the function with a vcf_idx different. :type variants: dict :param vcf_path: Path to the VCF file to add. :type vcf_path: str :param vcf_idx: Index used to store the frequency of each vrariants of the VCF in frequencies list (start from 0). :type vcf_idx: int :param spl_name: The frequency of the variants came from this sample. This parameters is optional when the VCF file contain 0 to 1 sample. :type spl_name: str """ with VCFIO(vcf_path) as FH_vcf: if spl_name is None: spl_name = FH_vcf.samples[0] for record in FH_vcf: allele_freq = record.getAltAF(spl_name) # For each alternative allele for idx_alt, alt in enumerate(record.alt): allele_record = getAlleleRecord(FH_vcf, record, idx_alt) allele_record.normalizeSingleAllele() variant_id = allele_record.getName() if variant_id not in variants: variants[variant_id] = { "chrom": allele_record.chrom, "pos": allele_record.pos, "ref": allele_record.ref, "alt": allele_record.alt[0], "freq": list() } # Complete variants missing in previous VCF while len(variants[variant_id]["freq"]) <= vcf_idx: variants[variant_id]["freq"].append(0) # Add allele frequency variants[variant_id]["freq"][vcf_idx] = allele_freq[idx_alt] # Complete variants missing in current VCF for variant_id in variants: while len(variants[variant_id]["freq"]) <= vcf_idx: variants[variant_id]["freq"].append(0)
# Process nb_variants = 0 nb_filtered = 0 noise_by_variant = getNoise(args.input_noises) with VCFIO(args.input_variants) as FH_in: with VCFIO(args.output_variants, "w") as FH_out: # Header FH_out.copyHeader(FH_in) FH_out.filter[args.tag_name] = HeaderFilterAttr(args.tag_name, args.tag_description) FH_out.writeHeader() # Records for record in FH_in: for idx in range(len(record.alt)): nb_variants += 1 curr_allele = getAlleleRecord(FH_in, record, idx) # Compare signal to noise if curr_allele.getName() in noise_by_variant: nb_spl_over_noise = 0 for curr_spl in curr_allele.samples: if curr_allele.getAltAF(curr_spl)[0] > noise_by_variant[curr_allele.getName()]: nb_spl_over_noise += 1 if nb_spl_over_noise == 0: nb_filtered += 1 if curr_allele.filter is None or len(curr_allele.filter) == 0: curr_allele.filter = [args.tag_name] else: if "PASS" in curr_allele.filter: curr_allele.filter.remove("PASS") curr_allele.filter.append(args.tag_name) # Update empty filter
def stdizeVCF(FH_ref, FH_in, FH_out, trace_unstandard=False, log=None): """ Split alternatives alleles in multi-lines, removes unecessary reference and alternative nucleotids, move indel to most upstream position and update alt allele in annotations. :param FH_ref: File handle to the reference file (format: fasta with faidx). :type FH_ref: anacore.sequenceIO.IdxFastaIO :param FH_in: File handle to the variants file (format: VCF). :type FH_in: anacore.vcf.VCFIO :param FH_out: File handle to the standardized variants file (format: VCF). :type FH_out: anacore.vcf.VCFIO :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO. :type trace_unstandard: bool :param log: Logger used. :type log: logging.Logger """ nb_annot = {"exact": 0, "collocated": 0} is_annotated = issubclass(FH_out.__class__, AnnotVCFIO) # Header FH_out.copyHeader(FH_in) if trace_unstandard: FH_out.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "The variant id (chromosome:position=reference/alternative) before standardization." ) FH_out.writeHeader() # Records for record in FH_in: collocated_records = [] for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) if trace_unstandard: alt_record.info["UNSTD"] = alt_record.getName() # Previous unstd = { "chrom": alt_record.chrom, "pos": alt_record.pos, "ref": alt_record.ref, "alt": alt_record.alt[0] } # Standardize pos, ref and alt alt_record.fastStandardize(FH_ref, 1000) # Update annotations if is_annotated and FH_in.annot_field in alt_record.info: cleaned_annot = [] for idx_ann, annot in enumerate( alt_record.info[FH_in.annot_field]): if unstd["alt"] == annot["Allele"]: nb_annot["exact"] += 1 annot["Allele"] = alt_record.alt[0] cleaned_annot.append(annot) else: nb_annot["collocated"] += 1 alt_record.info[FH_in.annot_field] = cleaned_annot collocated_records.append(alt_record) if len(collocated_records) == 1: FH_out.write(collocated_records[0]) else: for alt_record in sorted( collocated_records, key=lambda elt: (elt.refStart(), elt.refEnd())): # Sorted splitted alleleles FH_out.write(alt_record) if log is not None and nb_annot["collocated"] != 0: log.warning( "{}/{} annotations have been deleted because they concern collocated variant." .format(nb_annot["collocated"], nb_annot["exact"] + nb_annot["collocated"]))
for vcf_idx, current_vcf in enumerate(args.input_variants): current_aln = None if not args.deactivate_completion: current_aln = args.input_aln[vcf_idx] with VCFIO(current_vcf) as FH_vcf: # Manage samples for curr_spl in FH_vcf.samples: # For each sample in VCF aln_by_samples[curr_spl] = current_aln # Manage records for record in FH_vcf: # For each variant if args.selected_region is None or record.chrom == args.selected_region: for curr_spl in FH_vcf.samples: # For each sample in VCF vcaller_AF = record.getAltAF(curr_spl) vcaller_DP = record.getDP(curr_spl) for alt_idx, curr_alt in enumerate(record.alt): # For each alternative allele in in variant record_allele = getAlleleRecord(FH_vcf, record, alt_idx) # Get allele frequency from the variant caller vcaller_curr_AF = vcaller_AF[alt_idx] if len(vcaller_AF) == len(record.alt) + 1: # The AF cointains reference AF vcaller_curr_AF = vcaller_AF[alt_idx + 1] record_allele.samples[curr_spl]["AF"] = [round(vcaller_curr_AF, args.AF_precision)] record_allele.samples[curr_spl]["AD"] = [int(vcaller_curr_AF * vcaller_DP)] record_allele.samples[curr_spl]["DP"] = vcaller_DP # Store allele allele_id = record_allele.getName() if allele_id not in variants: variants[allele_id] = record_allele else: variants[allele_id].samples[curr_spl] = record_allele.samples[curr_spl] # Completes and writes variants