def changeAndWrite(in_vcf, out_vcf, change_by_rec): """ Write uniformised variants. :param in_vcf: Path to the variants file before haplotyping (format: VCF). :type in_vcf: str :param out_vcf: Path to the output variants file. :type out_vcf: str :param change_by_rec: Uniformised variants(s) by previous variant (example: {"chr1:1235448=A/G": {"chr1:1235448=ATAG/GTAC"}, "chr1:1235451=G/C": {"chr1:1235448=ATAG/GTAC"}}). :type change_by_rec: dict """ with VCFIO(out_vcf, "w") as handle_out: with VCFIO(in_vcf) as handle_in: rec_by_name = {} # Manage header handle_out.copyHeader(handle_in) handle_out.writeHeader() # Split/Merge variants for record in handle_in: if record.getName() not in change_by_rec: if record.getName() not in rec_by_name: rec_by_name[record.getName()] = record elif record.getPopAltAD()[0] > rec_by_name[ record.getName()].getPopAltAD()[0]: rec_by_name[record.getName()] = record else: # Must be uniformised if "PGT" in record.format and "PID" in record.format and "PS" in record.format: # Remove mutect haplotype information for spl_name, spl_info in record.samples.items(): del (spl_info["PGT"]) del (spl_info["PID"]) del (spl_info["PS"]) record.format = [ elt for elt in record.format if elt not in ["PGT", "PID", "PS"] ] # Change variant for curr_retained in change_by_rec[record.getName()]: if curr_retained not in rec_by_name: new_record = deepcopy(record) rec_by_name[curr_retained] = new_record new_record.ref, new_record.alt[ 0] = curr_retained.split("=")[1].split("/") new_record.pos = int( curr_retained.split("=")[0].split(":")[1]) elif record.getPopAltAD( )[0] > rec_by_name[curr_retained].getPopAltAD()[0]: new_record = deepcopy(record) rec_by_name[curr_retained] = new_record new_record.ref, new_record.alt[ 0] = curr_retained.split("=")[1].split("/") new_record.pos = int( curr_retained.split("=")[0].split(":")[1]) # Write records for record in sorted(rec_by_name.values(), key=lambda x: (x.chrom, x.pos, x.refEnd(), x.alt[0])): handle_out.write(record)
def getVariantsByName(callers, haplotyped_variants_files): """ Return variants detection and haplotype by variant name. :param callers: List of variants callers corresponding to haplotyped_variants_files. :type callers: list :param haplotyped_variants_files: Paths to the variants files after merging by haplotype (format: VCF). :type haplotyped_variants_files: list :return: Variants detection and haplotype by variant name (example: {"chr1:1235448=ATAG/GTAC": {"mutect2": {"merged": True, "sub": {"chr1:1235448=A/G", "chr1:1235451=G/C"}}, "freebayes": {"merged": False, "sub": {"chr1:1235448=ATAG/GTAC"}}}}). :rtype: dict """ variants_by_name = {} for curr_caller, curr_file in zip(callers, haplotyped_variants_files): with VCFIO(curr_file) as FH_in: for record in FH_in: if record.getName() not in variants_by_name: variants_by_name[record.getName()] = {} variants_by_name[record.getName()][curr_caller] = { "merged": "MCO_VAR" in record.info, "sub": set(record.info["MCO_VAR"]) if "MCO_VAR" in record.info else set() } return variants_by_name
def normAndMove(genome_path, in_variant_file, out_variant_file, trace_unstandard): """ Write in a new file the normalized version of each variant. The normalization constists in three steps: 1- The variants with multiple alternative alleles are splitted in one record by alternative allele. 2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.). 3- The allele is replaced by the most upstream allele that can have the same alternative sequence (example: a deletion in homopolymer is moved to first nucleotid of this homopolymer). :param genome_path: Path to the genome file (format: fasta). :type genome_path: str :param in_variant_file: Path to the variants file (format: VCF). :type in_variant_file: str :param out_variant_file: Path to the normalized variants file (format: VCF). :type out_variant_file: str :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO. :type trace_unstandard: bool """ genome_by_chr = getSeqByChr(genome_path) with VCFIO(out_variant_file, "w") as FH_out: with VCFIO(in_variant_file) as FH_in: # Header FH_out.copyHeader(FH_in) if trace_unstandard: FH_out.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "The variant id (chromosome:position=reference/alternative) before standardization." ) FH_out.writeHeader() # Records for record in FH_in: curr_chrom = genome_by_chr[record.chrom] for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) if trace_unstandard: alt_record.info["UNSTD"] = "{}:{}={}/{}".format( alt_record.chrom, alt_record.pos, alt_record.ref, "/".join(alt_record.alt)) FH_out.write(alt_record.getMostUpstream(curr_chrom))
def testTag(self): # Execute command subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL) # Validate results expected = [] for record in self.variants: for alt in record.alt: expected.append(record.id + ":" + record.info["expected"]) observed = [] with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed.append(record.id + ":" + record.filter[0]) self.assertEqual(expected, observed)
def testResults(self): # Execute command subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL) # Validate results expected = [ curr_var.id for curr_var in self.variants if curr_var.info["target"] == "target_2" ] observed = list() with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed.append(record.id) self.assertEqual(sorted(expected), sorted(observed))
def normOnly(in_variant_file, out_variant_file, trace_unstandard): """ Write in a new file the normalized version of each variant. The normalization constists in two steps: 1- The variants with multiple alternative alleles are splitted in one record by alternative allele. 2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.). :param in_variant_file: Path to the variants file (format: VCF). :type in_variant_file: str :param out_variant_file: Path to the normalized variants file (format: VCF). :type out_variant_file: str :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO. :type trace_unstandard: bool """ with VCFIO(out_variant_file, "w") as FH_out: with VCFIO(in_variant_file) as FH_in: # Header FH_out.copyHeader(FH_in) if trace_unstandard: FH_out.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "The variant id (chromosome:position=reference/alternative) before standardization." ) FH_out.writeHeader() # Records for record in FH_in: for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) if trace_unstandard: alt_record.info["UNSTD"] = "{}:{}={}/{}".format( alt_record.chrom, alt_record.pos, alt_record.ref, "/".join(alt_record.alt)) alt_record.normalizeSingleAllele() FH_out.write(alt_record)
def testAnnotVCFIO(self): # Execute command subprocess.check_call(self.cmd + ["--annotations-field", "ANN"], stderr=subprocess.DEVNULL) # Validate results expected = {} for record in self.variants: for idx, alt in enumerate(record.alt): id = "{} {}:{}={}/{}".format(record.id, record.chrom, record.pos, record.ref, alt) expected[id] = record.info["expected"][idx] observed = {} with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed[record.id + " " + record.info["UNSTD"]] = record.getName() self.assertEqual( expected, observed ) # Validate annotations expected = {} for record in self.variants: for idx, alt in enumerate(record.alt): id = "{} {}:{}={}/{}".format(record.id, record.chrom, record.pos, record.ref, alt) expected[id] = sorted([ann for ann in record.info["expectedANN"] if ann.split("|")[2] == str(idx)]) observed = {} with VCFIO(self.tmp_output) as FH_results: for record in FH_results: id = record.id + " " + record.info["UNSTD"] observed[id] = [] if "ANN" in record.info: observed[id] = sorted([ann for ann in record.info["ANN"]]) self.assertEqual( expected, observed )
def testRemove(self): # Execute command subprocess.check_call(self.cmd + ["--mode", "remove"], stderr=subprocess.DEVNULL) # Validate results expected = [] for record in self.variants: for alt in record.alt: if record.info["expected"] == "PASS": expected.append(record.id) observed = [] with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed.append(record.id) self.assertEqual(expected, observed)
def testVCFIO(self): # Execute command subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL) # Validate results expected = {} for record in self.variants: for idx, alt in enumerate(record.alt): id = "{} {}:{}={}/{}".format(record.id, record.chrom, record.pos, record.ref, alt) expected[id] = record.info["expected"][idx] observed = {} with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed[record.id + " " + record.info["UNSTD"]] = record.getName() self.assertEqual( expected, observed )
def testResults(self): # Create BED with BEDIO(self.tmp_regions, "w", 8) as FH_reg: ampl1 = BEDRecord("artificial_chr1", 5, 25, "ampl1", None, "+", 11, 20) FH_reg.write(ampl1) ampl2 = BEDRecord("artificial_chr2", 1, 11, "ampl2", None, "+", 3, 9) FH_reg.write(ampl2) # Execute command subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL) # Validate results expected = [curr_var.id for curr_var in self.variants if curr_var.info["ZOI"] == "yes"] observed = list() with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed.append(record.id) self.assertEqual( sorted(expected), sorted(observed) )
def loadBNDByID(in_vcf): """ Return breakend by ID from a VCF file. :param in_vcf: Path to the VCF containing BND coming from one fusion caller (format: VCF). :type in_vcf: str :return: Breakend by ID. :rtype: dict """ bnd_by_id = {} with VCFIO(in_vcf) as reader: if "SR" in reader.info and reader.info["SR"].number == ".": raise Exception( 'The number attribute for SR must be "A" or "R" or "1".') if "PR" in reader.info and reader.info["PR"].number == ".": raise Exception( 'The number attribute for PR must be "A" or "R" or "1".') for record in reader: if record.info["SVTYPE"] == "BND": bnd_by_id[record.id] = record return bnd_by_id
def variantsRegionFromVCF(vcf_path, min_count=1, symbol="GENE", hgvsc="CDS", hgvsp="AA", count="CNT"): """ Return the region object corresponding to the known variants in a VCF. :param vcf_path: Path to the variants file (format: VCF). :type vcf_path: str :param min_count: Minimum number of samples where the variant is known in the databases to use its information. :type min_count: int :param symbol: Tag used in VCF.info to store the symbol of the gene. :type symbol: str :param hgvsc: Tag used in VCF.info to store the HGVSc. :type hgvsc: str :param hgvsp: Tag used in VCF.info to store the HGVSp. :type hgvsp: str :param count: Tag used in VCF.info to store the number of database's samples with this variant. :type count: str :return: List of variants regions. :rtype: anacore.region.RegionList """ variants_region = None with VCFIO(vcf_path) as FH_in: variants_region = [ Region( record.pos, record.pos + len(record.ref), None, record.chrom, record.id, { "id": record.id, "gene": ("" if symbol not in record.info else record.info[symbol]), "HGVSp": ("" if hgvsp not in record.info else record.info[hgvsp]), "HGVSc": ("" if hgvsc not in record.info else record.info[hgvsc]), "count": (None if count not in record.info else int(record.info[count])) } ) for record in FH_in if (symbol not in record.info or "_ENST" not in record.info[symbol]) and (count not in record.info or int(record.info[count]) >= min_count) ] return RegionList(variants_region)
# Logger logging.basicConfig(format='%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s') log = logging.getLogger() log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) log.info("Version: " + str(__version__)) # Get identified variants from VCF variants = dict() aln_by_samples = dict() for vcf_idx, current_vcf in enumerate(args.input_variants): current_aln = None if not args.deactivate_completion: current_aln = args.input_aln[vcf_idx] with VCFIO(current_vcf) as FH_vcf: # Manage samples for curr_spl in FH_vcf.samples: # For each sample in VCF aln_by_samples[curr_spl] = current_aln # Manage records for record in FH_vcf: # For each variant if args.selected_region is None or record.chrom == args.selected_region: for curr_spl in FH_vcf.samples: # For each sample in VCF vcaller_AF = record.getAltAF(curr_spl) vcaller_DP = record.getDP(curr_spl) for alt_idx, curr_alt in enumerate(record.alt): # For each alternative allele in in variant record_allele = getAlleleRecord(FH_vcf, record, alt_idx) # Get allele frequency from the variant caller vcaller_curr_AF = vcaller_AF[alt_idx] if len(vcaller_AF) == len(record.alt) + 1: # The AF cointains reference AF vcaller_curr_AF = vcaller_AF[alt_idx + 1]
def addVCFVariants(variants, vcf_path, vcf_idx, spl_name=None): """ Add variant from VCF in dict. :param variants: By uniq ID the variants. The content of this variable is set by the call of this function. Content example: { "chr1:10=A/T":{ "chrom":"chr1", "pos":10, "ref":"A", "alt":"T", "freq":[0.2, 0.5] }, "chr1:10=A/G":{ "chrom":"chr1", "pos":10, "ref":"A", "alt":"G", "freq":[0.01, 0] }, "chr3:20=G/T":{ "chrom":"chr3", "pos":20, "ref":"G", "alt":"T", "freq":[0, 0.4] } } The list of frequencies is appended by each call of the function with a vcf_idx different. :type variants: dict :param vcf_path: Path to the VCF file to add. :type vcf_path: str :param vcf_idx: Index used to store the frequency of each vrariants of the VCF in frequencies list (start from 0). :type vcf_idx: int :param spl_name: The frequency of the variants came from this sample. This parameters is optional when the VCF file contain 0 to 1 sample. :type spl_name: str """ with VCFIO(vcf_path) as FH_vcf: if spl_name is None: spl_name = FH_vcf.samples[0] for record in FH_vcf: allele_freq = record.getAltAF(spl_name) # For each alternative allele for idx_alt, alt in enumerate(record.alt): allele_record = getAlleleRecord(FH_vcf, record, idx_alt) allele_record.normalizeSingleAllele() variant_id = allele_record.getName() if variant_id not in variants: variants[variant_id] = { "chrom": allele_record.chrom, "pos": allele_record.pos, "ref": allele_record.ref, "alt": allele_record.alt[0], "freq": list() } # Complete variants missing in previous VCF while len(variants[variant_id]["freq"]) <= vcf_idx: variants[variant_id]["freq"].append(0) # Add allele frequency variants[variant_id]["freq"][vcf_idx] = allele_freq[idx_alt] # Complete variants missing in current VCF for variant_id in variants: while len(variants[variant_id]["freq"]) <= vcf_idx: variants[variant_id]["freq"].append(0)
def testTagMultipleValues(self): # Write test data with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Expected filter tag.", type="String", number="1"), "SAR": HeaderInfoAttr( "SAR", "Number of reads supporting the alternative allele in reverse strand.", type="Integer", number="A"), "SAF": HeaderInfoAttr( "SAF", "Number of reads supporting the alternative allele in forward strand.", type="Integer", number="A"), "SRR": HeaderInfoAttr( "SRR", "Number of reads supporting the reference allele in reverse strand.", type="Integer", number="A"), "SRF": HeaderInfoAttr( "SRF", "Number of reads supporting the reference allele in forward strand.", type="Integer", number="A"), } FH_var.writeHeader() self.variants = [ # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 10, "sub_01", "G", ["T"], None, None, { "SAR": [5], "SAF": [5], "SRR": [5], "SRF": [5], "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, { "SAR": [9], "SAF": [1], "SRR": [95], "SRF": [95], "expected": "strandRatioBias" }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var) # Execute command subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL) # Validate results expected = [] for record in self.variants: for alt in record.alt: expected.append(record.id + ":" + record.info["expected"]) observed = [] with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed.append(record.id + ":" + record.filter[0]) self.assertEqual(expected, observed)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "standardizeVCF.py", "--trace-unstandard", "--input-reference", self.tmp_sequences, "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # Repeats: ****.... ...*** # Region: |----| |------------| |------| FH_seq.write(Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC")) # 123456789| | | | | | | | | | | | | | | | | | # 10| 14| 18| 22| 26| 30| 34| 38| 42| # 12 16 20 24 28 32 36 40 44 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 45 17 45 46 artificial_chr2 11 80 11 12""") # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Standardized version of {chrom}:{pos}={ref}/{alt}.", type="String", number="."), "ANN": HeaderInfoAttr("ANN", "Annotation of variants Format: Allele|Annotation_id|Alt_allele_idx", type="String", number="."), "expectedANN": HeaderInfoAttr("expectedANN", "Standardized version of annotations Format: Allele|Annotation_id|Alt_allele_idx", type="String", number=".") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "sub_01", "G", ["T"], None, None, { "expected": ["artificial_chr1:14=G/T"], "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 19, "sub_02", "T", ["A", "C"], None, None, { "expected": ["artificial_chr1:19=T/A", "artificial_chr1:19=T/C"], "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"], "expectedANN": ["A|ann_1|0", "A|ann_2|0"] }), # Substit multi nt VCFRecord("artificial_chr1", 7, "sub_03", "CATGTATG", ["GTACCCGC"], None, None, { "expected": ["artificial_chr1:7=CATGTATG/GTACCCGC"], "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTGT|ann_3|"], "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0"] }), VCFRecord("artificial_chr1", 11, "sub_04", "TATGTATG", ["GTACCCGC", "GTACCCAA"], None, None, { "expected": ["artificial_chr1:11=TATGTATG/GTACCCGC", "artificial_chr1:11=TATGTATG/GTACCCAA"], "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"], "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"] }), # Insertion single nt VCFRecord("artificial_chr1", 14, "ins_01", "G", ["GA"], None, None, { "expected": ["artificial_chr1:14=G/GA"], "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GT|ann_3|"], "expectedANN": ["GA|ann_1|0", "GA|ann_2|0"] }), VCFRecord("artificial_chr1", 20, "ins_02", "-", ["A"], None, None, { "expected": ["artificial_chr1:19=T/TA"], "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"], "expectedANN": ["TA|ann_1|0", "TA|ann_2|0"] }), VCFRecord("artificial_chr1", 14, "ins_03", "G", ["GA", "GC"], None, None, { "expected": ["artificial_chr1:14=G/GA", "artificial_chr1:14=G/GC"], "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1", "GT|ann_4|"], "expectedANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1"] }), VCFRecord("artificial_chr1", 20, "ins_04", "-", ["A", "C"], None, None, { "expected": ["artificial_chr1:19=T/TA", "artificial_chr1:19=T/TC"], "ANN": ["A|ann_1|0", "A|ann_2|0", "C|ann_3|1", "T|ann_4|"], "expectedANN": ["TA|ann_1|0", "TA|ann_2|0", "TC|ann_3|1"] }), # Insertion multi nt VCFRecord("artificial_chr1", 14, "ins_05", "G", ["GATGC"], None, None, { "expected": ["artificial_chr1:14=G/GATGC"], "ANN": ["GATGC|ann_1|0", "GATGC|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["GATGC|ann_1|0", "GATGC|ann_2|0"] }), VCFRecord("artificial_chr1", 20, "ins_06", "-", ["AAATC"], None, None, { "expected": ["artificial_chr1:19=T/TAAATC"], "ANN": ["AAATC|ann_1|0", "AAATC|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["TAAATC|ann_1|0", "TAAATC|ann_2|0"] }), # Movable insertion multi nt VCFRecord("artificial_chr1", 14, "ins_07", "G", ["GTG"], None, None, { "expected": ["artificial_chr1:12=A/ATG"], "ANN": ["GTG|ann_1|0", "GTG|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["ATG|ann_1|0", "ATG|ann_2|0"] }), VCFRecord("artificial_chr1", 27, "ins_08", "A", ["AAAA"], None, None, { "expected": ["artificial_chr1:25=C/CAAA"], "ANN": ["AAAA|ann_1|0", "AAAA|ann_2|0", "CAAA|ann_3|"], "expectedANN": ["CAAA|ann_1|0", "CAAA|ann_2|0"] }), # Deletion single nt VCFRecord("artificial_chr1", 14, "del_01", "G", [""], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 14, "del_02", "G", ["-"], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 13, "del_03", "TG", ["T"], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 13, "del_04", "TG", ["T", "-"], None, None, { "expected": ["artificial_chr1:13=TG/T", "artificial_chr1:12=ATG/A"], "ANN": ["T|ann_1|0", "T|ann_2|0", "-|ann_3|1"], "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1"] }), # Movable deletion multi nt VCFRecord("artificial_chr1", 11, "del_05", "TATG", ["T", "TA", "-"], None, None, { "expected": ["artificial_chr1:11=TATG/T", "artificial_chr1:12=ATG/A", "artificial_chr1:7=CATGT/C"], "ANN": ["T|ann_1|0", "T|ann_2|0", "TA|ann_3|1", "-|ann_4|2"], "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1", "C|ann_4|2"] }), ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
'-i', '--input-variants', required=True, help='The path to the variants file (format: VCF).') group_output = parser.add_argument_group('Outputs') # Outputs group_output.add_argument( '-o', '--output-variants', required=True, help= 'The path to the outputted file containing the constitutive variants (format: TSV).' ) args = parser.parse_args() # Process with VCFIO(args.input_variants) as FH_in: with open(args.output_variants, "w") as FH_out: # Header FH_out.write("## PARAMETERS: {}\n".format(" ".join(sys.argv))) FH_out.write("## VERSION: {}\n".format(__version__)) FH_out.write("\t".join([ "#Chromosome", "Position", "Reference_allele", "Alternative_allele", "Noise_rate", "Nb_input_spl", "Nb_usable_spl", "Nb_support_spl", "Nb_constit_spl", "Constit_spl", "Constit_AF" ]) + "\n") # Records for record in FH_in: for idx in range(len(record.alt)): curr_allele = getAlleleRecord(FH_in, record, idx) nb_spl = len(FH_in.samples)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFBySOR.py", "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Expected filter tag.", type="String", number="1"), "SAR": HeaderInfoAttr( "SAR", "Number of reads supporting the alternative allele in reverse strand.", type="Integer", number="1"), "SAF": HeaderInfoAttr( "SAF", "Number of reads supporting the alternative allele in forward strand.", type="Integer", number="1"), "SRR": HeaderInfoAttr( "SRR", "Number of reads supporting the reference allele in reverse strand.", type="Integer", number="1"), "SRF": HeaderInfoAttr( "SRF", "Number of reads supporting the reference allele in forward strand.", type="Integer", number="1"), } FH_var.writeHeader() self.variants = [ # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias VCFRecord("artificial_chr1", 10, "sub_01", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 5, "SRF": 5, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt no bias, ref no bias VCFRecord("artificial_chr1", 20, "sub_02", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 95, "SRF": 95, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt no bias, ref strand bias VCFRecord("artificial_chr1", 30, "sub_03", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 150, "SRF": 30, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 95, "SRF": 95, "expected": "strandRatioBias" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref strand bias => no bias VCFRecord("artificial_chr1", 50, "sub_05", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 150, "SRF": 30, "expected": "PASS" }), # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 60, "sub_06", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 5, "SRF": 5, "expected": "strandRatioBias" }), # 0.29 alt, 0.71 ref, good DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 70, "sub_07", "G", ["T"], None, None, { "SAR": 400, "SAF": 600, "SRR": 1400, "SRF": 1000, "expected": "PASS" }), # 0.71 alt, 0.29 ref, good DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 80, "sub_08", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 400, "SRF": 600, "expected": "PASS" }), # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 0 DP VCFRecord( "artificial_chr1", 90, "sub_09", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 0, "SRF": 0, "expected": "PASS" }), # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 2 DP VCFRecord( "artificial_chr1", 100, "sub_10", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 0, "SRF": 2, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 0 DP VCFRecord("artificial_chr1", 110, "sub_11", "G", ["T"], None, None, { "SAR": 90, "SAF": 30, "SRR": 0, "SRF": 0, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 2 DP VCFRecord("artificial_chr1", 120, "sub_12", "G", ["T"], None, None, { "SAR": 90, "SAF": 30, "SRR": 0, "SRF": 2, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 0 DP VCFRecord( "artificial_chr1", 130, "sub_13", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 0, "SRF": 0, "expected": "strandRatioBias" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 2 DP VCFRecord( "artificial_chr1", 140, "sub_14", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 0, "SRF": 2, "expected": "strandRatioBias" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 1 DP VCFRecord( "artificial_chr1", 150, "sub_15", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 1, "SRF": 0, "expected": "PASS" # It can be discuss: 2.89 }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 160, "sub_16", "G", ["T"], None, None, { "SAR": 15, "SAF": 2, "SRR": 200, "SRF": 200, "expected": "strandRatioBias" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 170, "sub_17", "G", ["T"], None, None, { "SAR": 13, # 12 => PASS "SAF": 2, "SRR": 200, "SRF": 200, "expected": "strandRatioBias" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias => no bias VCFRecord("artificial_chr1", 180, "sub_18", "G", ["T"], None, None, { "SAR": 13, "SAF": 2, "SRR": 350, "SRF": 50, "expected": "PASS" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias rev => bias VCFRecord( "artificial_chr1", 190, "sub_19", "G", ["T"], None, None, { "SAR": 13, "SAF": 2, "SRR": 50, "SRF": 350, "expected": "strandRatioBias" }), # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 200, "sub_20", "G", ["T"], None, None, { "SAR": 14, "SAF": 2, "SRR": 8, "SRF": 8, "expected": "strandRatioBias" }), ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Process cosmic_reader = None with AnnotVCFIO(args.output_variants, "w", annot_field=args.annotations_field) as FH_out: with AnnotVCFIO(args.input_variants, annot_field=args.annotations_field) as FH_in: # Header FH_out.copyHeader(FH_in) if args.input_cosmic: cosmic_reader = VCFIO(args.input_cosmic, "i") cosmic_version = getDatabankVersion(cosmic_reader) FH_out.extra_header.append( "##COSMIC={}".format(cosmic_version)) FH_out.writeHeader() # Records for record in FH_in: # To upper record.ref = record.ref.upper() record.alt = [alt.upper() for alt in record.alt] for annot in record.info[FH_in.annot_field]: annot["Allele"] = annot["Allele"].upper() # Change alternative representation for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) vep_alt = getVEPAlt(alt_record.ref, alt_record.alt)[0]
group_input = parser.add_argument_group('Inputs') # Inputs group_input.add_argument( '-i', '--input-variants', required=True, help='The path to the variants file (format: VCF).') group_output = parser.add_argument_group('Outputs') # Outputs group_output.add_argument( '-o', '--output-variants', required=True, help='The path to the outputted variants file (format: VCF).') args = parser.parse_args() # Process with VCFIO(args.output_variants, "w") as FH_out: with VCFIO(args.input_variants) as FH_in: # Header FH_out.copyHeader(FH_in) FH_out.writeHeader() # Records records_by_chr = dict() for record in FH_in: if record.chrom not in records_by_chr: records_by_chr[record.chrom] = list() records_by_chr[record.chrom].append(record) for chrom in sorted(records_by_chr): sorted_records = sorted(records_by_chr[chrom], key=lambda x: (x.chrom, x.pos, x.refEnd(), x.alt[0])) for record in sorted_records:
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFTargets.py", "--mode", "remove", "--input-variants", self.tmp_variants, "--input-targets", self.tmp_regions, "--input-reference", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # Repeats: ****.... ...*** # Region: |----| |------------| |------| FH_seq.write( Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC")) # 123456789| | | | | | | | | | | | | | | | | | # 10| 14| 18| 22| 26| 30| 34| 38| 42| # 12 16 20 24 28 32 36 40 44 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 45 17 45 46 artificial_chr2 11 80 11 12""") # Create targets with BEDIO(self.tmp_regions, "w", write_nb_col=4) as FH_bed: FH_bed.write(BEDRecord("artificial_chr1", 1, 6, "target_1")) FH_bed.write(BEDRecord("artificial_chr1", 15, 28, "target_2")) FH_bed.write(BEDRecord("artificial_chr1", 38, 45, "target_3")) # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "target": HeaderInfoAttr("target", "The ID of the overlapped target.", type="String", number="1") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_01", "G", ["T"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_02", "C", ["G"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 28, "alt_03", "A", ["G"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 29, "alt_04", "G", ["C"], None, None, {"target": None}), # After target ; first nt after target # Substit multi nt VCFRecord("artificial_chr1", 7, "alt_05", "CATGTATG", ["GTACCCGC"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord("artificial_chr1", 11, "alt_06", "TATGTATG", ["GTACCCGC"], None, None, {"target": "target_2"}), # Overlap target start VCFRecord("artificial_chr1", 13, "alt_07", "TGTATGTGCTCACAAAGTA", ["CCCGCCCCTACATTGCAGT"], None, None, {"target": "target_2"}), # Include target VCFRecord("artificial_chr1", 15, "alt_08", "TATGTGCTCACAAA", ["CGCCCCTACATTGC"], None, None, {"target": "target_2"}), # Exact target VCFRecord("artificial_chr1", 21, "alt_09", "CTCACAA", ["GTACCCG"], None, None, {"target": "target_2"}), # Included by target VCFRecord("artificial_chr1", 24, "alt_10", "ACAAAGTA", ["GTACCCG"], None, None, {"target": "target_2"}), # Overlap target end VCFRecord( "artificial_chr1", 29, "alt_11", "GTAGTAGAT", ["GTACCCGA"], None, None, {"target": None}), # After target ; first nt after target # Ins single nt VCFRecord("artificial_chr1", 14, "alt_12", "G", ["GA"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord("artificial_chr1", 15, "alt_12.2", "-", ["A"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_13", "A", ["TG"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_14", "C", ["CG"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 27, "alt_15", "A", ["AT"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord("artificial_chr1", 28, "alt_15.2", "-", ["T"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 28, "alt_16", "A", ["AT"], None, None, {"target": None}), # After target ; first nt afetr target # Movable del multi nt VCFRecord( "artificial_chr1", 14, "alt_17", "G", ["GT"], None, None, {"target": "target_2"}), # Movable to first nt of target VCFRecord( "artificial_chr1", 28, "alt_18", "A", ["AA"], None, None, {"target": "target_2"}), # Movable to last nt of target # Del single nt VCFRecord("artificial_chr1", 14, "alt_19", "G", [""], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_20", "T", [""], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_21", "C", [""], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 28, "alt_22", "A", [""], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 29, "alt_23", "G", [""], None, None, {"target": None}), # After target ; first nt afetr target # Del multi nt VCFRecord("artificial_chr1", 11, "alt_24", "TATG", ["T"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 13, "alt_25", "TGTA", ["T"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 20, "alt_26", "GCTC", ["G"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 27, "alt_27", "AAGT", ["A"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 28, "alt_28", "AGT", ["A"], None, None, {"target": None}), # After target ; first nt afetr target # Movable del multi nt VCFRecord("artificial_chr1", 7, "alt_29", "CATGT", ["C"], None, None, {"target": "target_2" }), # On repeat and movable to first nt of target VCFRecord( "artificial_chr1", 12, "alt_30", "ATG", ["A"], None, None, {"target": "target_2"}), # Movable to first nt of target VCFRecord( "artificial_chr1", 28, "alt_31", "AGTA", ["A"], None, None, {"target": "target_2"}), # Movable to last nt of target VCFRecord("artificial_chr1", 30, "alt_32", "TAGT", ["T"], None, None, {"target": "target_2" }), # On repeat and movable to last nt of target ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
'chr19': '19', 'chr20': '20', 'chr21': '21', 'chr22': '22', 'chrX': 'X', 'chrY': 'Y', 'chrM': 'MT' } if args.input_names: with SVIO(args.input_names, "r", separator="\t", has_title=False) as reader: for record in reader: new_names[record[0]] = record[1] # Process with VCFIO(args.output_variants, "w") as writer: with VCFIO(args.input_variants, "r") as reader: # Header writer.copyHeader(reader) for idx, curr_header in enumerate(writer.extra_header): if curr_header.startswith("##contig"): content = uGetHeaderAttr(curr_header) old_id = content.id if content.id in new_names: new_id = new_names[old_id] writer.extra_header[idx] = curr_header.replace( "ID={},".format(old_id), "ID={},".format(new_id)) writer.writeHeader() # Variants for record in reader: if record.chrom in new_names:
group_input = parser.add_argument_group('Inputs') # Inputs group_input.add_argument('-i', '--input-variants', help='Path to the variants file (format: VCF).') group_output = parser.add_argument_group('Outputs') # Outputs group_input.add_argument('-o', '--output-variants', help='Path to the file outputted file (format: VCF).') args = parser.parse_args() # Logger logging.basicConfig(format='%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s') log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Process nb_variants = 0 nb_filtered = 0 with VCFIO(args.input_variants) as handle_in: with VCFIO(args.output_variants, "w") as handle_out: # Header handle_out.copyHeader(handle_in) handle_out.info[args.SOR_tag] = HeaderInfoAttr(args.SOR_tag, "Strand bias estimated by the symmetric odds ratio test.", type="Float") handle_out.filter[args.bias_tag] = HeaderFilterAttr(args.bias_tag, "Strand ratio bias (estimated by the symmetric odds ratio test): substit SOR > {}, InDel SOR > {}.".format(args.substit_max_SOR, args.indel_max_SOR)) handle_out.writeHeader() # Records for record in handle_in: if len(record.alt) > 1: raise Exception("The multi-allelic variants cannot be processed: {}.".format(record.getName())) nb_variants += 1 is_filtered = False # Compute SOR record.info[args.SOR_tag] = strandOddRatio( record.info[args.ref_fwd_tag] if handle_in.info[args.ref_fwd_tag].number == "1" else record.info[args.ref_fwd_tag][0],
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFHomopolym.py", "--mode", "remove", "--homopolym-length", "4", "--input-variants", self.tmp_variants, "--input-reference", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # 12 16 20 24 28 32 36 40 44 48 52 56 60 64 68 72 76 80 84 88 92 96 100 # 2 4 6 8 10| 14| 18| 22| 26| 30| 34| 38| 42| 46| 50| 54| 58| 62| 66| 70| 74| 78| 82| 86| 90| 94| 98| 102 # | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | FH_seq.write( Sequence( "artificial_chr1", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr2", "CGAATATGATCCAGCAATAAAAAGCTCCTACAGGCAAAAGTAGGCAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAA" )) FH_seq.write( Sequence( "artificial_chr3", "CGAATATGATCCAGCAATGAAAATTCCTACAGGTAAAACGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr4", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCAAAAGGATATTCTCGACAAAACAGCAGAAAGTCAAG" )) FH_seq.write( Sequence( "artificial_chr5", "CGAATATGATCCAGTAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr6", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGCACAACCTGTCTCTTGGAAAATCTCGACACAGCAGGTAAAACAATGCAGTAAAT" )) """ Variant before_start before_end before_seq after_start after_end after_seq alt_00 10 13 TCCA 15 18 CAAT alt_01 20 23 AAAA 25 28 TTCC alt_02 30 33 ACAG 35 38 AAAA alt_03 40 43 AGTA 45 48 AAAG alt_04 10 13 TCCA 16 19 AATA alt_05 20 23 AAAA 26 29 TCCT alt_06 30 33 ACAG 36 39 AAAA alt_07 40 43 GTAG 46 49 AAAG alt_08 11 14 CCAG 15 18 CAAT alt_09 20 23 AAAA 24 27 TTCC alt_10 31 34 AGGT 35 38 AAAA alt_11 40 43 GTAG 44 47 AAAG alt_12 11 14 CCAG 15 18 CAAT alt_13 20 23 AAAA 24 27 GTTC alt_14 31 34 CAGG 35 38 AAAA alt_15 41 44 GTAG 45 48 AAAG alt_16 50 53 GAAA 57 60 GTCA alt_17 60 63 AAAA 67 70 TATT alt_18 70 73 TCTC 77 80 AAAA alt_19 80 83 ACAG 87 90 AAAG alt_20 11 14 CCAG 16 19 AATA alt_21 20 23 AAAA 25 28 TTCC alt_22 31 34 CAGG 36 39 AAAA alt_23 40 43 AGTA 45 48 AAAG alt_24 11 14 CCAG 17 20 ATAA alt_25 19 22 AAAA 26 29 TCCT alt_26 29 32 TACA 35 38 AAAA alt_27 38 41 AAAG 45 48 AAAG alt_28 50 53 ACAA 61 64 CTTG alt_29 66 69 AAAA 76 79 CACA alt_30 76 79 CACA 86 89 AAAA alt_31 88 91 AACA 99 102 AAAT """ # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 89 17 89 90 artificial_chr2 89 124 89 90 artificial_chr3 88 231 88 89 artificial_chr4 95 337 95 96 artificial_chr5 89 450 89 90 artificial_chr6 102 557 102 103""") # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "is_filtered": HeaderInfoAttr( "is_filtered", "1 if the variant is adjacent to an homopolymer.", type="Integer", number="1") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr1", 24, "alt_01", "G", ["T"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr1", 34, "alt_02", "G", ["T"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr1", 44, "alt_03", "G", ["T"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Substit multi nt VCFRecord("artificial_chr2", 14, "alt_04", "GC", ["TA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr2", 24, "alt_05", "GC", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr2", 34, "alt_06", "GC", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr2", 44, "alt_07", "GC", ["TA"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Ins single nt VCFRecord("artificial_chr3", 14, "alt_08", "G", ["GT"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr3", 23, "alt_09", "A", ["AT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr3", 34, "alt_10", "T", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr3", 43, "alt_11", "G", ["GT"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Ins multi nt VCFRecord("artificial_chr4", 14, "alt_12", "G", ["GTA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr4", 23, "alt_13", "A", ["ATA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr4", 34, "alt_14", "G", ["GTA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr4", 44, "alt_15", "G", ["GTC"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer VCFRecord("artificial_chr4", 54, "alt_16", "CCT", ["ATCCAGA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr4", 64, "alt_17", "GGA", ["CTCCAGT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr4", 74, "alt_18", "GAC", ["ATCCAGT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr4", 84, "alt_19", "CAG", ["ATCCAGT"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer # Del single nt VCFRecord("artificial_chr5", 14, "alt_20", "GT", ["G"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr5", 23, "alt_21", "AG", ["A"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr5", 34, "alt_22", "GA", ["G"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr5", 43, "alt_23", "AG", ["A"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # # Del multi nt VCFRecord("artificial_chr6", 14, "alt_24", "GCA", ["G"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr6", 23, "alt_25", "AGT", ["C"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr6", 32, "alt_26", "AGG", ["A"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr6", 42, "alt_27", "TAG", ["C"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer VCFRecord("artificial_chr6", 54, "alt_28", "CCTGTCT", ["GAA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr6", 70, "alt_29", "TCTCGA", ["CCC"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr6", 80, "alt_30", "GCAGGT", ["CCC"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr6", 92, "alt_31", "ATGCAGT", ["CCC"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
'%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(args.logging_level) log.info("Command: " + " ".join(sys.argv)) # Get merged records fusions = getMergedRecords(args.inputs_variants, args.calling_sources, args.annotation_field, args.shared_filters) # Log differences in SR and PR logSupportVariance(fusions, log) # Write breakends = list(itertools.chain.from_iterable(fusions)) with VCFIO(args.output_variants, "w") as writer: # Header new_header = getNewHeaderAttr(args) writer.samples = new_header["samples"] writer.info = new_header["info"] writer.format = new_header["format"] writer.filter = new_header["filter"] writer.writeHeader() # Records for record in sorted( breakends, key=lambda record: (record.chrom, record.refStart(), record.refEnd())): if len(record.filter) == 0: record.filter = ["PASS"] writer.write(record)
format= '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Get merged records variants = getMergedRecords(args.inputs_variants, args.calling_sources, args.annotations_field, args.shared_filters) # Log differences in AF and AD logACVariance(variants, log) # Write with VCFIO(args.output_variants, "w") as FH_out: # Header new_header = getNewHeaderAttr(args) FH_out.samples = new_header["samples"] FH_out.info = new_header["info"] FH_out.format = new_header["format"] FH_out.filter = new_header["filter"] FH_out.writeHeader() # Records for record in sorted( variants, key=lambda record: (record.chrom, record.refStart(), record.refEnd())): if record.filter is not None and len(record.filter) == 0: record.filter = ["PASS"] FH_out.write(record)
def getNewHeaderAttr(args): """ Return renamed and new VCFHeader elements for the merged VCF. :param args: The script's parameters. :type args: NameSpace :return: VCFHeader elements (filter, info, format, samples). :rtype: dict """ unchanged_info = {"MATEID", "RNA_FIRST", "SVTYPE", "IMPRECISE"} final_filter = {} final_info = { "CIPOS": HeaderInfoAttr("CIPOS", type="Integer", number="2", description="Confidence interval around POS"), "IDSRC": HeaderInfoAttr("IDSRC", type="String", number=".", description="ID of breakend by source"), "REFSRC": HeaderInfoAttr( "REFSRC", type="String", number="1", description="Selected support data (SR, PR) come from this source" ), "SRC": HeaderInfoAttr( "SRC", type="String", number=".", description= "Fusions callers where the breakend is identified. Possible values: {}" .format({ name: "s" + str(idx) for idx, name in enumerate(args.calling_sources) })) } final_format = { "SR": HeaderFormatAttr( "SR", type="Integer", number="1", description="Count of reads mapping on the fusion junction"), "PR": HeaderFormatAttr( "PR", type="Integer", number="1", description="Count of pairs of reads supporting the fusion"), "SRSRC": HeaderFormatAttr( "SRSRC", type="Integer", number=".", description= "Count of reads mapping on the fusion junction by source"), "PRSRC": HeaderFormatAttr( "PRSRC", type="Integer", number=".", description= "Count of pairs of reads supporting the fusion by source") } final_samples = None for idx_in, curr_in in enumerate(args.inputs_variants): with VCFIO(curr_in) as FH_vcf: # Samples if final_samples is None: final_samples = FH_vcf.samples elif FH_vcf.samples != final_samples: raise Exception( "The samples in VCF are not the same: {} in {} and {} in {}." .format(final_samples, args.inputs_variants[0], FH_vcf.samples, curr_in)) # FILTER for tag, data in FH_vcf.filter.items(): new_tag = tag if tag not in args.shared_filters: # Rename filters not based on caller new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_filter[new_tag] = data # INFO for tag, data in FH_vcf.info.items(): if tag in unchanged_info: if tag not in final_info or len( final_info[tag].description ) < len( data.description ): # Manage merge between callers with 0 variants (and 0 annotations) and callers with variants final_info[tag] = data else: new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_info[new_tag] = data qual_tag = "s{}_VCQUAL".format(idx_in) final_info[qual_tag] = HeaderInfoAttr( qual_tag, type="Float", number="1", description="The variant quality", source=args.calling_sources[idx_in]) # FORMAT for tag, data in FH_vcf.format.items(): new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_format[new_tag] = data return { "filter": final_filter, "info": final_info, "format": final_format, "samples": final_samples }
def getMergedRecords(inputs_variants, calling_sources, annotations_field, shared_filters): """ Merge VCFRecords coming from several variant callers. :param inputs_variants: Pathes to the variants files. :type inputs_variants: list :param calling_sources: Names of the variants callers (in same order as inputs_variants). :type calling_sources: list :param annotations_field: Field used to store annotations. :type annotations_field: str :param shared_filters: Filters tags applying to the variant and independent of caller like filters on annotations. These filters are not renamed to add caller ID as suffix. :type shared_filters: set :return: Merged VCF records. :rtype: list """ variant_by_name = {} for idx_in, curr_in in enumerate(inputs_variants): curr_caller = calling_sources[idx_in] with VCFIO(curr_in) as FH_in: log.info("Process {}".format(curr_caller)) for record in FH_in: variant_name = record.getName() # Extract AD and DP support_by_spl = {} for spl in FH_in.samples: support_by_spl[spl] = { "AD": record.getAltAD(spl)[0], "DP": record.getDP(spl) } # Rename filters if record.filter is not None: new_filter = [] for tag in record.filter: if tag != "PASS": if tag in shared_filters: # Rename filters not based on caller new_filter.append(tag) else: new_filter.append("s{}_{}".format(idx_in, tag)) record.filter = new_filter # Rename INFO new_info = {} for key, val in record.info.items(): if key == annotations_field: new_info[key] = val else: new_info["s{}_{}".format(idx_in, key)] = val record.info = new_info # Backup quality if record.qual is not None: record.info["s{}_VCQUAL".format(idx_in)] = record.qual # Rename FORMAT record.format = [ "s{}_{}".format(idx_in, curr_filter) for curr_filter in record.format ] for spl_name, spl_info in record.samples.items(): renamed_info = {} for key, val in spl_info.items(): renamed_info["s{}_{}".format(idx_in, key)] = val record.samples[spl_name] = renamed_info # Add to storage if variant_name not in variant_by_name: variant_by_name[variant_name] = record # Data source record.info["SRC"] = [curr_caller] # Quality if idx_in != 0: record.qual = None # For consistency, the quality of the variant comes only from the first caller of the variant # AD and DP by sample (from the first caller finding the variant: callers are in user order) record.format.insert(0, "ADSRC") record.format.insert(0, "DPSRC") record.format.insert(0, "AD") record.format.insert(0, "DP") for spl_name, spl_data in record.samples.items(): spl_data["AD"] = [support_by_spl[spl_name]["AD"]] spl_data["DP"] = support_by_spl[spl_name]["DP"] spl_data["ADSRC"] = [support_by_spl[spl_name]["AD"]] spl_data["DPSRC"] = [support_by_spl[spl_name]["DP"]] else: prev_variant = variant_by_name[variant_name] prev_variant.info["SRC"].append(curr_caller) # IDs if record.id is not None: prev_ids = prev_variant.id.split(";") prev_ids.extend(record.id.split(";")) prev_ids = sorted(list(set(prev_ids))) prev_variant.id = ";".join(prev_ids) # FILTERS if record.filter is not None: if prev_variant.filter is None: prev_variant.filter = record.filter else: prev_variant.filter = list( set(prev_variant.filter) or set(record.filter)) # FORMAT prev_variant.format.extend(record.format) # INFO prev_variant.info.update(record.info) for spl_name, spl_data in prev_variant.samples.items(): spl_data.update(record.samples[spl_name]) spl_data["ADSRC"].append( support_by_spl[spl_name]["AD"]) spl_data["DPSRC"].append( support_by_spl[spl_name]["DP"]) return variant_by_name.values()
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) self.tmp_initial_pathes = os.path.join(tmp_folder, unique_id + "_{}_initial.vcf") self.tmp_haplotyped_pathes = os.path.join(tmp_folder, unique_id + "_{}_haplotyped.vcf") self.tmp_expected_pathes = os.path.join(tmp_folder, unique_id + "_{}_expected.vcf") self.tmp_out_pathes = os.path.join(tmp_folder, unique_id + "_{}_out.vcf") # test cases self.test_cases = [ { # *a-b, a-b, a b, / "initial": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [ VCFRecord("chr1", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr1", 18, None, "A", ["G"], info={"AD": 104}) ] }, "haplotyped": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr1:14=G/C", "chr1:18=A/G"], "AD": 100})] }, "expected": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104})] } }, { # *a b, a b, a-b, / "initial": { "caller1": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])] }, "haplotyped": { "caller1": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})], "caller2": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})], "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])] }, "expected": { "caller1": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ] } }, { # *a-b c, a-b c, a b c, / "initial": { "caller1": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr3", 14, None, "G", ["C"], info={"AD": 104}), VCFRecord("chr3", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98}) ] }, "haplotyped": { "caller1": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})], "caller2": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})], "caller3": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=G/C", "chr3:18=A/G", "chr3:20=A/G"], "AD": 98})] }, "expected": { "caller1": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}), VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98}) ] } }, { # *a-b c, a-b c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr4", 14, None, "G", ["C"], info={"AD": 98}), VCFRecord("chr4", 18, None, "A", ["G"], info={"AD": 104}), VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})], "caller2": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})], "caller3": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=G/C", "chr4:18=A/G", "chr4:20=A/G"], "AD": 98})], "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}), VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ] } }, { # *a-b c, a' a-b c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr5", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"]})], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"], "AD": 100}) ], "caller3": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=G/C", "chr5:18=A/G", "chr5:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ] } }, { # *a b c, a' a-b c, a-b c, a-b-c "initial": { "caller1": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 105}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101}) ], "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=G/C", "chr6:18=A/G", "chr6:20=A/G"]})], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 100}) ], "caller3": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 101})], "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 105}), VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101}) ], "caller4": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ] } }, { # *a b c, a-b b' c, a-b c, a-b-c "initial": { "caller1": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=G/C", "chr7:18=A/G", "chr7:20=A/G"]})], "caller2": [ VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"], "AD": 100}), VCFRecord("chr7", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"]})], "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr7", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller4": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ] } }, { # *a-b c, a-b b' c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr8", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"]})], "caller2": [ VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"], "AD": 100}), VCFRecord("chr8", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=G/C", "chr8:18=A/G", "chr8:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ] } }, { # *a' a-b c, a-b b' c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr9", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"]}) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"], "AD": 100}), VCFRecord("chr9", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=G/C", "chr9:18=A/G", "chr9:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ] } } ] # Get callers callers = set() for curr_test in self.test_cases: for curr_caller in curr_test["initial"]: callers.add(curr_caller) self.callers = sorted(list(callers)) # Write files for curr_caller in self.callers: # Initial with VCFIO(self.tmp_initial_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["initial"]: for curr_var in curr_test["initial"][curr_caller]: handle_out.write(curr_var) # Haplotyped with VCFIO(self.tmp_haplotyped_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1"), "MCO_VAR": HeaderInfoAttr("MCO_VAR", "Name of the variants merged because their occur on same reads.", type="String", number=".") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["haplotyped"]: for curr_var in curr_test["haplotyped"][curr_caller]: handle_out.write(curr_var) # Expected with VCFIO(self.tmp_expected_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["expected"]: for curr_var in curr_test["expected"][curr_caller]: handle_out.write(curr_var)
def getNewHeaderAttr(args): """ Return renamed and new VCFHeader elements for the merged VCF. :param args: The script's parameters. :type args: NameSpace :return: VCFHeader elements (filter, info, format, samples). :rtype: dict """ final_filter = {} final_info = { "SRC": HeaderInfoAttr( "SRC", type="String", number=".", description= "Variant callers where the variant is identified. Possible values: {}" .format({ name: "s" + str(idx) for idx, name in enumerate(args.calling_sources) })) } final_format = { "AD": HeaderFormatAttr("AD", type="Integer", number="A", description="Allele Depth"), "DP": HeaderFormatAttr("DP", type="Integer", number="1", description="Total Depth"), "ADSRC": HeaderFormatAttr("ADSRC", type="Integer", number=".", description="Allele Depth by source"), "DPSRC": HeaderFormatAttr("DPSRC", type="Integer", number=".", description="Total Depth by source") } final_samples = None for idx_in, curr_in in enumerate(args.inputs_variants): with VCFIO(curr_in) as FH_vcf: # Samples if final_samples is None: final_samples = FH_vcf.samples elif FH_vcf.samples != final_samples: raise Exception( "The samples in VCF are not the same: {} in {} and {} in {}." .format(final_samples, args.inputs_variants[0], FH_vcf.samples, curr_in)) # FILTER for tag, data in FH_vcf.filter.items(): new_tag = tag if tag not in args.shared_filters: # Rename filters not based on caller new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_filter[new_tag] = data # INFO for tag, data in FH_vcf.info.items(): if tag == args.annotations_field: if tag not in final_info or len( final_info[tag].description ) < len( data.description ): # Manage merge between callers with 0 variants (and 0 annotations) and callers with variants final_info[tag] = data else: new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_info[new_tag] = data qual_tag = "s{}_VCQUAL".format(idx_in) final_info[qual_tag] = HeaderInfoAttr( qual_tag, type="Float", number="1", description="The variant quality", source=args.calling_sources[idx_in]) # FORMAT for tag, data in FH_vcf.format.items(): new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_format[new_tag] = data return { "filter": final_filter, "info": final_info, "format": final_format, "samples": final_samples }