def testResultsFilterAnnAndVar(self): cmd = self.cmd + [ "--input-filters-variants", self.tmp_var_filters, "--input-filters-annotations", self.tmp_annot_filters ] # Execute command subprocess.check_call(cmd, stderr=subprocess.DEVNULL) # Validate results expected = list() for record in self.variants: if record.info["is_filtered"] == 0: expected.append(record.id) if "ANN" in record.info: for curr_ann in record.info["ANN"]: if curr_ann["is_filtered"] == 0: expected.append(curr_ann["id"]) observed = list() with AnnotVCFIO(self.tmp_output) as FH_results: for record in FH_results: observed.append(record.id) if "ANN" in record.info: for curr_ann in record.info["ANN"]: observed.append(curr_ann["id"]) self.assertEqual(expected, observed)
def testResultsAnnotRemove(self): cmd = [ "filterVCFByAnnot.py", "--mode", "remove", "--input-selected-RNA", self.tmp_selected_rna, "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Execute command subprocess.check_call(cmd, stderr=subprocess.DEVNULL) # Validate results expected = list() for record in self.variants: if record.info["expected_filter"] == ["PASS"]: annot_idx = 0 for annot in record.info["ANN"]: if annot["expected_filter"] == "PASS": expected.append("{}:{}:PASS".format( record.id, annot_idx)) annot_idx += 1 observed = list() with AnnotVCFIO(self.tmp_output) as FH_results: for record in FH_results: for annot_idx, annot in enumerate(record.info["ANN"]): for curr_filter in sorted(annot["FILTER"].split("&")): observed.append("{}:{}:{}".format( record.id, annot_idx, curr_filter)) self.assertEqual(expected, observed)
def testResultsRecordTag(self): cmd = [ "filterVCFByAnnot.py", "--mode", "tag", "--input-selected-RNA", self.tmp_selected_rna, "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Execute command subprocess.check_call(cmd, stderr=subprocess.DEVNULL) # Validate results expected = list() for record in self.variants: for curr_filter in sorted(record.info["expected_filter"]): expected.append("{}:{}".format(record.id, curr_filter)) observed = list() with AnnotVCFIO(self.tmp_output) as FH_results: for record in FH_results: for curr_filter in sorted(record.filter): observed.append("{}:{}".format(record.id, curr_filter)) self.assertEqual(expected, observed)
def testResultsRecordRemove(self): cmd = [ "filterVCFByAnnot.py", "--mode", "remove", "--input-selected-RNA", self.tmp_selected_rna, "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Execute command subprocess.check_call(cmd, stderr=subprocess.DEVNULL) # Validate results expected = list() for record in self.variants: if record.info["expected_filter"] == ["PASS"]: expected.append(record.id) observed = list() with AnnotVCFIO(self.tmp_output) as FH_results: for record in FH_results: if record.info["expected_filter"] == ["PASS"]: observed.append(record.id) self.assertEqual(expected, observed)
group_output.add_argument('-o', '--output-fusions', required=True, help='Path to the output file (format: VCF).') args = parser.parse_args() # Logger logging.basicConfig( format= '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger() log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) log.info("Version: " + str(__version__)) # Process with AnnotVCFIO(args.output_fusions, "w", args.annotation_field) as writer: with FusionFileReader.factory(args.input_fusions, "r", args.annotation_field, args.sample_name) as reader: # Header reader.__class__.setVCFHeader(writer, args.annotation_field) writer.samples = [args.sample_name] writer.writeHeader() # Records for first_bnd, second_bnd in reader: writer.write(first_bnd) writer.write(second_bnd) log.info("End of job")
required=True, help='The path to the file outputted file (format: JSON).') args = parser.parse_args() # Logger logging.basicConfig( format= '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Convert VCF in python dict json_data = list() with AnnotVCFIO(args.input_variants, "r", args.annotation_field) as FH_vcf: # Get sources IDs for VCF coming from merged sources id_by_src = None if args.merged_sources: SRC_id_desc = FH_vcf.info["SRC"].description.split( "Possible values: ")[1].replace("'", '"') id_by_src = json.loads(SRC_id_desc) # Records for record in FH_vcf: for idx_alt, alt in enumerate(record.alt): allele_record = getAlleleRecord(FH_vcf, record, idx_alt) allele_record.normalizeSingleAllele() curr_json = dict() # Coord information curr_json["coord"] = { "region":
log.info("Command: " + " ".join(sys.argv)) log.info("Version: " + str(__version__)) # Get pathways pathways_by_gene = {} with open(args.input_pathways) as FH_pathways: for line in FH_pathways: fields = [elt.strip() for elt in line.split("\t")] for gene in fields[3:]: if gene not in pathways_by_gene: pathways_by_gene[gene] = set() pathways_by_gene[gene].add(fields[1]) # Write output with AnnotVCFIO(args.output_variants, "w", annot_field=args.annotation_field) as FH_out: with AnnotVCFIO(args.input_variants, annot_field=args.annotation_field) as FH_in: # Header FH_out.copyHeader(FH_in) FH_out.ANN_titles.append("Pathways") FH_out.writeHeader() # Records for record in FH_in: for annot in record.info[FH_in.annot_field]: if annot[args.gene_field] is not None and annot[ args.gene_field] != "": pathways = set() for gene in annot[args.gene_field].split(","): if gene in pathways_by_gene:
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_selected_rna = os.path.join(tmp_folder, unique_id + "_rna.tsv") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Create RNA ref with open(self.tmp_selected_rna, "w") as FH_rna: FH_rna.write("#Gene\tTranscript\n") FH_rna.write("Gene_1\tENST_selected1\n") FH_rna.write("Gene_1\tENST_selected2\n") # Create VCF with AnnotVCFIO(self.tmp_variants, "w") as FH_var: FH_var.ANN_titles = [ "Allele", "Consequence", "Feature", "EUR_AF", "gnomAD_AF", "expected_filter" ] FH_var.info = { "ANN": HeaderInfoAttr( "ANN", "Consequence annotations from Ensembl VEP. Format: Allele|Consequence|Feature|gnomAD_AF|expected_filter.", type="String", number="."), "expected_filter": HeaderInfoAttr("expected_filter", "The expected filters.", type="String", number=".") } FH_var.writeHeader() self.variants = [ VCFRecord( "artificial_chr1", 14, "alt_00", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "PASS" }], "expected_filter": ["PASS"] }), VCFRecord("artificial_chr1", 14, "alt_01", "G", ["T"], None, None, {"expected_filter": ["CSQ"]}), VCFRecord( "artificial_chr1", 14, "alt_02", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_03", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.popAF" }], "expected_filter": ["popAF"] }), VCFRecord( "artificial_chr1", 14, "alt_04", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.RNA" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_05", "G", ["T"], None, None, { "ANN": [{ "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_06", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "PASS" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["PASS"], }), VCFRecord( "artificial_chr1", 14, "alt_07", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_08", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"], }), VCFRecord( "artificial_chr1", 14, "alt_09", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.RNA" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"], }), VCFRecord( "artificial_chr1", 14, "alt_10", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.RNA&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_11", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_12", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.RNA&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_13", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_14", "G", ["GT"], None, None, { "ANN": [{ "Allele": "GT", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 15, "alt_15", "-", ["T"], None, None, { "ANN": [{ "Allele": "GT", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_15", "G", ["-"], None, None, { "ANN": [{ "Allele": "-", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_16", "GG", ["G"], None, None, { "ANN": [{ "Allele": "-", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_var_filters = os.path.join(tmp_folder, unique_id + "_varFilters.json") self.tmp_annot_filters = os.path.join(tmp_folder, unique_id + "_annFilters.json") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Command self.cmd = [ "filterAnnotVCF.py", "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create filters with open(self.tmp_var_filters, "w") as FH_filter: FH_filter.write("""{ "class": "FiltersCombiner", "operator": "or", "filters": [ { "class": "Filter", "getter": "filter", "action": "select", "aggregator": "ratio:1", "operator": "!=", "values": "CSQ" }, { "class": "Filter", "getter": "chrom", "action": "select", "aggregator": "nb:1", "operator": "==", "values": "artificial_chr2" } ] }""") with open(self.tmp_annot_filters, "w") as FH_filter: FH_filter.write("""{ "class": "Filter", "getter": "FILTER", "action": "select", "aggregator": "ratio:1", "operator": "==", "values": "PASS" }""") # Create VCF with AnnotVCFIO(self.tmp_variants, "w") as FH_var: FH_var.ANN_titles = ["Allele", "id", "is_filtered", "FILTER"] FH_var.info = { "ANN": HeaderInfoAttr( "ANN", "Consequence annotations from Ensembl VEP. Format: Allele|id|is_filtered|FILTER.", type="String", number="."), "is_filtered": HeaderInfoAttr("is_filtered", "The expected result.", type="Integer", number="1") } FH_var.writeHeader() self.variants = [ VCFRecord("artificial_chr1", 10, "alt_00", "G", ["T"], None, ["PASS"], {"is_filtered": 0}), VCFRecord("artificial_chr1", 10, "alt_01", "G", ["T"], None, ["CSQ"], {"is_filtered": 1}), VCFRecord( "artificial_chr2", 10, "alt_02", "G", ["T"], None, ["CSQ"], { "is_filtered": 0, # Proctected }), VCFRecord( "artificial_chr1", 10, "alt_03", "G", ["T"], None, ["PASS"], { "ANN": [{ "Allele": "T", "id": "ann_00", "FILTER": "PASS", "is_filtered": 0 }], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_04", "G", ["T"], None, ["PASS"], { "ANN": [{ "Allele": "C", "id": "ann_01", "FILTER": "ANN.COLLOC", "is_filtered": 1 }], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_05", "G", ["T"], None, ["CSQ"], { "ANN": [{ "Allele": "C", "id": "ann_02", "FILTER": "ANN.COLLOC", "is_filtered": 1 }], "is_filtered": 1 }), VCFRecord( "artificial_chr1", 10, "alt_06", "G", ["T"], None, ["CSQ"], { "ANN": [{ "Allele": "T", "id": "ann_03", "FILTER": "PASS", "is_filtered": 0 }], "is_filtered": 1 }), VCFRecord( "artificial_chr1", 10, "alt_07", "G", ["T"], None, ["PASS"], { "ANN": [ { "Allele": "T", "id": "ann_04", "FILTER": "PASS", "is_filtered": 0 }, { "Allele": "C", "id": "ann_05", "FILTER": "ANN.COLLOC", "is_filtered": 1 }, ], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_08", "G", ["T"], None, ["PASS"], { "ANN": [ { "Allele": "T", "id": "ann_06", "FILTER": "ANN.popAF", "is_filtered": 1 }, { "Allele": "C", "id": "ann_07", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 }), VCFRecord( "artificial_chr2", 10, "alt_09", "G", ["T"], None, ["CSQ"], { "ANN": [ { "Allele": "T", "id": "ann_08", "FILTER": "ANN.popAF", "is_filtered": 1 }, { "Allele": "C", "id": "ann_09", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 # Protected }), VCFRecord( "artificial_chr2", 10, "alt_10", "G", ["T"], None, ["CSQ"], { "ANN": [ { "Allele": "T", "id": "ann_10", "FILTER": "PASS", "is_filtered": 0 }, { "Allele": "C", "id": "ann_11", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 # Protected }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
with MAFIO(args.input_variants) as FH_in: for record in FH_in: nb_records += 1 samples.add(record["Tumor_Sample_Barcode"]) variants_id = getName(record) if variants_id not in occur_by_id: occur_by_id[variants_id] = {"nb_expec": 0, "data": {}} occur_by_id[variants_id]["nb_expec"] += 1 samples = sorted(samples) log.info("{} samples, {} variants and {} records.".format( len(samples), len(occur_by_id), nb_records)) # Convert log.info("Convert to VCF.") with MAFIO(args.input_variants) as FH_in: with AnnotVCFIO(args.output_variants, "w") as FH_out: # Header FH_out.samples = samples FH_out.ANN_titles = [ "Allele", "Consequence", "SYMBOL", "Feature_type", "Feature", "HGVSc", "HGVSp", "RefSeq" ] FH_out.info = { "SC": HeaderInfoAttr("SC", "Mutated sample count", type="Integer", number="1"), "AD": HeaderInfoAttr("AD", "Allele depth in tumor",