def test_gene_info_with_variant_that_is_ref_v38(self) -> None: """Error when gene info has variant where variant allele is the ref allele""" gene = "FAKE" chromosome_v37 = "X" chromosome_v38 = "chrX" reference_haplotype_name = "*1" drugs: FrozenSet[DrugInfo] = frozenset() rs_id = "rs294924" rs_id_to_ref_seq_difference_annotation = { rs_id: Annotation("399483A>C", "399483C>A") } haplotypes = frozenset([ Haplotype("*3", "No Function", frozenset([Variant("rs294924", "G")])) ]) rs_id_infos1 = frozenset([ RsIdInfo(rs_id, "A", "C", GeneCoordinate(chromosome_v37, 499593), GeneCoordinate(chromosome_v38, 399483)) ]) rs_id_infos2 = frozenset([ RsIdInfo(rs_id, "A", "G", GeneCoordinate(chromosome_v37, 499593), GeneCoordinate(chromosome_v38, 399483)) ]) GeneInfo(gene, reference_haplotype_name, haplotypes, rs_id_infos1, drugs, rs_id_to_ref_seq_difference_annotation) with self.assertRaises(ValueError): GeneInfo(gene, reference_haplotype_name, haplotypes, rs_id_infos2, drugs, rs_id_to_ref_seq_difference_annotation)
def test_genotype_reporter_non_empty_input_v37(self) -> None: all_full_calls = frozenset({ FullCall(GeneCoordinate("1", 1), "C", None, None, ("C", "CAG"), "DPYD", ("rs664",), "1A>C;1A>G", FullCallFilter.PASS, "1A>C;1A>G?", FullCallFilter.UNKNOWN), FullCall(GeneCoordinate("1", 5), "A", GeneCoordinate("chr1", 25), "A", ("G", "C"), "DPYD", (".",), "25A>C;25A>G", FullCallFilter.PASS, "25A>C;25A>G", FullCallFilter.PASS), FullCall(GeneCoordinate("1", 15), "C", None, None, ("C", "CAG"), "DPYD", ("rs536",), "35A>C;35A>G", FullCallFilter.PASS, "35A>C;35A>G?", FullCallFilter.UNKNOWN), FullCall(GeneCoordinate("X", 15), "TT", GeneCoordinate("chrX", 40), "AA", ("TT", "TT"), "GENE", ("rs23",), "REF_CALL", FullCallFilter.NO_CALL, "627AA>TT", FullCallFilter.INFERRED_PASS), FullCall(GeneCoordinate("2", 154663), "T", GeneCoordinate("chr2", 40565464), "T", ("T", "T"), "BRAF", ("rs154", "rs8839"), "REF_CALL", FullCallFilter.NO_CALL, "REF_CALL", FullCallFilter.NO_CALL), FullCall(GeneCoordinate("15", 24113), "A", GeneCoordinate("chr15", 684633), "T", ("T", "T"), ".", ("rs462", "rs9820", "rs536"), "29482A>T", FullCallFilter.PASS, "REF_CALL", FullCallFilter.PASS), }) pgx_analysis = PgxAnalysis(FullCallData(all_full_calls), {}) panel_id = "Panel_v0.2" version = "V1" result = GenotypeReporter.get_calls_tsv_text(pgx_analysis, panel_id, version, ReferenceAssembly.V37) result_expected = ( "gene\tchromosome_v37\tposition_v37\tchromosome_v38\tposition_v38\tref_v37\tref_v38\tallele1\tallele2\t" "rsid\tvariant_annotation_v37\tfilter_v37\tvariant_annotation_v38\tfilter_v38\tpanel_version\trepo_version\n" "DPYD\t1\t1\tUNKNOWN\tUNKNOWN\tC\tUNKNOWN\tC\tCAG\trs664\t1A>C;1A>G\tPASS\t1A>C;1A>G?\tUNKNOWN\tPanel_v0.2\tV1\n" "DPYD\t1\t5\tchr1\t25\tA\tA\tC\tG\t.\t25A>C;25A>G\tPASS\t25A>C;25A>G\tPASS\tPanel_v0.2\tV1\n" "DPYD\t1\t15\tUNKNOWN\tUNKNOWN\tC\tUNKNOWN\tC\tCAG\trs536\t35A>C;35A>G\tPASS\t35A>C;35A>G?\tUNKNOWN\tPanel_v0.2\tV1\n" "BRAF\t2\t154663\tchr2\t40565464\tT\tT\tT\tT\trs154;rs8839\tREF_CALL\tNO_CALL\tREF_CALL\tNO_CALL\tPanel_v0.2\tV1\n" ".\t15\t24113\tchr15\t684633\tA\tT\tT\tT\trs462;rs9820;rs536\t29482A>T\tPASS\tREF_CALL\tPASS\tPanel_v0.2\tV1\n" "GENE\tX\t15\tchrX\t40\tTT\tAA\tTT\tTT\trs23\tREF_CALL\tNO_CALL\t627AA>TT\tINFERRED_PASS\tPanel_v0.2\tV1\n" ) self.assertEqual(result_expected, result)
def test_gene_info_with_overlapping_rs_id_infos(self) -> None: """Error when gene info has rs id infos for which the relevant coordinates overlap""" gene = "FAKE" chromosome_v37 = "X" chromosome_v38 = "chrX" reference_haplotype_name = "*1" haplotypes: FrozenSet[Haplotype] = frozenset() drugs: FrozenSet[DrugInfo] = frozenset() rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict() rs_id_info1 = RsIdInfo("rs294924", "A", "A", GeneCoordinate(chromosome_v37, 499593), GeneCoordinate(chromosome_v38, 399483)) rs_id_info2 = RsIdInfo("rs294927", "AA", "AA", GeneCoordinate(chromosome_v37, 499592), GeneCoordinate(chromosome_v38, 399482)) single_rs_id_info = frozenset([rs_id_info1]) overlapping_rs_id_infos = frozenset([rs_id_info1, rs_id_info2]) GeneInfo(gene, reference_haplotype_name, haplotypes, single_rs_id_info, drugs, rs_id_to_ref_seq_difference_annotation) with self.assertRaises(ValueError): GeneInfo(gene, reference_haplotype_name, haplotypes, overlapping_rs_id_infos, drugs, rs_id_to_ref_seq_difference_annotation)
def test_get_covered_coordinates_multiple(self) -> None: start_coordinate = GeneCoordinate("X", 17) result = get_covered_coordinates(start_coordinate, "AGTA") result_expected = { GeneCoordinate("X", 17), GeneCoordinate("X", 18), GeneCoordinate("X", 19), GeneCoordinate("X", 20), } self.assertEqual(result_expected, result)
def from_json(cls, data: Json, chromosome_v37: str, chromosome_v38: str) -> "RsIdInfo": rs_id = str(data['rsid']) reference_allele_v37 = str(data['referenceAlleleV37']) reference_allele_v38 = str(data['referenceAlleleV38']) start_coordinate_v37 = GeneCoordinate(chromosome_v37, int(data['positionV37'])) start_coordinate_v38 = GeneCoordinate(chromosome_v38, int(data['positionV38'])) info = RsIdInfo( rs_id, reference_allele_v37, reference_allele_v38, start_coordinate_v37, start_coordinate_v38, ) return info
def get_covered_coordinates(start_coordinate: GeneCoordinate, allele: str) -> Set[GeneCoordinate]: return { GeneCoordinate(start_coordinate.chromosome, start_coordinate.position + i) for i in range(len(allele)) }
def __get_call_from_variants(cls, call_index: int, sample_r_id: str, variants: Dict[str, Any]) -> SimpleCall: chromosome = cls.__get_chromosome_from_variants(call_index, variants) position = cls.__get_position_from_variants(call_index, variants) gene_coordinate = GeneCoordinate(chromosome, position) reference_allele = cls.__get_reference_allele_from_variants( call_index, variants) alleles = cls.__get_called_alleles_from_variants( call_index, sample_r_id, variants) gene = cls.__get_gene_from_variants(call_index, variants) rs_ids = cls.__get_rs_ids_from_variants(call_index, variants) if alleles == (reference_allele, reference_allele): variant_annotation = REF_CALL_ANNOTATION_STRING else: variant_annotation = cls.__get_variant_annotation_from_variants( call_index, variants) call = SimpleCall( gene_coordinate, reference_allele, alleles, gene, rs_ids, variant_annotation, SimpleCallFilter.PASS, ) return call
def test_gene_info_with_rs_id_infos_for_different_chromosome(self) -> None: """Error when gene info has rs id infos for which the relevant coordinates have a different chromosome""" gene = "FAKE" chromosome_v37 = "X" chromosome_v38 = "chrX" reference_haplotype_name = "*1" haplotypes: FrozenSet[Haplotype] = frozenset() drugs: FrozenSet[DrugInfo] = frozenset() rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict() other_chromosome_v37 = "1" other_chromosome_v38 = "1" rs_id_info1 = RsIdInfo("rs294924", "A", "A", GeneCoordinate(chromosome_v37, 499593), GeneCoordinate(chromosome_v38, 399483)) rs_id_info2 = RsIdInfo("rs294924", "A", "A", GeneCoordinate(other_chromosome_v37, 499593), GeneCoordinate(chromosome_v38, 399483)) rs_id_info3 = RsIdInfo("rs294924", "A", "A", GeneCoordinate(chromosome_v37, 499593), GeneCoordinate(other_chromosome_v38, 399483)) rs_id_info4 = RsIdInfo("rs294924", "A", "A", GeneCoordinate(other_chromosome_v37, 499593), GeneCoordinate(other_chromosome_v38, 399483)) GeneInfo(gene, reference_haplotype_name, haplotypes, frozenset([rs_id_info1]), drugs, rs_id_to_ref_seq_difference_annotation) GeneInfo(gene, reference_haplotype_name, haplotypes, frozenset([rs_id_info2]), drugs, rs_id_to_ref_seq_difference_annotation) GeneInfo(gene, reference_haplotype_name, haplotypes, frozenset([rs_id_info3]), drugs, rs_id_to_ref_seq_difference_annotation) GeneInfo(gene, reference_haplotype_name, haplotypes, frozenset([rs_id_info4]), drugs, rs_id_to_ref_seq_difference_annotation) with self.assertRaises(ValueError): GeneInfo(gene, reference_haplotype_name, haplotypes, frozenset([rs_id_info1, rs_id_info2]), drugs, rs_id_to_ref_seq_difference_annotation) with self.assertRaises(ValueError): GeneInfo(gene, reference_haplotype_name, haplotypes, frozenset([rs_id_info1, rs_id_info3]), drugs, rs_id_to_ref_seq_difference_annotation) with self.assertRaises(ValueError): GeneInfo( gene, reference_haplotype_name, haplotypes, frozenset([rs_id_info1, rs_id_info2, rs_id_info3, rs_id_info4]), drugs, rs_id_to_ref_seq_difference_annotation)
def test_gene_info_with_overlapping_haplotype_names(self) -> None: """Error when haplotype name used multiple times for gene""" gene = "FAKE" chromosome_v37 = "X" chromosome_v38 = "chrX" reference_haplotype_name = "*1" drugs: FrozenSet[DrugInfo] = frozenset() rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict() variant1 = Variant("rs94982", "A") variant2 = Variant("rs394934", "T") rs_id_infos = frozenset([ RsIdInfo(variant1.rs_id, "C", "C", GeneCoordinate(chromosome_v37, 4994545), GeneCoordinate(chromosome_v38, 2993823)), RsIdInfo(variant2.rs_id, "G", "G", GeneCoordinate(chromosome_v37, 3993842), GeneCoordinate(chromosome_v38, 2949923)), ]) haplotypes1 = frozenset([ Haplotype("*2", "No Function", frozenset([variant1])), Haplotype("*4", "Partial Function", frozenset([variant1, variant2])), ]) haplotypes2 = frozenset([ Haplotype("*2", "No Function", frozenset([variant1])), Haplotype("*2", "Partial Function", frozenset([variant1, variant2])), ]) GeneInfo(gene, reference_haplotype_name, haplotypes1, rs_id_infos, drugs, rs_id_to_ref_seq_difference_annotation) with self.assertRaises(ValueError): GeneInfo(gene, reference_haplotype_name, haplotypes2, rs_id_infos, drugs, rs_id_to_ref_seq_difference_annotation)
def test_gene_info_with_ref_seq_difference_without_annotation( self) -> None: """Error when a ref seq difference does not have an annotation""" gene = "FAKE" chromosome_v37 = "X" chromosome_v38 = "chrX" reference_haplotype_name = "*1" haplotypes: FrozenSet[Haplotype] = frozenset() drugs: FrozenSet[DrugInfo] = frozenset() rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict() empty_rs_id_infos: FrozenSet[RsIdInfo] = frozenset() non_empty_rs_id_infos = frozenset([ RsIdInfo("rs294924", "A", "C", GeneCoordinate(chromosome_v37, 499593), GeneCoordinate(chromosome_v38, 399483)) ]) GeneInfo(gene, reference_haplotype_name, haplotypes, empty_rs_id_infos, drugs, rs_id_to_ref_seq_difference_annotation) with self.assertRaises(ValueError): GeneInfo(gene, reference_haplotype_name, haplotypes, non_empty_rs_id_infos, drugs, rs_id_to_ref_seq_difference_annotation)
def __get_relevant_coordinates( cls, chromosome: str, position: int, ref_allele: str) -> Tuple[GeneCoordinate, ...]: return tuple( GeneCoordinate(chromosome, position + i) for i in range(len(ref_allele)))
def __get_example_panel(cls) -> Panel: dpyd_two_a_variant = Variant("rs3918290", "T") dpyd_two_b_variant = Variant("rs1801159", "C") dpyd_three_variant = Variant("rs72549303", "TG") fake_variant = Variant("rs1212125", "C") fake2_variant = Variant("rs1212127", "C") dpyd_haplotypes = frozenset({ Haplotype("*2A", "No Function", frozenset({dpyd_two_a_variant})), Haplotype("*2B", "No Function", frozenset({dpyd_two_a_variant, dpyd_two_b_variant})), Haplotype("*3", "Normal Function", frozenset({dpyd_three_variant})), }) dpyd_rs_id_infos = frozenset({ RsIdInfo("rs3918290", "C", "C", GeneCoordinate("1", 97915614), GeneCoordinate("chr1", 97450058)), RsIdInfo("rs72549309", "GATGA", "GATGA", GeneCoordinate("1", 98205966), GeneCoordinate("chr1", 97740410)), RsIdInfo("rs1801159", "T", "T", GeneCoordinate("1", 97981395), GeneCoordinate("chr1", 97515839)), RsIdInfo("rs72549303", "TG", "TC", GeneCoordinate("1", 97915621), GeneCoordinate("chr1", 97450065)), }) dpyd_drugs = frozenset({ DrugInfo("5-Fluorouracil", "https://www.pharmgkb.org/chemical/PA128406956/guidelineAnnotation/PA166104939"), DrugInfo("Capecitabine", "https://www.pharmgkb.org/chemical/PA448771/guidelineAnnotation/PA166104963"), }) dpyd_rs_id_to_difference_annotations = { "rs72549303": Annotation("6744CA>GA", "6744GA>CA"), } fake_haplotypes = frozenset({ Haplotype("*4A", "Reduced Function", frozenset({fake_variant})), }) fake_rs_id_infos = frozenset({ RsIdInfo("rs1212125", "T", "T", GeneCoordinate("5", 97915617), GeneCoordinate("chr5", 97450060)), }) fake_drugs = frozenset({ DrugInfo("Aspirin", "https://www.pharmgkb.org/some_other_url"), }) fake_rs_id_to_difference_annotations: Dict[str, Annotation] = {} fake2_haplotypes = frozenset({ Haplotype("*4A", "Reduced Function", frozenset({fake2_variant})), }) fake2_rs_id_infos = frozenset({ RsIdInfo("rs1212127", "C", "T", GeneCoordinate("16", 97915617), GeneCoordinate("chr16", 97450060)), }) fake2_drugs = frozenset({ DrugInfo("Aspirin", "https://www.pharmgkb.org/some_other_url"), }) fake2_rs_id_to_difference_annotations = {"rs1212127": Annotation("1324C>T", "1324T>C")} gene_infos = frozenset({ GeneInfo("DPYD", "*1", dpyd_haplotypes, dpyd_rs_id_infos, dpyd_drugs, dpyd_rs_id_to_difference_annotations), GeneInfo("FAKE", "*1", fake_haplotypes, fake_rs_id_infos, fake_drugs, fake_rs_id_to_difference_annotations), GeneInfo("FAKE2", "*1", fake2_haplotypes, fake2_rs_id_infos, fake2_drugs, fake2_rs_id_to_difference_annotations), }) name = "Panel" version = "0.2" return Panel(name, version, gene_infos)
def test_get_covered_coordinates_single(self) -> None: start_coordinate = GeneCoordinate("X", 17) result = get_covered_coordinates(start_coordinate, "A") result_expected = {start_coordinate} self.assertEqual(result_expected, result)
def test_get_covered_coordinates_empty(self) -> None: start_coordinate = GeneCoordinate("X", 17) result = get_covered_coordinates(start_coordinate, "") result_expected: Set[GeneCoordinate] = set() self.assertEqual(result_expected, result)
def test_panel_with_overlapping_rs_id_infos_for_different_genes( self) -> None: """Error when panel has overlapping rs id infos for different genes, but not when they are exactly the same""" name = "FakePanel" version = "1.0" gene1 = "FAKE" gene2 = "OTHER" chromosome_v37 = "X" chromosome_v38 = "chrX" reference_haplotype_name = "*1" haplotypes: FrozenSet[Haplotype] = frozenset() drugs: FrozenSet[DrugInfo] = frozenset() rs_id_to_ref_seq_difference_annotation: Dict[str, Annotation] = dict() rs_id_info1 = RsIdInfo( "rs294924", "AT", "AT", GeneCoordinate(chromosome_v37, 499593), GeneCoordinate(chromosome_v38, 399483), ) rs_id_info2 = RsIdInfo( "rs3949923", "C", "C", GeneCoordinate(chromosome_v37, 293993), GeneCoordinate(chromosome_v38, 1388323), ) rs_id_info3 = RsIdInfo( "rs12993", "GG", "GG", GeneCoordinate(chromosome_v37, 499592), GeneCoordinate(chromosome_v38, 399482), ) rs_id_infos1 = frozenset([rs_id_info1]) rs_id_infos2 = frozenset([rs_id_info1, rs_id_info2]) rs_id_infos3 = frozenset([rs_id_info3]) gene_info1 = GeneInfo( gene1, reference_haplotype_name, haplotypes, rs_id_infos1, drugs, rs_id_to_ref_seq_difference_annotation, ) gene_info2 = GeneInfo( gene2, reference_haplotype_name, haplotypes, rs_id_infos2, drugs, rs_id_to_ref_seq_difference_annotation, ) gene_info3 = GeneInfo( gene2, reference_haplotype_name, haplotypes, rs_id_infos3, drugs, rs_id_to_ref_seq_difference_annotation, ) Panel(name, version, frozenset([gene_info1, gene_info2])) with self.assertRaises(ValueError): Panel(name, version, frozenset([gene_info1, gene_info3]))
def test_load_panel(self) -> None: """Load panel from json""" panel_path = get_test_resource("test_panel.json") panel = load_panel(str(panel_path)) dpyd_two_a_variant = Variant("rs3918290", "T") dpyd_two_b_variant = Variant("rs1801159", "C") dpyd_three_variant = Variant("rs72549303", "TG") fake_variant = Variant("rs1212125", "C") fake2_variant = Variant("rs1212127", "C") dpyd_haplotypes_expected = frozenset({ Haplotype("*2A", "No Function", frozenset({dpyd_two_a_variant})), Haplotype("*2B", "No Function", frozenset({dpyd_two_a_variant, dpyd_two_b_variant})), Haplotype("*3", "Normal Function", frozenset({dpyd_three_variant})), }) dpyd_rs_id_infos_expected = frozenset({ RsIdInfo("rs3918290", "C", "C", GeneCoordinate("1", 97915614), GeneCoordinate("chr1", 97450058)), RsIdInfo("rs72549309", "GATGA", "GATGA", GeneCoordinate("1", 98205966), GeneCoordinate("chr1", 97740410)), RsIdInfo("rs1801159", "T", "T", GeneCoordinate("1", 97981395), GeneCoordinate("chr1", 97515839)), RsIdInfo("rs72549303", "TG", "TC", GeneCoordinate("1", 97915621), GeneCoordinate("chr1", 97450065)), RsIdInfo("rs1801265", "G", "A", GeneCoordinate("1", 98348885), GeneCoordinate("chr1", 97883329)), }) dpyd_drugs_expected = frozenset({ DrugInfo("5-Fluorouracil", "https://www.source_url.org/5-Fluorouracil"), DrugInfo("Capecitabine", "https://www.source_url.org/Capecitabine"), }) dpyd_rs_id_to_difference_annotations = { "rs72549303": Annotation("6744CA>GA", "6744GA>CA"), "rs1801265": Annotation("85C>T", "85T>C"), } fake_haplotypes_expected = frozenset({ Haplotype("*4A", "Reduced Function", frozenset({fake_variant})), }) fake_rs_id_infos_expected = frozenset({ RsIdInfo("rs1212125", "T", "T", GeneCoordinate("5", 97915617), GeneCoordinate("chr5", 97450060)), }) fake_drugs_expected = frozenset({ DrugInfo("Aspirin", "https://www.source_url.org/Aspirin"), }) fake_rs_id_to_difference_annotations: Dict[str, Annotation] = {} fake2_haplotypes_expected = frozenset({ Haplotype("*4A", "Reduced Function", frozenset({fake2_variant})), }) fake2_rs_id_infos_expected = frozenset({ RsIdInfo("rs1212127", "C", "T", GeneCoordinate("16", 97915617), GeneCoordinate("chr16", 97450060)), }) fake2_drugs_expected = frozenset({ DrugInfo("Aspirin", "https://www.source_url.org/Aspirin"), }) fake2_rs_id_to_difference_annotations = { "rs1212127": Annotation("1324C>T", "1324T>C") } gene_infos_expected = frozenset({ GeneInfo("DPYD", "*1", dpyd_haplotypes_expected, dpyd_rs_id_infos_expected, dpyd_drugs_expected, dpyd_rs_id_to_difference_annotations), GeneInfo("FAKE", "*1", fake_haplotypes_expected, fake_rs_id_infos_expected, fake_drugs_expected, fake_rs_id_to_difference_annotations), GeneInfo("FAKE2", "*1", fake2_haplotypes_expected, fake2_rs_id_infos_expected, fake2_drugs_expected, fake2_rs_id_to_difference_annotations), }) name_expected = "fake_panel" version_expected = "0.3" panel_expected = Panel(name_expected, version_expected, gene_infos_expected) self.assertEqual(panel_expected, panel)