def _parse_ml_method(ml_method_id: str, ml_specification) -> tuple: valid_class_values = ReflectionHandler.all_nonabstract_subclass_basic_names(MLMethod, "", "ml_methods/") if type(ml_specification) is str: ml_specification = {ml_specification: {}} ml_specification = {**DefaultParamsLoader.load("ml_methods/", "MLMethod"), **ml_specification} ml_specification_keys = list(ml_specification.keys()) ParameterValidator.assert_all_in_valid_list(list(ml_specification_keys), ["model_selection_cv", "model_selection_n_folds"] + valid_class_values, "MLParser", ml_method_id) non_default_keys = [key for key in ml_specification.keys() if key not in ["model_selection_cv", "model_selection_n_folds"]] assert len(ml_specification_keys) == 3, f"MLParser: ML method {ml_method_id} was not correctly specified. Expected at least 1 key " \ f"(ML method name), got {len(ml_specification_keys) - 2} instead: " \ f"{str([key for key in non_default_keys])[1:-1]}." ml_method_class_name = non_default_keys[0] ml_method_class = ReflectionHandler.get_class_by_name(ml_method_class_name, "ml_methods/") ml_specification[ml_method_class_name] = {**DefaultParamsLoader.load("ml_methods/", ml_method_class_name, log_if_missing=False), **ml_specification[ml_method_class_name]} method, params = MLParser.create_method_instance(ml_specification, ml_method_class, ml_method_id) ml_specification[ml_method_class_name] = params method.name = ml_method_id return method, ml_specification
def test_import_receptor_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/io_10xGenomics/" PathBuilder.build(path) self.create_dumy_dataset(path, add_metadata=False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "tenx_genomics") params["is_repertoire"] = False params["paired"] = True params["result_path"] = path params["path"] = path params["sequence_file_size"] = 1 params["receptor_chains"] = "TRA_TRB" dataset = TenxGenomicsImport.import_dataset(params, "tenx_dataset_receptor") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(2, len(dataset.get_filenames())) data = dataset.get_data(1) for receptor in data: self.assertTrue(receptor.alpha.amino_acid_sequence in ["ALSGTGGYKVV", "AIVGNTGKLI"]) self.assertTrue(receptor.beta.amino_acid_sequence in ["ASSLYGGPEVF", "ASSFATNSDYT"]) shutil.rmtree(path)
def test_load_repertoire_with_stop_codon(self): path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "igor") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["import_with_stop_codon"] = True params["metadata_file"] = path + "metadata.csv" dataset_stop_codons = IGoRImport.import_dataset( params, "igor_dataset_stop") self.assertEqual(2, dataset_stop_codons.get_example_count()) self.assertEqual(len(dataset_stop_codons.repertoires[0].sequences), 2) self.assertEqual(len(dataset_stop_codons.repertoires[1].sequences), 2) self.assertEqual( dataset_stop_codons.repertoires[0].sequences[0]. amino_acid_sequence, "ARVNRHIVVVTAIMTG*NWFDP") shutil.rmtree(path)
def test_load_sequence_dataset(self): """Test dataset content with and without a header included in the input file""" path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "igor") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path params["import_with_stop_codon"] = True dataset = IGoRImport.import_dataset(params, "igor_seq_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertEqual(4, dataset.get_example_count()) self.assertEqual( "GCGAGACGTGTCTAGGGAGGATATTGTAGTAGTACCAGCTGCTATGACGGGCGGTCCGGTAGTACTACTTTGACTAC", seqs[0].nucleotide_sequence) self.assertEqual( "GCGAGAGGCTTCCATGGAACTACAGTAACTACGTTTGTAGGCTGTAGTACTACATGGACGTC", seqs[1].nucleotide_sequence) self.assertEqual( "GCGAGAGTTAATCGGCATATTGTGGTGGTGACTGCTATTATGACCGGGTAAAACTGGTTCGACCCC", seqs[2].nucleotide_sequence) self.assertEqual( "GCGAGAGATAGGTGGTCAACCCCAGTATTACGATATTTTGACTGGTGGACCCCGCCCTACTACTACTACATGGACGTC", seqs[3].nucleotide_sequence) shutil.rmtree(path)
def test_import_repertoire_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/immunoseq/" self.create_dummy_dataset(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "ImmunoSEQSample") params["is_repertoire"] = True params["result_path"] = path params["metadata_file"] = path + "metadata.csv" params["path"] = path dataset = ImmunoSEQSampleImport.import_dataset(params, "immunoseq_dataset") self.assertEqual(1, dataset.get_example_count()) for index, rep in enumerate(dataset.get_data()): self.assertEqual("1234a", rep.metadata["subject_id"]) self.assertEqual(18, len(rep.sequences)) self.assertEqual("ATSDQLNRWGTGELF", rep.sequences[0].get_sequence()) self.assertEqual("TRBV25-1", rep.sequences[2].metadata.v_gene) self.assertListEqual([ 38, 48, 37, 53, 28, 16, 72, 14, 26, 13, 8, 16, 8, 28, 7, 1, 9, 1 ], list(rep.get_counts())) self.assertListEqual([Chain.BETA for i in range(18)], list(rep.get_chains())) shutil.rmtree(path)
def test_load_repertoire(self): """Test dataset content with and without a header included in the input file""" path = EnvironmentSettings.root_path + "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "igor") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path + "metadata.csv" dataset = IGoRImport.import_dataset(params, "igor_repertoire_dataset") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(len(dataset.repertoires[0].sequences), 1) self.assertEqual(len(dataset.repertoires[1].sequences), 1) self.assertEqual( dataset.repertoires[0].sequences[0].amino_acid_sequence, "ARDRWSTPVLRYFDWWTPPYYYYMDV") self.assertListEqual(list(dataset.repertoires[0].get_counts()), [1]) self.assertEqual(dataset.repertoires[0].get_chains(), None) shutil.rmtree(path)
def test_import_repertoire_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/io_10xGenomics/" PathBuilder.build(path) self.create_dumy_dataset(path, add_metadata=True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "tenx_genomics") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path + "metadata.csv" dataset = TenxGenomicsImport.import_dataset(params, "tenx_dataset_repertoire") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(len(dataset.repertoires[0].sequences), 2) self.assertEqual(len(dataset.repertoires[1].sequences), 4) self.assertEqual( dataset.repertoires[0].sequences[0].amino_acid_sequence, "ALSGTGGYKVV") self.assertListEqual([Chain.ALPHA, Chain.BETA], list(dataset.repertoires[0].get_chains())) self.assertListEqual([2, 4], list(dataset.repertoires[0].get_counts())) shutil.rmtree(path)
def _prepare_params(dataset_specs: dict, result_path: str, dataset_name: str): params = DefaultParamsLoader.load(ImportParser.keyword, dataset_specs["format"]) if "params" in dataset_specs.keys(): params = {**params, **dataset_specs["params"]} if "result_path" not in params or params["result_path"] is None: params["result_path"] = f"{result_path}datasets/{dataset_name}/" dataset_specs["params"] = params return params
def test_load_repertoire_dataset(self): file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = EnvironmentSettings.root_path + "test/tmp/iovdjdb2/" PathBuilder.build(path) number_of_repertoires = 5 for i in range(number_of_repertoires): with open(path + "receptors_{}.tsv".format(i + 1), "w") as file: file.writelines(file_content) metadata = { "filename": [ "receptors_{}.tsv".format(i + 1) for i in range(number_of_repertoires) ], "label1": [i % 2 for i in range(number_of_repertoires)] } pd.DataFrame(metadata).to_csv(path + "metadata.csv") default_params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "vdjdb") dataset = VDJdbImport.import_dataset( { "is_repertoire": True, "result_path": path, "metadata_file": path + "metadata.csv", "path": path, "import_empty_nt_sequences": True, "import_empty_aa_sequences": False, "import_illegal_characters": False, "column_mapping": default_params["column_mapping"], "separator": "\t", "region_type": "IMGT_CDR3" }, "vdjdb_rep_dataset") self.assertEqual(number_of_repertoires, dataset.get_example_count()) self.assertEqual(number_of_repertoires, len(dataset.get_data())) for repertoire in dataset.get_data(2): self.assertTrue(repertoire.metadata["label1"] in {0, 1}) self.assertEqual(4, len(repertoire.sequences)) self.assertListEqual( [Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.ALPHA], list(repertoire.get_chains())) self.assertEqual(None, repertoire.get_counts()) shutil.rmtree(path)
def test(self): path = EnvironmentSettings.tmp_test_path + "deeprc_classifier/" data_path = path + "encoded_data/" result_path = path + "result/" PathBuilder.build(data_path) PathBuilder.build(result_path) encoded_data = self.make_encoded_data(data_path) y = {"status": encoded_data.labels["status"]} params = DefaultParamsLoader.load("ml_methods/", "DeepRC") classifier = DeepRC(**params) # Prepare 'dummy training' for classifier, to test other functionalities classifier.result_path = path classifier.pytorch_device = torch.device("cpu") classifier.training_function = self.dummy_training_function train_indices, val_indices = classifier.get_train_val_indices(10) self.assertEqual(len(train_indices) + len(val_indices), 10) self.assertEqual(set(list(train_indices) + list(val_indices)), set(range(10))) # test if 'fit' function saves models classifier.fit(encoded_data, "status") self.assertListEqual(classifier.get_classes_for_label("status"), ["A", "B"]) self.assertIsInstance(classifier.models, dict) self.assertListEqual(list(classifier.models.keys()), ["status"]) for model in classifier.models.values(): self.assertIsInstance(model, DeepRCInternal) # Test storing and loading of models self.assertFalse(classifier.check_if_exists(result_path)) classifier.store(result_path, feature_names=None) self.assertTrue(classifier.check_if_exists(result_path)) second_classifier = DeepRC(**params) second_classifier.load(result_path) self.assertIsInstance(second_classifier.models, dict) self.assertListEqual(list(second_classifier.models.keys()), ["status"]) for model in second_classifier.models.values(): self.assertIsInstance(model, DeepRCInternal) shutil.rmtree(path)
def test_repertoire_import(self): path = EnvironmentSettings.root_path + "test/tmp/adaptive/" self.build_dummy_dataset(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "ImmunoSEQRearrangement") params["is_repertoire"] = True params["result_path"] = path params['import_empty_nt_sequences'] = False params['import_empty_aa_sequences'] = True params["metadata_file"] = path + "metadata.csv" params["path"] = path params["import_productive"] = True params["import_with_stop_codon"] = True params["import_out_of_frame"] = True dataset_name = "adaptive_dataset_reps" dataset = ImmunoSEQRearrangementImport.import_dataset( params, dataset_name) self.assertEqual( dataset.repertoires[0].sequences[1].metadata.frame_type, SequenceFrameType.IN) self.assertListEqual( list(dataset.repertoires[0].get_counts()), [10, 1772, 1763, None, 566, 506, 398, 394, 363, 363]) self.assertListEqual(list(dataset.repertoires[0].get_chains()), [Chain.BETA for i in range(10)]) self.assertEqual(2, dataset.get_example_count()) for index, rep in enumerate(dataset.get_data()): if index == 0: self.assertEqual("1234", rep.metadata["subject_id"]) self.assertEqual(10, len(rep.sequences)) self.assertEqual(10, rep.sequences[0].metadata.count) self.assertEqual("TRBV29", rep.sequences[0].metadata.v_subgroup) else: self.assertEqual("1234a", rep.metadata["subject_id"]) self.assertEqual(11, len(rep.sequences)) self.assertEqual(2, rep.sequences[-1].metadata.count) dataset_file = f"{path}{dataset_name}.{ImportHelper.DATASET_FORMAT}" self.assertTrue(os.path.isfile(dataset_file)) shutil.rmtree(path)
def _parse_split_config(self, instruction_key, instruction: dict, split_key: str, symbol_table: SymbolTable, settings_count: int) -> SplitConfig: try: default_params = DefaultParamsLoader.load("instructions/", SplitConfig.__name__) report_config_input = self._prepare_report_config( instruction_key, instruction, split_key, symbol_table) instruction[split_key] = { **default_params, **instruction[split_key] } split_strategy = SplitType[instruction[split_key] ["split_strategy"].upper()] training_percentage = float( instruction[split_key]["training_percentage"] ) if split_strategy == SplitType.RANDOM else -1 if split_strategy == SplitType.RANDOM and training_percentage == 1 and settings_count > 1: raise ValueError( f"{TrainMLModelParser.__name__}: all data under {instruction_key}/{split_key} was specified to be used for " f"training, but {settings_count} settings were specified for evaluation. Please define a test/validation set by " f"reducing the training percentage (e.g., to 0.7) or use only one hyperparameter setting to run the analysis." ) return SplitConfig( split_strategy=split_strategy, split_count=int(instruction[split_key]["split_count"]), training_percentage=training_percentage, reports=ReportConfig(**report_config_input), manual_config=ManualSplitConfig( **instruction[split_key]["manual_config"]) if "manual_config" in instruction[split_key] else None, leave_one_out_config=LeaveOneOutConfig( **instruction[split_key]["leave_one_out_config"]) if "leave_one_out_config" in instruction[split_key] else None) except KeyError as key_error: raise KeyError( f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under {split_key}." )
def test_load_repertoire_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/mixcr/" PathBuilder.build(path) self.create_dummy_dataset(path, add_metadata=True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "mixcr") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path + "metadata.csv" dataset = MiXCRImport.import_dataset(params, "mixcr_repertoire_dataset") self.assertEqual(2, dataset.get_example_count()) for index, repertoire in enumerate(dataset.get_data()): self.assertTrue( all(sequence.metadata.chain == Chain.ALPHA for sequence in repertoire.sequences)) if index == 0: self.assertEqual(9, len(repertoire.sequences)) self.assertEqual("ALVTDSWGKLQ", repertoire.sequences[0].amino_acid_sequence) self.assertEqual("ALRITQGGSEKLV", repertoire.sequences[1].amino_acid_sequence) self.assertEqual("TRAV6", repertoire.sequences[0].metadata.v_gene) self.assertEqual("TRAV16", repertoire.sequences[1].metadata.v_gene) self.assertListEqual([Chain.ALPHA for i in range(9)], list(repertoire.get_chains())) self.assertListEqual([ 956023, 90101, 69706, 56658, 55692, 43466, 42172, 41647, 19133 ], list(repertoire.get_counts())) elif index == 1: self.assertEqual(5, len(repertoire.sequences)) self.assertEqual("GCTGTGCTGGAAACCAGTGGCTCTAGGTTGACC", repertoire.sequences[0].nucleotide_sequence) shutil.rmtree(path)
def test_encode(self): file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 3051 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15761 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3051 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15761 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = PathBuilder.build(EnvironmentSettings.root_path + "test/tmp/trcdist_encoder/") with open(path + "receptors.tsv", "w") as file: file.writelines(file_content) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "vdjdb") params["is_repertoire"] = False params["paired"] = True params["result_path"] = path params["path"] = path params["sequence_file_size"] = 1 params["receptor_chains"] = "TRA_TRB" params['organism'] = 'human' dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset") encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2}) encoded_dataset = encoder.encode( dataset, EncoderParams(f"{path}result/", LabelConfiguration([Label("epitope")]))) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == encoded_dataset.encoded_data.examples.shape[1] and encoded_dataset.encoded_data.examples.shape[0] == dataset.get_example_count()) shutil.rmtree(path)
def test_load(self): params = { "a": 1, "b": True } path = EnvironmentSettings.tmp_test_path + "defaultparamsloader/" PathBuilder.build(path) with open(path + "mixcr_params.yaml", "w") as file: yaml.dump(params, file) loaded = DefaultParamsLoader.load(path, "MiXCR") self.assertTrue(all(key in loaded.keys() for key in params.keys())) self.assertEqual(1, loaded["a"]) self.assertEqual(True, loaded["b"]) self.assertEqual(2, len(loaded.keys())) shutil.rmtree(path)
def prepare_reference(reference_params: dict, location: str, paired: bool): ParameterValidator.assert_keys(list(reference_params.keys()), ["format", "params"], location, "reference") seq_import_params = reference_params[ "params"] if "params" in reference_params else {} assert os.path.isfile(seq_import_params["path"]), f"{location}: the file {seq_import_params['path']} does not exist. " \ f"Specify the correct path under reference." if "paired" in seq_import_params: assert seq_import_params[ "paired"] == paired, f"{location}: paired must be {paired} for SequenceImport" else: seq_import_params["paired"] = paired format_str = reference_params["format"] if format_str == "IRIS": # todo refactor this when refactoring IRISSequenceImport receptors = IRISSequenceImport.import_items(**seq_import_params) else: import_class = ReflectionHandler.get_class_by_name( "{}Import".format(format_str)) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", DefaultParamsLoader.convert_to_snake_case(format_str)) for key, value in seq_import_params.items(): params[key] = value params["paired"] = paired processed_params = DatasetImportParams.build_object(**params) receptors = ImportHelper.import_items( import_class, reference_params["params"]["path"], processed_params) return receptors
def _get_implanting_strategy(key: str, signal: dict) -> SignalImplantingStrategy: valid_strategies = [cls[:-10] for cls in ReflectionHandler.discover_classes_by_partial_name("Implanting", "simulation/signal_implanting_strategy/")] ParameterValidator.assert_in_valid_list(signal["implanting"], valid_strategies, "SignalParser", key) defaults = DefaultParamsLoader.load("signal_implanting_strategy/", f"{signal['implanting']}Implanting") signal = {**defaults, **signal} ParameterValidator.assert_keys_present(list(signal.keys()), ["motifs", "implanting", "sequence_position_weights"], SignalParser.__name__, key) implanting_comp = None if 'implanting_computation' in signal: implanting_comp = signal['implanting_computation'].lower() ParameterValidator.assert_in_valid_list(implanting_comp, [el.name.lower() for el in ImplantingComputation], SignalParser.__name__, 'implanting_computation') implanting_comp = ImplantingComputation[implanting_comp.upper()] implanting_strategy = ReflectionHandler.get_class_by_name(f"{signal['implanting']}Implanting")(GappedMotifImplanting(), signal["sequence_position_weights"], implanting_comp) return implanting_strategy
def test_load_sequence_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/mixcr/" PathBuilder.build(path) self.create_dummy_dataset(path, add_metadata=False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "mixcr") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path dataset = MiXCRImport.import_dataset(params, "mixcr_repertoire_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertEqual("AVLETSGSRLT", seqs[0].amino_acid_sequence) self.assertEqual("AVNDAGNMLT", seqs[1].amino_acid_sequence) self.assertEqual("TRAV21", seqs[0].metadata.v_gene) self.assertEqual("TRAV12-2", seqs[1].metadata.v_gene) shutil.rmtree(path)
def test_sequence_import(self): path = EnvironmentSettings.root_path + "test/tmp/adaptive/" self.build_dummy_dataset(path, False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "ImmunoSEQRearrangement") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path params["import_productive"] = True params["import_with_stop_codon"] = True params["import_out_of_frame"] = True params["import_empty_nt_sequences"] = False params["import_empty_aa_sequences"] = True dataset_name = "adaptive_dataset_seqs" dataset = ImmunoSEQRearrangementImport.import_dataset( params, dataset_name) self.assertEqual(21, dataset.get_example_count()) seqs = [sequence for sequence in dataset.get_data()] self.assertEqual("ASSLPGTNTGELF", seqs[0].amino_acid_sequence) self.assertEqual("IN", seqs[0].metadata.frame_type.name) self.assertEqual('TRBV7-9', seqs[0].metadata.v_gene) self.assertEqual('TRBJ2-2', seqs[0].metadata.j_gene) self.assertEqual('GCCAGCAGCTTACCGGGGACGAACACCGGGGAGCTGTTT', seqs[0].nucleotide_sequence) dataset_file = f"{path}{dataset_name}.{ImportHelper.DATASET_FORMAT}" self.assertTrue(os.path.isfile(dataset_file)) shutil.rmtree(path)
def test_import_sequence_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/immunoseq/" self.create_dummy_dataset(path, False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "ImmunoSEQSample") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path dataset = ImmunoSEQSampleImport.import_dataset(params, "immunoseq_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertEqual(seqs[0].amino_acid_sequence, "ATSDQLNRWGTGELF") self.assertEqual(seqs[1].amino_acid_sequence, "ASKDGDTGELF") self.assertEqual(seqs[2].amino_acid_sequence, "ASSGEGQGVFGGTEAF") self.assertEqual(seqs[3].amino_acid_sequence, "ASSEEVGGNQPQH") shutil.rmtree(path)
def get_all_params(specs, class_path, short_class_name, key: str = None): default_params = DefaultParamsLoader.load(class_path, short_class_name) specified_params = ObjectParser.get_params(specs, short_class_name) params = {**default_params, **specified_params, "name": key} return params
def test_load_galaxy_bordercases(self): # This test is here because in the Galaxy interface, when importing data from VDJdb: # - receptors might be incomplete (one of two genes only) # - V and J genes might be missing file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 4000 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = EnvironmentSettings.root_path + "test/tmp/iovdjdb/" PathBuilder.build(path) with open(path + "receptors.tsv", "w") as file: file.writelines(file_content) default_params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "vdjdb") dataset = VDJdbImport.import_dataset( { "is_repertoire": False, "result_path": path, "paired": True, "path": path, "sequence_file_size": 1, "region_type": "IMGT_CDR3", "separator": "\t", "receptor_chains": "TRA_TRB", "column_mapping": default_params["column_mapping"], "import_empty_nt_sequences": True, "import_empty_aa_sequences": False, "import_illegal_characters": False, "metadata_column_mapping": default_params["metadata_column_mapping"] }, "vdjdb_rec_dataset") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(2, len(dataset.get_filenames())) for receptor in dataset.get_data(2): self.assertTrue(receptor.alpha.amino_acid_sequence in ["AAIYESRGSTLGRLY", "ALRLNNQGGKLI"]) self.assertTrue( receptor.alpha.get_attribute("v_gene") in ["TRAV13-1", None]) self.assertTrue(receptor.alpha.get_attribute("j_gene") in [None]) self.assertTrue( receptor.beta.get_attribute("v_gene") in ["TRBV5-4", None]) self.assertTrue( receptor.beta.get_attribute("j_gene") in ["TRBJ2-1", "TRBJ2-6"]) self.assertTrue( receptor.metadata["epitope_species"] in ["EBV", "CMV"]) self.assertTrue( receptor.metadata["epitope"] in ["AVFDRKSDAK", "KLGGALQAK"]) self.assertTrue( receptor.metadata["epitope_gene"] in ["EBNA4", "IE1"]) shutil.rmtree(path)