def test(self): is_installed = True try: from immuneML.ml_methods.DeepRC import DeepRC from deeprc.deeprc_binary.architectures import DeepRC as DeepRCInternal except Exception as e: is_installed = False if is_installed: logging.warning("DeepRC test is temporarily excluded") path = EnvironmentSettings.tmp_test_path / "deeprc_classifier" data_path = path / "encoded_data" result_path = path / "result" PathBuilder.build(data_path) PathBuilder.build(result_path) encoded_data = self.make_encoded_data(data_path) y = {"status": encoded_data.labels["status"]} params = DefaultParamsLoader.load("ml_methods/", "DeepRC") classifier = DeepRC(**params) # Prepare 'dummy training' for classifier, to test other functionalities classifier.result_path = path classifier.pytorch_device = torch.device("cpu") classifier.training_function = self.dummy_training_function train_indices, val_indices = classifier._get_train_val_indices(10, y['status']) self.assertEqual(len(train_indices) + len(val_indices), 10) self.assertEqual(set(list(train_indices) + list(val_indices)), set(range(10))) # test if 'fit' function saves models classifier.fit(encoded_data, "status") self.assertListEqual(classifier.get_classes(), ["A", "B"]) self.assertIsInstance(classifier.model, DeepRCInternal) # Test storing and loading of models self.assertFalse(classifier.check_if_exists(result_path)) classifier.store(result_path, feature_names=None) self.assertTrue(classifier.check_if_exists(result_path)) second_classifier = DeepRC(**params) second_classifier.load(result_path) self.assertIsInstance(second_classifier.model, DeepRCInternal) shutil.rmtree(path) # test get package info params = DefaultParamsLoader.load("ml_methods/", "DeepRC") classifier = DeepRC(**params) classifier.get_package_info() else: logging.warning("DeepRC is not installed, skipping test. To install DeepRC, install the requirements from requirements_DeepRC.txt.")
def test_load_repertoire(self): """Test dataset content with and without a header included in the input file""" path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "igor") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path / "metadata.csv" dataset = IGoRImport.import_dataset(params, "igor_repertoire_dataset") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(len(dataset.repertoires[0].sequences), 1) self.assertEqual(len(dataset.repertoires[1].sequences), 1) self.assertEqual( dataset.repertoires[0].sequences[0].amino_acid_sequence, "ARDRWSTPVLRYFDWWTPPYYYYMDV") self.assertListEqual(list(dataset.repertoires[0].get_counts()), [1]) self.assertEqual(dataset.repertoires[0].get_chains(), None) shutil.rmtree(path)
def test_import_repertoire_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/io_10xGenomics/" PathBuilder.build(path) self.create_dumy_dataset(path, add_metadata=True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "tenx_genomics") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path / "metadata.csv" dataset = TenxGenomicsImport.import_dataset(params, "tenx_dataset_repertoire") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(len(dataset.repertoires[0].sequences), 2) self.assertEqual(len(dataset.repertoires[1].sequences), 4) self.assertEqual( dataset.repertoires[0].sequences[0].amino_acid_sequence, "ALSGTGGYKVV") self.assertListEqual([Chain.ALPHA, Chain.BETA], list(dataset.repertoires[0].get_chains())) self.assertListEqual([2, 4], list(dataset.repertoires[0].get_counts())) shutil.rmtree(path)
def test_import_receptor_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/io_10xGenomics/" PathBuilder.build(path) self.create_dumy_dataset(path, add_metadata=False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "tenx_genomics") params["is_repertoire"] = False params["paired"] = True params["result_path"] = path params["path"] = path params["sequence_file_size"] = 1 params["receptor_chains"] = "TRA_TRB" dataset = TenxGenomicsImport.import_dataset(params, "tenx_dataset_receptor") self.assertEqual(2, dataset.get_example_count()) self.assertEqual(2, len(dataset.get_filenames())) data = dataset.get_data(1) for receptor in data: self.assertTrue(receptor.alpha.amino_acid_sequence in ["ALSGTGGYKVV", "AIVGNTGKLI"]) self.assertTrue(receptor.beta.amino_acid_sequence in ["ASSLYGGPEVF", "ASSFATNSDYT"]) shutil.rmtree(path)
def test_load_repertoire_with_stop_codon(self): path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "igor") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["import_with_stop_codon"] = True params["metadata_file"] = path / "metadata.csv" dataset_stop_codons = IGoRImport.import_dataset( params, "igor_dataset_stop") self.assertEqual(2, dataset_stop_codons.get_example_count()) self.assertEqual(len(dataset_stop_codons.repertoires[0].sequences), 2) self.assertEqual(len(dataset_stop_codons.repertoires[1].sequences), 2) self.assertEqual( dataset_stop_codons.repertoires[0].sequences[0]. amino_acid_sequence, "ARVNRHIVVVTAIMTG*NWFDP") shutil.rmtree(path)
def test_load_sequence_dataset(self): """Test dataset content with and without a header included in the input file""" path = EnvironmentSettings.root_path / "test/tmp/io_igor_load/" PathBuilder.build(path) self.write_dummy_files(path, False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "igor") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path params["import_with_stop_codon"] = True dataset = IGoRImport.import_dataset(params, "igor_seq_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertEqual(4, dataset.get_example_count()) self.assertListEqual( sorted([ "GCGAGACGTGTCTAGGGAGGATATTGTAGTAGTACCAGCTGCTATGACGGGCGGTCCGGTAGTACTACTTTGACTAC", "GCGAGAGGCTTCCATGGAACTACAGTAACTACGTTTGTAGGCTGTAGTACTACATGGACGTC", "GCGAGAGTTAATCGGCATATTGTGGTGGTGACTGCTATTATGACCGGGTAAAACTGGTTCGACCCC", "GCGAGAGATAGGTGGTCAACCCCAGTATTACGATATTTTGACTGGTGGACCCCGCCCTACTACTACTACATGGACGTC" ]), sorted([seq.nucleotide_sequence for seq in seqs])) shutil.rmtree(path)
def test_alternative_repertoire_import(self): path = EnvironmentSettings.root_path / "test/tmp/immunoseq_alternative/" rep1text = """sample_name productive_frequency templates amino_acid rearrangement v_resolved d_resolved j_resolved LivMet_45 0.014838454958215437 451 CASSLLGLGSEQYF CTGCTGTCGGCTGCTCCCTCCCAGACATCTGTGTACTTCTGTGCCAGCAGTTTACTCGGGTTAGGGAGCGAGCAGTACTTCGGGCCG TCRBV06 TCRBD02-01*02 TCRBJ02-07*01 LivMet_45 0.0106928999144568 325 CASSPGQGEGYEQYF CACGCCCTGCAGCCAGAAGACTCAGCCCTGTATCTCTGCGCCAGCAGCCCGGGACAGGGGGAGGGCTACGAGCAGTACTTCGGGCCG TCRBV04-01*01 TCRBD01-01*01 TCRBJ02-07*01 LivMet_45 0.0074356780943607296 226 CASSAGETQYF ACTCTGACGATCCAGCGCACAGAGCAGCGGGACTCGGCCATGTATCGCTGTGCCAGCAGCGCAGGCGAGACCCAGTACTTCGGGCCA TCRBV07-06*01 TCRBD01-01*01 TCRBJ02-05*01 LivMet_45 0.0072053694808185825 219 CASSGTGEKGEQYF ATCCGGTCCACAAAGCTGGAGGACTCAGCCATGTACTTCTGTGCCAGCAGTGGGACAGGGGAGAAGGGCGAGCAGTACTTCGGGCCG TCRBV02-01*01 TCRBD01-01*01 TCRBJ02-07*01 """ PathBuilder.build(path) with open(path / "rep1.tsv", "w") as file: file.writelines(rep1text) with open(path / "metadata.csv", "w") as file: file.writelines("""filename,chain,subject_id rep1.tsv,TRB,1234a""") params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "ImmunoSEQRearrangement") params["is_repertoire"] = True params["result_path"] = path params["metadata_file"] = path / "metadata.csv" params["path"] = path dataset = ImmunoSEQRearrangementImport.import_dataset( params, "alternative") self.assertEqual(1, dataset.get_example_count()) shutil.rmtree(path)
def _parse_split_config(self, instruction_key, instruction: dict, split_key: str, symbol_table: SymbolTable, settings_count: int) -> SplitConfig: try: default_params = DefaultParamsLoader.load("instructions/", SplitConfig.__name__) report_config_input = self._prepare_report_config(instruction_key, instruction, split_key, symbol_table) instruction[split_key] = {**default_params, **instruction[split_key]} split_strategy = SplitType[instruction[split_key]["split_strategy"].upper()] training_percentage = float(instruction[split_key]["training_percentage"]) if split_strategy == SplitType.RANDOM else -1 if split_strategy == SplitType.RANDOM and training_percentage == 1 and settings_count > 1: raise ValueError(f"{TrainMLModelParser.__name__}: all data under {instruction_key}/{split_key} was specified to be used for " f"training, but {settings_count} settings were specified for evaluation. Please define a test/validation set by " f"reducing the training percentage (e.g., to 0.7) or use only one hyperparameter setting to run the analysis.") return SplitConfig(split_strategy=split_strategy, split_count=int(instruction[split_key]["split_count"]), training_percentage=training_percentage, reports=ReportConfig(**report_config_input), manual_config=ManualSplitConfig(**instruction[split_key]["manual_config"]) if "manual_config" in instruction[split_key] else None, leave_one_out_config=LeaveOneOutConfig(**instruction[split_key]["leave_one_out_config"]) if "leave_one_out_config" in instruction[split_key] else None) except KeyError as key_error: raise KeyError(f"{TrainMLModelParser.__name__}: parameter {key_error.args[0]} was not defined under {split_key}.")
def prepare_reference(reference_params: dict, location: str, paired: bool): ParameterValidator.assert_keys(list(reference_params.keys()), ["format", "params"], location, "reference") seq_import_params = reference_params["params"] if "params" in reference_params else {} assert os.path.isfile(seq_import_params["path"]), f"{location}: the file {seq_import_params['path']} does not exist. " \ f"Specify the correct path under reference." if "is_repertoire" in seq_import_params: assert seq_import_params["is_repertoire"] == False, f"{location}: is_repertoire must be False for SequenceImport" else: seq_import_params["is_repertoire"] = False if "paired" in seq_import_params: assert seq_import_params["paired"] == paired, f"{location}: paired must be {paired} for SequenceImport" else: seq_import_params["paired"] = paired format_str = reference_params["format"] import_class = ReflectionHandler.get_class_by_name("{}Import".format(format_str)) default_params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets", DefaultParamsLoader.convert_to_snake_case(format_str)) params = {**default_params, **seq_import_params} processed_params = DatasetImportParams.build_object(**params) receptors = ImportHelper.import_items(import_class, reference_params["params"]["path"], processed_params) return receptors
def _parse_ml_method(ml_method_id: str, ml_specification) -> tuple: valid_class_values = ReflectionHandler.all_nonabstract_subclass_basic_names( MLMethod, "", "ml_methods/") if type(ml_specification) is str: ml_specification = {ml_specification: {}} ml_specification = { **DefaultParamsLoader.load("ml_methods/", "MLMethod"), **ml_specification } ml_specification_keys = list(ml_specification.keys()) ParameterValidator.assert_all_in_valid_list( list(ml_specification_keys), ["model_selection_cv", "model_selection_n_folds"] + valid_class_values, "MLParser", ml_method_id) non_default_keys = [ key for key in ml_specification.keys() if key not in ["model_selection_cv", "model_selection_n_folds"] ] assert len(ml_specification_keys) == 3, f"MLParser: ML method {ml_method_id} was not correctly specified. Expected at least 1 key " \ f"(ML method name), got {len(ml_specification_keys) - 2} instead: " \ f"{str([key for key in non_default_keys])[1:-1]}." ml_method_class_name = non_default_keys[0] ml_method_class = ReflectionHandler.get_class_by_name( ml_method_class_name, "ml_methods/") ml_specification[ml_method_class_name] = { **DefaultParamsLoader.load("ml_methods/", ml_method_class_name, log_if_missing=False), **ml_specification[ml_method_class_name] } method, params = MLParser.create_method_instance( ml_specification, ml_method_class, ml_method_id) ml_specification[ml_method_class_name] = params method.name = ml_method_id return method, ml_specification
def parse_instruction(key: str, instruction: dict, symbol_table: SymbolTable, path) -> tuple: ParameterValidator.assert_keys_present(list(instruction.keys()), ["type"], InstructionParser.__name__, key) valid_instructions = [cls[:-6] for cls in ReflectionHandler.discover_classes_by_partial_name("Parser", "dsl/instruction_parsers/")] ParameterValidator.assert_in_valid_list(instruction["type"], valid_instructions, "InstructionParser", "type") default_params = DefaultParamsLoader.load("instructions/", instruction["type"]) instruction = {**default_params, **instruction} parser = ReflectionHandler.get_class_by_name("{}Parser".format(instruction["type"]), "instruction_parsers/")() instruction_object = parser.parse(key, instruction, symbol_table, path) symbol_table.add(key, SymbolType.INSTRUCTION, instruction_object) return instruction, symbol_table
def test_repertoire_import(self): path = EnvironmentSettings.root_path / "test/tmp/adaptive/" self.build_dummy_dataset(path, True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "ImmunoSEQRearrangement") params["is_repertoire"] = True params["result_path"] = path params['import_empty_nt_sequences'] = False params['import_empty_aa_sequences'] = True params["metadata_file"] = path / "metadata.csv" params["path"] = path params["import_productive"] = True params["import_with_stop_codon"] = True params["import_out_of_frame"] = True dataset_name = "adaptive_dataset_reps" dataset = ImmunoSEQRearrangementImport.import_dataset( params, dataset_name) self.assertEqual( dataset.repertoires[0].sequences[1].metadata.frame_type, SequenceFrameType.IN) self.assertListEqual( list(dataset.repertoires[0].get_counts()), [10, 1772, 1763, None, 566, 506, 398, 394, 363, 363]) self.assertListEqual(list(dataset.repertoires[0].get_chains()), [Chain.BETA for i in range(10)]) self.assertEqual(2, dataset.get_example_count()) for index, rep in enumerate(dataset.get_data()): if index == 0: self.assertEqual("1234", rep.metadata["subject_id"]) self.assertEqual(10, len(rep.sequences)) self.assertEqual(10, rep.sequences[0].metadata.count) self.assertEqual("TRBV29", rep.sequences[0].metadata.v_subgroup) else: self.assertEqual("1234a", rep.metadata["subject_id"]) self.assertEqual(11, len(rep.sequences)) self.assertEqual(2, rep.sequences[-1].metadata.count) dataset_file = path / f"{dataset_name}.{ImportHelper.DATASET_FORMAT}" self.assertTrue(dataset_file.is_file()) shutil.rmtree(path)
def test_load(self): params = {"a": 1, "b": True} path = EnvironmentSettings.tmp_test_path / "defaultparamsloader/" PathBuilder.build(path) with open(path / "mixcr_params.yaml", "w") as file: yaml.dump(params, file) loaded = DefaultParamsLoader.load(path, "MiXCR") self.assertTrue(all(key in loaded.keys() for key in params.keys())) self.assertEqual(1, loaded["a"]) self.assertEqual(True, loaded["b"]) self.assertEqual(2, len(loaded.keys())) shutil.rmtree(path)
def test_load_repertoire_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/mixcr/" PathBuilder.build(path) self.create_dummy_dataset(path, add_metadata=True) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "mixcr") params["is_repertoire"] = True params["result_path"] = path params["path"] = path params["metadata_file"] = path / "metadata.csv" dataset = MiXCRImport.import_dataset(params, "mixcr_repertoire_dataset") self.assertEqual(2, dataset.get_example_count()) for index, repertoire in enumerate(dataset.get_data()): self.assertTrue( all(sequence.metadata.chain == Chain.ALPHA for sequence in repertoire.sequences)) if index == 0: self.assertEqual(9, len(repertoire.sequences)) self.assertTrue(repertoire.sequences[0].amino_acid_sequence in ["ALVTDSWGKLQ", "AVLETSGSRLT"]) # OSX/windows self.assertTrue(repertoire.sequences[0].metadata.v_gene in ["TRAV6", "TRAV21"]) # OSX/windows self.assertListEqual([Chain.ALPHA for i in range(9)], list(repertoire.get_chains())) self.assertListEqual( sorted([ 956023, 90101, 69706, 56658, 55692, 43466, 42172, 41647, 19133 ]), sorted(list(repertoire.get_counts()))) elif index == 1: self.assertEqual(5, len(repertoire.sequences)) self.assertTrue(repertoire.sequences[0].nucleotide_sequence in [ "GCTGTGCTGGAAACCAGTGGCTCTAGGTTGACC", "GCTCTAGTAACTGACAGCTGGGGGAAATTGCAG" ]) # OSX/windows shutil.rmtree(path)
def _prepare_params(dataset_specs: dict, result_path: Path, dataset_name: str): params = DefaultParamsLoader.load(ImportParser.keyword, dataset_specs["format"]) if "params" in dataset_specs.keys(): params = {**params, **dataset_specs["params"]} if "result_path" not in params or params["result_path"] is None: params["result_path"] = Path( result_path) / "datasets" / dataset_name else: params["result_path"] = Path(params["result_path"]) if "path" in params: params["path"] = Path(params["path"]) if "metadata_file" in params: params["metadata_file"] = Path(params["metadata_file"]) dataset_specs["params"] = params return params
def test_sequence_import(self): path = EnvironmentSettings.root_path / "test/tmp/adaptive/" self.build_dummy_dataset(path, False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "ImmunoSEQRearrangement") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path params["import_productive"] = True params["import_with_stop_codon"] = True params["import_out_of_frame"] = True params["import_empty_nt_sequences"] = False params["import_empty_aa_sequences"] = True dataset_name = "adaptive_dataset_seqs" dataset = ImmunoSEQRearrangementImport.import_dataset( params, dataset_name) self.assertEqual(21, dataset.get_example_count()) seqs = [sequence for sequence in dataset.get_data()] self.assertTrue(seqs[0].amino_acid_sequence in ["ASSLPGTNTGELF", "SVEESYEQY"]) # OSX/windows self.assertTrue(seqs[0].nucleotide_sequence in [ "GCCAGCAGCTTACCGGGGACGAACACCGGGGAGCTGTTT", 'AGCGTTGAAGAATCCTACGAGCAGTAC' ]) # OSX/windows self.assertEqual("IN", seqs[0].metadata.frame_type.name) self.assertTrue(seqs[0].metadata.v_gene in ['TRBV7-9', 'TRBV29-1']) # OSX/windows self.assertTrue(seqs[0].metadata.j_gene in ['TRBJ2-2', 'TRBJ2-7']) # OSX/windows dataset_file = path / f"{dataset_name}.{ImportHelper.DATASET_FORMAT}" self.assertTrue(dataset_file.is_file()) shutil.rmtree(path)
def test_encode(self): file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 3051 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15761 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3051 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15761 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = PathBuilder.build(EnvironmentSettings.root_path / "test/tmp/trcdist_encoder/") with open(path / "receptors.tsv", "w") as file: file.writelines(file_content) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "vdjdb") params["is_repertoire"] = False params["paired"] = True params["result_path"] = path params["path"] = path params["sequence_file_size"] = 1 params["receptor_chains"] = "TRA_TRB" params['organism'] = 'human' dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset") encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2}) encoded_dataset = encoder.encode( dataset, EncoderParams(path / "result/", LabelConfiguration([Label("epitope")]))) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == encoded_dataset.encoded_data.examples.shape[1] and encoded_dataset.encoded_data.examples.shape[0] == dataset.get_example_count()) shutil.rmtree(path)
def test_import_sequence_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/immunoseq/" self.create_dummy_dataset(path, False) params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets/", "ImmunoSEQSample") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path dataset = ImmunoSEQSampleImport.import_dataset(params, "immunoseq_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertEqual(seqs[0].amino_acid_sequence, "ATSDQLNRWGTGELF") self.assertEqual(seqs[1].amino_acid_sequence, "ASKDGDTGELF") self.assertEqual(seqs[2].amino_acid_sequence, "ASSGEGQGVFGGTEAF") self.assertEqual(seqs[3].amino_acid_sequence, "ASSEEVGGNQPQH") shutil.rmtree(path)
def test_load_sequence_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/mixcr/" PathBuilder.build(path) self.create_dummy_dataset(path, add_metadata=False) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path / "datasets/", "mixcr") params["is_repertoire"] = False params["paired"] = False params["result_path"] = path params["path"] = path dataset = MiXCRImport.import_dataset(params, "mixcr_repertoire_dataset") seqs = [sequence for sequence in dataset.get_data()] self.assertTrue(seqs[0].amino_acid_sequence in ["AVLETSGSRLT", "ALVTDSWGKLQ"]) # OSX/windows self.assertTrue(seqs[0].metadata.v_gene in ["TRAV21", "TRAV6"]) # OSX/windows shutil.rmtree(path)
def test_import_repertoire_dataset(self): path = EnvironmentSettings.root_path / "test/tmp/immunoseq/" self.create_dummy_dataset(path, True) params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets/", "ImmunoSEQSample") params["is_repertoire"] = True params["result_path"] = path params["metadata_file"] = path / "metadata.csv" params["path"] = path dataset = ImmunoSEQSampleImport.import_dataset(params, "immunoseq_dataset") self.assertEqual(1, dataset.get_example_count()) for index, rep in enumerate(dataset.get_data()): self.assertEqual("1234a", rep.metadata["subject_id"]) self.assertEqual(18, len(rep.sequences)) self.assertEqual("ATSDQLNRWGTGELF", rep.sequences[0].get_sequence()) self.assertEqual("TRBV25-1", rep.sequences[2].metadata.v_gene) self.assertListEqual([38, 48, 37, 53, 28, 16, 72, 14, 26, 13, 8, 16, 8, 28, 7, 1, 9, 1], list(rep.get_counts())) self.assertListEqual([Chain.BETA for i in range(18)], list(rep.get_chains())) shutil.rmtree(path)
def _get_implanting_strategy(key: str, signal: dict) -> SignalImplantingStrategy: valid_strategies = [ cls[:-10] for cls in ReflectionHandler.discover_classes_by_partial_name( "Implanting", "simulation/signal_implanting_strategy/") ] ParameterValidator.assert_in_valid_list(signal["implanting"], valid_strategies, "SignalParser", key) defaults = DefaultParamsLoader.load( "signal_implanting_strategy/", f"{signal['implanting']}Implanting") signal = {**defaults, **signal} ParameterValidator.assert_keys_present( list(signal.keys()), ["motifs", "implanting", "sequence_position_weights"], SignalParser.__name__, key) implanting_comp = None if 'implanting_computation' in signal: implanting_comp = signal['implanting_computation'].lower() ParameterValidator.assert_in_valid_list( implanting_comp, [el.name.lower() for el in ImplantingComputation], SignalParser.__name__, 'implanting_computation') implanting_comp = ImplantingComputation[implanting_comp.upper()] implanting_strategy = ReflectionHandler.get_class_by_name( f"{signal['implanting']}Implanting")( GappedMotifImplanting(), signal["sequence_position_weights"], implanting_comp) return implanting_strategy
def get_all_params(specs, class_path, short_class_name, key: str = None): default_params = DefaultParamsLoader.load(class_path, short_class_name) specified_params = ObjectParser.get_params(specs, short_class_name) params = {**default_params, **specified_params, "name": key} return params