def test_encode(self): path = EnvironmentSettings.tmp_test_path + "abundance_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceAbundanceEncoder.build_object(dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8 }) label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def import_hp_setting(config_dir: str) -> Tuple[HPSetting, Label]: config = MLMethodConfiguration() config.load(f'{config_dir}ml_config.yaml') ml_method = ReflectionHandler.get_class_by_name( config.ml_method, 'ml_methods/')() ml_method.load(config_dir) encoder = MLImport.import_encoder(config, config_dir) preprocessing_sequence = MLImport.import_preprocessing_sequence( config, config_dir) labels = list(config.labels_with_values.keys()) assert len( labels ) == 1, "MLImport: Multiple labels set in a single ml_config file." label = Label(labels[0], config.labels_with_values[labels[0]]) return HPSetting( encoder=encoder, encoder_params=config.encoding_parameters, encoder_name=config.encoding_name, ml_method=ml_method, ml_method_name=config.ml_method_name, ml_params={}, preproc_sequence=preprocessing_sequence, preproc_sequence_name=config.preprocessing_sequence_name), label
def test_sequence_flattened(self): path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/" PathBuilder.build(path) dataset = self.construct_test_flatten_dataset(path) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=path, label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]), pool_size=1, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, SequenceDataset)) onehot_a = [1.0] + [0.0] * 19 onehot_t = [0.0] * 16 + [1.0] + [0] * 3 self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()]) shutil.rmtree(path)
def _create_state_object(self, path): repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={"l1": [1, 2], "l2": [0, 1]}) enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4} hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5), SplitConfig(SplitType.RANDOM, 1, 0.5), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) return state
def test_run(self): path = EnvironmentSettings.tmp_test_path + "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset( 50, {5: 1}, {5: 1}, {"l1": { 1: 0.5, 2: 0.5 }}, path + 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder( NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting( encoder, { "normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True }, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path + 'result/instr1/') shutil.copy(path + 'dict_vectorizer.pickle', path + 'result/instr1/dict_vectorizer.pickle') shutil.copy(path + 'scaler.pickle', path + 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path + 'result/') predictions_path = path + "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "distance_encoder/" PathBuilder.build(path) dataset = self.create_dataset(path) enc = DistanceEncoder.build_object( dataset, **{ "distance_metric": DistanceMetricType.JACCARD.name, "attributes_to_match": ["sequence_aas"], "sequence_batch_size": 20 }) enc.set_context({"dataset": dataset}) encoded = enc.encode( dataset, EncoderParams(result_path=path, label_config=LabelConfiguration( [Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4, filename="dataset.pkl")) self.assertEqual(8, encoded.encoded_data.examples.shape[0]) self.assertEqual(8, encoded.encoded_data.examples.shape[1]) self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 0]) self.assertEqual(1, encoded.encoded_data.examples.iloc[1, 1]) self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 4]) self.assertTrue( np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"])) self.assertTrue( np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"])) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "deeprc_encoder/" PathBuilder.build(path) PathBuilder.build(path + "encoded_data/") main_dataset, sub_dataset = self.create_datasets(path) enc = DeepRCEncoder.build_object(sub_dataset, **{}) enc.set_context({"dataset": main_dataset}) encoded = enc.encode( sub_dataset, EncoderParams(result_path=path + "encoded_data/", label_config=LabelConfiguration( [Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4)) self.assertListEqual(encoded.encoded_data.example_ids, sub_dataset.get_repertoire_ids()) self.assertTrue( os.path.isfile(encoded.encoded_data.info["metadata_filepath"])) metadata_content = pd.read_csv( encoded.encoded_data.info["metadata_filepath"], sep="\t") self.assertListEqual(list(metadata_content["ID"]), sub_dataset.get_repertoire_ids()) for repertoire in main_dataset.repertoires: rep_path = f"{path}/encoded_data/encoding/{repertoire.identifier}.tsv" self.assertTrue(os.path.isfile(rep_path)) repertoire_tsv = pd.read_csv(rep_path, sep="\t") self.assertListEqual(list(repertoire_tsv["amino_acid"]), list(repertoire.get_sequence_aas())) shutil.rmtree(path)
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "tcrdist_motif_discovery/") dataset_path = self._create_dataset(path) dataset = SingleLineReceptorImport.import_dataset( { "path": dataset_path, "result_path": path + "dataset/", "separator": ",", "columns_to_load": [ "subject", "epitope", "count", "v_a_gene", "j_a_gene", "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa", "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq" ], "column_mapping": { "cdr3_a_aa": "alpha_amino_acid_sequence", "cdr3_b_aa": "beta_amino_acid_sequence", "cdr3_a_nucseq": "alpha_nucleotide_sequence", "cdr3_b_nucseq": "beta_nucleotide_sequence", "v_a_gene": "alpha_v_gene", "v_b_gene": "beta_v_gene", "j_a_gene": "alpha_j_gene", "j_b_gene": "beta_j_gene", "clone_id": "identifier" }, "receptor_chains": "TRA_TRB", "region_type": "IMGT_CDR3", "sequence_file_size": 50000, "organism": "mouse" }, 'd1') dataset = TCRdistEncoder(8).encode( dataset, EncoderParams(f"{path}result/", LabelConfiguration([Label("epitope")]))) report = TCRdistMotifDiscovery(dataset, path + "report/", "report name", 8) report.generate_report() shutil.rmtree(path)
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None): """ encodes the repertoire dataset using KmerFrequencyEncoder :param path_to_dataset_directory: path to directory containing all repertoire files with .tsv extension in MiXCR format :param result_path: where to store the results :param metadata_path: csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None, otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column :return: encoded dataset with encoded data in encoded_dataset.encoded_data.examples """ if metadata_path is None: metadata_path = generate_random_metadata(path_to_dataset_directory, result_path) loader = MiXCRImport() dataset = loader.import_dataset({ "is_repertoire": True, "path": path_to_dataset_directory, "metadata_file": metadata_path, "region_type": "IMGT_CDR3", # import_dataset in only cdr3 "number_of_processes": 4, # number of parallel processes for loading the data "result_path": result_path, "separator": "\t", "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"], "column_mapping": { "cloneCount": "counts", "allVHitsWithScore": "v_genes", "allJHitsWithScore": "j_genes" }, }, "mixcr_dataset") label_name = list(dataset.params.keys())[0] # label that can be used for ML prediction - by default: "disease" with values True/False encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": "relative_frequency", # encode repertoire by the relative frequency of k-mers in repertoire "reads": "unique", # count each sequence only once, do not use clonal count "k": 2, # k-mer length "sequence_encoding": "continuous_kmer" # split each sequence in repertoire to overlapping k-mers }), EncoderParams(result_path=result_path, label_config=LabelConfiguration([Label(label_name, dataset.params[label_name])])), False)) dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset, result_path=f"{result_path if result_path[:-1] == '/' else result_path+'/'}csv_exported/") dataset_exporter.generate_report() return encoded_dataset
def test_encode(self): file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 3051 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15761 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3051 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15761 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = PathBuilder.build(EnvironmentSettings.root_path + "test/tmp/trcdist_encoder/") with open(path + "receptors.tsv", "w") as file: file.writelines(file_content) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "vdjdb") params["is_repertoire"] = False params["paired"] = True params["result_path"] = path params["path"] = path params["sequence_file_size"] = 1 params["receptor_chains"] = "TRA_TRB" params['organism'] = 'human' dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset") encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2}) encoded_dataset = encoder.encode( dataset, EncoderParams(f"{path}result/", LabelConfiguration([Label("epitope")]))) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == encoded_dataset.encoded_data.examples.shape[1] and encoded_dataset.encoded_data.examples.shape[0] == dataset.get_example_count()) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "count_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceCountEncoder.build_object( dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4 }) label_config = LabelConfiguration( [Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) test = encoded_dataset.encoded_data.examples self.assertTrue(test[0] == 1) self.assertTrue(test[1] == 1) self.assertTrue(test[2] == 0) self.assertTrue(test[3] == 0) self.assertTrue("III" in encoded_dataset.encoded_data.feature_names) shutil.rmtree(path)
def add_label(self, label: str, values: list = None, auxiliary_labels: list = None, positive_class=None): vals = list(values) if values else None if label in self._labels and self._labels[label] is not None and len( self._labels[label]) > 0: warnings.warn( "Label " + label + " has already been set. Overriding existing values...", Warning) if positive_class is not None: ParameterValidator.assert_in_valid_list(positive_class, values, Label.__name__, 'positive_class') self._labels[label] = Label(label, vals, auxiliary_labels, positive_class)
def test_generate(self): path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}kernel_sequence_logo/") dataset = RandomDatasetGenerator.generate_receptor_dataset(receptor_count=500, chain_1_length_probabilities={4: 1}, chain_2_length_probabilities={4: 1}, labels={"CMV": {True: 0.5, False: 0.5}}, path=path + "dataset/") enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(dataset, EncoderParams(path + "result/", LabelConfiguration([Label("CMV", [True, False])]))) cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu", number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5, batch_size=100, training_percentage=0.8, l2_weight_decay=0.0) cnn.fit(enc_dataset.encoded_data, "CMV") report = KernelSequenceLogo(method=cnn, result_path=path + "logos/") report.generate_report() self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.png")) self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.png")) self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.png")) self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.png")) self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.csv")) self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.csv")) self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.csv")) self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.csv")) self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.csv")) self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.html")) shutil.rmtree(path)
def test_fit(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "kmermil/") repertoire_count = 10 dataset = RandomDatasetGenerator.generate_repertoire_dataset( repertoire_count=repertoire_count, sequence_count_probabilities={2: 1}, sequence_length_probabilities={4: 1}, labels={"l1": { True: 0.5, False: 0.5 }}, path=path + "dataset/") enc_dataset = AtchleyKmerEncoder( 2, 1, 1, 'relative_abundance', False).encode( dataset, EncoderParams(path + "result/", LabelConfiguration([Label("l1", [True, False])]))) cls = AtchleyKmerMILClassifier(iteration_count=10, threshold=-0.0001, evaluate_at=2, use_early_stopping=False, random_seed=1, learning_rate=0.01, zero_abundance_weight_init=True, number_of_threads=8) cls.fit(enc_dataset.encoded_data, "l1") predictions = cls.predict(enc_dataset.encoded_data, "l1") self.assertEqual(repertoire_count, len(predictions["l1"])) self.assertEqual( repertoire_count, len([pred for pred in predictions["l1"] if isinstance(pred, bool)])) predictions_proba = cls.predict_proba(enc_dataset.encoded_data, "l1") self.assertEqual(repertoire_count, np.rint(np.sum(predictions_proba["l1"]))) self.assertEqual(repertoire_count, predictions_proba["l1"].shape[0]) cls.store(path + "model_storage/", feature_names=enc_dataset.encoded_data.feature_names) cls2 = AtchleyKmerMILClassifier(iteration_count=10, threshold=-0.0001, evaluate_at=2, use_early_stopping=False, random_seed=1, learning_rate=0.01, zero_abundance_weight_init=True, number_of_threads=8) cls2.load(path + "model_storage/") cls2_vars = vars(cls2) del cls2_vars["logistic_regression"] cls_vars = vars(cls) del cls_vars["logistic_regression"] for item, value in cls_vars.items(): if not isinstance(value, np.ndarray): loaded_value = cls2_vars[item] self.assertEqual(value, loaded_value) model = cls.get_model("l1") self.assertEqual(vars(cls), model) shutil.rmtree(path)
def test_find_label_associated_sequence_p_values(self): path = EnvironmentSettings.tmp_test_path + "comparison_data_find_label_assocseqpvalues/" PathBuilder.build(path) repertoires = [Repertoire.build_from_sequence_objects([ReceptorSequence()], path, { "l1": val, "subject_id": subject_id }) for val, subject_id in zip([True, True, False, False], ["rep_0", "rep_1", "rep_2", "rep_3"])] col_name_index = {repertoires[index].identifier: index for index in range(len(repertoires))} comparison_data = ComparisonData(repertoire_ids=[repertoire.identifier for repertoire in repertoires], comparison_attributes=["sequence_aas"], sequence_batch_size=4, path=path) comparison_data.batches = [ComparisonDataBatch(**{'matrix': np.array([[1., 0., 0., 0.], [1., 1., 0., 0.]]), 'items': [('GGG',), ('III',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 0}), ComparisonDataBatch(**{'matrix': np.array([[1., 1., 0., 1.], [1., 1., 1., 1.]]), 'items': [('LLL',), ('MMM',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 1}), ComparisonDataBatch(**{'matrix': np.array([[0., 1., 0., 0.], [0., 1., 0., 1.]]), 'items': [('DDD',), ('EEE',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 2}), ComparisonDataBatch(**{'matrix': np.array([[0., 1., 1., 1.], [0., 0., 1., 1.]]), 'items': [('FFF',), ('CCC',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 3}), ComparisonDataBatch(**{'matrix': np.array([[0., 0., 0., 1.]]), 'items': [('AAA',)], 'repertoire_index_mapping': col_name_index, 'path': path, 'identifier': 4})] p_values = SequenceFilterHelper.find_label_associated_sequence_p_values(comparison_data, repertoires, Label('l1', [True, False], positive_class=True)) print(p_values) self.assertTrue( np.allclose([SequenceFilterHelper.INVALID_P_VALUE, 0.1666666666666667, 0.5000000000000001, 1., SequenceFilterHelper.INVALID_P_VALUE, 0.8333333333333331, 1., 1., 2], p_values, equal_nan=True)) shutil.rmtree(path)
def test_encode(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "atchley_kmer_encoding/") dataset = RandomDatasetGenerator.generate_repertoire_dataset(3, {1: 1}, {4: 1}, {"l1": {True: 0.4, False: 0.6}}, path + "dataset/") encoder = AtchleyKmerEncoder.build_object(dataset, **{"k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE", "normalize_all_features": False}) encoded_dataset = encoder.encode(dataset, EncoderParams(path + "result/", LabelConfiguration(labels=[Label("l1")]))) self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape) self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0]) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.tmp_test_path + "hpoptimproc/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 ], "l2": [ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 ] }) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={ "l1": [1, 2], "l2": [0, 1] }) enc1 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4 } enc2 = { "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 } hp_settings = [ HPSetting(Word2VecEncoder.build_object(dataset, **enc1), enc1, LogisticRegression(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, []), HPSetting( Word2VecEncoder.build_object(dataset, **enc2), enc2, SVM(), { "model_selection_cv": False, "model_selection_n_folds": -1 }, [ClonesPerRepertoireFilter(lower_limit=-1, upper_limit=1000)]) ] report = SequenceLengthDistribution() label_config = LabelConfiguration( [Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction( dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig(data_splits={"seqlen": report})), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) self.assertTrue(isinstance(state, TrainMLModelState)) self.assertEqual(1, len(state.assessment_states)) self.assertTrue("l1" in state.assessment_states[0].label_states) self.assertTrue("l2" in state.assessment_states[0].label_states) shutil.rmtree(path)
def test_generate(self): path = EnvironmentSettings.tmp_test_path + "cv_feature_performance/" state = TrainMLModelState( assessment=SplitConfig(split_count=5, split_strategy=SplitType.K_FOLD), selection=SplitConfig(split_count=10, split_strategy=SplitType.K_FOLD), optimization_metric=Metric.ACCURACY, label_configuration=LabelConfiguration( labels=[Label(name="CMV", values=[True, False])]), hp_settings=[ HPSetting(encoder_params={"p_value_threshold": 0.001}, encoder_name="e1", encoder=SequenceAbundanceEncoder([], 0, 0, 0), preproc_sequence=[], ml_method_name="ml1", ml_method=ProbabilisticBinaryClassifier(10, 0.1), ml_params={}), HPSetting(encoder_params={"p_value_threshold": 0.01}, encoder_name="e2", encoder=SequenceAbundanceEncoder([], 0, 0, 0), preproc_sequence=[], ml_method_name="ml1", ml_method=ProbabilisticBinaryClassifier(10, 0.1), ml_params={}), HPSetting(encoder_params={"p_value_threshold": 0.01}, encoder=SequenceAbundanceEncoder([], 0, 0, 0), preproc_sequence=[], ml_method=ProbabilisticBinaryClassifier(10, 0.01), ml_params={}) ], dataset=None, hp_strategy=None, metrics=None) report = CVFeaturePerformance("p_value_threshold", state, path, is_feature_axis_categorical=True, name="report1") with self.assertWarns(RuntimeWarning): report.generate_report() state.hp_settings = state.hp_settings[:2] state.assessment_states = [ HPAssessmentState(i, None, None, None, state.label_configuration) for i in range(state.assessment.split_count) ] for assessment_state in state.assessment_states: assessment_state.label_states["CMV"] = HPLabelState("CMV", []) assessment_state.label_states["CMV"].assessment_items = { setting.get_key(): HPItem(performance={'accuracy': random.uniform(0.5, 1)}, hp_setting=setting) for setting in state.hp_settings } assessment_state.label_states[ "CMV"].selection_state = HPSelectionState( [], [], "", GridSearch(state.hp_settings)) assessment_state.label_states["CMV"].selection_state.hp_items = { str(setting): [ HPItem(performance={'accuracy': random.uniform(0.5, 1)}, hp_setting=setting) for _ in range(state.selection.split_count) ] for setting in state.hp_settings } report.state = state report_result = report.generate_report() self.assertTrue(isinstance(report_result, ReportResult)) self.assertEqual(2, len(report_result.output_tables)) self.assertEqual(1, len(report_result.output_figures)) self.assertTrue(os.path.isfile(report_result.output_figures[0].path)) self.assertTrue(os.path.isfile(report_result.output_tables[0].path)) self.assertTrue(os.path.isfile(report_result.output_tables[1].path)) shutil.rmtree(path)
def test_fit(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "cnn/") dataset = RandomDatasetGenerator.generate_receptor_dataset( receptor_count=500, chain_1_length_probabilities={4: 1}, chain_2_length_probabilities={4: 1}, labels={"CMV": { True: 0.5, False: 0.5 }}, path=path + "dataset/") enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode( dataset, EncoderParams(path + "result/", LabelConfiguration([Label("CMV", [True, False])]))) cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu", number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5, batch_size=100, training_percentage=0.8, l2_weight_decay=0.0) cnn.fit(encoded_data=enc_dataset.encoded_data, label_name="CMV") predictions = cnn.predict(enc_dataset.encoded_data, "CMV") self.assertEqual(500, len(predictions["CMV"])) self.assertEqual( 500, len([ pred for pred in predictions["CMV"] if isinstance(pred, bool) ])) predictions_proba = cnn.predict_proba(enc_dataset.encoded_data, "CMV") self.assertEqual(500, np.rint(np.sum(predictions_proba["CMV"]))) self.assertEqual(500, predictions_proba["CMV"].shape[0]) cnn.store(path + "model_storage/") cnn2 = ReceptorCNN(sequence_type="amino_acid") cnn2.load(path + "model_storage/") cnn2_vars = vars(cnn2) del cnn2_vars["CNN"] cnn_vars = vars(cnn) del cnn_vars["CNN"] for item, value in cnn_vars.items(): if not isinstance(value, np.ndarray): self.assertEqual(value, cnn2_vars[item]) model = cnn.get_model(["CMV"]) self.assertEqual(vars(cnn), model) shutil.rmtree(path)