def test_encode(self): path = EnvironmentSettings.tmp_test_path + "abundance_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceAbundanceEncoder.build_object(dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8 }) label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def test_encode_sequence(self): seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY") result = KmerSequenceEncoder.encode_sequence( seq, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)) self.assertTrue("CAS" in result) self.assertTrue("ASS" in result) self.assertTrue("SSV" in result) self.assertTrue("SVF" in result) self.assertTrue("VFR" in result) self.assertTrue("FRT" in result) self.assertTrue("RTY" in result) self.assertEqual(7, len(result)) self.assertEqual( KmerSequenceEncoder.encode_sequence( ReceptorSequence(amino_acid_sequence="AC"), EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)), None)
def test_encode_sequence(self): sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="OUT")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="STOP")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="IN")) enc = IdentitySequenceEncoder() self.assertEqual(["AAA"], enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")))
def encode_dataset(dataset, hp_setting: HPSetting, path: str, learn_model: bool, context: dict, number_of_processes: int, label_configuration: LabelConfiguration, encode_labels: bool = True, store_encoded_data: bool = False): PathBuilder.build(path) encoded_dataset = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=hp_setting.encoder, encoder_params=EncoderParams( model=hp_setting.encoder_params, result_path=path, pool_size=number_of_processes, label_config=label_configuration, learn_model=learn_model, filename="train_dataset.pkl" if learn_model else "test_dataset.pkl", encode_labels=encode_labels), store_encoded_data=store_encoded_data)) return encoded_dataset
def test_generate(self): path = PathBuilder.build(f"{EnvironmentSettings.tmp_test_path}kernel_sequence_logo/") dataset = RandomDatasetGenerator.generate_receptor_dataset(receptor_count=500, chain_1_length_probabilities={4: 1}, chain_2_length_probabilities={4: 1}, labels={"CMV": {True: 0.5, False: 0.5}}, path=path + "dataset/") enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode(dataset, EncoderParams(path + "result/", LabelConfiguration([Label("CMV", [True, False])]))) cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu", number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5, batch_size=100, training_percentage=0.8, l2_weight_decay=0.0) cnn.fit(enc_dataset.encoded_data, "CMV") report = KernelSequenceLogo(method=cnn, result_path=path + "logos/") report.generate_report() self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.png")) self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.png")) self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.png")) self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.png")) self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_1.csv")) self.assertTrue(os.path.isfile(f"{path}logos/alpha_kernel_3_2.csv")) self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_1.csv")) self.assertTrue(os.path.isfile(f"{path}logos/beta_kernel_3_2.csv")) self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.csv")) self.assertTrue(os.path.isfile(f"{path}logos/fully_connected_layer_weights.html")) shutil.rmtree(path)
def test_encode_no_v(self): path = EnvironmentSettings.root_path + "test/tmp/regex_matches_encoder/" dataset, label_config, motif_filepath, labels = self.create_dummy_data( path) encoder = MatchedRegexEncoder.build_object( dataset, **{ "motif_filepath": motif_filepath, "match_v_genes": False, "sum_counts": True }) encoded = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config, filename="dataset.csv")) expected_outcome = [[20, 10, 0, 0], [0, 0, 10, 0], [0, 0, 0, 5]] for index, row in enumerate(expected_outcome): self.assertListEqual(list(encoded.encoded_data.examples[index]), expected_outcome[index]) self.assertListEqual(["1_IGL", "1_IGH", "2_IGH", "3_IGL"], encoded.encoded_data.feature_names) self.assertListEqual(["subject_1", "subject_2", "subject_3"], encoded.encoded_data.example_ids) shutil.rmtree(path)
def test_sequence_flattened(self): path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/" PathBuilder.build(path) dataset = self.construct_test_flatten_dataset(path) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=path, label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]), pool_size=1, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, SequenceDataset)) onehot_a = [1.0] + [0.0] * 19 onehot_t = [0.0] * 16 + [1.0] + [0] * 3 self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()]) shutil.rmtree(path)
def test(self): path = EnvironmentSettings.tmp_test_path + "onehot_sequence/" PathBuilder.build(path) dataset, lc = self._construct_test_dataset(path) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": False}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=f"{path}encoded/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, SequenceDataset)) onehot_a = [1] + [0] * 19 onehot_t = [0] * 16 + [1] + [0] * 3 onehot_empty = [0] * 20 self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[0]], [onehot_a for i in range(4)]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[1]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual([list(item) for item in encoded_data.encoded_data.examples[2]], [onehot_a, onehot_t, onehot_t, onehot_empty]) self.assertListEqual(encoded_data.encoded_data.example_ids, [receptor.identifier for receptor in dataset.get_data()]) self.assertDictEqual(encoded_data.encoded_data.labels, {"l1": [receptor_seq.get_attribute("l1") for receptor_seq in dataset.get_data()], "l2": [receptor_seq.get_attribute("l2") for receptor_seq in dataset.get_data()]}) shutil.rmtree(path)
def get_encoded_repertoire(self, repertoire, params: EncoderParams): params.model = vars(self) return CacheHandler.memo_by_params((("encoding_model", params.model), ("labels", params.label_config.get_labels_by_name()), ("repertoire_id", repertoire.identifier), ("repertoire_data", hashlib.sha256(np.ascontiguousarray(repertoire.get_sequence_aas())).hexdigest())), lambda: self.encode_repertoire(repertoire, params), CacheObjectType.ENCODING_STEP)
def test_run(self): path = EnvironmentSettings.tmp_test_path + "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset( 50, {5: 1}, {5: 1}, {"l1": { 1: 0.5, 2: 0.5 }}, path + 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder( NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting( encoder, { "normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True }, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path + 'result/instr1/') shutil.copy(path + 'dict_vectorizer.pickle', path + 'result/instr1/dict_vectorizer.pickle') shutil.copy(path + 'scaler.pickle', path + 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path + 'result/') predictions_path = path + "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def _encode_sequence(self, sequence: ReceptorSequence, params: EncoderParams, sequence_encoder, counts): params.model = vars(self) features = sequence_encoder.encode_sequence(sequence, params) if features is not None: for i in features: if self.reads == ReadsType.UNIQUE: counts[i] += 1 elif self.reads == ReadsType.ALL: counts[i] += sequence.metadata.count return counts
def test_encode(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "atchley_kmer_encoding/") dataset = RandomDatasetGenerator.generate_repertoire_dataset(3, {1: 1}, {4: 1}, {"l1": {True: 0.4, False: 0.6}}, path + "dataset/") encoder = AtchleyKmerEncoder.build_object(dataset, **{"k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE", "normalize_all_features": False}) encoded_dataset = encoder.encode(dataset, EncoderParams(path + "result/", LabelConfiguration(labels=[Label("l1")]))) self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape) self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0]) shutil.rmtree(path)
def test_encode_sequence(self): sequence = ReceptorSequence("ABCDEFG", None, None) result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 3, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'ABC.EFG', 'ABCDEF', 'BCDEFG'}, set(result)) result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 3, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'sequence'}, set(result)) self.assertEqual(GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 10, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")), None) sequence.amino_acid_sequence = "ABCDEFG" result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 3, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'ABC.EFG', 'ABCDEF', 'BCDEFG'}, set(result)) result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 3, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'sequence'}, set(result)) self.assertEqual(GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 10, "max_gap": 1}, label_config=LabelConfiguration(), result_path="")), None) sequence.amino_acid_sequence = "ABCDEFG" result = GappedKmerSequenceEncoder.encode_sequence(sequence, EncoderParams(model={"k_left": 2, "max_gap": 1, "min_gap": 1, "k_right": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'AB.DEF', 'BC.EFG'}, set(result)) result = GappedKmerSequenceEncoder.get_feature_names(EncoderParams(model={"k_left": 2, "max_gap": 1, "min_gap": 1, "k_right": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'sequence'}, set(result))
def test_encode_sequence(self): sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ", None, None) result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams( model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108', 'PRE///109', 'RER///110', 'ERA///111', 'RAT///111.001', 'ATY///111.002', 'TYE///111.003', 'YEQ///111.004', 'EQC///111.005', 'QCA///111.006', 'CAS///111.007', 'ASS///111.008', 'SSP///111.009', 'SPR///111.01', 'PRE///111.011', 'RER///111.012', 'ERA///111.013', 'RAT///112.013', 'ATY///112.012', 'TYE///112.011', 'YEQ///112.01', 'EQC///112.009', 'QCA///112.008', 'CAS///112.007', 'ASS///112.006', 'SSP///112.005', 'SPR///112.004', 'PRE///112.003', 'RER///112.002', 'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114', 'YEQ///115'}, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) sequence = ReceptorSequence("AHCDE", None, None) result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams( model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'}, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) self.assertEqual( IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 25}, label_config=LabelConfiguration(), result_path="") ), None )
def test_encode_sequence(self): sequence = ReceptorSequence("AHCDE", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'AH///105', 'HC///106', 'CD///107', 'DE///116', 'A.C///105', 'H.D///106', 'C.E///107' }, set(kmers)) sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'CA///105', 'AS///106', 'SS///107', 'SP///108', 'PR///109', 'RE///110', 'ER///111', 'RA///111.001', 'AT///112.002', 'TY///112.001', 'YE///112', 'EQ///113', 'QC///114', 'CA///115', 'AY///116', 'C.S///105', 'A.S///106', 'S.P///107', 'S.R///108', 'P.E///109', 'R.R///110', 'E.A///111', 'R.T///111.001', 'A.Y///112.002', 'T.E///112.001', 'Y.Q///112', 'E.C///113', 'Q.A///114', 'C.Y///115' }, set(kmers))
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "tcrdist_motif_discovery/") dataset_path = self._create_dataset(path) dataset = SingleLineReceptorImport.import_dataset( { "path": dataset_path, "result_path": path + "dataset/", "separator": ",", "columns_to_load": [ "subject", "epitope", "count", "v_a_gene", "j_a_gene", "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa", "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq" ], "column_mapping": { "cdr3_a_aa": "alpha_amino_acid_sequence", "cdr3_b_aa": "beta_amino_acid_sequence", "cdr3_a_nucseq": "alpha_nucleotide_sequence", "cdr3_b_nucseq": "beta_nucleotide_sequence", "v_a_gene": "alpha_v_gene", "v_b_gene": "beta_v_gene", "j_a_gene": "alpha_j_gene", "j_b_gene": "beta_j_gene", "clone_id": "identifier" }, "receptor_chains": "TRA_TRB", "region_type": "IMGT_CDR3", "sequence_file_size": 50000, "organism": "mouse" }, 'd1') dataset = TCRdistEncoder(8).encode( dataset, EncoderParams(f"{path}result/", LabelConfiguration([Label("epitope")]))) report = TCRdistMotifDiscovery(dataset, path + "report/", "report name", 8) report.generate_report() shutil.rmtree(path)
def test__encode_new_dataset(self): path = EnvironmentSettings.root_path + "test/tmp/matched_receptors_encoder/" dataset, label_config, reference_sequences, labels = self.create_dummy_data( path) encoder = MatchedSequencesEncoder.build_object( dataset, **{ "reference": reference_sequences, "max_edit_distance": 0 }) encoded = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config, filename="dataset.csv")) expected_outcome = [[10, 0], [0, 10], [0, 5]] for index, row in enumerate(expected_outcome): self.assertListEqual(list(encoded.encoded_data.examples[index]), expected_outcome[index]) self.assertDictEqual( encoded.encoded_data.labels, { "label": ["yes", "yes", "no"], "subject_id": ["subject_1", "subject_2", "subject_3"] }) self.assertListEqual(encoded.encoded_data.feature_names, ["100_TRB", "200_TRB"]) self.assertListEqual( list(encoded.encoded_data.feature_annotations.sequence_id), ["100_TRB", "200_TRB"]) self.assertListEqual( list(encoded.encoded_data.feature_annotations.chain), ["beta", "beta"]) self.assertListEqual( list(encoded.encoded_data.feature_annotations.sequence), ["AAAA", "SSSS"]) self.assertListEqual( list(encoded.encoded_data.feature_annotations.v_gene), ["TRBV1", "TRBV1"]) self.assertListEqual( list(encoded.encoded_data.feature_annotations.j_gene), ["TRBJ1", "TRBJ1"]) shutil.rmtree(path)
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def test_repertoire_flattened(self): path = EnvironmentSettings.root_path + "test/tmp/onehot_recep_flat/" PathBuilder.build(path) dataset, lc = self._construct_test_repertoiredataset(path, positional=False) encoder = OneHotEncoder.build_object( dataset, **{ "use_positional_info": False, "distance_to_seq_middle": None, "flatten": True }) encoded_data = encoder.encode( dataset, EncoderParams(result_path=path, label_config=lc, pool_size=1, learn_model=True, model={}, filename="dataset.pkl")) self.assertTrue(isinstance(encoded_data, RepertoireDataset)) onehot_a = [1.0] + [0.0] * 19 onehot_t = [0.0] * 16 + [1.0] + [0] * 3 onehot_empty = [0] * 20 self.assertListEqual( list(encoded_data.encoded_data.examples[0]), onehot_a + onehot_a + onehot_a + onehot_a + onehot_a + onehot_t + onehot_a + onehot_empty + onehot_a + onehot_t + onehot_a + onehot_empty) self.assertListEqual( list(encoded_data.encoded_data.examples[1]), onehot_a + onehot_t + onehot_a + onehot_empty + onehot_t + onehot_a + onehot_a + onehot_empty + onehot_empty + onehot_empty + onehot_empty + onehot_empty) self.assertListEqual(list(encoded_data.encoded_data.feature_names), [ f"{seq}_{pos}_{char}" for seq in range(3) for pos in range(4) for char in EnvironmentSettings.get_sequence_alphabet() ]) shutil.rmtree(path)
def test_encode(self): test_path = EnvironmentSettings.root_path + "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None): """ encodes the repertoire dataset using KmerFrequencyEncoder :param path_to_dataset_directory: path to directory containing all repertoire files with .tsv extension in MiXCR format :param result_path: where to store the results :param metadata_path: csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None, otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column :return: encoded dataset with encoded data in encoded_dataset.encoded_data.examples """ if metadata_path is None: metadata_path = generate_random_metadata(path_to_dataset_directory, result_path) loader = MiXCRImport() dataset = loader.import_dataset({ "is_repertoire": True, "path": path_to_dataset_directory, "metadata_file": metadata_path, "region_type": "IMGT_CDR3", # import_dataset in only cdr3 "number_of_processes": 4, # number of parallel processes for loading the data "result_path": result_path, "separator": "\t", "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"], "column_mapping": { "cloneCount": "counts", "allVHitsWithScore": "v_genes", "allJHitsWithScore": "j_genes" }, }, "mixcr_dataset") label_name = list(dataset.params.keys())[0] # label that can be used for ML prediction - by default: "disease" with values True/False encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": "relative_frequency", # encode repertoire by the relative frequency of k-mers in repertoire "reads": "unique", # count each sequence only once, do not use clonal count "k": 2, # k-mer length "sequence_encoding": "continuous_kmer" # split each sequence in repertoire to overlapping k-mers }), EncoderParams(result_path=result_path, label_config=LabelConfiguration([Label(label_name, dataset.params[label_name])])), False)) dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset, result_path=f"{result_path if result_path[:-1] == '/' else result_path+'/'}csv_exported/") dataset_exporter.generate_report() return encoded_dataset
def test(self): path = EnvironmentSettings.tmp_test_path + "onehot_sequence_1/" PathBuilder.build(path) dataset, lc = self._construct_test_dataset(path) encoder = OneHotEncoder.build_object( dataset, **{ "use_positional_info": False, "distance_to_seq_middle": 6, "flatten": False }) encoded_data = encoder.encode( dataset, EncoderParams(result_path=f"{path}encoded/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl")) self.assertTrue(isinstance(encoded_data, ReceptorDataset)) onehot_a = [1] + [0] * 19 onehot_t = [0] * 16 + [1] + [0] * 3 onehot_empty = [0] * 20 self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[0, 0]], [onehot_a for i in range(4)]) self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[0, 1]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[1, 0]], [onehot_a, onehot_t, onehot_a, onehot_empty]) self.assertListEqual( [list(item) for item in encoded_data.encoded_data.examples[1, 1]], [onehot_a, onehot_t, onehot_t, onehot_empty]) shutil.rmtree(path)
def test_encode(self): file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 3051 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15761 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3051 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15761 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = PathBuilder.build(EnvironmentSettings.root_path + "test/tmp/trcdist_encoder/") with open(path + "receptors.tsv", "w") as file: file.writelines(file_content) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "vdjdb") params["is_repertoire"] = False params["paired"] = True params["result_path"] = path params["path"] = path params["sequence_file_size"] = 1 params["receptor_chains"] = "TRA_TRB" params['organism'] = 'human' dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset") encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2}) encoded_dataset = encoder.encode( dataset, EncoderParams(f"{path}result/", LabelConfiguration([Label("epitope")]))) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == encoded_dataset.encoded_data.examples.shape[1] and encoded_dataset.encoded_data.examples.shape[0] == dataset.get_example_count()) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "count_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceCountEncoder.build_object( dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4 }) label_config = LabelConfiguration( [Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) test = encoded_dataset.encoded_data.examples self.assertTrue(test[0] == 1) self.assertTrue(test[1] == 1) self.assertTrue(test[2] == 0) self.assertTrue(test[3] == 0) self.assertTrue("III" in encoded_dataset.encoded_data.feature_names) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "distance_encoder/" PathBuilder.build(path) dataset = self.create_dataset(path) enc = DistanceEncoder.build_object( dataset, **{ "distance_metric": DistanceMetricType.JACCARD.name, "attributes_to_match": ["sequence_aas"], "sequence_batch_size": 20 }) enc.set_context({"dataset": dataset}) encoded = enc.encode( dataset, EncoderParams(result_path=path, label_config=LabelConfiguration( [Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4, filename="dataset.pkl")) self.assertEqual(8, encoded.encoded_data.examples.shape[0]) self.assertEqual(8, encoded.encoded_data.examples.shape[1]) self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 0]) self.assertEqual(1, encoded.encoded_data.examples.iloc[1, 1]) self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 4]) self.assertTrue( np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"])) self.assertTrue( np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"])) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "deeprc_encoder/" PathBuilder.build(path) PathBuilder.build(path + "encoded_data/") main_dataset, sub_dataset = self.create_datasets(path) enc = DeepRCEncoder.build_object(sub_dataset, **{}) enc.set_context({"dataset": main_dataset}) encoded = enc.encode( sub_dataset, EncoderParams(result_path=path + "encoded_data/", label_config=LabelConfiguration( [Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4)) self.assertListEqual(encoded.encoded_data.example_ids, sub_dataset.get_repertoire_ids()) self.assertTrue( os.path.isfile(encoded.encoded_data.info["metadata_filepath"])) metadata_content = pd.read_csv( encoded.encoded_data.info["metadata_filepath"], sep="\t") self.assertListEqual(list(metadata_content["ID"]), sub_dataset.get_repertoire_ids()) for repertoire in main_dataset.repertoires: rep_path = f"{path}/encoded_data/encoding/{repertoire.identifier}.tsv" self.assertTrue(os.path.isfile(rep_path)) repertoire_tsv = pd.read_csv(rep_path, sep="\t") self.assertListEqual(list(repertoire_tsv["amino_acid"]), list(repertoire.get_sequence_aas())) shutil.rmtree(path)
def test_fit(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "kmermil/") repertoire_count = 10 dataset = RandomDatasetGenerator.generate_repertoire_dataset( repertoire_count=repertoire_count, sequence_count_probabilities={2: 1}, sequence_length_probabilities={4: 1}, labels={"l1": { True: 0.5, False: 0.5 }}, path=path + "dataset/") enc_dataset = AtchleyKmerEncoder( 2, 1, 1, 'relative_abundance', False).encode( dataset, EncoderParams(path + "result/", LabelConfiguration([Label("l1", [True, False])]))) cls = AtchleyKmerMILClassifier(iteration_count=10, threshold=-0.0001, evaluate_at=2, use_early_stopping=False, random_seed=1, learning_rate=0.01, zero_abundance_weight_init=True, number_of_threads=8) cls.fit(enc_dataset.encoded_data, "l1") predictions = cls.predict(enc_dataset.encoded_data, "l1") self.assertEqual(repertoire_count, len(predictions["l1"])) self.assertEqual( repertoire_count, len([pred for pred in predictions["l1"] if isinstance(pred, bool)])) predictions_proba = cls.predict_proba(enc_dataset.encoded_data, "l1") self.assertEqual(repertoire_count, np.rint(np.sum(predictions_proba["l1"]))) self.assertEqual(repertoire_count, predictions_proba["l1"].shape[0]) cls.store(path + "model_storage/", feature_names=enc_dataset.encoded_data.feature_names) cls2 = AtchleyKmerMILClassifier(iteration_count=10, threshold=-0.0001, evaluate_at=2, use_early_stopping=False, random_seed=1, learning_rate=0.01, zero_abundance_weight_init=True, number_of_threads=8) cls2.load(path + "model_storage/") cls2_vars = vars(cls2) del cls2_vars["logistic_regression"] cls_vars = vars(cls) del cls_vars["logistic_regression"] for item, value in cls_vars.items(): if not isinstance(value, np.ndarray): loaded_value = cls2_vars[item] self.assertEqual(value, loaded_value) model = cls.get_model("l1") self.assertEqual(vars(cls), model) shutil.rmtree(path)
def test_fit(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "cnn/") dataset = RandomDatasetGenerator.generate_receptor_dataset( receptor_count=500, chain_1_length_probabilities={4: 1}, chain_2_length_probabilities={4: 1}, labels={"CMV": { True: 0.5, False: 0.5 }}, path=path + "dataset/") enc_dataset = OneHotReceptorEncoder(True, 1, False, "enc1").encode( dataset, EncoderParams(path + "result/", LabelConfiguration([Label("CMV", [True, False])]))) cnn = ReceptorCNN(kernel_count=2, kernel_size=[3], positional_channels=3, sequence_type="amino_acid", device="cpu", number_of_threads=4, random_seed=1, learning_rate=0.01, iteration_count=10, l1_weight_decay=0.1, evaluate_at=5, batch_size=100, training_percentage=0.8, l2_weight_decay=0.0) cnn.fit(encoded_data=enc_dataset.encoded_data, label_name="CMV") predictions = cnn.predict(enc_dataset.encoded_data, "CMV") self.assertEqual(500, len(predictions["CMV"])) self.assertEqual( 500, len([ pred for pred in predictions["CMV"] if isinstance(pred, bool) ])) predictions_proba = cnn.predict_proba(enc_dataset.encoded_data, "CMV") self.assertEqual(500, np.rint(np.sum(predictions_proba["CMV"]))) self.assertEqual(500, predictions_proba["CMV"].shape[0]) cnn.store(path + "model_storage/") cnn2 = ReceptorCNN(sequence_type="amino_acid") cnn2.load(path + "model_storage/") cnn2_vars = vars(cnn2) del cnn2_vars["CNN"] cnn_vars = vars(cnn) del cnn_vars["CNN"] for item, value in cnn_vars.items(): if not isinstance(value, np.ndarray): self.assertEqual(value, cnn2_vars[item]) model = cnn.get_model(["CMV"]) self.assertEqual(vars(cnn), model) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.root_path + "test/tmp/kmerfreqenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ReceptorSequence("AAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')], metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path) rep2 = Repertoire.build_from_sequence_objects([ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2"), ReceptorSequence("AAC", identifier="3")], metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.IDENTITY.name, "k": 3 }) d1 = encoder.encode(dataset, EncoderParams( result_path=path + "1/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl" )) encoder = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d2 = encoder.encode(dataset, EncoderParams( result_path=path + "2/", label_config=lc, pool_size=2, learn_model=True, model={}, filename="dataset.csv" )) encoder3 = KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": NormalizationType.BINARY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 }) d3 = encoder3.encode(dataset, EncoderParams( result_path=path + "3/", label_config=lc, learn_model=True, model={}, filename="dataset.pkl" )) shutil.rmtree(path) self.assertTrue(isinstance(d1, RepertoireDataset)) self.assertTrue(isinstance(d2, RepertoireDataset)) self.assertEqual(0.67, np.round(d2.encoded_data.examples[0, 2], 2)) self.assertEqual(0.0, np.round(d3.encoded_data.examples[0, 1], 2)) self.assertTrue(isinstance(encoder, KmerFrequencyEncoder))
def test_encode(self): path = EnvironmentSettings.root_path + "test/tmp/evennessenc/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence("AAA", metadata=SequenceMetadata(count=10)) for i in range(1000) ] + [ ReceptorSequence("AAA", metadata=SequenceMetadata(count=100)) for i in range(1000) ] + [ ReceptorSequence("AAA", metadata=SequenceMetadata(count=1)) for i in range(1000) ], metadata={ "l1": "test_1", "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence("AAA", metadata=SequenceMetadata(count=10)) for i in range(1000) ], metadata={ "l1": "test_2", "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", ["test_1", "test_2"]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = EvennessProfileEncoder.build_object( dataset, **{ "min_alpha": 0, "max_alpha": 10, "dimension": 51 }) d1 = encoder.encode( dataset, EncoderParams( result_path=path + "1/", label_config=lc, )) encoder = EvennessProfileEncoder.build_object( dataset, **{ "min_alpha": 0, "max_alpha": 10, "dimension": 11 }) d2 = encoder.encode( dataset, EncoderParams(result_path=path, label_config=lc, pool_size=2)) self.assertAlmostEqual(d1.encoded_data.examples[0, 0], 1) self.assertAlmostEqual(d1.encoded_data.examples[0, 1], 0.786444) self.assertAlmostEqual(d1.encoded_data.examples[1, 0], 1) self.assertAlmostEqual(d1.encoded_data.examples[1, 1], 1) shutil.rmtree(path)