def create_dataset(self, path, dataset_size: int = 50): sequences = [] for i in range(dataset_size): if i % 2 == 0: sequences.append( ReceptorSequence( amino_acid_sequence="AAACCC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 1}))) else: sequences.append( ReceptorSequence( amino_acid_sequence="ACACAC", identifier=str(i), metadata=SequenceMetadata(custom_params={"l1": 2}))) PathBuilder.build(path) filename = "{}sequences.pkl".format(path) with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset
def test_encode_sequence(self): sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="OUT")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence( amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="STOP")) enc = IdentitySequenceEncoder() self.assertEqual( enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")), ["AAA"]) sequence = ReceptorSequence(amino_acid_sequence="AAA", metadata=SequenceMetadata(frame_type="IN")) enc = IdentitySequenceEncoder() self.assertEqual(["AAA"], enc.encode_sequence( sequence, EncoderParams(model={}, label_config=LabelConfiguration(), result_path="")))
def _create_label_config(self, instruction: dict, dataset: Dataset, instruction_key: str) -> LabelConfiguration: labels = instruction["labels"] self._check_label_format(labels, instruction_key) label_config = LabelConfiguration() for label in labels: label_name = label if isinstance(label, str) else list( label.keys())[0] positive_class = label[label_name]['positive_class'] if isinstance( label, dict) else None if dataset.params is not None and label_name in dataset.params: label_values = dataset.params[label_name] elif hasattr(dataset, "get_metadata"): label_values = list( set(dataset.get_metadata([label_name])[label_name])) else: label_values = [] warnings.warn( f"{TrainMLModelParser.__name__}: for instruction {instruction_key}, label values could not be recovered for label " f"{label}, using empty list instead. This could cause problems with some encodings. " f"If that might be the case, check if the dataset {dataset.name} has been properly loaded." ) label_config.add_label(label_name, label_values, positive_class=positive_class) return label_config
def test(self): path = EnvironmentSettings.tmp_test_path + "integration_sequence_classification/" dataset = self.create_dataset(path) os.environ["cache_type"] = "test" encoder_params = { "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3 } hp_setting = HPSetting(encoder=KmerFrequencyEncoder.build_object( dataset, **encoder_params), encoder_params=encoder_params, ml_method=LogisticRegression(), ml_params={ "model_selection_cv": False, "model_selection_n_folds": -1 }, preproc_sequence=[]) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) instruction = TrainMLModelInstruction( dataset, GridSearch([hp_setting]), [hp_setting], SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), SplitConfig(SplitType.RANDOM, 1, 0.5, reports=ReportConfig()), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, lc, path) result = instruction.run(result_path=path) shutil.rmtree(path)
def test_encode_sequence(self): seq = ReceptorSequence(amino_acid_sequence="CASSVFRTY") result = KmerSequenceEncoder.encode_sequence( seq, EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)) self.assertTrue("CAS" in result) self.assertTrue("ASS" in result) self.assertTrue("SSV" in result) self.assertTrue("SVF" in result) self.assertTrue("VFR" in result) self.assertTrue("FRT" in result) self.assertTrue("RTY" in result) self.assertEqual(7, len(result)) self.assertEqual( KmerSequenceEncoder.encode_sequence( ReceptorSequence(amino_acid_sequence="AC"), EncoderParams(model={"k": 3}, label_config=LabelConfiguration(), result_path="", pool_size=4)), None)
def __init__(self, split_index: int, train_val_dataset, test_dataset, path: str, label_configuration: LabelConfiguration): self.split_index = split_index self.train_val_dataset = train_val_dataset self.test_dataset = test_dataset self.path = path self.train_val_data_reports = [] self.test_data_reports = [] # computed self.label_states = {label: HPLabelState(label, label_configuration.get_auxiliary_labels(label)) for label in label_configuration.get_labels_by_name()}
def _construct_test_repertoiredataset(self, path, positional): receptors1 = ReceptorSequenceList() receptors2 = ReceptorSequenceList() if positional: [ receptors1.append(seq) for seq in [ ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1") ] ] [ receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")] ] else: [ receptors1.append(seq) for seq in [ ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3') ] ] [ receptors2.append(seq) for seq in [ ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2") ] ] rep1 = Repertoire.build_from_sequence_objects(receptors1, metadata={ "l1": 1, "l2": 2, "subject_id": "1" }, path=path) rep2 = Repertoire.build_from_sequence_objects(receptors2, metadata={ "l1": 0, "l2": 3, "subject_id": "2" }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) return dataset, lc
def construct_test_flatten_dataset(self, path): sequences = [ReceptorSequence(amino_acid_sequence="AAATTT", identifier="1", metadata=SequenceMetadata(custom_params={"l1": 1})), ReceptorSequence(amino_acid_sequence="ATATAT", identifier="2", metadata=SequenceMetadata(custom_params={"l1": 2}))] PathBuilder.build(path) filename = "{}sequences.pkl".format(path) with open(filename, "wb") as file: pickle.dump(sequences, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) return SequenceDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1")
def test_run(self): path = EnvironmentSettings.tmp_test_path + "explanalysisprocintegration/" PathBuilder.build(path) os.environ["cache_type"] = "test" dataset = self.create_dataset(path) label_config = LabelConfiguration() label_config.add_label("l1", [0, 1]) label_config.add_label("l2", [2, 3]) file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 100a TRA AAAC TRAV12 TRAJ1 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV """ with open(path + "refs.tsv", "w") as file: file.writelines(file_content) refs = { "params": { "path": path + "refs.tsv", "region_type": "FULL_SEQUENCE" }, "format": "VDJdb" } units = { "named_analysis_4": ExploratoryAnalysisUnit( dataset=dataset, report=DesignMatrixExporter(), label_config=label_config, encoder=MatchedSequencesRepertoireEncoder.build_object( dataset, **{ "max_edit_distance": 1, "reference": refs })) } process = ExploratoryAnalysisInstruction(units, name="exp") process.run(path + "results/") self.assertTrue( os.path.isfile( path + "results/exp/analysis_named_analysis_4/report/design_matrix.csv" )) shutil.rmtree(path)
def test_sequence_flattened(self): path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/" PathBuilder.build(path) dataset = self.construct_test_flatten_dataset(path) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=path, label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]), pool_size=1, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, SequenceDataset)) onehot_a = [1.0] + [0.0] * 19 onehot_t = [0.0] * 16 + [1.0] + [0] * 3 self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()]) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "abundance_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceAbundanceEncoder.build_object(dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4, "repertoire_batch_size": 8 }) label_config = LabelConfiguration([Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[1, 4], [1, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) encoder.p_value_threshold = 0.05 encoded_dataset = encoder.encode(dataset, EncoderParams(result_path=path, label_config=label_config)) self.assertTrue(np.array_equal(np.array([[0, 4], [0, 6], [0, 3], [0, 6]]), encoded_dataset.encoded_data.examples)) shutil.rmtree(path)
def _prepare_optional_params(self, analysis: dict, symbol_table: SymbolTable) -> dict: params = {} dataset = symbol_table.get(analysis["dataset"]) if all(key in analysis for key in ["encoding", "labels"]): params["encoder"] = symbol_table.get(analysis["encoding"]) \ .build_object(dataset, **symbol_table.get_config(analysis["encoding"])["encoder_params"]) params["label_config"] = LabelConfiguration() for label in analysis["labels"]: label_values = self._get_label_values(label, dataset) params["label_config"].add_label(label, label_values) elif any(key in analysis for key in ["encoding", "labels"]): raise KeyError( "ExploratoryAnalysisParser: keys for analyses are not properly defined. " "If encoding is defined, labels have to be defined as well and vice versa." ) if "preprocessing_sequence" in analysis: params["preprocessing_sequence"] = symbol_table.get( analysis["preprocessing_sequence"]) if "number_of_processes" in analysis: params["number_of_processes"] = analysis["number_of_processes"] return params
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)
def test_encode(self): test_path = EnvironmentSettings.root_path + "test/tmp/w2v/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA", identifier="1") sequence2 = ReceptorSequence("CASSCCC", identifier="2") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) label_configuration = LabelConfiguration() label_configuration.add_label("T1D", ["T1D", "CTL"]) config_params = EncoderParams(model={}, learn_model=True, result_path=test_path, label_config=label_configuration, filename="dataset.pkl") encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": "sequence", "vector_size": 16 }) encoded_dataset = encoder.encode(dataset=dataset, params=config_params) self.assertIsNotNone(encoded_dataset.encoded_data) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == 2) self.assertTrue(encoded_dataset.encoded_data.examples.shape[1] == 16) self.assertTrue(len(encoded_dataset.encoded_data.labels["T1D"]) == 2) self.assertTrue(encoded_dataset.encoded_data.labels["T1D"][0] == "T1D") self.assertTrue(isinstance(encoder, W2VRepertoireEncoder)) shutil.rmtree(test_path)
def test_run(self): path = EnvironmentSettings.tmp_test_path + "mlapplicationtest/" PathBuilder.build(path) dataset = RandomDatasetGenerator.generate_repertoire_dataset( 50, {5: 1}, {5: 1}, {"l1": { 1: 0.5, 2: 0.5 }}, path + 'dataset/') ml_method = LogisticRegression() encoder = KmerFreqRepertoireEncoder( NormalizationType.RELATIVE_FREQUENCY, ReadsType.UNIQUE, SequenceEncodingType.CONTINUOUS_KMER, 3, scale_to_zero_mean=True, scale_to_unit_variance=True) label_config = LabelConfiguration([Label("l1", [1, 2])]) enc_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config, filename="tmp_enc_dataset.pickle", pool_size=4)) ml_method.fit(enc_dataset.encoded_data, 'l1') hp_setting = HPSetting( encoder, { "normalization_type": "relative_frequency", "reads": "unique", "sequence_encoding": "continuous_kmer", "k": 3, "scale_to_zero_mean": True, "scale_to_unit_variance": True }, ml_method, {}, [], 'enc1', 'ml1') PathBuilder.build(path + 'result/instr1/') shutil.copy(path + 'dict_vectorizer.pickle', path + 'result/instr1/dict_vectorizer.pickle') shutil.copy(path + 'scaler.pickle', path + 'result/instr1/scaler.pickle') ml_app = MLApplicationInstruction(dataset, label_config, hp_setting, 4, "instr1", False) ml_app.run(path + 'result/') predictions_path = path + "result/instr1/predictions.csv" self.assertTrue(os.path.isfile(predictions_path)) df = pd.read_csv(predictions_path) self.assertEqual(50, df.shape[0]) shutil.rmtree(path)
def test_encode(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "atchley_kmer_encoding/") dataset = RandomDatasetGenerator.generate_repertoire_dataset(3, {1: 1}, {4: 1}, {"l1": {True: 0.4, False: 0.6}}, path + "dataset/") encoder = AtchleyKmerEncoder.build_object(dataset, **{"k": 2, "skip_first_n_aa": 1, "skip_last_n_aa": 1, "abundance": "RELATIVE_ABUNDANCE", "normalize_all_features": False}) encoded_dataset = encoder.encode(dataset, EncoderParams(path + "result/", LabelConfiguration(labels=[Label("l1")]))) self.assertEqual((3, 11, 3), encoded_dataset.encoded_data.examples.shape) self.assertEqual(0., encoded_dataset.encoded_data.examples[0, -1, 0]) shutil.rmtree(path)
def run_setting(state: TrainMLModelState, hp_setting, train_dataset, val_dataset, split_index: int, current_path: str, label: str, assessment_index: int): hp_item = MLProcess(train_dataset=train_dataset, test_dataset=val_dataset, encoding_reports=state.selection.reports.encoding_reports.values(), label_config=LabelConfiguration([state.label_configuration.get_label_object(label)]), report_context=state.context, number_of_processes=state.number_of_processes, metrics=state.metrics, optimization_metric=state.optimization_metric, ml_reports=state.selection.reports.model_reports.values(), label=label, path=current_path, hp_setting=hp_setting, store_encoded_data=state.store_encoded_data)\ .run(split_index) state.assessment_states[assessment_index].label_states[label].selection_state.hp_items[hp_setting.get_key()].append(hp_item) return hp_item.performance[state.optimization_metric.name.lower()] if hp_item.performance is not None else None
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build([["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0]) dataset.encoded_data = EncodedData( examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]), labels={"l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]} ) label_config = LabelConfiguration() label_config.add_label("l1", [1, 3]) method1 = LogisticRegression() method1.fit(dataset.encoded_data, label_name='l1') res = MLMethodAssessment.run(MLMethodAssessmentParams( dataset=dataset, method=method1, metrics={Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO}, optimization_metric=Metric.LOG_LOSS, predictions_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv", label="l1", ml_score_path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv", split_index=1, path=EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/" )) self.assertTrue(isinstance(res, dict)) self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1) self.assertTrue(os.path.isfile(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv")) df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/ml_score.csv") self.assertTrue(df.shape[0] == 1) df = pd.read_csv(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/predictions.csv") self.assertEqual(12, df.shape[0]) shutil.rmtree(EnvironmentSettings.root_path + "test/tmp/mlmethodassessment/")
def test_run(self): path = EnvironmentSettings.root_path + "test/tmp/smmodel/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"], ["AAA", "CCC"], ["TTTT"]], path, {"default": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]}) dataset = RepertoireDataset(repertoires=repertoires, params={"default": [1, 2]}, metadata_file=metadata) label_config = LabelConfiguration() label_config.add_label("default", [1, 2]) hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **{"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}), {"vector_size": 8, "model_type": ModelType.SEQUENCE.name, "k": 3}, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] split_config_assessment = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) split_config_selection = SplitConfig(SplitType.RANDOM, 1, 0.5, ReportConfig()) instruction = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, split_config_assessment, split_config_selection, {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) semantic_model = SemanticModel([instruction], path) semantic_model.run() shutil.rmtree(path)
def test_encode_sequence(self): sequence = ReceptorSequence("AHCDE", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'AH///105', 'HC///106', 'CD///107', 'DE///116', 'A.C///105', 'H.D///106', 'C.E///107' }, set(kmers)) sequence = ReceptorSequence("CASSPRERATYEQCAY", None, None) kmers = IMGTGappedKmerEncoder.encode_sequence( sequence, EncoderParams(model={ "k_left": 1, "max_gap": 1 }, label_config=LabelConfiguration(), result_path="")) self.assertEqual( { 'CA///105', 'AS///106', 'SS///107', 'SP///108', 'PR///109', 'RE///110', 'ER///111', 'RA///111.001', 'AT///112.002', 'TY///112.001', 'YE///112', 'EQ///113', 'QC///114', 'CA///115', 'AY///116', 'C.S///105', 'A.S///106', 'S.P///107', 'S.R///108', 'P.E///109', 'R.R///110', 'E.A///111', 'R.T///111.001', 'A.Y///112.002', 'T.E///112.001', 'Y.Q///112', 'E.C///113', 'Q.A///114', 'C.Y///115' }, set(kmers))
def test_encode_sequence(self): sequence = ReceptorSequence("CASSPRERATYEQCASSPRERATYEQCASSPRERATYEQ", None, None) result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams( model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'CAS///105', 'ASS///106', 'SSP///107', 'SPR///108', 'PRE///109', 'RER///110', 'ERA///111', 'RAT///111.001', 'ATY///111.002', 'TYE///111.003', 'YEQ///111.004', 'EQC///111.005', 'QCA///111.006', 'CAS///111.007', 'ASS///111.008', 'SSP///111.009', 'SPR///111.01', 'PRE///111.011', 'RER///111.012', 'ERA///111.013', 'RAT///112.013', 'ATY///112.012', 'TYE///112.011', 'YEQ///112.01', 'EQC///112.009', 'QCA///112.008', 'CAS///112.007', 'ASS///112.006', 'SSP///112.005', 'SPR///112.004', 'PRE///112.003', 'RER///112.002', 'ERA///112.001', 'RAT///112', 'ATY///113', 'TYE///114', 'YEQ///115'}, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) sequence = ReceptorSequence("AHCDE", None, None) result = IMGTKmerSequenceEncoder.encode_sequence(sequence, EncoderParams( model={"k": 3}, label_config=LabelConfiguration(), result_path="")) self.assertEqual({'AHC///105', 'HCD///106', 'CDE///107'}, set(result)) self.assertEqual(len(result), len(sequence.get_sequence()) - 3 + 1) self.assertEqual( IMGTKmerSequenceEncoder.encode_sequence( sequence, EncoderParams(model={"k": 25}, label_config=LabelConfiguration(), result_path="") ), None )
def test_generate(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path + "tcrdist_motif_discovery/") dataset_path = self._create_dataset(path) dataset = SingleLineReceptorImport.import_dataset( { "path": dataset_path, "result_path": path + "dataset/", "separator": ",", "columns_to_load": [ "subject", "epitope", "count", "v_a_gene", "j_a_gene", "cdr3_a_aa", "v_b_gene", "j_b_gene", "cdr3_b_aa", "clone_id", "cdr3_a_nucseq", "cdr3_b_nucseq" ], "column_mapping": { "cdr3_a_aa": "alpha_amino_acid_sequence", "cdr3_b_aa": "beta_amino_acid_sequence", "cdr3_a_nucseq": "alpha_nucleotide_sequence", "cdr3_b_nucseq": "beta_nucleotide_sequence", "v_a_gene": "alpha_v_gene", "v_b_gene": "beta_v_gene", "j_a_gene": "alpha_j_gene", "j_b_gene": "beta_j_gene", "clone_id": "identifier" }, "receptor_chains": "TRA_TRB", "region_type": "IMGT_CDR3", "sequence_file_size": 50000, "organism": "mouse" }, 'd1') dataset = TCRdistEncoder(8).encode( dataset, EncoderParams(f"{path}result/", LabelConfiguration([Label("epitope")]))) report = TCRdistMotifDiscovery(dataset, path + "report/", "report name", 8) report.generate_report() shutil.rmtree(path)
def _construct_test_dataset(self, path, dataset_size: int = 50): receptors = [ TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="AAAA"), beta=ReceptorSequence(amino_acid_sequence="ATA"), metadata={"l1": 1}, identifier=str("1")), TCABReceptor(alpha=ReceptorSequence(amino_acid_sequence="ATA"), beta=ReceptorSequence(amino_acid_sequence="ATT"), metadata={"l1": 2}, identifier=str("2")) ] PathBuilder.build(path) filename = "{}receptors.pkl".format(path) with open(filename, "wb") as file: pickle.dump(receptors, file) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) dataset = ReceptorDataset(params={"l1": [1, 2]}, filenames=[filename], identifier="d1") return dataset, lc
def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: str) -> MLApplicationInstruction: location = MLApplicationParser.__name__ ParameterValidator.assert_keys(instruction.keys(), ['type', 'dataset', 'label', 'pool_size', 'config_path', 'store_encoded_data'], location, key) ParameterValidator.assert_in_valid_list(instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset") ParameterValidator.assert_type_and_value(instruction['pool_size'], int, location, f"{key}: pool_size", min_inclusive=1) ParameterValidator.assert_type_and_value(instruction['label'], str, location, f'{key}: label') ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path') ParameterValidator.assert_type_and_value(instruction['store_encoded_data'], bool, location, f'{key}: store_encoded_data') hp_setting, label = self._parse_hp_setting(instruction, path, key) instruction = MLApplicationInstruction(dataset=symbol_table.get(instruction['dataset']), name=key, pool_size=instruction['pool_size'], label_configuration=LabelConfiguration([label]), hp_setting=hp_setting, store_encoded_data=instruction['store_encoded_data']) return instruction
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None): """ encodes the repertoire dataset using KmerFrequencyEncoder :param path_to_dataset_directory: path to directory containing all repertoire files with .tsv extension in MiXCR format :param result_path: where to store the results :param metadata_path: csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None, otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column :return: encoded dataset with encoded data in encoded_dataset.encoded_data.examples """ if metadata_path is None: metadata_path = generate_random_metadata(path_to_dataset_directory, result_path) loader = MiXCRImport() dataset = loader.import_dataset({ "is_repertoire": True, "path": path_to_dataset_directory, "metadata_file": metadata_path, "region_type": "IMGT_CDR3", # import_dataset in only cdr3 "number_of_processes": 4, # number of parallel processes for loading the data "result_path": result_path, "separator": "\t", "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"], "column_mapping": { "cloneCount": "counts", "allVHitsWithScore": "v_genes", "allJHitsWithScore": "j_genes" }, }, "mixcr_dataset") label_name = list(dataset.params.keys())[0] # label that can be used for ML prediction - by default: "disease" with values True/False encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": "relative_frequency", # encode repertoire by the relative frequency of k-mers in repertoire "reads": "unique", # count each sequence only once, do not use clonal count "k": 2, # k-mer length "sequence_encoding": "continuous_kmer" # split each sequence in repertoire to overlapping k-mers }), EncoderParams(result_path=result_path, label_config=LabelConfiguration([Label(label_name, dataset.params[label_name])])), False)) dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset, result_path=f"{result_path if result_path[:-1] == '/' else result_path+'/'}csv_exported/") dataset_exporter.generate_report() return encoded_dataset
def _create_state_object(self, path): repertoires, metadata = RepertoireBuilder.build(sequences=[["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"], ["AAA", "CCC", "DDD"]], path=path, labels={ "l1": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], "l2": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, params={"l1": [1, 2], "l2": [0, 1]}) enc_params = {"k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 4} hp_settings = [HPSetting(Word2VecEncoder.build_object(dataset, **enc_params), enc_params, LogisticRegression(), {"model_selection_cv": False, "model_selection_n_folds": -1}, [])] label_config = LabelConfiguration([Label("l1", [1, 2]), Label("l2", [0, 1])]) process = TrainMLModelInstruction(dataset, GridSearch(hp_settings), hp_settings, SplitConfig(SplitType.RANDOM, 1, 0.5), SplitConfig(SplitType.RANDOM, 1, 0.5), {Metric.BALANCED_ACCURACY}, Metric.BALANCED_ACCURACY, label_config, path) state = process.run(result_path=path) return state
def create_dummy_data(self, path): # Setting up dummy data labels = { "subject_id": ["subject_1", "subject_2", "subject_3"], "label": ["yes", "yes", "no"] } metadata = { "v_gene": "TRBV1", "j_gene": "TRBJ1", "chain": Chain.BETA.value } repertoires, metadata = RepertoireBuilder.build( sequences=[["AAAA"], ["SSSS"], ["SSSS", "CCCC"]], path=path, labels=labels, seq_metadata=[[{ **metadata, "count": 10 }], [{ **metadata, "count": 10 }], [{ **metadata, "count": 5 }, { **metadata, "count": 5 }]], subject_ids=labels["subject_id"]) dataset = RepertoireDataset(repertoires=repertoires) label_config = LabelConfiguration() label_config.add_label("subject_id", labels["subject_id"]) label_config.add_label("label", labels["label"]) file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 100 TRB AAAA TRBV1 TRBJ1 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 200 TRB SSSS TRBV1 TRBJ1 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0""" with open(path + "refs.tsv", "w") as file: file.writelines(file_content) reference_sequences = { "params": { "path": path + "refs.tsv", "region_type": "FULL_SEQUENCE" }, "format": "VDJdb" } return dataset, label_config, reference_sequences, labels
def test_encode(self): file_content = """complex.id Gene CDR3 V J Species MHC A MHC B MHC class Epitope Epitope gene Epitope species Reference Method Meta CDR3fix Score 3050 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15760 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3050 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15760 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 3051 TRB CASSPPRVYSNGAGLAGVGWRNEQFF TRBV5-4*01 TRBJ2-1*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CASSPPRVYSNGAGLAGVGWRNEQFF", "cdr3_old": "CASSPPRVYSNGAGLAGVGWRNEQFF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-1*01", "jStart": 21, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-4*01"} 0 15761 TRB CASSWTWDAATLWGQGALGGANVLTF TRBV5-5*01 TRBJ2-6*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CASSWTWDAATLWGQGALGGANVLTF", "cdr3_old": "CASSWTWDAATLWGQGALGGANVLTF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRBJ2-6*01", "jStart": 19, "vCanonical": true, "vEnd": 4, "vFixType": "NoFixNeeded", "vId": "TRBV5-5*01"} 0 3051 TRA CAAIYESRGSTLGRLYF TRAV13-1*01 TRAJ18*01 HomoSapiens HLA-A*11:01 B2M MHCI AVFDRKSDAK EBNA4 EBV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/11684", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "1", "tissue": ""} {"cdr3": "CAAIYESRGSTLGRLYF", "cdr3_old": "CAAIYESRGSTLGRLYF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ18*01", "jStart": 7, "oldVEnd": -1, "oldVFixType": "FailedBadSegment", "oldVId": null, "vCanonical": true, "vEnd": 3, "vFixType": "ChangeSegment", "vId": "TRAV13-1*01"} 0 15761 TRA CALRLNNQGGKLIF TRAV9-2*01 TRAJ23*01 HomoSapiens HLA-A*03:01 B2M MHCI KLGGALQAK IE1 CMV https://www.10xgenomics.com/resources/application-notes/a-new-way-of-exploring-immunity-linking-highly-multiplexed-antigen-recognition-to-immune-repertoire-and-phenotype/# {"frequency": "1/25584", "identification": "dextramer-sort", "sequencing": "rna-seq", "singlecell": "yes", "verification": ""} {"cell.subset": "", "clone.id": "", "donor.MHC": "", "donor.MHC.method": "", "epitope.id": "", "replica.id": "", "samples.found": 1, "structure.id": "", "studies.found": 1, "study.id": "", "subject.cohort": "", "subject.id": "3", "tissue": ""} {"cdr3": "CALRLNNQGGKLIF", "cdr3_old": "CALRLNNQGGKLIF", "fixNeeded": false, "good": true, "jCanonical": true, "jFixType": "NoFixNeeded", "jId": "TRAJ23*01", "jStart": 6, "vCanonical": true, "vEnd": 3, "vFixType": "NoFixNeeded", "vId": "TRAV9-2*01"} 0 """ path = PathBuilder.build(EnvironmentSettings.root_path + "test/tmp/trcdist_encoder/") with open(path + "receptors.tsv", "w") as file: file.writelines(file_content) params = DefaultParamsLoader.load( EnvironmentSettings.default_params_path + "datasets/", "vdjdb") params["is_repertoire"] = False params["paired"] = True params["result_path"] = path params["path"] = path params["sequence_file_size"] = 1 params["receptor_chains"] = "TRA_TRB" params['organism'] = 'human' dataset = VDJdbImport.import_dataset(params, "vdjdb_dataset") encoder = TCRdistEncoder.build_object(dataset, **{"cores": 2}) encoded_dataset = encoder.encode( dataset, EncoderParams(f"{path}result/", LabelConfiguration([Label("epitope")]))) self.assertTrue(encoded_dataset.encoded_data.examples.shape[0] == encoded_dataset.encoded_data.examples.shape[1] and encoded_dataset.encoded_data.examples.shape[0] == dataset.get_example_count()) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "count_encoder/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build( [["GGG", "III", "LLL", "MMM"], ["DDD", "EEE", "FFF", "III", "LLL", "MMM"], ["CCC", "FFF", "MMM"], ["AAA", "CCC", "EEE", "FFF", "LLL", "MMM"]], labels={"l1": [True, True, False, False]}, path=path) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, identifier="1") encoder = SequenceCountEncoder.build_object( dataset, **{ "comparison_attributes": ["sequence_aas"], "p_value_threshold": 0.4, "sequence_batch_size": 4 }) label_config = LabelConfiguration( [Label("l1", [True, False], positive_class=True)]) encoded_dataset = encoder.encode( dataset, EncoderParams(result_path=path, label_config=label_config)) test = encoded_dataset.encoded_data.examples self.assertTrue(test[0] == 1) self.assertTrue(test[1] == 1) self.assertTrue(test[2] == 0) self.assertTrue(test[3] == 0) self.assertTrue("III" in encoded_dataset.encoded_data.feature_names) shutil.rmtree(path)
def test_encode(self): path = EnvironmentSettings.tmp_test_path + "distance_encoder/" PathBuilder.build(path) dataset = self.create_dataset(path) enc = DistanceEncoder.build_object( dataset, **{ "distance_metric": DistanceMetricType.JACCARD.name, "attributes_to_match": ["sequence_aas"], "sequence_batch_size": 20 }) enc.set_context({"dataset": dataset}) encoded = enc.encode( dataset, EncoderParams(result_path=path, label_config=LabelConfiguration( [Label("l1", [0, 1]), Label("l2", [2, 3])]), pool_size=4, filename="dataset.pkl")) self.assertEqual(8, encoded.encoded_data.examples.shape[0]) self.assertEqual(8, encoded.encoded_data.examples.shape[1]) self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 0]) self.assertEqual(1, encoded.encoded_data.examples.iloc[1, 1]) self.assertEqual(1, encoded.encoded_data.examples.iloc[0, 4]) self.assertTrue( np.array_equal([1, 0, 1, 0, 1, 0, 1, 0], encoded.encoded_data.labels["l1"])) self.assertTrue( np.array_equal([2, 3, 2, 3, 2, 3, 3, 3], encoded.encoded_data.labels["l2"])) shutil.rmtree(path)