def encode_dataset(dataset, hp_setting: HPSetting, path: Path, learn_model: bool, context: dict, number_of_processes: int, label_configuration: LabelConfiguration, encode_labels: bool = True, store_encoded_data: bool = False): PathBuilder.build(path) encoded_dataset = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=hp_setting.encoder, encoder_params=EncoderParams( model=hp_setting.encoder_params, result_path=path, pool_size=number_of_processes, label_config=label_configuration, learn_model=learn_model, filename="train_dataset.pkl" if learn_model else "test_dataset.pkl", encode_labels=encode_labels), store_encoded_data=store_encoded_data)) return encoded_dataset
def encode_dataset_by_kmer_freq(path_to_dataset_directory: str, result_path: str, metadata_path: str = None): """ encodes the repertoire dataset using KmerFrequencyEncoder Arguments: path_to_dataset_directory (str): path to directory containing all repertoire files with .tsv extension in MiXCR format result_path (str): where to store the results metadata_path(str): csv file with columns "filename", "subject_id", "disease" which is filled by default if value of argument is None, otherwise any metadata csv file passed to the function, must include filename and subject_id columns, and an arbitrary disease column Returns: encoded dataset with encoded data in encoded_dataset.encoded_data.examples """ path_to_dataset_directory = Path(path_to_dataset_directory) result_path = Path(result_path) if metadata_path is None: metadata_path = generate_random_metadata(path_to_dataset_directory, result_path) else: metadata_path = Path(metadata_path) loader = MiXCRImport() dataset = loader.import_dataset({ "is_repertoire": True, "path": path_to_dataset_directory, "metadata_file": metadata_path, "region_type": "IMGT_CDR3", # import_dataset in only cdr3 "number_of_processes": 4, # number of parallel processes for loading the data "result_path": result_path, "separator": "\t", "columns_to_load": ["cloneCount", "allVHitsWithScore", "allJHitsWithScore", "aaSeqCDR3", "nSeqCDR3"], "column_mapping": { "cloneCount": "counts", "allVHitsWithScore": "v_alleles", "allJHitsWithScore": "j_alleles" }, }, "mixcr_dataset") label_name = list(dataset.labels.keys())[0] # label that can be used for ML prediction - by default: "disease" with values True/False encoded_dataset = DataEncoder.run(DataEncoderParams(dataset, KmerFrequencyEncoder.build_object(dataset, **{ "normalization_type": "relative_frequency", # encode repertoire by the relative frequency of k-mers in repertoire "reads": "unique", # count each sequence only once, do not use clonal count "k": 2, # k-mer length "sequence_type": "amino_acid", "sequence_encoding": "continuous_kmer" # split each sequence in repertoire to overlapping k-mers }), EncoderParams(result_path=result_path, label_config=LabelConfiguration([Label(label_name, dataset.labels[label_name])])))) dataset_exporter = DesignMatrixExporter(dataset=encoded_dataset, result_path=result_path / "csv_exported", file_format='csv') dataset_exporter.generate_report() return encoded_dataset
def encode(self, unit: ExploratoryAnalysisUnit, result_path: Path) -> Dataset: if unit.encoder is not None: encoded_dataset = DataEncoder.run(DataEncoderParams(dataset=unit.dataset, encoder=unit.encoder, encoder_params=EncoderParams(result_path=result_path, label_config=unit.label_config, filename="encoded_dataset.pkl", pool_size=unit.number_of_processes, learn_model=True, encode_labels=unit.label_config is not None), store_encoded_data=True)) else: encoded_dataset = unit.dataset return encoded_dataset
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/dataencoder/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], metadata={ "l1": 1, "l2": 2 }, path=path) rep2 = Repertoire.build_from_sequence_objects( [ReceptorSequence("ATA", identifier="2")], metadata={ "l1": 0, "l2": 3 }, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) encoder = Word2VecEncoder.build_object( dataset, **{ "k": 3, "model_type": ModelType.SEQUENCE.name, "vector_size": 6 }) res = DataEncoder.run( DataEncoderParams(dataset=dataset, encoder=encoder, encoder_params=EncoderParams( model={}, pool_size=2, label_config=lc, result_path=path, filename="dataset.csv"), store_encoded_data=False)) self.assertTrue(isinstance(res, RepertoireDataset)) self.assertTrue(res.encoded_data.examples.shape[0] == 2) shutil.rmtree(path)