def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams): labels = params.label_config.get_labels_by_name() assert len(labels) == 1, \ "SequenceAbundanceEncoder: this encoding works only for single label." examples = self._calculate_sequence_abundance(dataset, self.comparison_data, labels[0], params) encoded_data = EncodedData( examples, dataset.get_metadata([labels[0]]) if params.encode_labels else None, dataset.get_repertoire_ids(), [ SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE ], encoding=SequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) encoded_dataset = RepertoireDataset(labels=dataset.labels, encoded_data=encoded_data, repertoires=dataset.repertoires) return encoded_dataset
def process_dataset(self, dataset: RepertoireDataset, result_path: Path = None): self.check_dataset_type(dataset, [RepertoireDataset], "ClonesPerRepertoireFilter") self.result_path = result_path if result_path is not None else self.result_path processed_dataset = dataset.clone() repertoires, indices = [], [] for index, repertoire in enumerate(dataset.get_data()): if self.lower_limit != -1 and len( repertoire.sequences) < self.lower_limit: continue if self.upper_limit != -1 and len( repertoire.sequences) > self.upper_limit: continue repertoires.append(dataset.repertoires[index]) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = self._build_new_metadata( dataset, indices) self.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter") return processed_dataset
def _encode_new_dataset(self, dataset, params: EncoderParams): encoded_dataset = RepertoireDataset( repertoires=dataset.repertoires, labels=dataset.labels, metadata_file=dataset.metadata_file) feature_annotations = self._get_feature_info() encoded_repertoires, labels, example_ids = self._encode_repertoires( dataset, params) encoded_dataset.add_encoded_data( EncodedData( # examples contains a np.ndarray with counts examples=encoded_repertoires, # example_ids contains a list of repertoire identifiers example_ids=example_ids, # feature_names contains a list of reference receptor identifiers feature_names=[ "{receptor_id}.{chain}".format( receptor_id=row["receptor_id"], chain=row["chain"]) for index, row in feature_annotations.iterrows() ], # feature_annotations contains a PD dataframe with sequence and VDJ gene usage per reference receptor feature_annotations=feature_annotations, labels=labels, encoding=MatchedReceptorsEncoder.__name__)) return encoded_dataset
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector" PathBuilder.build(path) reps = [ Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects( [ReceptorSequence("AAC", identifier="2")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects( [ReceptorSequence("AAC", identifier="3")], path=path, metadata={"subject_id": "patient3"}) ] dataset = RepertoireDataset(repertoires=reps) dataset2 = SubjectRepertoireCollector.process( dataset, {"result_path": path / "result"}) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual(3, len(dataset.get_data())) values = [2, 1] for index, rep in enumerate(dataset2.get_data()): self.assertEqual(values[index], len(rep.sequences)) shutil.rmtree(path)
def create_dummy_dataset(self, path): repertoires, metadata = RepertoireBuilder.build([["AA"], ["CC"]], path, labels={"label1": ["val1", "val2"], "label2": ["val1", "val2"]}) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) dataset.name = "my_dataset" PickleExporter.export(dataset, path) return f"{dataset.name}.iml_dataset"
def test_run(self): path = EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=RepertoireBuilder.build( [["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"], ["AA"], ["CC"]], path)[0]) dataset.encoded_data = EncodedData( examples=np.array([[1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3], [1, 1], [1, 1], [3, 3]]), labels={ "l1": [1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3], "l2": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] }) label_config = LabelConfiguration() label_config.add_label("l1", [1, 3]) label = Label(name='l1', values=[1, 2]) method1 = LogisticRegression() method1.fit(dataset.encoded_data, label=label) res = MLMethodAssessment.run( MLMethodAssessmentParams( dataset=dataset, method=method1, metrics={ Metric.ACCURACY, Metric.BALANCED_ACCURACY, Metric.F1_MACRO }, optimization_metric=Metric.LOG_LOSS, predictions_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv", label=label, ml_score_path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv", split_index=1, path=EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")) self.assertTrue(isinstance(res, dict)) self.assertTrue(res[Metric.LOG_LOSS.name.lower()] <= 0.1) self.assertTrue( os.path.isfile(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv")) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/ml_score.csv") self.assertTrue(df.shape[0] == 1) df = pd.read_csv(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/predictions.csv") self.assertEqual(12, df.shape[0]) shutil.rmtree(EnvironmentSettings.root_path / "test/tmp/mlmethodassessment/")
def process_dataset(self, dataset: RepertoireDataset): extract_fn = self.build_matching_fn() repertoire_count = dataset.get_example_count() for index, repertoire in enumerate(dataset.get_data()): self.process_repertoire(repertoire, str(repertoire.identifier), extract_fn) logging.info("Repertoire {} ({}/{}) processed.".format( repertoire.identifier, index + 1, repertoire_count)) logging.info( f"Currently, there are {self.item_count} items in the comparison data matrix." ) self.merge_tmp_batches_to_matrix()
def create_datasets(self, path: Path): repertoires, metadata = RepertoireBuilder.build( [["A", "B"], ["B", "C"], ["D"], ["E", "F"]], path, { "l1": [1, 0, 1, 0], "l2": [2, 3, 2, 3] }) main_dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) sub_dataset = main_dataset.make_subset([0, 1], path=path, dataset_type="subset") return main_dataset, sub_dataset
def export_updated_metadata(dataset: RepertoireDataset, result_path: Path, repertoire_folder: str): df = pd.read_csv(dataset.metadata_file, comment=Constants.COMMENT_SIGN) identifiers = df["identifier"].values.tolist( ) if "identifier" in df.columns else dataset.get_example_ids() df["filename"] = [ str( Path(repertoire_folder) / f"{repertoire.data_filename.stem}.tsv") for repertoire in dataset.get_data() ] df['identifier'] = identifiers df.to_csv(result_path / "metadata.csv", index=False)
def test_get_metadata_fields(self): path = EnvironmentSettings.tmp_test_path / "repertoire_dataset/" PathBuilder.build(path) repertoires, metadata = RepertoireBuilder.build([["AA"], ["BB"]], path, {"l1": [1, 2], "hla": ["A", "B"]}, subject_ids=["d1", "d2"]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) self.assertTrue("l1" in dataset.get_metadata_fields()) self.assertTrue("hla" in dataset.get_metadata_fields()) self.assertTrue("subject_id" in dataset.get_metadata_fields()) shutil.rmtree(path)
def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict: lbl = ["repertoire_identifier"] lbl.extend(params.label_config.get_labels_by_name()) tmp_labels = dataset.get_metadata(lbl, return_df=True) tmp_labels = tmp_labels.iloc[pd.Index( tmp_labels['repertoire_identifier']).get_indexer( dataset.get_repertoire_ids())] tmp_labels = tmp_labels.to_dict("list") del tmp_labels["repertoire_identifier"] return tmp_labels
def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams): # Rows = repertoires, Columns = reference chains (two per sequence receptor) encoded_repertories = np.zeros((dataset.get_example_count(), len(self.reference_receptors) * 2), dtype=int) labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None for i, repertoire in enumerate(dataset.get_data()): encoded_repertories[i] = self._match_repertoire_to_receptors(repertoire) if labels is not None: for label_name in params.label_config.get_labels_by_name(): labels[label_name].append(repertoire.metadata[label_name]) return encoded_repertories, labels, dataset.get_repertoire_ids()
def test_generate(self): path = EnvironmentSettings.root_path / "test/tmp/featuredistribution/" PathBuilder.build(path) dataset = self._create_dummy_encoded_data(path) report = FeatureComparison.build_object(**{"dataset": dataset, "result_path": path, "comparison_label": "patient"}) self.assertTrue(report.check_prerequisites()) result = report.generate_report() self.assertIsInstance(result, ReportResult) self.assertEqual(result.output_figures[0].path, path / "feature_comparison.html") self.assertEqual(result.output_tables[0].path, path / "feature_values.csv") content = pd.read_csv(path / "feature_values.csv") self.assertListEqual(list(content.columns), ["patient", "example_id", "sequence", "feature", "value"]) # report should succeed to build but check_prerequisites should be false when data is not encoded report = FeatureDistribution.build_object(**{"dataset": RepertoireDataset(), "result_path": path}) self.assertFalse(report.check_prerequisites()) shutil.rmtree(path)
def _create_dummy_encoded_data(self, path): n_subjects = 50 n_features = 30 kmers = [''.join(random.choices(string.ascii_uppercase, k=3)) for i in range(n_features)] encoded_data = { 'examples': sparse.csr_matrix( np.random.normal(50, 10, n_subjects * n_features).reshape((n_subjects, n_features))), 'example_ids': [''.join(random.choices(string.ascii_uppercase, k=4)) for i in range(n_subjects)], 'labels': { }, 'feature_names': kmers, 'feature_annotations': pd.DataFrame({ "sequence": kmers }), 'encoding': "random" } metadata_filepath = path / "metadata.csv" metadata = pd.DataFrame({"patient": np.array([i % 2 == 0 for i in range(n_subjects)])}) metadata.to_csv(metadata_filepath, index=False) dataset = RepertoireDataset(encoded_data=EncodedData(**encoded_data), metadata_file=metadata_filepath) return dataset
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: Preprocessor.check_dataset_type(dataset, [RepertoireDataset], "ClonesPerRepertoireFilter") processed_dataset = dataset.clone() repertoires = [] indices = [] for index, repertoire in enumerate(dataset.get_data()): if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \ "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]: repertoires.append(dataset.repertoires[index]) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"]) Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter") return processed_dataset
def _construct_test_repertoiredataset(self, path, positional): receptors1 = ReceptorSequenceList() receptors2 = ReceptorSequenceList() if positional: [receptors1.append(seq) for seq in [ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1"), ReceptorSequence("AAAAAAAAAAAAAAAAA", identifier="1")]] [receptors2.append(seq) for seq in [ReceptorSequence("TTTTTTTTTTTTT", identifier="1")]] else: [receptors1.append(seq) for seq in [ReceptorSequence("AAAA", identifier="1"), ReceptorSequence("ATA", identifier="2"), ReceptorSequence("ATA", identifier='3')]] [receptors2.append(seq) for seq in [ReceptorSequence("ATA", identifier="1"), ReceptorSequence("TAA", identifier="2")]] rep1 = Repertoire.build_from_sequence_objects(receptors1, metadata={"l1": 1, "l2": 2, "subject_id": "1"}, path=path) rep2 = Repertoire.build_from_sequence_objects(receptors2, metadata={"l1": 0, "l2": 3, "subject_id": "2"}, path=path) lc = LabelConfiguration() lc.add_label("l1", [1, 2]) lc.add_label("l2", [0, 3]) dataset = RepertoireDataset(repertoires=[rep1, rep2]) return dataset, lc
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: SubjectRepertoireCollector.check_dataset_type(dataset, [RepertoireDataset], "SubjectRepertoireCollector") rep_map = {} repertoires = [] indices_to_keep = [] processed_dataset = dataset.clone() PathBuilder.build(params["result_path"]) for index, repertoire in enumerate(processed_dataset.get_data()): if repertoire.metadata["subject_id"] in rep_map.keys(): sequences = np.append(repertoire.sequences, rep_map[repertoire.metadata["subject_id"]].sequences) del rep_map[repertoire.metadata["subject_id"]] repertoires.append(SubjectRepertoireCollector.store_repertoire( params["result_path"], repertoire, sequences)) else: rep_map[repertoire.metadata["subject_id"]] = repertoire indices_to_keep.append(index) for key in rep_map.keys(): repertoires.append(SubjectRepertoireCollector.store_repertoire(params["result_path"], rep_map[key], rep_map[key].sequences)) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = SubjectRepertoireCollector.build_new_metadata(dataset, indices_to_keep, params["result_path"]) return processed_dataset
def create_dataset(self): path = Path( os.path.relpath(EnvironmentSettings.root_path / "test/tmp/immunemlapp/initial_dataset")) PathBuilder.build(path) repertoire_count = 30 repertoires, metadata = RepertoireBuilder.build( [["AA", "AAAA", "AAAA", "AAA"] for i in range(repertoire_count)], path, { "CD": [ 'yes' if i % 2 == 0 else 'no' for i in range(repertoire_count) ], "CMV": [ True if i % 2 == 1 else False for i in range(repertoire_count) ] }, [[{ "chain": "A" if i % 2 == 0 else "B", "count": random.randint(2, 5) } for i in range(4)] for j in range(repertoire_count)]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata, labels={ "CD": [True, False], "CMV": [True, False] }, name="d1") PickleExporter.export(dataset, path) return path / "d1.iml_dataset"
def test_repertoire_export(self): path = EnvironmentSettings.tmp_test_path / "airr_exporter_repertoire/" PathBuilder.build(path) repertoire, metadata_path = self.create_dummy_repertoire(path) dataset = RepertoireDataset(repertoires=[repertoire], metadata_file=metadata_path) path_exported = path / "exported" AIRRExporter.export(dataset, path_exported) resulting_data = pd.read_csv(path_exported / f"repertoires/{repertoire.identifier}.tsv", sep="\t") self.assertListEqual(list(resulting_data["sequence_id"]), ["receptor_1", "receptor_2"]) self.assertListEqual(list(resulting_data["cdr3"]), ["GCTGCTGCT", "GGTGGTGGT"]) self.assertListEqual(list(resulting_data["cdr3_aa"]), ["AAA", "GGG"]) self.assertListEqual(list(resulting_data["v_call"]), ["TRBV1", "TRAV2*01"]) self.assertListEqual(list(resulting_data["j_call"]), ["TRBJ1", "TRAJ2"]) self.assertListEqual(list(resulting_data["d_call"]), ["TRBD1", "TRAD2"]) self.assertListEqual(list(resulting_data["locus"]), ["TRB", "TRA"]) self.assertListEqual(list(resulting_data["duplicate_count"]), [5, 15]) self.assertListEqual(list(resulting_data["custom_test"]), ["cust1", "cust2"]) self.assertListEqual(list(resulting_data["productive"]), ['T', 'F']) self.assertListEqual(list(resulting_data["stop_codon"]), ['F', 'F']) shutil.rmtree(path)
def test_get_normalized_sequence_lengths(self): path = EnvironmentSettings.root_path / "test/tmp/datareports/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAA", identifier="1"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="2"), ReceptorSequence(amino_acid_sequence="AAAAA", identifier="3"), ReceptorSequence(amino_acid_sequence="AAA", identifier="4") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects(sequence_objects=[ ReceptorSequence(amino_acid_sequence="AAA", identifier="5"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="6"), ReceptorSequence(amino_acid_sequence="AAAA", identifier="7"), ReceptorSequence(amino_acid_sequence="AAA", identifier="8") ], path=path, metadata={}) dataset = RepertoireDataset(repertoires=[rep1, rep2]) sld = SequenceLengthDistribution(dataset, 1, path) result = sld.generate_report() self.assertTrue(os.path.isfile(result.output_figures[0].path)) shutil.rmtree(path)
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=["1", "1", "1", "2", '2'], path=path) if dataset_type == "receptor": dataset = ReceptorDataset.build_from_objects( test_repertoire.receptors, 100, path, name="receptor_dataset") dataset.identifier = 'receptor_dataset' elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def _create_report(self, path): report = ConfounderAnalysis.build_object( metadata_labels=["age", "HLA"], name='test') report.ml_details_path = path / "ml_details.yaml" report.label = Label("disease") report.result_path = path encoder = KmerFrequencyEncoder.build_object( RepertoireDataset(), **{ "normalization_type": NormalizationType.RELATIVE_FREQUENCY.name, "reads": ReadsType.UNIQUE.name, "sequence_encoding": SequenceEncodingType.CONTINUOUS_KMER.name, "k": 3, 'sequence_type': SequenceType.AMINO_ACID.name }) report.train_dataset = self._encode_dataset( encoder, self._make_dataset(path / "train", size=100), path) report.test_dataset = self._encode_dataset(encoder, self._make_dataset(path / "test", size=40), path, learn_model=False) report.method = self._create_dummy_lr_model( path, report.train_dataset.encoded_data, Label("disease")) return report
def _merge_repertoires(self, dataset: RepertoireDataset): rep_map = {} repertoires, indices_to_keep = [], [] processed_dataset = dataset.clone() for index, repertoire in enumerate(processed_dataset.get_data()): if repertoire.metadata["subject_id"] in rep_map.keys(): sequences = np.append( repertoire.sequences, rep_map[repertoire.metadata["subject_id"]].sequences) del rep_map[repertoire.metadata["subject_id"]] repertoires.append( self._store_repertoire(repertoire, sequences)) else: rep_map[repertoire.metadata["subject_id"]] = repertoire indices_to_keep.append(index) for key in rep_map.keys(): repertoires.append( self._store_repertoire(rep_map[key], rep_map[key].sequences)) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = self._build_new_metadata( dataset, indices_to_keep) return processed_dataset
def test_create_model(self): test_path = EnvironmentSettings.root_path / "test/tmp/w2v_test_tmp/" PathBuilder.build(test_path) sequence1 = ReceptorSequence("CASSVFA") sequence2 = ReceptorSequence("CASSCCC") metadata1 = {"T1D": "T1D", "subject_id": "1"} rep1 = Repertoire.build_from_sequence_objects([sequence1, sequence2], test_path, metadata1) metadata2 = {"T1D": "CTL", "subject_id": "2"} rep2 = Repertoire.build_from_sequence_objects([sequence1], test_path, metadata2) dataset = RepertoireDataset(repertoires=[rep1, rep2]) model_creator = KmerPairModelCreator() model = model_creator.create_model(dataset=dataset, k=2, vector_size=16, batch_size=1, model_path=test_path / "model.model") self.assertTrue(isinstance(model, Word2Vec)) self.assertTrue("CA" in model.wv.vocab) self.assertEqual(400, len(model.wv.vocab)) shutil.rmtree(test_path)
def make_random_dataset(self, path): alphabet = EnvironmentSettings.get_sequence_alphabet() sequences = [["".join([rn.choice(alphabet) for i in range(20)]) for i in range(100)] for i in range(40)] repertoires, metadata = RepertoireBuilder.build(sequences, path, subject_ids=[i % 2 for i in range(len(sequences))]) dataset = RepertoireDataset(repertoires=repertoires, metadata_file=metadata) PickleExporter.export(dataset, path)
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=[1, 1, 1, 2, 2], path=path) if dataset_type == "receptor": receptordataset_filename = path / "receptors.pkl" with open(receptordataset_filename, "wb") as file: pickle.dump(test_repertoire.receptors, file) dataset = ReceptorDataset(filenames=[receptordataset_filename], identifier="receptor_dataset") elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def test_exporter(self): dataset = RepertoireDataset(encoded_data=EncodedData(examples=csr_matrix(np.arange(12).reshape(3, 4)), labels={"l1": [1, 0, 1], "l2": [0, 0, 1]}, example_ids=[0, 1, 2], feature_names=["f1", "f2", "f3", "f4"], encoding="test_encoding")) path = EnvironmentSettings.tmp_test_path / "designmatrrixexporterreport/" report = DesignMatrixExporter(dataset=dataset, result_path=path, name="design_matrix", file_format='csv') report.generate_report() self.assertTrue(os.path.isfile(path / "design_matrix.csv")) report.file_format = 'csv.zip' report._export_matrix() self.assertTrue(os.path.isfile(path / "design_matrix.csv.zip")) report.file_format = 'npy' report._export_matrix() self.assertTrue(os.path.isfile(path / "design_matrix.npy")) report.file_format = 'npy.zip' report._export_matrix() self.assertTrue(os.path.isfile(path / "design_matrix.npy.zip")) report.file_format = 'hdf5' report._export_matrix() self.assertTrue(os.path.isfile(path / "design_matrix.hdf5")) report.file_format = 'hdf5.zip' report._export_matrix() self.assertTrue(os.path.isfile(path / "design_matrix.hdf5.zip")) shutil.rmtree(path) with self.assertRaises(AssertionError): DesignMatrixExporter.build_object(**{'file_format': "random"})
def test_generate(self): dataset = RepertoireDataset(encoded_data=EncodedData(examples=csr_matrix(np.arange(12).reshape(3, 4)), labels={"l1": [1, 0, 1], "l2": [0, 0, 1]}, example_ids=[0, 1, 2], feature_names=["f1", "f2", "f3", "f4"], encoding="test_encoding")) path = EnvironmentSettings.tmp_test_path / "designmatrrixexporterreport/" report = DesignMatrixExporter(dataset, path, name='report', file_format='csv') report.generate_report() self.assertTrue(os.path.isfile(path / "design_matrix.csv")) self.assertTrue(os.path.isfile(path / "labels.csv")) self.assertTrue(os.path.isfile(path / "encoding_details.yaml")) matrix = pd.read_csv(path / "design_matrix.csv", sep=",").values self.assertTrue(np.array_equal(matrix, np.arange(12).reshape(3, 4))) labels = pd.read_csv(path / "labels.csv", sep=",").values self.assertTrue(np.array_equal(labels, np.array([[1, 0], [0, 0], [1, 1]]))) with open(path / "encoding_details.yaml", "r") as file: loaded = yaml.safe_load(file) self.assertTrue("feature_names" in loaded) self.assertTrue("encoding" in loaded) self.assertTrue("example_ids" in loaded) self.assertTrue(np.array_equal(loaded["example_ids"], np.array([0, 1, 2]))) self.assertTrue(np.array_equal(loaded["feature_names"], np.array(["f1", "f2", "f3", "f4"]))) self.assertEqual("test_encoding", loaded["encoding"]) shutil.rmtree(path)
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path / "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path / "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path / "results" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path / "results" }) shutil.rmtree(path)
def create_comparison_data(self, dataset: RepertoireDataset) -> ComparisonData: comparison_data = ComparisonData(dataset.get_repertoire_ids(), self.matching_columns, self.sequence_batch_size, self.path) comparison_data.process_dataset(dataset) return comparison_data