def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=[1, 1, 1, 2, 2], path=path) if dataset_type == "receptor": receptordataset_filename = path / "receptors.pkl" with open(receptordataset_filename, "wb") as file: pickle.dump(test_repertoire.receptors, file) dataset = ReceptorDataset(filenames=[receptordataset_filename], identifier="receptor_dataset") elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def process_repertoire(repertoire: Repertoire, params: dict) -> Repertoire: data = pd.DataFrame(repertoire.load_data()) groupby_fields = DuplicateSequenceFilter._prepare_group_by_field(params, data.columns) custom_lists = list(set(data.columns) - set(Repertoire.FIELDS)) agg_dict = DuplicateSequenceFilter._prepare_agg_dict(params, data.columns, custom_lists) # Chain objects can not be aggregated, convert to strings if "chains" in data.columns: data["chains"] = [chain.value if isinstance(chain, Chain) else chain for chain in data["chains"]] else: data["chains"] = None no_duplicates = data.groupby(groupby_fields).agg(agg_dict).reset_index() processed_repertoire = Repertoire.build(sequence_aas=list(no_duplicates["sequence_aas"]) if "sequence_aas" in no_duplicates.columns else None, sequences=list(no_duplicates["sequences"]) if "sequences" in no_duplicates.columns else None, v_genes=list(no_duplicates["v_genes"]) if "v_genes" in no_duplicates.columns else None, j_genes=list(no_duplicates["j_genes"]) if 'j_genes' in no_duplicates.columns else None, chains=[Chain(key) for key in list(no_duplicates["chains"])] if "chains" in no_duplicates.columns else None, counts=list(no_duplicates["counts"]) if "counts" in no_duplicates else None, region_types=list(no_duplicates["region_types"]) if "region_types" in no_duplicates else None, custom_lists={key: list(no_duplicates[key]) for key in custom_lists}, sequence_identifiers=list(no_duplicates["sequence_identifiers"]), metadata=copy.deepcopy(repertoire.metadata), path=params["result_path"], filename_base=f"{repertoire.data_filename.stem}_filtered") return processed_repertoire
def _create_dummy_data(self, path, dataset_type): PathBuilder.build(path) dataset = None test_repertoire = Repertoire.build( sequence_aas=[ "DUPDUP", "AILUDGYF", "DFJKHJ", "DIUYUAG", "CTGTCGH" ], v_genes=["V1-1" for i in range(5)], j_genes=["J1-1" for i in range(5)], chains=[ Chain.ALPHA, Chain.BETA, Chain.BETA, Chain.ALPHA, Chain.BETA ], custom_lists={ "custom_1": [f"CUST-{i}" for i in range(5)], "custom_2": [f"CUST-A" for i in range(3)] + [f"CUST-B" for i in range(2)] }, cell_ids=["1", "1", "1", "2", '2'], path=path) if dataset_type == "receptor": dataset = ReceptorDataset.build_from_objects( test_repertoire.receptors, 100, path, name="receptor_dataset") dataset.identifier = 'receptor_dataset' elif dataset_type == "repertoire": test_repertoire.identifier = "repertoire_dataset" dataset = RepertoireDataset(repertoires=[test_repertoire]) return dataset
def test_implant_in_repertoire(self): path = PathBuilder.build(EnvironmentSettings.tmp_test_path / "full_seq_implanting/") signal = Signal("sig1", [Motif("motif1", GappedKmerInstantiation(max_gap=0), "AAAA")], FullSequenceImplanting()) repertoire = Repertoire.build(["CCCC", "CCCC", "CCCC"], path=path) new_repertoire = signal.implant_to_repertoire(repertoire, 0.33, path) self.assertEqual(len(repertoire.sequences), len(new_repertoire.sequences)) self.assertEqual(1, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "AAAA"])) self.assertEqual(2, len([seq for seq in new_repertoire.sequences if seq.amino_acid_sequence == "CCCC"])) shutil.rmtree(path)
def load_repertoire_as_object(import_class, metadata_row, params: DatasetImportParams): try: alternative_load_func = getattr(import_class, "alternative_load_func", None) filename = params.path / f"{metadata_row['filename']}" dataframe = ImportHelper.load_sequence_dataframe(filename, params, alternative_load_func) dataframe = import_class.preprocess_dataframe(dataframe, params) sequence_lists = {field: dataframe[field].values.tolist() for field in Repertoire.FIELDS if field in dataframe.columns} sequence_lists["custom_lists"] = {field: dataframe[field].values.tolist() for field in list(set(dataframe.columns) - set(Repertoire.FIELDS))} repertoire_inputs = {**{"metadata": metadata_row.to_dict(), "path": params.result_path / "repertoires/", "filename_base": filename.stem}, **sequence_lists} repertoire = Repertoire.build(**repertoire_inputs) return repertoire except Exception as exception: raise RuntimeError(f"{ImportHelper.__name__}: error when importing file {metadata_row['filename']}.") from exception
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/duplicatesequencefilter/" PathBuilder.build(path) dataset = RepertoireDataset(repertoires=[ Repertoire.build( sequence_aas=["AAA", "AAA", "CCC", "AAA", "CCC", "CCC", "CCC"], sequences=[ "ntAAA", "ntBBB", "ntCCC", "ntAAA", "ntCCC", "ntCCC", "ntDDD" ], v_genes=["v1", "v1", "v1", "v1", "v1", "v1", "v1"], j_genes=["j1", "j1", "j1", "j1", "j1", "j1", "j1"], chains=[ Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.ALPHA, Chain.BETA ], counts=[10, 20, 30, 5, 20, None, 40], region_types=[ "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3", "IMGT_CDR3" ], custom_lists={ "custom1": ["yes", "yes", "yes", "no", "no", "no", "no"], "custom2": ["yes", "yes", "yes", "no", "no", "no", "no"] }, sequence_identifiers=[1, 2, 3, 4, 5, 6, 7], path=path) ]) # collapse by amino acids & use sum counts dupfilter = DuplicateSequenceFilter( filter_sequence_type=SequenceType.AMINO_ACID, count_agg=CountAggregationFunction.SUM, batch_size=1) reduced_repertoire = dupfilter.process_dataset( dataset=dataset, result_path=path).repertoires[0] attr = reduced_repertoire.get_attributes([ "sequence_identifiers", "sequence_aas", "sequences", "counts", "chains" ]) self.assertEqual(3, len(reduced_repertoire.get_sequence_identifiers())) self.assertListEqual(["AAA", "CCC", "CCC"], list(attr["sequence_aas"])) self.assertListEqual(["ntAAA", "ntCCC", "ntDDD"], list(attr["sequences"])) self.assertListEqual([35, 50, 40], list(attr["counts"])) self.assertListEqual([1, 3, 7], list(attr["sequence_identifiers"])) self.assertListEqual(['ALPHA', 'ALPHA', 'BETA'], list(attr["chains"])) # collapse by nucleotides & use min counts dupfilter = DuplicateSequenceFilter( filter_sequence_type=SequenceType.NUCLEOTIDE, count_agg=CountAggregationFunction.MIN, batch_size=4) reduced_repertoire = dupfilter.process_dataset( dataset=dataset, result_path=path).repertoires[0] attr = reduced_repertoire.get_attributes( ["sequence_identifiers", "sequence_aas", "sequences", "counts"]) self.assertEqual(4, len(reduced_repertoire.get_sequence_identifiers())) self.assertListEqual([1, 2, 3, 7], list(attr["sequence_identifiers"])) self.assertListEqual(["AAA", "AAA", "CCC", "CCC"], list(attr["sequence_aas"])) self.assertListEqual(["ntAAA", "ntBBB", "ntCCC", "ntDDD"], list(attr["sequences"])) self.assertListEqual([5, 20, 20, 40], list(attr["counts"])) shutil.rmtree(path)