def process_dataset(self, dataset: RepertoireDataset, result_path: Path = None): self.check_dataset_type(dataset, [RepertoireDataset], "ClonesPerRepertoireFilter") self.result_path = result_path if result_path is not None else self.result_path processed_dataset = dataset.clone() repertoires, indices = [], [] for index, repertoire in enumerate(dataset.get_data()): if self.lower_limit != -1 and len( repertoire.sequences) < self.lower_limit: continue if self.upper_limit != -1 and len( repertoire.sequences) > self.upper_limit: continue repertoires.append(dataset.repertoires[index]) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = self._build_new_metadata( dataset, indices) self.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter") return processed_dataset
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/subject_rep_collector" PathBuilder.build(path) reps = [ Repertoire.build_from_sequence_objects( [ReceptorSequence("AAA", identifier="1")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects( [ReceptorSequence("AAC", identifier="2")], path=path, metadata={"subject_id": "patient1"}), Repertoire.build_from_sequence_objects( [ReceptorSequence("AAC", identifier="3")], path=path, metadata={"subject_id": "patient3"}) ] dataset = RepertoireDataset(repertoires=reps) dataset2 = SubjectRepertoireCollector.process( dataset, {"result_path": path / "result"}) self.assertEqual(2, len(dataset2.get_data())) self.assertEqual(3, len(dataset.get_data())) values = [2, 1] for index, rep in enumerate(dataset2.get_data()): self.assertEqual(values[index], len(rep.sequences)) shutil.rmtree(path)
def process_dataset(self, dataset: RepertoireDataset): extract_fn = self.build_matching_fn() repertoire_count = dataset.get_example_count() for index, repertoire in enumerate(dataset.get_data()): self.process_repertoire(repertoire, str(repertoire.identifier), extract_fn) logging.info("Repertoire {} ({}/{}) processed.".format( repertoire.identifier, index + 1, repertoire_count)) logging.info( f"Currently, there are {self.item_count} items in the comparison data matrix." ) self.merge_tmp_batches_to_matrix()
def match(self, dataset: RepertoireDataset, reference_sequences: list, max_distance: int, summary_type: SequenceMatchingSummaryType) -> dict: matched = {"repertoires": []} for index, repertoire in enumerate(dataset.get_data()): matched["repertoires"].append( self.match_repertoire(repertoire, index, reference_sequences, max_distance, summary_type)) return matched
def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: Path): model = Word2Vec(size=vector_size, min_count=1, window=5) # creates an empty model all_kmers = KmerHelper.create_all_kmers(k=k, alphabet=EnvironmentSettings.get_sequence_alphabet()) all_kmers = [[kmer] for kmer in all_kmers] model.build_vocab(all_kmers) for repertoire in dataset.get_data(batch_size=batch_size): sentences = KmerHelper.create_sentences_from_repertoire(repertoire=repertoire, k=k) model.train(sentences=sentences, total_words=len(all_kmers), epochs=15) model.save(str(model_path)) return model
def export_updated_metadata(dataset: RepertoireDataset, result_path: Path, repertoire_folder: str): df = pd.read_csv(dataset.metadata_file, comment=Constants.COMMENT_SIGN) identifiers = df["identifier"].values.tolist( ) if "identifier" in df.columns else dataset.get_example_ids() df["filename"] = [ str( Path(repertoire_folder) / f"{repertoire.data_filename.stem}.tsv") for repertoire in dataset.get_data() ] df['identifier'] = identifiers df.to_csv(result_path / "metadata.csv", index=False)
def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams): # Rows = repertoires, Columns = reference chains (two per sequence receptor) encoded_repertories = np.zeros((dataset.get_example_count(), len(self.reference_receptors) * 2), dtype=int) labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None for i, repertoire in enumerate(dataset.get_data()): encoded_repertories[i] = self._match_repertoire_to_receptors(repertoire) if labels is not None: for label_name in params.label_config.get_labels_by_name(): labels[label_name].append(repertoire.metadata[label_name]) return encoded_repertories, labels, dataset.get_repertoire_ids()
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: Preprocessor.check_dataset_type(dataset, [RepertoireDataset], "ClonesPerRepertoireFilter") processed_dataset = dataset.clone() repertoires = [] indices = [] for index, repertoire in enumerate(dataset.get_data()): if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \ "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]: repertoires.append(dataset.repertoires[index]) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"]) Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter") return processed_dataset
def test_process(self): path = EnvironmentSettings.root_path / "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path / "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path / "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path / "results" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path / "results" }) shutil.rmtree(path)
def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams): # Rows = repertoires, Columns = regex matches (one chain per column) encoded_repertoires = np.zeros((dataset.get_example_count(), self.feature_count), dtype=int) labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None n_repertoires = dataset.get_example_count() for i, repertoire in enumerate(dataset.get_data()): print(f"{datetime.datetime.now()}: Encoding repertoire {i+1}/{n_repertoires}") encoded_repertoires[i] = self._match_repertoire_to_regexes(repertoire) if labels is not None: for label_name in params.label_config.get_labels_by_name(): labels[label_name].append(repertoire.metadata[label_name]) return encoded_repertoires, labels
def _encode_repertoires(self, dataset: RepertoireDataset, params): # Rows = repertoires, Columns = reference sequences encoded_repertories = np.zeros( (dataset.get_example_count(), len(self.reference_sequences)), dtype=int) labels = { label: [] for label in params.label_config.get_labels_by_name() } if params.encode_labels else None for i, repertoire in enumerate(dataset.get_data()): encoded_repertories[i] = self._match_repertoire_to_reference( repertoire) for label_name in params.label_config.get_labels_by_name(): labels[label_name].append(repertoire.metadata[label_name]) return encoded_repertories, labels
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: processed_dataset = dataset.clone() PathBuilder.build(params["result_path"]) repertoires = [] indices = [] for index, repertoire in enumerate(dataset.get_data()): if all(sequence.metadata.chain == Chain.get_chain( params["keep_chain"]) for sequence in repertoire.sequences): repertoires.append(repertoire) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata( processed_dataset, indices, params["result_path"]) Filter.check_dataset_not_empty(processed_dataset, "ChainRepertoireFilter") return processed_dataset
def process_dataset(self, dataset: RepertoireDataset, result_path: Path): self.check_dataset_type(dataset, [RepertoireDataset], "ChainRepertoireFilter") processed_dataset = dataset.clone() self.result_path = result_path if result_path is not None else self.result_path repertoires = [] indices = [] for index, repertoire in enumerate(dataset.get_data()): if all(sequence.metadata.chain == self.keep_chain for sequence in repertoire.sequences): repertoires.append(repertoire) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = self._build_new_metadata( processed_dataset, indices) self.check_dataset_not_empty(processed_dataset, "ChainRepertoireFilter") return processed_dataset