def annotate_features(dataset: RepertoireDataset, criteria: dict, name: str = "annotation"): """ Takes an encoded dataset and adds a new column to the feature_annotations with boolean values showing whether a feature matched the specified criteria or not. """ dataset = copy.deepcopy(dataset) feature_annotations = dataset.encoded_data.feature_annotations matcher = CriteriaMatcher() results = matcher.match(criteria=criteria, data=feature_annotations) feature_annotations[name] = results encoded = EncodedData( examples=dataset.encoded_data.examples, labels=dataset.encoded_data.labels, example_ids=dataset.encoded_data.example_ids, feature_names=dataset.encoded_data.feature_names, feature_annotations=feature_annotations ) result = RepertoireDataset( params=dataset.params, encoded_data=encoded, repertoires=dataset.get_data(), identifier=dataset.identifier, metadata_file=dataset.metadata_file ) return result
def match(self, dataset: RepertoireDataset, reference_sequences: list, max_distance: int, summary_type: SequenceMatchingSummaryType) -> dict: matched = {"repertoires": []} for index, repertoire in enumerate(dataset.get_data()): matched["repertoires"].append(self.match_repertoire(repertoire, index, reference_sequences, max_distance, summary_type)) return matched
def process_dataset(self, dataset: RepertoireDataset): extract_fn = self.build_matching_fn() repertoire_count = dataset.get_example_count() for index, repertoire in enumerate(dataset.get_data()): self.process_repertoire(repertoire, str(repertoire.identifier), extract_fn) logging.info("Repertoire {} ({}/{}) processed.".format( repertoire.identifier, index + 1, repertoire_count)) logging.info( f"Currently, there are {self.item_count} items in the comparison data matrix." ) self.merge_tmp_batches_to_matrix()
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: processed_dataset = dataset.clone() repertoires = [] indices =[] for index, repertoire in enumerate(dataset.get_data()): if "lower_limit" in params.keys() and len(repertoire.sequences) >= params["lower_limit"] or \ "upper_limit" in params.keys() and len(repertoire.sequences) <= params["upper_limit"]: repertoires.append(dataset.repertoires[index]) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ClonesPerRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"]) Filter.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter") return processed_dataset
def _encode_repertoires(self, dataset: RepertoireDataset, params): # Rows = repertoires, Columns = reference sequences encoded_repertories = np.zeros((dataset.get_example_count(), len(self.reference_sequences)), dtype=int) labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None for i, repertoire in enumerate(dataset.get_data()): encoded_repertories[i] = self._match_repertoire_to_reference(repertoire) for label in params.label_config.get_labels_by_name(): labels[label].append(repertoire.metadata[label]) return encoded_repertories, labels
def test_process(self): path = EnvironmentSettings.root_path + "test/tmp/chain_filter/" PathBuilder.build(path) rep1 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAA", metadata=SequenceMetadata(chain="A"), identifier="1") ], path=path, metadata={}) rep2 = Repertoire.build_from_sequence_objects([ ReceptorSequence( "AAC", metadata=SequenceMetadata(chain="B"), identifier="2") ], path=path, metadata={}) metadata = pd.DataFrame({"CD": [1, 0]}) metadata.to_csv(path + "metadata.csv") dataset = RepertoireDataset(repertoires=[rep1, rep2], metadata_file=path + "metadata.csv") dataset2 = ChainRepertoireFilter.process( dataset, { "keep_chain": "ALPHA", "result_path": path + "results/" }) self.assertEqual(1, len(dataset2.get_data())) self.assertEqual(2, len(dataset.get_data())) metadata_dict = dataset2.get_metadata(["CD"]) self.assertEqual(1, len(metadata_dict["CD"])) self.assertEqual(1, metadata_dict["CD"][0]) for rep in dataset2.get_data(): self.assertEqual("AAA", rep.sequences[0].get_sequence()) self.assertRaises(AssertionError, ChainRepertoireFilter.process, dataset, { "keep_chain": "GAMMA", "result_path": path + "results/" }) shutil.rmtree(path)
def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: str): model = Word2Vec(size=vector_size, min_count=1, window=5) # creates an empty model all_kmers = KmerHelper.create_all_kmers( k=k, alphabet=EnvironmentSettings.get_sequence_alphabet()) all_kmers = [[kmer] for kmer in all_kmers] model.build_vocab(all_kmers) for repertoire in dataset.get_data(batch_size=batch_size): sentences = KmerHelper.create_sentences_from_repertoire( repertoire=repertoire, k=k) model.train(sentences=sentences, total_words=len(all_kmers), epochs=15) model.save(model_path) return model
def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: processed_dataset = dataset.clone() PathBuilder.build(params["result_path"]) repertoires = [] indices = [] for index, repertoire in enumerate(dataset.get_data()): if all(sequence.metadata.chain == Chain.get_chain( params["keep_chain"]) for sequence in repertoire.sequences): repertoires.append(repertoire) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata( processed_dataset, indices, params["result_path"]) Filter.check_dataset_not_empty(processed_dataset, "ChainRepertoireFilter") return processed_dataset
def group_features(dataset: RepertoireDataset, group_columns, group_summarization_type: GroupSummarizationType): """ Takes an encoded dataset and groups together features by either adding or averaging the feature values for all features with the same values (or combination of values) for the specified group_columns. """ dataset = copy.deepcopy(dataset) if group_summarization_type == GroupSummarizationType.NONZERO: dataset.encoded_data.examples.data[:] = 1 feature_annotations = dataset.encoded_data.feature_annotations concatenated = DataSummarizer.concatenate_columns(feature_annotations, group_columns) group_mask = DataSummarizer.create_group_mask(concatenated.values, group_summarization_type) groups = group_mask["groups"] mask = group_mask["mask"] repertoires = dataset.encoded_data.examples.dot(mask) feature_annotations = DataSummarizer.split_values(groups, group_columns) encoded = EncodedData( examples=repertoires, labels=dataset.encoded_data.labels, example_ids=dataset.encoded_data.example_ids, feature_names=groups, feature_annotations=feature_annotations ) result = RepertoireDataset( params=dataset.params, encoded_data=encoded, repertoires=dataset.get_data(), identifier=dataset.identifier, metadata_file=dataset.metadata_file ) return result
def _encode_repertoires(self, dataset: RepertoireDataset, params: EncoderParams): # Rows = repertoires, Columns = regex matches (one chain per column) encoded_repertoires = np.zeros( (dataset.get_example_count(), self.feature_count), dtype=int) labels = { label: [] for label in params.label_config.get_labels_by_name() } if params.encode_labels else None n_repertoires = dataset.get_example_count() for i, repertoire in enumerate(dataset.get_data()): print( f"{datetime.datetime.now()}: Encoding repertoire {i+1}/{n_repertoires}" ) encoded_repertoires[i] = self._match_repertoire_to_regexes( repertoire) if labels is not None: for label in params.label_config.get_labels_by_name(): labels[label].append(repertoire.metadata[label]) return encoded_repertoires, labels
def group_repertoires(dataset: RepertoireDataset, group_columns, group_summarization_type: GroupSummarizationType): """ Takes an encoded dataset and groups together repertoires by either adding or averaging the feature values for all repertoires with the same values (or combination of values) for the specified group_columns. """ dataset = copy.deepcopy(dataset) repertoire_annotations = pd.DataFrame(dataset.encoded_data.labels) concatenated = DataSummarizer.concatenate_columns(repertoire_annotations, group_columns) group_mask = DataSummarizer.create_group_mask(concatenated.values, group_summarization_type) groups = group_mask["groups"] mask = group_mask["mask"] repertoires = mask.T.dot(dataset.encoded_data.examples) labels = DataSummarizer.split_values(groups, group_columns) metadata_file = DataSummarizer.build_metadata_from_labels(dataset.metadata_file, labels) encoded = EncodedData( examples=repertoires, labels=labels.to_dict("list"), example_ids=groups, feature_names=dataset.encoded_data.feature_names, feature_annotations=dataset.encoded_data.feature_annotations ) result = RepertoireDataset( params=dataset.params, encoded_data=encoded, repertoires=dataset.get_data(), identifier=dataset.identifier, metadata_file=metadata_file ) return result