def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams): labels = params.label_config.get_labels_by_name() assert len(labels) == 1, \ "SequenceAbundanceEncoder: this encoding works only for single label." examples = self._calculate_sequence_abundance(dataset, self.comparison_data, labels[0], params) encoded_data = EncodedData( examples, dataset.get_metadata([labels[0]]) if params.encode_labels else None, dataset.get_repertoire_ids(), [ SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE ], encoding=SequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) encoded_dataset = RepertoireDataset(labels=dataset.labels, encoded_data=encoded_data, repertoires=dataset.repertoires) return encoded_dataset
def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict: lbl = ["repertoire_identifier"] lbl.extend(params.label_config.get_labels_by_name()) tmp_labels = dataset.get_metadata(lbl, return_df=True) tmp_labels = tmp_labels.iloc[pd.Index(tmp_labels['repertoire_identifier']).get_indexer(dataset.get_repertoire_ids())] tmp_labels = tmp_labels.to_dict("list") del tmp_labels["repertoire_identifier"] return tmp_labels
def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData: sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params, comparison_data, label, self.p_value_threshold, self.comparison_attributes, self.relevant_indices_path) if self.relevant_indices_path is None: self.relevant_indices_path = indices_path if self.relevant_sequence_csv_path is None: self.relevant_sequence_csv_path = relevant_sequences_path count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices) feature_names = comparison_data.get_item_names()[sequence_p_values_indices] encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None, dataset.get_repertoire_ids(), feature_names, encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) return encoded_data
def _encode_examples(self, dataset: RepertoireDataset, params: EncoderParams) -> Tuple[list, set, dict]: keys = set() example_count = dataset.get_example_count() arguments = [(repertoire, index, example_count) for index, repertoire in enumerate(dataset.repertoires)] with Pool(params.pool_size) as pool: chunksize = math.floor( dataset.get_example_count() / params.pool_size) + 1 examples = pool.starmap(self._process_repertoire_cached, arguments, chunksize=chunksize) for example in examples: keys.update(list(example.keys())) labels = dataset.get_metadata(params.label_config.get_labels_by_name() ) if params.encode_labels else None return examples, keys, labels
def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams): label_name = params.label_config.get_labels_by_name()[0] examples = self._calculate_sequence_abundance( dataset, self.sequence_presence_matrix, self.matrix_repertoire_ids, label_name, params) encoded_data = EncodedData( examples, dataset.get_metadata([label_name]) if params.encode_labels else None, dataset.get_repertoire_ids(), [ CompAIRRSequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, CompAIRRSequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE ], encoding=CompAIRRSequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) encoded_dataset = RepertoireDataset(labels=dataset.labels, encoded_data=encoded_data, repertoires=dataset.repertoires) return encoded_dataset
def get_matching_indices(dataset: RepertoireDataset, criteria): metadata = pd.DataFrame(dataset.get_metadata(None)) matches = CriteriaMatcher().match(criteria, metadata) indices = np.where(matches)[0] return indices
def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict: lbl = params.label_config.get_labels_by_name() return dataset.get_metadata(lbl, return_df=False)