def _encode_sequence_count(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams) -> EncodedData: sequence_p_values_indices, indices_path, relevant_sequences_path = SequenceFilterHelper.get_relevant_sequences(dataset, params, comparison_data, label, self.p_value_threshold, self.comparison_attributes, self.relevant_indices_path) if self.relevant_indices_path is None: self.relevant_indices_path = indices_path if self.relevant_sequence_csv_path is None: self.relevant_sequence_csv_path = relevant_sequences_path count_matrix = self._build_count_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices) feature_names = comparison_data.get_item_names()[sequence_p_values_indices] encoded_data = EncodedData(count_matrix, dataset.get_metadata([label]) if params.encode_labels else None, dataset.get_repertoire_ids(), feature_names, encoding=SequenceCountEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) return encoded_data
def create_comparison_data(self, dataset: RepertoireDataset) -> ComparisonData: comparison_data = ComparisonData(dataset.get_repertoire_ids(), self.matching_columns, self.sequence_batch_size, self.path) comparison_data.process_dataset(dataset) return comparison_data
def build_comparison_data(dataset: RepertoireDataset, params: EncoderParams, comparison_attributes, sequence_batch_size): comp_data = ComparisonData(dataset.get_repertoire_ids(), comparison_attributes, sequence_batch_size, params.result_path) comp_data.process_dataset(dataset) return comp_data
def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict: lbl = ["repertoire_identifier"] lbl.extend(params.label_config.get_labels_by_name()) tmp_labels = dataset.get_metadata(lbl, return_df=True) tmp_labels = tmp_labels.iloc[pd.Index( tmp_labels['repertoire_identifier']).get_indexer( dataset.get_repertoire_ids())] tmp_labels = tmp_labels.to_dict("list") del tmp_labels["repertoire_identifier"] return tmp_labels
def _calculate_sequence_abundance(self, dataset: RepertoireDataset, comparison_data: ComparisonData, label: str, params: EncoderParams): sequence_p_values_indices, indices_path, sequence_csv_path = SequenceFilterHelper.get_relevant_sequences(dataset=dataset, params=params, comparison_data=comparison_data, label=label, p_value_threshold=self.p_value_threshold, comparison_attributes=self.comparison_attributes, sequence_indices_path=self.relevant_indices_path) if self.relevant_indices_path is None: self.relevant_indices_path = indices_path if self.relevant_sequence_csv_path is None: self.relevant_sequence_csv_path = sequence_csv_path abundance_matrix = self._build_abundance_matrix(comparison_data, dataset.get_repertoire_ids(), sequence_p_values_indices) return abundance_matrix
def build_distance_matrix(self, dataset: RepertoireDataset, params: EncoderParams, train_repertoire_ids: list): self.comparison = PairwiseRepertoireComparison( self.attributes_to_match, self.attributes_to_match, params.result_path, sequence_batch_size=self.sequence_batch_size) current_dataset = dataset if self.context is None or "dataset" not in self.context else self.context[ "dataset"] distance_matrix = self.comparison.compare(current_dataset, self.distance_fn, self.distance_metric.value) repertoire_ids = dataset.get_repertoire_ids() distance_matrix = distance_matrix.loc[repertoire_ids, train_repertoire_ids] return distance_matrix
def compare_repertoires(self, dataset: RepertoireDataset, comparison_fn): self.comparison_data = self.memo_by_params(dataset) repertoire_count = dataset.get_example_count() comparison_result = np.zeros([repertoire_count, repertoire_count]) repertoire_identifiers = dataset.get_repertoire_ids() for index1 in range(repertoire_count): repertoire_vector_1 = self.comparison_data.get_repertoire_vector( repertoire_identifiers[index1]) for index2 in range(index1, repertoire_count): repertoire_vector_2 = self.comparison_data.get_repertoire_vector( repertoire_identifiers[index2]) comparison_result[index1, index2] = comparison_fn( repertoire_vector_1, repertoire_vector_2) comparison_result[index2, index1] = comparison_result[index1, index2] comparison_df = pd.DataFrame(comparison_result, columns=repertoire_identifiers, index=repertoire_identifiers) return comparison_df
def _encode_data(self, dataset: RepertoireDataset, params: EncoderParams): labels = params.label_config.get_labels_by_name() assert len(labels) == 1, \ "SequenceAbundanceEncoder: this encoding works only for single label." examples = self._calculate_sequence_abundance(dataset, self.comparison_data, labels[0], params) encoded_data = EncodedData(examples, dataset.get_metadata([labels[0]]) if params.encode_labels else None, dataset.get_repertoire_ids(), [SequenceAbundanceEncoder.RELEVANT_SEQUENCE_ABUNDANCE, SequenceAbundanceEncoder.TOTAL_SEQUENCE_ABUNDANCE], encoding=SequenceAbundanceEncoder.__name__, info={'relevant_sequence_path': self.relevant_sequence_csv_path}) encoded_dataset = RepertoireDataset(params=dataset.params, encoded_data=encoded_data, repertoires=dataset.repertoires) return encoded_dataset