def _encode_new_dataset(self, dataset, params: EncoderParams): encoded_dataset = RepertoireDataset( repertoires=dataset.repertoires, labels=dataset.labels, metadata_file=dataset.metadata_file) feature_annotations = self._get_feature_info() encoded_repertoires, labels, example_ids = self._encode_repertoires( dataset, params) encoded_dataset.add_encoded_data( EncodedData( # examples contains a np.ndarray with counts examples=encoded_repertoires, # example_ids contains a list of repertoire identifiers example_ids=example_ids, # feature_names contains a list of reference receptor identifiers feature_names=[ "{receptor_id}.{chain}".format( receptor_id=row["receptor_id"], chain=row["chain"]) for index, row in feature_annotations.iterrows() ], # feature_annotations contains a PD dataframe with sequence and VDJ gene usage per reference receptor feature_annotations=feature_annotations, labels=labels, encoding=MatchedReceptorsEncoder.__name__)) return encoded_dataset
def _encode_new_dataset(self, dataset, params: EncoderParams): self._load_regex_df() encoded_dataset = RepertoireDataset(repertoires=dataset.repertoires, labels=dataset.labels, metadata_file=dataset.metadata_file) feature_annotations = self._get_feature_info() encoded_repertoires, labels = self._encode_repertoires(dataset, params) encoded_dataset.add_encoded_data(EncodedData( examples=encoded_repertoires, example_ids=list(dataset.get_metadata(["subject_id"]).values())[0], feature_names=list(feature_annotations["chain_id"]), feature_annotations=feature_annotations, labels=labels, encoding=MatchedRegexEncoder.__name__ )) return encoded_dataset
def _encode_new_dataset(self, dataset, params: EncoderParams): encoded_dataset = RepertoireDataset( repertoires=dataset.repertoires, labels=dataset.labels, metadata_file=dataset.metadata_file) encoded_repertoires, labels = self._encode_repertoires(dataset, params) feature_annotations = self._get_feature_info() encoded_dataset.add_encoded_data( EncodedData(examples=encoded_repertoires, labels=labels, feature_names=list(feature_annotations["sequence_id"]), feature_annotations=feature_annotations, example_ids=[ repertoire.identifier for repertoire in dataset.get_data() ], encoding=MatchedSequencesEncoder.__name__)) return encoded_dataset