def _encode_new_dataset(self, dataset, params: EncoderParams):
        encoded_dataset = RepertoireDataset(
            repertoires=dataset.repertoires,
            labels=dataset.labels,
            metadata_file=dataset.metadata_file)

        feature_annotations = self._get_feature_info()
        encoded_repertoires, labels, example_ids = self._encode_repertoires(
            dataset, params)

        encoded_dataset.add_encoded_data(
            EncodedData(
                # examples contains a np.ndarray with counts
                examples=encoded_repertoires,
                # example_ids contains a list of repertoire identifiers
                example_ids=example_ids,
                # feature_names contains a list of reference receptor identifiers
                feature_names=[
                    "{receptor_id}.{chain}".format(
                        receptor_id=row["receptor_id"], chain=row["chain"])
                    for index, row in feature_annotations.iterrows()
                ],
                # feature_annotations contains a PD dataframe with sequence and VDJ gene usage per reference receptor
                feature_annotations=feature_annotations,
                labels=labels,
                encoding=MatchedReceptorsEncoder.__name__))

        return encoded_dataset
    def _encode_new_dataset(self, dataset, params: EncoderParams):
        self._load_regex_df()

        encoded_dataset = RepertoireDataset(repertoires=dataset.repertoires, labels=dataset.labels,
                                            metadata_file=dataset.metadata_file)

        feature_annotations = self._get_feature_info()
        encoded_repertoires, labels = self._encode_repertoires(dataset, params)

        encoded_dataset.add_encoded_data(EncodedData(
            examples=encoded_repertoires,
            example_ids=list(dataset.get_metadata(["subject_id"]).values())[0],
            feature_names=list(feature_annotations["chain_id"]),
            feature_annotations=feature_annotations,
            labels=labels,
            encoding=MatchedRegexEncoder.__name__
        ))

        return encoded_dataset
예제 #3
0
    def _encode_new_dataset(self, dataset, params: EncoderParams):
        encoded_dataset = RepertoireDataset(
            repertoires=dataset.repertoires,
            labels=dataset.labels,
            metadata_file=dataset.metadata_file)
        encoded_repertoires, labels = self._encode_repertoires(dataset, params)

        feature_annotations = self._get_feature_info()

        encoded_dataset.add_encoded_data(
            EncodedData(examples=encoded_repertoires,
                        labels=labels,
                        feature_names=list(feature_annotations["sequence_id"]),
                        feature_annotations=feature_annotations,
                        example_ids=[
                            repertoire.identifier
                            for repertoire in dataset.get_data()
                        ],
                        encoding=MatchedSequencesEncoder.__name__))

        return encoded_dataset