Exemplo n.º 1
0
    def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams):
        receptor_objs = [receptor for receptor in dataset.get_data()]
        sequences = [[
            getattr(obj, chain).get_sequence(self.sequence_type)
            for chain in obj.get_chains()
        ] for obj in receptor_objs]
        first_chain_seqs, second_chain_seqs = zip(*sequences)

        if any(seq is None for seq in first_chain_seqs) or any(
                seq is None for seq in second_chain_seqs):
            raise ValueError(
                f"{OneHotEncoder.__name__}: receptor dataset {dataset.name} (id: {dataset.identifier}) contains empty sequences for the "
                f"specified sequence type {self.sequence_type.name.lower()}. Please check that the dataset is imported correctly."
            )

        max_seq_len = max(max([len(seq) for seq in first_chain_seqs]),
                          max([len(seq) for seq in second_chain_seqs]))

        example_ids = dataset.get_example_ids()
        labels = self._get_labels(receptor_objs,
                                  params) if params.encode_labels else None

        examples_first_chain = self._encode_sequence_list(
            first_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)
        examples_second_chain = self._encode_sequence_list(
            second_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)

        examples = np.stack((examples_first_chain, examples_second_chain),
                            axis=1)

        feature_names = self._get_feature_names(max_seq_len,
                                                receptor_objs[0].get_chains())

        if self.flatten:
            examples = examples.reshape(
                (len(receptor_objs),
                 2 * max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for subsublist in sublist
                for item in subsublist
            ]

        encoded_data = EncodedData(
            examples=examples,
            labels=labels,
            example_ids=example_ids,
            feature_names=feature_names,
            encoding=OneHotEncoder.__name__,
            info={
                "chain_names":
                receptor_objs[0].get_chains() if all(
                    receptor_obj.get_chains() == receptor_objs[0].get_chains()
                    for receptor_obj in receptor_objs) else None
            })

        return encoded_data
Exemplo n.º 2
0
    def _encode_data(self, dataset: ReceptorDataset, params: EncoderParams):
        receptor_objs = [receptor for receptor in dataset.get_data()]
        sequences = [[
            getattr(obj, chain).get_sequence() for chain in obj.get_chains()
        ] for obj in receptor_objs]
        first_chain_seqs, second_chain_seqs = zip(*sequences)

        max_seq_len = max(max([len(seq) for seq in first_chain_seqs]),
                          max([len(seq) for seq in second_chain_seqs]))

        example_ids = dataset.get_example_ids()
        labels = self._get_labels(receptor_objs,
                                  params) if params.encode_labels else None

        examples_first_chain = self._encode_sequence_list(
            first_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)
        examples_second_chain = self._encode_sequence_list(
            second_chain_seqs,
            pad_n_sequences=len(receptor_objs),
            pad_sequence_len=max_seq_len)

        examples = np.stack((examples_first_chain, examples_second_chain),
                            axis=1)

        feature_names = self._get_feature_names(max_seq_len,
                                                receptor_objs[0].get_chains())

        if self.flatten:
            examples = examples.reshape(
                (len(receptor_objs),
                 2 * max_seq_len * len(self.onehot_dimensions)))
            feature_names = [
                item for sublist in feature_names for subsublist in sublist
                for item in subsublist
            ]

        encoded_data = EncodedData(
            examples=examples,
            labels=labels,
            example_ids=example_ids,
            feature_names=feature_names,
            encoding=OneHotEncoder.__name__,
            info={
                "chain_names":
                receptor_objs[0].get_chains() if all(
                    receptor_obj.get_chains() == receptor_objs[0].get_chains()
                    for receptor_obj in receptor_objs) else None
            })

        return encoded_data