示例#1
0
    def prepare_encoder(self, priming_data):
        random.seed(len(priming_data))

        if self._prepared:
            raise Exception('You can only call "prepare_encoder" once for a given encoder.')

        self.onehot_encoder.prepare_encoder(priming_data)

        input_len = self.onehot_encoder._lang.n_words
        self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length

        if self.use_autoencoder:
            logging.info('Preparing a categorical autoencoder, this might take a while')

            embeddings_layer_len = self.max_encoded_length

            self.net = DefaultNet(dynamic_parameters={}, shape=[
                                  input_len, embeddings_layer_len, input_len], selfaware=False)

            criterion = torch.nn.CrossEntropyLoss()
            optimizer = Ranger(self.net.parameters())

            gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion,
                      device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode,
                      output_encoder=self._encoder_targets)

            batch_size = min(200, int(len(priming_data) / 50))

            priming_data_str = [str(x) for x in priming_data]
            train_data_loader = DataLoader(list(zip(priming_data_str,priming_data_str)), batch_size=batch_size, shuffle=True)

            test_data_loader = None

            best_model, error, training_time = gym.fit(train_data_loader,
                                                       test_data_loader,
                                                       desired_error=self.desired_error,
                                                       max_time=self.max_training_time,
                                                       callback=self._train_callback,
                                                       eval_every_x_epochs=1,
                                                       max_unimproving_models=5)

            self.net = best_model.to(self.net.device)

            modules = [module for module in self.net.modules() if type(
                module) != torch.nn.Sequential and type(module) != DefaultNet]
            self.encoder = torch.nn.Sequential(*modules[0:2]).eval()
            self.decoder = torch.nn.Sequential(*modules[2:3]).eval()
            logging.info('Categorical autoencoder ready')

        self._prepared = True
示例#2
0
class DistilBertEncoder:
    def __init__(self, is_target=False, aim=ENCODER_AIM.BALANCE):
        self.name = 'Text Transformer Encoder'
        self._tokenizer = None
        self._model = None
        self._pad_id = None
        self._pytorch_wrapper = torch.FloatTensor
        self._max_len = None
        self._max_ele = None
        self._prepared = False
        self._model_type = None
        self.desired_error = 0.01
        self.max_training_time = CONFIG.MAX_ENCODER_TRAINING_TIME
        self._head = None
        # Possible: speed, balance, accuracy
        self.aim = aim

        if self.aim == ENCODER_AIM.SPEED:
            # uses more memory, takes very long to train and outputs weird debugging statements to the command line, consider waiting until it gets better or try to investigate why this happens (changing the pretrained model doesn't seem to help)
            self._classifier_model_class = AlbertForSequenceClassification
            self._embeddings_model_class = AlbertModel
            self._tokenizer_class = AlbertTokenizer
            self._pretrained_model_name = 'albert-base-v2'
            self._model_max_len = 768
        if self.aim == ENCODER_AIM.BALANCE:
            self._classifier_model_class = DistilBertForSequenceClassification
            self._embeddings_model_class = DistilBertModel
            self._tokenizer_class = DistilBertTokenizer
            self._pretrained_model_name = 'distilbert-base-uncased'
            self._model_max_len = 768
        if self.aim == ENCODER_AIM.ACCURACY:
            self._classifier_model_class = DistilBertForSequenceClassification
            self._embeddings_model_class = DistilBertModel
            self._tokenizer_class = DistilBertTokenizer
            self._pretrained_model_name = 'distilbert-base-uncased'
            self._model_max_len = 768

        device_str = "cuda" if CONFIG.USE_CUDA else "cpu"
        if CONFIG.USE_DEVICE is not None:
            device_str = CONFIG.USE_DEVICE
        self.device = torch.device(device_str)

    def _train_callback(self, error, real_buff, predicted_buff):
        logging.info(f'{self.name} reached a loss of {error} while training !')

    @staticmethod
    def categorical_train_function(model, data, gym, test=False):
        input, real = data
        input = input.to(gym.device)
        labels = torch.tensor([torch.argmax(x) for x in real]).to(gym.device)

        outputs = gym.model(input, labels=labels)
        loss, logits = outputs[:2]

        if not test:
            loss.backward()
            gym.optimizer.step()
            gym.scheduler.step()
            gym.optimizer.zero_grad()
        return loss

    @staticmethod
    def numerical_train_function(model, data, gym, backbone, test=False):
        input, real = data

        input = input.to(gym.device)
        real = real.to(gym.device)

        embeddings = backbone(input)[0][:, 0, :]
        outputs = gym.model(embeddings)

        loss = gym.loss_criterion(outputs, real)

        if not test:
            loss.backward()
            gym.optimizer.step()
            gym.scheduler.step()
            gym.optimizer.zero_grad()

        return loss

    def prepare_encoder(self, priming_data, training_data=None):
        if self._prepared:
            raise Exception(
                'You can only call "prepare_encoder" once for a given encoder.'
            )

        priming_data = [x if x is not None else '' for x in priming_data]

        self._max_len = min(max([len(x) for x in priming_data]),
                            self._model_max_len)
        self._tokenizer = self._tokenizer_class.from_pretrained(
            self._pretrained_model_name)
        self._pad_id = self._tokenizer.convert_tokens_to_ids(
            [self._tokenizer.pad_token])[0]
        # @TODO: Support multiple targets if they are all categorical or train for the categorical target if it's a mix (maybe ?)
        # @TODO: Attach a language modeling head and/or use GPT2 and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text

        if training_data is not None and 'targets' in training_data and len(
                training_data['targets']
        ) == 1 and training_data['targets'][0][
                'output_type'] == COLUMN_DATA_TYPES.CATEGORICAL and CONFIG.TRAIN_TO_PREDICT_TARGET:
            self._model_type = 'classifier'
            self._model = self._classifier_model_class.from_pretrained(
                self._pretrained_model_name,
                num_labels=len(
                    set(training_data['targets'][0]['unencoded_output'])) +
                1).to(self.device)
            batch_size = 10

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in self._model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.000001
            }, {
                'params': [
                    p for n, p in self._model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=10,
                num_training_steps=len(priming_data) * 15 / 20)

            gym = Gym(model=self._model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_criterion=None,
                      device=self.device,
                      name=self.name)

            input = [
                self._tokenizer.encode(x[:self._max_len],
                                       add_special_tokens=True)
                for x in priming_data
            ]
            tokenized_max_len = max([len(x) for x in input])
            input = torch.tensor([
                x + [self._pad_id] * (tokenized_max_len - len(x))
                for x in input
            ])

            real = training_data['targets'][0]['encoded_output']

            merged_data = list(zip(input, real))

            train_data_loader = DataLoader(
                merged_data[:int(len(merged_data) * 9 / 10)],
                batch_size=batch_size,
                shuffle=True)
            test_data_loader = DataLoader(
                merged_data[int(len(merged_data) * 9 / 10):],
                batch_size=batch_size,
                shuffle=True)

            best_model, error, training_time = gym.fit(
                train_data_loader=train_data_loader,
                test_data_loader=test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=10,
                custom_train_func=partial(self.categorical_train_function,
                                          test=False),
                custom_test_func=partial(self.categorical_train_function,
                                         test=True))

            self._model = best_model.to(self.device)

        elif all([
                x['output_type'] == COLUMN_DATA_TYPES.NUMERIC
                or x['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL
                for x in training_data['targets']
        ]) and CONFIG.TRAIN_TO_PREDICT_TARGET:
            self.desired_error = 0.01
            self._model_type = 'generic_target_predictor'
            self._model = self._embeddings_model_class.from_pretrained(
                self._pretrained_model_name).to(self.device)
            batch_size = 10

            self._head = DefaultNet(ds=None,
                                    dynamic_parameters={},
                                    shape=funnel(
                                        768,
                                        sum([
                                            len(x['encoded_output'][0])
                                            for x in training_data['targets']
                                        ]),
                                        depth=5),
                                    selfaware=False)

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in self._head.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.000001
            }, {
                'params': [
                    p for n, p in self._head.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                                          lr=5e-5,
                                          eps=1e-8)
            #optimizer = Ranger(self._head.parameters(),lr=5e-5)

            # num_training_steps is kind of an estimation
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=10,
                num_training_steps=len(priming_data) * 15 / 20)

            criterion = torch.nn.MSELoss()

            gym = Gym(model=self._head,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_criterion=criterion,
                      device=self.device,
                      name=self.name)

            input = [
                self._tokenizer.encode(x[:self._max_len],
                                       add_special_tokens=True)
                for x in priming_data
            ]
            tokenized_max_len = max([len(x) for x in input])
            input = torch.tensor([
                x + [self._pad_id] * (tokenized_max_len - len(x))
                for x in input
            ])

            real = [[]] * len(training_data['targets'][0]['encoded_output'])
            for i in range(len(real)):
                for target in training_data['targets']:
                    real[i] = real[i] + target['encoded_output'][i]
            real = torch.tensor(real)

            merged_data = list(zip(input, real))

            train_data_loader = DataLoader(
                merged_data[:int(len(merged_data) * 9 / 10)],
                batch_size=batch_size,
                shuffle=True)
            test_data_loader = DataLoader(
                merged_data[int(len(merged_data) * 9 / 10):],
                batch_size=batch_size,
                shuffle=True)

            self._model.eval()

            best_model, error, training_time = gym.fit(
                train_data_loader=train_data_loader,
                test_data_loader=test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=10,
                custom_train_func=partial(self.numerical_train_function,
                                          backbone=self._model,
                                          test=False),
                custom_test_func=partial(self.numerical_train_function,
                                         backbone=self._model,
                                         test=True))

            self._head = best_model.to(self.device)

        else:
            self._model_type = 'embeddings_generator'
            self._model = self._embeddings_model_class.from_pretrained(
                self._pretrained_model_name).to(self.device)

        self._prepared = True

    def encode(self, column_data):
        encoded_representation = []
        self._model.eval()
        with torch.no_grad():
            for text in column_data:
                if text is None:
                    text = ''
                input = torch.tensor(
                    self._tokenizer.encode(text[:self._max_len],
                                           add_special_tokens=True)).to(
                                               self.device).unsqueeze(0)

                if self._model_type == 'generic_target_predictor':
                    embeddings = self._model(input)
                    output = self._head(embeddings[0][:, 0, :])
                    encoded_representation.append(output.tolist()[0])

                elif self._model_type == 'classifier':
                    output = self._model(input)
                    logits = output[0]
                    predicted_targets = logits[0].tolist()
                    encoded_representation.append(predicted_targets)

                else:
                    output = self._model(input)
                    embeddings = output[0][:, 0, :].cpu().numpy()[0]
                    encoded_representation.append(embeddings)

        return self._pytorch_wrapper(encoded_representation)

    def decode(self, encoded_values_tensor, max_length=100):
        # When test is an output... a bit trickier to handle this case, thinking on it
        pass
示例#3
0
    def prepare_encoder(self, priming_data, training_data=None):
        if self._prepared:
            raise Exception(
                'You can only call "prepare_encoder" once for a given encoder.'
            )

        priming_data = [x if x is not None else '' for x in priming_data]

        self._max_len = min(max([len(x) for x in priming_data]),
                            self._model_max_len)
        self._tokenizer = self._tokenizer_class.from_pretrained(
            self._pretrained_model_name)
        self._pad_id = self._tokenizer.convert_tokens_to_ids(
            [self._tokenizer.pad_token])[0]
        # @TODO: Support multiple targets if they are all categorical or train for the categorical target if it's a mix (maybe ?)
        # @TODO: Attach a language modeling head and/or use GPT2 and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text

        if training_data is not None and 'targets' in training_data and len(
                training_data['targets']
        ) == 1 and training_data['targets'][0][
                'output_type'] == COLUMN_DATA_TYPES.CATEGORICAL and CONFIG.TRAIN_TO_PREDICT_TARGET:
            self._model_type = 'classifier'
            self._model = self._classifier_model_class.from_pretrained(
                self._pretrained_model_name,
                num_labels=len(
                    set(training_data['targets'][0]['unencoded_output'])) +
                1).to(self.device)
            batch_size = 10

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in self._model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.000001
            }, {
                'params': [
                    p for n, p in self._model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=10,
                num_training_steps=len(priming_data) * 15 / 20)

            gym = Gym(model=self._model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_criterion=None,
                      device=self.device,
                      name=self.name)

            input = [
                self._tokenizer.encode(x[:self._max_len],
                                       add_special_tokens=True)
                for x in priming_data
            ]
            tokenized_max_len = max([len(x) for x in input])
            input = torch.tensor([
                x + [self._pad_id] * (tokenized_max_len - len(x))
                for x in input
            ])

            real = training_data['targets'][0]['encoded_output']

            merged_data = list(zip(input, real))

            train_data_loader = DataLoader(
                merged_data[:int(len(merged_data) * 9 / 10)],
                batch_size=batch_size,
                shuffle=True)
            test_data_loader = DataLoader(
                merged_data[int(len(merged_data) * 9 / 10):],
                batch_size=batch_size,
                shuffle=True)

            best_model, error, training_time = gym.fit(
                train_data_loader=train_data_loader,
                test_data_loader=test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=10,
                custom_train_func=partial(self.categorical_train_function,
                                          test=False),
                custom_test_func=partial(self.categorical_train_function,
                                         test=True))

            self._model = best_model.to(self.device)

        elif all([
                x['output_type'] == COLUMN_DATA_TYPES.NUMERIC
                or x['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL
                for x in training_data['targets']
        ]) and CONFIG.TRAIN_TO_PREDICT_TARGET:
            self.desired_error = 0.01
            self._model_type = 'generic_target_predictor'
            self._model = self._embeddings_model_class.from_pretrained(
                self._pretrained_model_name).to(self.device)
            batch_size = 10

            self._head = DefaultNet(ds=None,
                                    dynamic_parameters={},
                                    shape=funnel(
                                        768,
                                        sum([
                                            len(x['encoded_output'][0])
                                            for x in training_data['targets']
                                        ]),
                                        depth=5),
                                    selfaware=False)

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in self._head.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.000001
            }, {
                'params': [
                    p for n, p in self._head.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                                          lr=5e-5,
                                          eps=1e-8)
            #optimizer = Ranger(self._head.parameters(),lr=5e-5)

            # num_training_steps is kind of an estimation
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=10,
                num_training_steps=len(priming_data) * 15 / 20)

            criterion = torch.nn.MSELoss()

            gym = Gym(model=self._head,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_criterion=criterion,
                      device=self.device,
                      name=self.name)

            input = [
                self._tokenizer.encode(x[:self._max_len],
                                       add_special_tokens=True)
                for x in priming_data
            ]
            tokenized_max_len = max([len(x) for x in input])
            input = torch.tensor([
                x + [self._pad_id] * (tokenized_max_len - len(x))
                for x in input
            ])

            real = [[]] * len(training_data['targets'][0]['encoded_output'])
            for i in range(len(real)):
                for target in training_data['targets']:
                    real[i] = real[i] + target['encoded_output'][i]
            real = torch.tensor(real)

            merged_data = list(zip(input, real))

            train_data_loader = DataLoader(
                merged_data[:int(len(merged_data) * 9 / 10)],
                batch_size=batch_size,
                shuffle=True)
            test_data_loader = DataLoader(
                merged_data[int(len(merged_data) * 9 / 10):],
                batch_size=batch_size,
                shuffle=True)

            self._model.eval()

            best_model, error, training_time = gym.fit(
                train_data_loader=train_data_loader,
                test_data_loader=test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=10,
                custom_train_func=partial(self.numerical_train_function,
                                          backbone=self._model,
                                          test=False),
                custom_test_func=partial(self.numerical_train_function,
                                         backbone=self._model,
                                         test=True))

            self._head = best_model.to(self.device)

        else:
            self._model_type = 'embeddings_generator'
            self._model = self._embeddings_model_class.from_pretrained(
                self._pretrained_model_name).to(self.device)

        self._prepared = True
示例#4
0
class CategoricalAutoEncoder:

    def __init__(self, is_target=False):
        self._pytorch_wrapper = torch.FloatTensor
        self._prepared = False
        self.name = 'Categorical Autoencoder'
        self.net = None
        self.encoder = None
        self.decoder = None
        self.onehot_encoder = OneHotEncoder()
        self.desired_error = 0.01
        self.use_autoencoder = None
        if is_target:
            self.max_encoded_length = None
        else:
            self.max_encoded_length = 100
        self.max_training_time = CONFIG.MAX_ENCODER_TRAINING_TIME

    def _train_callback(self, error, real_buff, predicted_buff):
        logging.info(f'{self.name} reached a loss of {error} while training !')

    def _encoder_targets(self, data):
        oh_encoded_categories = self.onehot_encoder.encode(data)
        target = oh_encoded_categories.cpu().numpy()
        target_indexes = np.where(target>0)[1]
        targets_c = torch.LongTensor(target_indexes)
        labels = targets_c.to(self.net.device)
        return labels

    def prepare_encoder(self, priming_data):
        random.seed(len(priming_data))

        if self._prepared:
            raise Exception('You can only call "prepare_encoder" once for a given encoder.')

        self.onehot_encoder.prepare_encoder(priming_data)

        input_len = self.onehot_encoder._lang.n_words
        self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length

        if self.use_autoencoder:
            logging.info('Preparing a categorical autoencoder, this might take a while')

            embeddings_layer_len = self.max_encoded_length

            self.net = DefaultNet(ds=None, dynamic_parameters={},shape=[input_len, embeddings_layer_len, input_len], selfaware=False)

            criterion = torch.nn.CrossEntropyLoss()
            optimizer = Ranger(self.net.parameters())

            gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, output_encoder=self._encoder_targets)

            batch_size = min(200, int(len(priming_data)/50))

            train_data_loader = DataLoader(list(zip(priming_data,priming_data)), batch_size=batch_size, shuffle=True)
            test_data_loader = None

            best_model, error, training_time = gym.fit(train_data_loader, test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=5)

            self.net = best_model.to(self.net.device)

            modules = [module for module in self.net.modules() if type(module) != torch.nn.Sequential and type(module) != DefaultNet]
            self.encoder = torch.nn.Sequential(*modules[0:2])
            self.decoder = torch.nn.Sequential(*modules[2:3])
            logging.info('Categorical autoencoder ready')

        self._prepared = True

    def encode(self, column_data):
        oh_encoded_tensor = self.onehot_encoder.encode(column_data)
        if not self.use_autoencoder:
            return oh_encoded_tensor
        else:
            oh_encoded_tensor = oh_encoded_tensor.to(self.net.device)
            embeddings = self.encoder(oh_encoded_tensor)
            return embeddings


    def decode(self, encoded_data):
        if not self.use_autoencoder:
            return self.onehot_encoder.decode(encoded_data)
        else:
            oh_encoded_tensor = self.decoder(encoded_data)
            oh_encoded_tensor = oh_encoded_tensor.to('cpu')
            decoded_categories = self.onehot_encoder.decode(oh_encoded_tensor)
            return decoded_categories
示例#5
0
class CategoricalAutoEncoder(BaseEncoder):
    def __init__(self, is_target=False, max_encoded_length=100):
        super().__init__(is_target)
        self._prepared = False
        self.name = 'Categorical Autoencoder'
        self.net = None
        self.encoder = None
        self.decoder = None
        self.predict_proba = None  # whether to return the belief distribution as well
        self.onehot_encoder = OneHotEncoder(is_target=self.is_target)
        self.desired_error = 0.01
        self.use_autoencoder = None
        if self.is_target:
            self.max_encoded_length = None
        else:
            self.max_encoded_length = max_encoded_length
        self.max_training_time = 7200

    def _train_callback(self, error, real_buff, predicted_buff):
        log.info(f'{self.name} reached a loss of {error} while training !')

    def _encoder_targets(self, data):
        oh_encoded_categories = self.onehot_encoder.encode(data)
        target = oh_encoded_categories.cpu().numpy()
        target_indexes = np.where(target > 0)[1]
        targets_c = torch.LongTensor(target_indexes)
        labels = targets_c.to(self.net.device)
        return labels

    def to(self, device, available_devices):
        if self.use_autoencoder:
            self.net = self.net.to(device, available_devices)
        return self

    def prepare(self, priming_data):
        random.seed(len(priming_data))

        if self._prepared:
            raise Exception(
                'You can only call "prepare" once for a given encoder.')

        self.onehot_encoder.prepare(priming_data)

        input_len = self.onehot_encoder._lang.n_words
        self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length

        if self.use_autoencoder:
            log.info(
                'Preparing a categorical autoencoder, this might take a while')

            embeddings_layer_len = self.max_encoded_length

            self.net = DefaultNet(
                dynamic_parameters={},
                shape=[input_len, embeddings_layer_len, input_len])

            criterion = torch.nn.CrossEntropyLoss()
            optimizer = Ranger(self.net.parameters())

            gym = Gym(model=self.net,
                      optimizer=optimizer,
                      scheduler=None,
                      loss_criterion=criterion,
                      device=self.net.device,
                      name=self.name,
                      input_encoder=self.onehot_encoder.encode,
                      output_encoder=self._encoder_targets)

            batch_size = min(200, int(len(priming_data) / 50))

            priming_data_str = [str(x) for x in priming_data]
            train_data_loader = DataLoader(list(
                zip(priming_data_str, priming_data_str)),
                                           batch_size=batch_size,
                                           shuffle=True)

            test_data_loader = None

            best_model, error, training_time = gym.fit(
                train_data_loader,
                test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=5)

            self.net = best_model.to(self.net.device)

            modules = [
                module for module in self.net.modules()
                if type(module) != torch.nn.Sequential
                and type(module) != DefaultNet
            ]
            self.encoder = torch.nn.Sequential(*modules[0:2]).eval()
            self.decoder = torch.nn.Sequential(*modules[2:3]).eval()
            log.info('Categorical autoencoder ready')

        self._prepared = True

    def encode(self, column_data):
        if not column_data:
            column_data = ['']
        oh_encoded_tensor = self.onehot_encoder.encode(column_data)
        if not self.use_autoencoder:
            return oh_encoded_tensor
        else:
            with torch.no_grad():
                oh_encoded_tensor = oh_encoded_tensor.to(self.net.device)
                embeddings = self.encoder(oh_encoded_tensor)
                return embeddings

    def decode(self, encoded_data):
        self.onehot_encoder.predict_proba = self.predict_proba
        if not self.use_autoencoder:
            return self.onehot_encoder.decode(encoded_data)
        else:
            with torch.no_grad():
                oh_encoded_tensor = self.decoder(encoded_data)
                oh_encoded_tensor = oh_encoded_tensor.to('cpu')
                return self.onehot_encoder.decode(oh_encoded_tensor)