예제 #1
0
    def prepare_encoder(self, priming_data):
        random.seed(len(priming_data))

        if self._prepared:
            raise Exception('You can only call "prepare_encoder" once for a given encoder.')

        self.onehot_encoder.prepare_encoder(priming_data)

        input_len = self.onehot_encoder._lang.n_words
        self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length

        if self.use_autoencoder:
            logging.info('Preparing a categorical autoencoder, this might take a while')

            embeddings_layer_len = self.max_encoded_length

            self.net = DefaultNet(dynamic_parameters={}, shape=[
                                  input_len, embeddings_layer_len, input_len], selfaware=False)

            criterion = torch.nn.CrossEntropyLoss()
            optimizer = Ranger(self.net.parameters())

            gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion,
                      device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode,
                      output_encoder=self._encoder_targets)

            batch_size = min(200, int(len(priming_data) / 50))

            priming_data_str = [str(x) for x in priming_data]
            train_data_loader = DataLoader(list(zip(priming_data_str,priming_data_str)), batch_size=batch_size, shuffle=True)

            test_data_loader = None

            best_model, error, training_time = gym.fit(train_data_loader,
                                                       test_data_loader,
                                                       desired_error=self.desired_error,
                                                       max_time=self.max_training_time,
                                                       callback=self._train_callback,
                                                       eval_every_x_epochs=1,
                                                       max_unimproving_models=5)

            self.net = best_model.to(self.net.device)

            modules = [module for module in self.net.modules() if type(
                module) != torch.nn.Sequential and type(module) != DefaultNet]
            self.encoder = torch.nn.Sequential(*modules[0:2]).eval()
            self.decoder = torch.nn.Sequential(*modules[2:3]).eval()
            logging.info('Categorical autoencoder ready')

        self._prepared = True
예제 #2
0
    def prepare_encoder(self, priming_data, training_data=None):
        if self._prepared:
            raise Exception(
                'You can only call "prepare_encoder" once for a given encoder.'
            )

        priming_data = [x if x is not None else '' for x in priming_data]

        self._max_len = min(max([len(x) for x in priming_data]),
                            self._model_max_len)
        self._tokenizer = self._tokenizer_class.from_pretrained(
            self._pretrained_model_name)
        self._pad_id = self._tokenizer.convert_tokens_to_ids(
            [self._tokenizer.pad_token])[0]
        # @TODO: Support multiple targets if they are all categorical or train for the categorical target if it's a mix (maybe ?)
        # @TODO: Attach a language modeling head and/or use GPT2 and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text

        if training_data is not None and 'targets' in training_data and len(
                training_data['targets']
        ) == 1 and training_data['targets'][0][
                'output_type'] == COLUMN_DATA_TYPES.CATEGORICAL and CONFIG.TRAIN_TO_PREDICT_TARGET:
            self._model_type = 'classifier'
            self._model = self._classifier_model_class.from_pretrained(
                self._pretrained_model_name,
                num_labels=len(
                    set(training_data['targets'][0]['unencoded_output'])) +
                1).to(self.device)
            batch_size = 10

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in self._model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.000001
            }, {
                'params': [
                    p for n, p in self._model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=10,
                num_training_steps=len(priming_data) * 15 / 20)

            gym = Gym(model=self._model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_criterion=None,
                      device=self.device,
                      name=self.name)

            input = [
                self._tokenizer.encode(x[:self._max_len],
                                       add_special_tokens=True)
                for x in priming_data
            ]
            tokenized_max_len = max([len(x) for x in input])
            input = torch.tensor([
                x + [self._pad_id] * (tokenized_max_len - len(x))
                for x in input
            ])

            real = training_data['targets'][0]['encoded_output']

            merged_data = list(zip(input, real))

            train_data_loader = DataLoader(
                merged_data[:int(len(merged_data) * 9 / 10)],
                batch_size=batch_size,
                shuffle=True)
            test_data_loader = DataLoader(
                merged_data[int(len(merged_data) * 9 / 10):],
                batch_size=batch_size,
                shuffle=True)

            best_model, error, training_time = gym.fit(
                train_data_loader=train_data_loader,
                test_data_loader=test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=10,
                custom_train_func=partial(self.categorical_train_function,
                                          test=False),
                custom_test_func=partial(self.categorical_train_function,
                                         test=True))

            self._model = best_model.to(self.device)

        elif all([
                x['output_type'] == COLUMN_DATA_TYPES.NUMERIC
                or x['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL
                for x in training_data['targets']
        ]) and CONFIG.TRAIN_TO_PREDICT_TARGET:
            self.desired_error = 0.01
            self._model_type = 'generic_target_predictor'
            self._model = self._embeddings_model_class.from_pretrained(
                self._pretrained_model_name).to(self.device)
            batch_size = 10

            self._head = DefaultNet(ds=None,
                                    dynamic_parameters={},
                                    shape=funnel(
                                        768,
                                        sum([
                                            len(x['encoded_output'][0])
                                            for x in training_data['targets']
                                        ]),
                                        depth=5),
                                    selfaware=False)

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in self._head.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.000001
            }, {
                'params': [
                    p for n, p in self._head.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                                          lr=5e-5,
                                          eps=1e-8)
            #optimizer = Ranger(self._head.parameters(),lr=5e-5)

            # num_training_steps is kind of an estimation
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=10,
                num_training_steps=len(priming_data) * 15 / 20)

            criterion = torch.nn.MSELoss()

            gym = Gym(model=self._head,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_criterion=criterion,
                      device=self.device,
                      name=self.name)

            input = [
                self._tokenizer.encode(x[:self._max_len],
                                       add_special_tokens=True)
                for x in priming_data
            ]
            tokenized_max_len = max([len(x) for x in input])
            input = torch.tensor([
                x + [self._pad_id] * (tokenized_max_len - len(x))
                for x in input
            ])

            real = [[]] * len(training_data['targets'][0]['encoded_output'])
            for i in range(len(real)):
                for target in training_data['targets']:
                    real[i] = real[i] + target['encoded_output'][i]
            real = torch.tensor(real)

            merged_data = list(zip(input, real))

            train_data_loader = DataLoader(
                merged_data[:int(len(merged_data) * 9 / 10)],
                batch_size=batch_size,
                shuffle=True)
            test_data_loader = DataLoader(
                merged_data[int(len(merged_data) * 9 / 10):],
                batch_size=batch_size,
                shuffle=True)

            self._model.eval()

            best_model, error, training_time = gym.fit(
                train_data_loader=train_data_loader,
                test_data_loader=test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=10,
                custom_train_func=partial(self.numerical_train_function,
                                          backbone=self._model,
                                          test=False),
                custom_test_func=partial(self.numerical_train_function,
                                         backbone=self._model,
                                         test=True))

            self._head = best_model.to(self.device)

        else:
            self._model_type = 'embeddings_generator'
            self._model = self._embeddings_model_class.from_pretrained(
                self._pretrained_model_name).to(self.device)

        self._prepared = True