예제 #1
0
    def prepare_encoder(self, priming_data):
        random.seed(len(priming_data))

        if self._prepared:
            raise Exception('You can only call "prepare_encoder" once for a given encoder.')

        self.onehot_encoder.prepare_encoder(priming_data)

        input_len = self.onehot_encoder._lang.n_words
        self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length

        if self.use_autoencoder:
            logging.info('Preparing a categorical autoencoder, this might take a while')

            embeddings_layer_len = self.max_encoded_length

            self.net = DefaultNet(dynamic_parameters={}, shape=[
                                  input_len, embeddings_layer_len, input_len], selfaware=False)

            criterion = torch.nn.CrossEntropyLoss()
            optimizer = Ranger(self.net.parameters())

            gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion,
                      device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode,
                      output_encoder=self._encoder_targets)

            batch_size = min(200, int(len(priming_data) / 50))

            priming_data_str = [str(x) for x in priming_data]
            train_data_loader = DataLoader(list(zip(priming_data_str,priming_data_str)), batch_size=batch_size, shuffle=True)

            test_data_loader = None

            best_model, error, training_time = gym.fit(train_data_loader,
                                                       test_data_loader,
                                                       desired_error=self.desired_error,
                                                       max_time=self.max_training_time,
                                                       callback=self._train_callback,
                                                       eval_every_x_epochs=1,
                                                       max_unimproving_models=5)

            self.net = best_model.to(self.net.device)

            modules = [module for module in self.net.modules() if type(
                module) != torch.nn.Sequential and type(module) != DefaultNet]
            self.encoder = torch.nn.Sequential(*modules[0:2]).eval()
            self.decoder = torch.nn.Sequential(*modules[2:3]).eval()
            logging.info('Categorical autoencoder ready')

        self._prepared = True
예제 #2
0
    def prepare_encoder(self, priming_data, training_data=None):
        if self._prepared:
            raise Exception(
                'You can only call "prepare_encoder" once for a given encoder.'
            )

        priming_data = [x if x is not None else '' for x in priming_data]

        self._max_len = min(max([len(x) for x in priming_data]),
                            self._model_max_len)
        self._tokenizer = self._tokenizer_class.from_pretrained(
            self._pretrained_model_name)
        self._pad_id = self._tokenizer.convert_tokens_to_ids(
            [self._tokenizer.pad_token])[0]
        # @TODO: Support multiple targets if they are all categorical or train for the categorical target if it's a mix (maybe ?)
        # @TODO: Attach a language modeling head and/or use GPT2 and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text

        if training_data is not None and 'targets' in training_data and len(
                training_data['targets']
        ) == 1 and training_data['targets'][0][
                'output_type'] == COLUMN_DATA_TYPES.CATEGORICAL and CONFIG.TRAIN_TO_PREDICT_TARGET:
            self._model_type = 'classifier'
            self._model = self._classifier_model_class.from_pretrained(
                self._pretrained_model_name,
                num_labels=len(
                    set(training_data['targets'][0]['unencoded_output'])) +
                1).to(self.device)
            batch_size = 10

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in self._model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.000001
            }, {
                'params': [
                    p for n, p in self._model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=10,
                num_training_steps=len(priming_data) * 15 / 20)

            gym = Gym(model=self._model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_criterion=None,
                      device=self.device,
                      name=self.name)

            input = [
                self._tokenizer.encode(x[:self._max_len],
                                       add_special_tokens=True)
                for x in priming_data
            ]
            tokenized_max_len = max([len(x) for x in input])
            input = torch.tensor([
                x + [self._pad_id] * (tokenized_max_len - len(x))
                for x in input
            ])

            real = training_data['targets'][0]['encoded_output']

            merged_data = list(zip(input, real))

            train_data_loader = DataLoader(
                merged_data[:int(len(merged_data) * 9 / 10)],
                batch_size=batch_size,
                shuffle=True)
            test_data_loader = DataLoader(
                merged_data[int(len(merged_data) * 9 / 10):],
                batch_size=batch_size,
                shuffle=True)

            best_model, error, training_time = gym.fit(
                train_data_loader=train_data_loader,
                test_data_loader=test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=10,
                custom_train_func=partial(self.categorical_train_function,
                                          test=False),
                custom_test_func=partial(self.categorical_train_function,
                                         test=True))

            self._model = best_model.to(self.device)

        elif all([
                x['output_type'] == COLUMN_DATA_TYPES.NUMERIC
                or x['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL
                for x in training_data['targets']
        ]) and CONFIG.TRAIN_TO_PREDICT_TARGET:
            self.desired_error = 0.01
            self._model_type = 'generic_target_predictor'
            self._model = self._embeddings_model_class.from_pretrained(
                self._pretrained_model_name).to(self.device)
            batch_size = 10

            self._head = DefaultNet(ds=None,
                                    dynamic_parameters={},
                                    shape=funnel(
                                        768,
                                        sum([
                                            len(x['encoded_output'][0])
                                            for x in training_data['targets']
                                        ]),
                                        depth=5),
                                    selfaware=False)

            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in self._head.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.000001
            }, {
                'params': [
                    p for n, p in self._head.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                                          lr=5e-5,
                                          eps=1e-8)
            #optimizer = Ranger(self._head.parameters(),lr=5e-5)

            # num_training_steps is kind of an estimation
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=10,
                num_training_steps=len(priming_data) * 15 / 20)

            criterion = torch.nn.MSELoss()

            gym = Gym(model=self._head,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      loss_criterion=criterion,
                      device=self.device,
                      name=self.name)

            input = [
                self._tokenizer.encode(x[:self._max_len],
                                       add_special_tokens=True)
                for x in priming_data
            ]
            tokenized_max_len = max([len(x) for x in input])
            input = torch.tensor([
                x + [self._pad_id] * (tokenized_max_len - len(x))
                for x in input
            ])

            real = [[]] * len(training_data['targets'][0]['encoded_output'])
            for i in range(len(real)):
                for target in training_data['targets']:
                    real[i] = real[i] + target['encoded_output'][i]
            real = torch.tensor(real)

            merged_data = list(zip(input, real))

            train_data_loader = DataLoader(
                merged_data[:int(len(merged_data) * 9 / 10)],
                batch_size=batch_size,
                shuffle=True)
            test_data_loader = DataLoader(
                merged_data[int(len(merged_data) * 9 / 10):],
                batch_size=batch_size,
                shuffle=True)

            self._model.eval()

            best_model, error, training_time = gym.fit(
                train_data_loader=train_data_loader,
                test_data_loader=test_data_loader,
                desired_error=self.desired_error,
                max_time=self.max_training_time,
                callback=self._train_callback,
                eval_every_x_epochs=1,
                max_unimproving_models=10,
                custom_train_func=partial(self.numerical_train_function,
                                          backbone=self._model,
                                          test=False),
                custom_test_func=partial(self.numerical_train_function,
                                         backbone=self._model,
                                         test=True))

            self._head = best_model.to(self.device)

        else:
            self._model_type = 'embeddings_generator'
            self._model = self._embeddings_model_class.from_pretrained(
                self._pretrained_model_name).to(self.device)

        self._prepared = True
    def fit(self, train_ds, test_ds, max_time, eval_every_x_epochs, callback):
        """
        :param ds:
        :return:
        """

        self.fit_data_source(train_ds)
        if self.is_categorical_output:
            # The WeightedRandomSampler samples "randomly" but can assign higher weight to certain rows, we assign each rows it's weight based on the target variable value in that row and it's associated weight in the output_weights map (otherwise used to bias the loss function)
            if train_ds.output_weights is not None and train_ds.output_weights is not False and CONFIG.OVERSAMPLE:
                weights = []
                for row in train_ds:
                    _, out = row
                    # @Note: This assumes one-hot encoding for the encoded_value
                    weights.append(train_ds.output_weights[torch.argmax(out).item()])

                self._nonpersistent['sampler'] = torch.utils.data.WeightedRandomSampler(weights=weights,num_samples=len(weights),replacement=True)

        self.net = self.nn_class(train_ds, self.dynamic_parameters)
        self.net = self.net.train()

        if self.batch_size < self.net.available_devices:
            self.batch_size = self.net.available_devices

        self.awareness_criterion = torch.nn.MSELoss()

        if self.criterion is None:
            if self.is_categorical_output:
                if train_ds.output_weights is not None and train_ds.output_weights is not False and not CONFIG.OVERSAMPLE:
                    output_weights = torch.Tensor(train_ds.output_weights).to(self.net.device)
                else:
                    output_weights = None
                self.criterion = torch.nn.CrossEntropyLoss(weight=output_weights)
            else:
                self.criterion = torch.nn.MSELoss()

        self.optimizer_class = Ranger
        if self.optimizer_args is None:
            self.optimizer_args = {}

        if 'beta1' in self.dynamic_parameters:
            self.optimizer_args['betas'] = (self.dynamic_parameters['beta1'],0.999)

        for optimizer_arg_name in ['lr','k','N_sma_threshold']:
            if optimizer_arg_name in self.dynamic_parameters:
                self.optimizer_args[optimizer_arg_name] = self.dynamic_parameters[optimizer_arg_name]

        self.optimizer = self.optimizer_class(self.net.parameters(), **self.optimizer_args)
        total_epochs = self.epochs


        if self._nonpersistent['sampler'] is None:
            train_data_loader = DataLoader(train_ds, batch_size=self.batch_size, shuffle=True)
        else:
            train_data_loader = DataLoader(train_ds, batch_size=self.batch_size, sampler=self._nonpersistent['sampler'])

        test_data_loader = DataLoader(test_ds, batch_size=self.batch_size, shuffle=True, num_workers=0)

        gym = Gym(model=self._model, optimizer=self.optimizer, scheduler=None, loss_criterion=self.criterion, device=self.device, name=self.name)

        best_model, error, training_time = gym.fit(train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=0, max_time=max_time, callback=callback, eval_every_x_epochs=eval_every_x_epochs, max_unimproving_models=10, custom_train_func=self._train_loop, custom_test_func=self._test_loop)

        '''
        total_iterations = 0
        for epoch in range(total_epochs):  # loop over the dataset multiple times
            running_loss = 0.0
            error = 0
            for i, data in enumerate(data_loader, 0):
                total_iterations += 1
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = data

                labels = labels.to(self.net.device)
                inputs = inputs.to(self.net.device)

                # zero the parameter gradients
                self.optimizer.zero_grad()

                # forward + backward + optimize
                # outputs = self.net(inputs)
                if CONFIG.SELFAWARE:
                    outputs, awareness = self.net(inputs)
                else:
                    outputs = self.net(inputs)

                if self.is_categorical_output:
                    target = labels.cpu().numpy()
                    target_indexes = np.where(target>0)[1]
                    targets_c = torch.LongTensor(target_indexes)
                    cat_labels = targets_c.to(self.net.device)
                    loss = self.criterion(outputs, cat_labels)
                else:
                    loss = self.criterion(outputs, labels)

                if CONFIG.SELFAWARE:
                    real_loss = torch.abs(labels - outputs) # error precentual to the target
                    real_loss = torch.Tensor(real_loss.tolist()) # disconnect from the graph (test if this is necessary)
                    real_loss = real_loss.to(self.net.device)

                    awareness_loss = self.awareness_criterion(awareness, real_loss)

                    #print(awareness_loss.item())
                    #print(loss.item())

                    total_loss = self.loss_combination_operator(awareness_loss, loss)
                    running_loss += total_loss.item()

                    # Make sure the LR doesn't get too low
                    if self.optimizer.lr > 5 * pow(10,-6):
                        if np.isnan(running_loss) or np.isinf(running_loss) or running_loss > pow(10,4):
                            self.optimizer_args['lr'] = self.optimizer.lr/2
                            gc.collect()
                            if 'cuda' in str(self.net.device):
                                torch.cuda.empty_cache()

                            self.loss_combination_operator = operator.add
                            self.net = self.nn_class(ds, self.dynamic_parameters)
                            self.optimizer.zero_grad()
                            self.optimizer = self.optimizer_class(self.net.parameters(), **self.optimizer_args)

                            break
                else:
                    total_loss = loss

                total_loss.backward()
                self.optimizer.step()
                # now that we have run backward in both losses, optimize() (review: we may need to optimize for each step)

                error = running_loss / (i + 1)


                if error < 1:
                    if self.loss_combination_operator == operator.add:
                        self.loss_combination_operator = operator.mul
                '''
            yield error