def prepare_encoder(self, priming_data): random.seed(len(priming_data)) if self._prepared: raise Exception('You can only call "prepare_encoder" once for a given encoder.') self.onehot_encoder.prepare_encoder(priming_data) input_len = self.onehot_encoder._lang.n_words self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length if self.use_autoencoder: logging.info('Preparing a categorical autoencoder, this might take a while') embeddings_layer_len = self.max_encoded_length self.net = DefaultNet(dynamic_parameters={}, shape=[ input_len, embeddings_layer_len, input_len], selfaware=False) criterion = torch.nn.CrossEntropyLoss() optimizer = Ranger(self.net.parameters()) gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, output_encoder=self._encoder_targets) batch_size = min(200, int(len(priming_data) / 50)) priming_data_str = [str(x) for x in priming_data] train_data_loader = DataLoader(list(zip(priming_data_str,priming_data_str)), batch_size=batch_size, shuffle=True) test_data_loader = None best_model, error, training_time = gym.fit(train_data_loader, test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=5) self.net = best_model.to(self.net.device) modules = [module for module in self.net.modules() if type( module) != torch.nn.Sequential and type(module) != DefaultNet] self.encoder = torch.nn.Sequential(*modules[0:2]).eval() self.decoder = torch.nn.Sequential(*modules[2:3]).eval() logging.info('Categorical autoencoder ready') self._prepared = True
def prepare_encoder(self, priming_data, training_data=None): if self._prepared: raise Exception( 'You can only call "prepare_encoder" once for a given encoder.' ) priming_data = [x if x is not None else '' for x in priming_data] self._max_len = min(max([len(x) for x in priming_data]), self._model_max_len) self._tokenizer = self._tokenizer_class.from_pretrained( self._pretrained_model_name) self._pad_id = self._tokenizer.convert_tokens_to_ids( [self._tokenizer.pad_token])[0] # @TODO: Support multiple targets if they are all categorical or train for the categorical target if it's a mix (maybe ?) # @TODO: Attach a language modeling head and/or use GPT2 and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text if training_data is not None and 'targets' in training_data and len( training_data['targets'] ) == 1 and training_data['targets'][0][ 'output_type'] == COLUMN_DATA_TYPES.CATEGORICAL and CONFIG.TRAIN_TO_PREDICT_TARGET: self._model_type = 'classifier' self._model = self._classifier_model_class.from_pretrained( self._pretrained_model_name, num_labels=len( set(training_data['targets'][0]['unencoded_output'])) + 1).to(self.device) batch_size = 10 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self._model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.000001 }, { 'params': [ p for n, p in self._model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20) gym = Gym(model=self._model, optimizer=optimizer, scheduler=scheduler, loss_criterion=None, device=self.device, name=self.name) input = [ self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data ] tokenized_max_len = max([len(x) for x in input]) input = torch.tensor([ x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input ]) real = training_data['targets'][0]['encoded_output'] merged_data = list(zip(input, real)) train_data_loader = DataLoader( merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True) test_data_loader = DataLoader( merged_data[int(len(merged_data) * 9 / 10):], batch_size=batch_size, shuffle=True) best_model, error, training_time = gym.fit( train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=partial(self.categorical_train_function, test=False), custom_test_func=partial(self.categorical_train_function, test=True)) self._model = best_model.to(self.device) elif all([ x['output_type'] == COLUMN_DATA_TYPES.NUMERIC or x['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL for x in training_data['targets'] ]) and CONFIG.TRAIN_TO_PREDICT_TARGET: self.desired_error = 0.01 self._model_type = 'generic_target_predictor' self._model = self._embeddings_model_class.from_pretrained( self._pretrained_model_name).to(self.device) batch_size = 10 self._head = DefaultNet(ds=None, dynamic_parameters={}, shape=funnel( 768, sum([ len(x['encoded_output'][0]) for x in training_data['targets'] ]), depth=5), selfaware=False) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self._head.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.000001 }, { 'params': [ p for n, p in self._head.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8) #optimizer = Ranger(self._head.parameters(),lr=5e-5) # num_training_steps is kind of an estimation scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20) criterion = torch.nn.MSELoss() gym = Gym(model=self._head, optimizer=optimizer, scheduler=scheduler, loss_criterion=criterion, device=self.device, name=self.name) input = [ self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data ] tokenized_max_len = max([len(x) for x in input]) input = torch.tensor([ x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input ]) real = [[]] * len(training_data['targets'][0]['encoded_output']) for i in range(len(real)): for target in training_data['targets']: real[i] = real[i] + target['encoded_output'][i] real = torch.tensor(real) merged_data = list(zip(input, real)) train_data_loader = DataLoader( merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True) test_data_loader = DataLoader( merged_data[int(len(merged_data) * 9 / 10):], batch_size=batch_size, shuffle=True) self._model.eval() best_model, error, training_time = gym.fit( train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=partial(self.numerical_train_function, backbone=self._model, test=False), custom_test_func=partial(self.numerical_train_function, backbone=self._model, test=True)) self._head = best_model.to(self.device) else: self._model_type = 'embeddings_generator' self._model = self._embeddings_model_class.from_pretrained( self._pretrained_model_name).to(self.device) self._prepared = True
def fit(self, train_ds, test_ds, max_time, eval_every_x_epochs, callback): """ :param ds: :return: """ self.fit_data_source(train_ds) if self.is_categorical_output: # The WeightedRandomSampler samples "randomly" but can assign higher weight to certain rows, we assign each rows it's weight based on the target variable value in that row and it's associated weight in the output_weights map (otherwise used to bias the loss function) if train_ds.output_weights is not None and train_ds.output_weights is not False and CONFIG.OVERSAMPLE: weights = [] for row in train_ds: _, out = row # @Note: This assumes one-hot encoding for the encoded_value weights.append(train_ds.output_weights[torch.argmax(out).item()]) self._nonpersistent['sampler'] = torch.utils.data.WeightedRandomSampler(weights=weights,num_samples=len(weights),replacement=True) self.net = self.nn_class(train_ds, self.dynamic_parameters) self.net = self.net.train() if self.batch_size < self.net.available_devices: self.batch_size = self.net.available_devices self.awareness_criterion = torch.nn.MSELoss() if self.criterion is None: if self.is_categorical_output: if train_ds.output_weights is not None and train_ds.output_weights is not False and not CONFIG.OVERSAMPLE: output_weights = torch.Tensor(train_ds.output_weights).to(self.net.device) else: output_weights = None self.criterion = torch.nn.CrossEntropyLoss(weight=output_weights) else: self.criterion = torch.nn.MSELoss() self.optimizer_class = Ranger if self.optimizer_args is None: self.optimizer_args = {} if 'beta1' in self.dynamic_parameters: self.optimizer_args['betas'] = (self.dynamic_parameters['beta1'],0.999) for optimizer_arg_name in ['lr','k','N_sma_threshold']: if optimizer_arg_name in self.dynamic_parameters: self.optimizer_args[optimizer_arg_name] = self.dynamic_parameters[optimizer_arg_name] self.optimizer = self.optimizer_class(self.net.parameters(), **self.optimizer_args) total_epochs = self.epochs if self._nonpersistent['sampler'] is None: train_data_loader = DataLoader(train_ds, batch_size=self.batch_size, shuffle=True) else: train_data_loader = DataLoader(train_ds, batch_size=self.batch_size, sampler=self._nonpersistent['sampler']) test_data_loader = DataLoader(test_ds, batch_size=self.batch_size, shuffle=True, num_workers=0) gym = Gym(model=self._model, optimizer=self.optimizer, scheduler=None, loss_criterion=self.criterion, device=self.device, name=self.name) best_model, error, training_time = gym.fit(train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=0, max_time=max_time, callback=callback, eval_every_x_epochs=eval_every_x_epochs, max_unimproving_models=10, custom_train_func=self._train_loop, custom_test_func=self._test_loop) ''' total_iterations = 0 for epoch in range(total_epochs): # loop over the dataset multiple times running_loss = 0.0 error = 0 for i, data in enumerate(data_loader, 0): total_iterations += 1 # get the inputs; data is a list of [inputs, labels] inputs, labels = data labels = labels.to(self.net.device) inputs = inputs.to(self.net.device) # zero the parameter gradients self.optimizer.zero_grad() # forward + backward + optimize # outputs = self.net(inputs) if CONFIG.SELFAWARE: outputs, awareness = self.net(inputs) else: outputs = self.net(inputs) if self.is_categorical_output: target = labels.cpu().numpy() target_indexes = np.where(target>0)[1] targets_c = torch.LongTensor(target_indexes) cat_labels = targets_c.to(self.net.device) loss = self.criterion(outputs, cat_labels) else: loss = self.criterion(outputs, labels) if CONFIG.SELFAWARE: real_loss = torch.abs(labels - outputs) # error precentual to the target real_loss = torch.Tensor(real_loss.tolist()) # disconnect from the graph (test if this is necessary) real_loss = real_loss.to(self.net.device) awareness_loss = self.awareness_criterion(awareness, real_loss) #print(awareness_loss.item()) #print(loss.item()) total_loss = self.loss_combination_operator(awareness_loss, loss) running_loss += total_loss.item() # Make sure the LR doesn't get too low if self.optimizer.lr > 5 * pow(10,-6): if np.isnan(running_loss) or np.isinf(running_loss) or running_loss > pow(10,4): self.optimizer_args['lr'] = self.optimizer.lr/2 gc.collect() if 'cuda' in str(self.net.device): torch.cuda.empty_cache() self.loss_combination_operator = operator.add self.net = self.nn_class(ds, self.dynamic_parameters) self.optimizer.zero_grad() self.optimizer = self.optimizer_class(self.net.parameters(), **self.optimizer_args) break else: total_loss = loss total_loss.backward() self.optimizer.step() # now that we have run backward in both losses, optimize() (review: we may need to optimize for each step) error = running_loss / (i + 1) if error < 1: if self.loss_combination_operator == operator.add: self.loss_combination_operator = operator.mul ''' yield error