def prepare_encoder(self, priming_data): random.seed(len(priming_data)) if self._prepared: raise Exception('You can only call "prepare_encoder" once for a given encoder.') self.onehot_encoder.prepare_encoder(priming_data) input_len = self.onehot_encoder._lang.n_words self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length if self.use_autoencoder: logging.info('Preparing a categorical autoencoder, this might take a while') embeddings_layer_len = self.max_encoded_length self.net = DefaultNet(dynamic_parameters={}, shape=[ input_len, embeddings_layer_len, input_len], selfaware=False) criterion = torch.nn.CrossEntropyLoss() optimizer = Ranger(self.net.parameters()) gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, output_encoder=self._encoder_targets) batch_size = min(200, int(len(priming_data) / 50)) priming_data_str = [str(x) for x in priming_data] train_data_loader = DataLoader(list(zip(priming_data_str,priming_data_str)), batch_size=batch_size, shuffle=True) test_data_loader = None best_model, error, training_time = gym.fit(train_data_loader, test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=5) self.net = best_model.to(self.net.device) modules = [module for module in self.net.modules() if type( module) != torch.nn.Sequential and type(module) != DefaultNet] self.encoder = torch.nn.Sequential(*modules[0:2]).eval() self.decoder = torch.nn.Sequential(*modules[2:3]).eval() logging.info('Categorical autoencoder ready') self._prepared = True
def prepare_encoder(self, priming_data, training_data=None): if self._prepared: raise Exception( 'You can only call "prepare_encoder" once for a given encoder.' ) priming_data = [x if x is not None else '' for x in priming_data] self._max_len = min(max([len(x) for x in priming_data]), self._model_max_len) self._tokenizer = self._tokenizer_class.from_pretrained( self._pretrained_model_name) self._pad_id = self._tokenizer.convert_tokens_to_ids( [self._tokenizer.pad_token])[0] # @TODO: Support multiple targets if they are all categorical or train for the categorical target if it's a mix (maybe ?) # @TODO: Attach a language modeling head and/or use GPT2 and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text if training_data is not None and 'targets' in training_data and len( training_data['targets'] ) == 1 and training_data['targets'][0][ 'output_type'] == COLUMN_DATA_TYPES.CATEGORICAL and CONFIG.TRAIN_TO_PREDICT_TARGET: self._model_type = 'classifier' self._model = self._classifier_model_class.from_pretrained( self._pretrained_model_name, num_labels=len( set(training_data['targets'][0]['unencoded_output'])) + 1).to(self.device) batch_size = 10 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self._model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.000001 }, { 'params': [ p for n, p in self._model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20) gym = Gym(model=self._model, optimizer=optimizer, scheduler=scheduler, loss_criterion=None, device=self.device, name=self.name) input = [ self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data ] tokenized_max_len = max([len(x) for x in input]) input = torch.tensor([ x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input ]) real = training_data['targets'][0]['encoded_output'] merged_data = list(zip(input, real)) train_data_loader = DataLoader( merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True) test_data_loader = DataLoader( merged_data[int(len(merged_data) * 9 / 10):], batch_size=batch_size, shuffle=True) best_model, error, training_time = gym.fit( train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=partial(self.categorical_train_function, test=False), custom_test_func=partial(self.categorical_train_function, test=True)) self._model = best_model.to(self.device) elif all([ x['output_type'] == COLUMN_DATA_TYPES.NUMERIC or x['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL for x in training_data['targets'] ]) and CONFIG.TRAIN_TO_PREDICT_TARGET: self.desired_error = 0.01 self._model_type = 'generic_target_predictor' self._model = self._embeddings_model_class.from_pretrained( self._pretrained_model_name).to(self.device) batch_size = 10 self._head = DefaultNet(ds=None, dynamic_parameters={}, shape=funnel( 768, sum([ len(x['encoded_output'][0]) for x in training_data['targets'] ]), depth=5), selfaware=False) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self._head.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.000001 }, { 'params': [ p for n, p in self._head.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8) #optimizer = Ranger(self._head.parameters(),lr=5e-5) # num_training_steps is kind of an estimation scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20) criterion = torch.nn.MSELoss() gym = Gym(model=self._head, optimizer=optimizer, scheduler=scheduler, loss_criterion=criterion, device=self.device, name=self.name) input = [ self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data ] tokenized_max_len = max([len(x) for x in input]) input = torch.tensor([ x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input ]) real = [[]] * len(training_data['targets'][0]['encoded_output']) for i in range(len(real)): for target in training_data['targets']: real[i] = real[i] + target['encoded_output'][i] real = torch.tensor(real) merged_data = list(zip(input, real)) train_data_loader = DataLoader( merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True) test_data_loader = DataLoader( merged_data[int(len(merged_data) * 9 / 10):], batch_size=batch_size, shuffle=True) self._model.eval() best_model, error, training_time = gym.fit( train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=partial(self.numerical_train_function, backbone=self._model, test=False), custom_test_func=partial(self.numerical_train_function, backbone=self._model, test=True)) self._head = best_model.to(self.device) else: self._model_type = 'embeddings_generator' self._model = self._embeddings_model_class.from_pretrained( self._pretrained_model_name).to(self.device) self._prepared = True