def load_recommender(vector_dim, hidden, activation, dropout, weights_path): rencoder_api = model.AutoEncoder(layer_sizes=[vector_dim] + [int(l) for l in hidden.split(',')], nl_type=activation, is_constrained=False, dp_drop_prob=dropout, last_layer_activations=False) load_model_weights(rencoder_api, weights_path) rencoder_api.eval() rencoder_api = rencoder_api.cuda() return rencoder_api
def teacher_preds(model_path, teacher_id): # Compute nb samples and deduce nb of batches data_length = len(stud_train_data_layer.data.keys()) vector_length = stud_train_data_layer._vector_dim print("shape: (%s, %s)" % (data_length, vector_length)) # Will hold the tmp result preds = np.memmap('/data/Netflix/memmaps/preds_%s.dat.tmp' % teacher_id, dtype=np.float32, mode='w+', shape=(data_length, vector_length)) #num features varies depending on what items were in its training data rencoder = model.AutoEncoder( layer_sizes=[vector_length] + [int(l) for l in config['hidden_layers'].split(',')], nl_type=config['non_linearity_type'], is_constrained=config['constrained'], dp_drop_prob=config['drop_prob'], last_layer_activations=config['skip_last_layer_nl']) rencoder.load_state_dict(torch.load(model_path)) rencoder.eval() if use_gpu: rencoder.cuda() # Parse data by batch for i, mb in enumerate( stud_train_data_layer.iterate_one_epoch(do_shuffle=False)): inputs = Variable(mb.cuda().to_dense() if use_gpu else mb.to_dense()) start = i * config['batch_size'] end = (i + 1) * config['batch_size'] # Prepare feed dictionary preds[start:end, :] = rencoder(inputs).cpu().detach().numpy() final_preds = np.memmap('/data/Netflix/memmaps/preds_%s.dat' % teacher_id, dtype=np.int8, mode='w+', shape=(data_length, vector_length)) final_preds[:, :] = np.rint(preds.clip(min=1, max=5)) del preds gc.collect() os.remove('/data/Netflix/memmaps/preds_%s.dat.tmp' % teacher_id) return final_preds
def eval_student(nb_teachers): model_path = student_dir + '/model_' + str(nb_teachers) + '_student.last' rencoder = model.AutoEncoder( layer_sizes=[dr.stud_train_data_layer._vector_dim] + [int(l) for l in dr.config['hidden_layers'].split(',')], nl_type=dr.config['non_linearity_type'], is_constrained=dr.config['constrained'], dp_drop_prob=dr.config['drop_prob'], last_layer_activations=dr.config['skip_last_layer_nl']) rencoder.load_state_dict(torch.load(model_path)) rencoder.eval() if dr.use_gpu: rencoder.cuda() eval_loss = dr.do_eval(rencoder, dr.stud_eval_data_layer) print("EVALUATION LOSS: %s" % eval_loss)
def train_teacher(nb_teachers, teacher_id): ''' Very similar to code from DeepRecommender/run.py ''' nf_data_dir = dr.config['path_to_train_data'] nf_eval_data_dir = dr.config['path_to_eval_data'] all_files = [ path.join(nf_data_dir, f) for f in listdir(nf_data_dir) if path.isfile(path.join(nf_data_dir, f)) and f.endswith('.txt') ] chunk_size = floor(len(all_files) / nb_teachers) start = teacher_id * chunk_size chunk = all_files[start:start + chunk_size] params['src_files'] = chunk print("Loading Training Data") data_layer = new_input_layer.UserItemRecDataProviderNew( params=params, user_id_map=userIdMap, item_id_map=itemIdMap) print("Data loaded") print("Total items found: {}".format(len(data_layer.data.keys()))) print("Vector dim: {}".format(data_layer.vector_dim)) print("Loading eval data") eval_params = copy.deepcopy(params) del eval_params['src_files'] # must set eval batch size to 1 to make sure no examples are missed eval_params['data_dir'] = nf_eval_data_dir eval_data_layer = input_layer.UserItemRecDataProvider( params=eval_params, user_id_map=userIdMap, item_id_map=itemIdMap) eval_data_layer.src_data = src_data_layer.data rencoder = model.AutoEncoder( layer_sizes=[data_layer.vector_dim] + [int(l) for l in dr.config['hidden_layers'].split(',')], nl_type=dr.config['non_linearity_type'], is_constrained=dr.config['constrained'], dp_drop_prob=dr.config['drop_prob'], last_layer_activations=dr.config['skip_last_layer_nl']) os.makedirs(dr.config['logdir'], exist_ok=True) model_checkpoint = dr.config['logdir'] + "/model_%s_%s" % (nb_teachers, teacher_id) path_to_model = Path(model_checkpoint) if path_to_model.is_file(): print("Loading model from: {}".format(model_checkpoint)) rencoder.load_state_dict(torch.load(model_checkpoint)) print('######################################################') print('######################################################') print('############# AutoEncoder Model: #####################') print(rencoder) print('######################################################') print('######################################################') gpu_ids = [int(g) for g in dr.config['gpu_ids'].split(',')] print('Using GPUs: {}'.format(gpu_ids)) if len(gpu_ids) > 1: rencoder = nn.DataParallel(rencoder, device_ids=gpu_ids) if use_gpu: rencoder = rencoder.cuda() if dr.config['optimizer'] == "adam": optimizer = optim.Adam(rencoder.parameters(), lr=dr.config['lr'], weight_decay=dr.config['weight_decay']) elif dr.config['optimizer'] == "adagrad": optimizer = optim.Adagrad(rencoder.parameters(), lr=dr.config['lr'], weight_decay=dr.config['weight_decay']) elif dr.config['optimizer'] == "momentum": optimizer = optim.SGD(rencoder.parameters(), lr=dr.config['lr'], momentum=0.9, weight_decay=dr.config['weight_decay']) scheduler = MultiStepLR(optimizer, milestones=[24, 36, 48, 66, 72], gamma=0.5) elif dr.config['optimizer'] == "rmsprop": optimizer = optim.RMSprop(rencoder.parameters(), lr=dr.config['lr'], momentum=0.9, weight_decay=dr.config['weight_decay']) else: raise ValueError('Unknown optimizer kind') t_loss = 0.0 t_loss_denom = 0.0 global_step = 0 if dr.config['noise_prob'] > 0.0: dp = nn.Dropout(p=dr.config['noise_prob']) for epoch in range(dr.config['num_epochs']): print('Doing epoch {} of {}'.format(epoch, dr.config['num_epochs'])) e_start_time = time.time() rencoder.train() total_epoch_loss = 0.0 denom = 0.0 if dr.config['optimizer'] == "momentum": scheduler.step() for i, mb in enumerate(data_layer.iterate_one_epoch()): inputs = Variable( mb.cuda().to_dense() if use_gpu else mb.to_dense()) optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings loss.backward() optimizer.step() global_step += 1 t_loss += torch.Tensor.item(loss.data) t_loss_denom += 1 if i % dr.config['summary_frequency'] == 0: print('[%d, %5d] RMSE: %.7f' % (epoch, i, sqrt(t_loss / t_loss_denom))) logger.scalar_summary("Training_RMSE", sqrt(t_loss / t_loss_denom), global_step) t_loss = 0 t_loss_denom = 0.0 log_var_and_grad_summaries(logger, rencoder.encode_w, global_step, "Encode_W") log_var_and_grad_summaries(logger, rencoder.encode_b, global_step, "Encode_b") if not rencoder.is_constrained: log_var_and_grad_summaries(logger, rencoder.decode_w, global_step, "Decode_W") log_var_and_grad_summaries(logger, rencoder.decode_b, global_step, "Decode_b") total_epoch_loss += torch.Tensor.item(loss.data) denom += 1 #if dr.config['aug_step'] > 0 and i % dr.config['aug_step'] == 0 and i > 0: if dr.config['aug_step'] > 0: # Magic data augmentation trick happen here for t in range(dr.config['aug_step']): inputs = Variable(outputs.data) if dr.config['noise_prob'] > 0.0: inputs = dp(inputs) optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings loss.backward() optimizer.step() e_end_time = time.time() print( 'Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}' .format(epoch, e_end_time - e_start_time, sqrt(total_epoch_loss / denom))) logger.scalar_summary("Training_RMSE_per_epoch", sqrt(total_epoch_loss / denom), epoch) logger.scalar_summary("Epoch_time", e_end_time - e_start_time, epoch) if epoch == dr.config['num_epochs'] - 1: eval_loss = do_eval(rencoder, eval_data_layer) print('Epoch {} EVALUATION LOSS: {}'.format(epoch, eval_loss)) logger.scalar_summary("EVALUATION_RMSE", eval_loss, epoch) print("Saving model to {}".format(model_checkpoint + ".last")) torch.save(rencoder.state_dict(), model_checkpoint + ".last") return True
def train_student(nb_teachers): """ This function trains a student using predictions made by an ensemble of teachers. The student and teacher models are trained using the same neural network architecture. :param nb_teachers: number of teachers (in the ensemble) to learn from :return: True if student training went well """ assert input.create_dir_if_needed(train_dir) dr.load_maps() dr.load_train_data_layer() predictions = ensemble_preds(nb_teachers) #print("%s, %s, %s" % (nb_teachers, len(dr.stud_train_data_layer.data.keys()), # dr.stud_train_data_layer._vector_dim)) #predictions = np.memmap('/data/Netflix/memmaps/results.dat', dtype=np.int8, # shape=(nb_teachers, # len(dr.stud_train_data_layer.data.keys()), # dr.stud_train_data_layer._vector_dim), mode='r') labels = nagg.noisy_max(predictions, lap_scale) #labels = np.memmap('/data/Netflix/memmaps/results_.dat', dtype=np.float32, # shape=(len(dr.stud_train_data_layer.data.keys()), # dr.stud_train_data_layer._vector_dim), mode='r') #IN THE ABOVE: it is recommended to run each one at a time - have predictions #save to its memmap file, then load it up in the next run to calculate labels. #Then again load the labels from file to carry on with the rest of training. #This is due to bugs in memory from trying to go directly from one step to the #next # Prepare checkpoint filename and path model_path = train_dir + '/' 'model_' + str( nb_teachers) + '_student.last' # NOLINT(long-line) rencoder = model.AutoEncoder( layer_sizes=[dr.stud_train_data_layer._vector_dim] + [int(l) for l in dr.config['hidden_layers'].split(',')], nl_type=dr.config['non_linearity_type'], is_constrained=dr.config['constrained'], dp_drop_prob=dr.config['drop_prob'], last_layer_activations=dr.config['skip_last_layer_nl']) gpu_ids = [int(g) for g in dr.config['gpu_ids'].split(',')] print('Using GPUs: {}'.format(gpu_ids)) if len(gpu_ids) > 1: rencoder = nn.DataParallel(rencoder, device_ids=gpu_ids) if dr.use_gpu: rencoder = rencoder.cuda() if dr.config['optimizer'] == "adam": optimizer = optim.Adam(rencoder.parameters(), lr=dr.config['lr'], weight_decay=dr.config['weight_decay']) elif dr.config['optimizer'] == "adagrad": optimizer = optim.Adagrad(rencoder.parameters(), lr=dr.config['lr'], weight_decay=dr.config['weight_decay']) elif dr.config['optimizer'] == "momentum": optimizer = optim.SGD(rencoder.parameters(), lr=dr.config['lr'], momentum=0.9, weight_decay=dr.config['weight_decay']) scheduler = MultiStepLR(optimizer, milestones=[24, 36, 48, 66, 72], gamma=0.5) elif dr.config['optimizer'] == "rmsprop": optimizer = optim.RMSprop(rencoder.parameters(), lr=dr.config['lr'], momentum=0.9, weight_decay=dr.config['weight_decay']) else: raise ValueError('Unknown optimizer kind') t_loss = 0.0 t_loss_denom = 0.0 global_step = 0 if dr.config['noise_prob'] > 0.0: dp = nn.Dropout(p=dr.config['noise_prob']) # Start student training for epoch in range(dr.config['num_epochs']): print('Doing epoch {} of {}'.format(epoch, dr.config['num_epochs'])) e_start_time = time.time() rencoder.train() total_epoch_loss = 0.0 denom = 0.0 if dr.config['optimizer'] == "momentum": scheduler.step() num_batches = int(len(labels) / dr.config['batch_size']) for i, (mb, new_labels) in enumerate( iterate_one_epoch(dr.stud_train_data_layer, labels)): if i % 100 == 0: print("batch %s out of %s" % (i, num_batches)) inputs = Variable( mb.cuda().to_dense() if dr.use_gpu else mb.to_dense()) consensus = Variable( new_labels.cuda() if dr.use_gpu else new_labels) optimizer.zero_grad() outputs = rencoder(inputs) # define consensus loss, num_ratings = model.MSEloss(outputs, consensus) loss = loss / num_ratings loss.backward() optimizer.step() global_step += 1 t_loss += torch.Tensor.item(loss.data) t_loss_denom += 1 total_epoch_loss += torch.Tensor.item(loss.data) denom += 1 #if dr.config['aug_step'] > 0 and i % dr.config['aug_step'] == 0 and i > 0: if dr.config['aug_step'] > 0: # Magic data augmentation trick happen here for t in range(dr.config['aug_step']): inputs = Variable(outputs.data) if dr.config['noise_prob'] > 0.0: inputs = dp(inputs) optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings loss.backward() optimizer.step() e_end_time = time.time() print( 'Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}' .format(epoch, e_end_time - e_start_time, sqrt(total_epoch_loss / denom))) torch.save(rencoder.state_dict(), model_path) print("STUDENT TRAINED") return True