def fetch_recs_for_users(user, user_dict, pdt_id, item_feat_df, item_id_type, result_filepath, ground_truth_purchase_dict=None): """ For all items in a dict (of recs, or already_bought, or ground_truth), fetch information. """ for iid in user_dict[user]: try: info1, info2, info3 = get_item_by_id(iid, pdt_id, item_feat_df, item_id_type) sentence = info1 + ', ' + info2 + info3 if ground_truth_purchase_dict is not None: if iid in ground_truth_purchase_dict[user]: count_purchases = len([item for item in ground_truth_purchase_dict[user] if item == iid]) sentence += f' ----- BOUGHT {count_purchases} TIME(S)' except: sentence = 'No name' save_txt(sentence, result_filepath, mode='a')
def train_full_model(fixed_params_path, visualization, check_embedding, remove, edge_batch_size, **params,): """ Given the best hyperparameter combination, function to train the model on all available data. Files needed to run ------------------- All the files in the TrainDataPaths: It includes all the interactions between user, sport and items, as well as features for user, sport and items. Fixed_params and params found in hyperparametrization: Those params will indicate how to train the model. Usually, they are found when running the hyperparametrization loop. Parameters ---------- See click options below for details. Saves to files -------------- trained_model with its fixed parameters and hyperparameters: The trained model with all parameters are saved to the folder 'models'. graph and ID mapping: When doing inference, it might be useful to import an already built graph (and the mapping that allows to associate node ID with personal information such as CUSTOMER IDENTIFIER or ITEM IDENTIFIER). Thus, the graph and ID mapping are saved to folder 'models'. """ # Load parameters fixed_params = read_data(fixed_params_path) class objectview(object): def __init__(self, d): self.__dict__ = d fixed_params = objectview(fixed_params) fixed_params.remove = remove fixed_params.subtrain_size = 0.01 fixed_params.valid_size = 0.01 fixed_params.edge_batch_size = edge_batch_size # Create full train set train_data_paths = TrainDataPaths() presplit_item_feat = read_data(train_data_paths.item_feat_path) full_interaction_data = read_data(train_data_paths.full_interaction_path) train_df, test_df = presplit_data(presplit_item_feat, full_interaction_data, num_min=3, remove_unk=True, sort=True, test_size_days=1, item_id_type='ITEM IDENTIFIER', ctm_id_type='CUSTOMER IDENTIFIER', ) train_data_paths.train_path = train_df train_data_paths.test_path = test_df data = DataLoader(train_data_paths, fixed_params) # Initialize graph & features valid_graph = create_graph( data.graph_schema, ) valid_graph = assign_graph_features(valid_graph, fixed_params, data, **params, ) dim_dict = {'user': valid_graph.nodes['user'].data['features'].shape[1], 'item': valid_graph.nodes['item'].data['features'].shape[1], 'out': params['out_dim'], 'hidden': params['hidden_dim']} all_sids = None if 'sport' in valid_graph.ntypes: dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[1] all_sids = np.arange(valid_graph.num_nodes('sport')) # Initialize model model = ConvModel(valid_graph, params['n_layers'], dim_dict, params['norm'], params['dropout'], params['aggregator_type'], params['pred'], params['aggregator_hetero'], params['embedding_layer'], ) if cuda: model = model.to(device) # Initialize dataloaders # get training and test ids ( train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, ground_truth_subtrain, ground_truth_valid, all_eids_dict ) = train_valid_split( valid_graph, data.ground_truth_test, fixed_params.etype, fixed_params.subtrain_size, fixed_params.valid_size, fixed_params.reverse_etype, fixed_params.train_on_clicks, fixed_params.remove_train_eids, params['clicks_sample'], params['purchases_sample'], ) ( edgeloader_train, edgeloader_valid, nodeloader_subtrain, nodeloader_valid, nodeloader_test ) = generate_dataloaders(valid_graph, train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, fixed_params, num_workers, all_sids, embedding_layer=params['embedding_layer'], n_layers=params['n_layers'], neg_sample_size=params['neg_sample_size'], ) train_eids_len = 0 valid_eids_len = 0 for etype in train_eids_dict.keys(): train_eids_len += len(train_eids_dict[etype]) valid_eids_len += len(valid_eids_dict[etype]) num_batches_train = math.ceil(train_eids_len / fixed_params.edge_batch_size) num_batches_subtrain = math.ceil( (len(subtrain_uids) + len(all_iids)) / fixed_params.node_batch_size ) num_batches_val_loss = math.ceil(valid_eids_len / fixed_params.edge_batch_size) num_batches_val_metrics = math.ceil( (len(valid_uids) + len(all_iids)) / fixed_params.node_batch_size ) num_batches_test = math.ceil( (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size ) # Run model hp_sentence = params hp_sentence.update(vars(fixed_params)) hp_sentence = f'{str(hp_sentence)[1: -1]} \n' save_txt(f'\n \n START - Hyperparameters \n{hp_sentence}', train_data_paths.result_filepath, "a") trained_model, viz, best_metrics = train_model( model, fixed_params.num_epochs, num_batches_train, num_batches_val_loss, edgeloader_train, edgeloader_valid, max_margin_loss, params['delta'], params['neg_sample_size'], params['use_recency'], cuda, device, fixed_params.optimizer, params['lr'], get_metrics=True, train_graph=train_graph, valid_graph=valid_graph, nodeloader_valid=nodeloader_valid, nodeloader_subtrain=nodeloader_subtrain, k=fixed_params.k, out_dim=params['out_dim'], num_batches_val_metrics=num_batches_val_metrics, num_batches_subtrain=num_batches_subtrain, bought_eids=train_eids_dict[('user', 'buys', 'item')], ground_truth_subtrain=ground_truth_subtrain, ground_truth_valid=ground_truth_valid, remove_already_bought=True, result_filepath=train_data_paths.result_filepath, start_epoch=fixed_params.start_epoch, patience=fixed_params.patience, pred=params['pred'], use_popularity=params['use_popularity'], weight_popularity=params['weight_popularity'], remove_false_negative=fixed_params.remove_false_negative, embedding_layer=params['embedding_layer'], ) # Get viz & metrics if visualization: plot_train_loss(hp_sentence, viz) # Report performance on validation set sentence = ("BEST VALIDATION Precision " "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%" .format(best_metrics['precision'] * 100, best_metrics['recall'] * 100, best_metrics['coverage'] * 100)) log.info(sentence) save_txt(sentence, train_data_paths.result_filepath, mode='a') # Report performance on test set log.debug('Test metrics start ...') trained_model.eval() with torch.no_grad(): embeddings = get_embeddings(valid_graph, params['out_dim'], trained_model, nodeloader_test, num_batches_test, cuda, device, params['embedding_layer'], ) for ground_truth in [data.ground_truth_purchase_test, data.ground_truth_test]: precision, recall, coverage = get_metrics_at_k( embeddings, valid_graph, trained_model, params['out_dim'], ground_truth, all_eids_dict[('user', 'buys', 'item')], fixed_params.k, True, # Remove already bought cuda, device, params['pred'], params['use_popularity'], params['weight_popularity'], ) sentence = ("TEST Precision " "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%" .format(precision * 100, recall * 100, coverage * 100)) log.info(sentence) save_txt(sentence, train_data_paths.result_filepath, mode='a') if check_embedding: trained_model.eval() with torch.no_grad(): log.debug('ANALYSIS OF RECOMMENDATIONS') if 'sport' in train_graph.ntypes: result_sport = explore_sports(embeddings, data.sport_feat_df, data.spt_id, fixed_params.num_choices) save_txt(result_sport, train_data_paths.result_filepath, mode='a') already_bought_dict = create_already_bought(valid_graph, all_eids_dict[('user', 'buys', 'item')], ) already_clicked_dict = None if fixed_params.discern_clicks: already_clicked_dict = create_already_bought(valid_graph, all_eids_dict[('user', 'clicks', 'item')], etype='clicks', ) users, items = data.ground_truth_test ground_truth_dict = create_ground_truth(users, items) user_ids = np.unique(users).tolist() recs = get_recs(valid_graph, embeddings, trained_model, params['out_dim'], fixed_params.k, user_ids, already_bought_dict, remove_already_bought=True, pred=params['pred'], use_popularity=params['use_popularity'], weight_popularity=params['weight_popularity']) users, items = data.ground_truth_purchase_test ground_truth_purchase_dict = create_ground_truth(users, items) explore_recs(recs, already_bought_dict, already_clicked_dict, ground_truth_dict, ground_truth_purchase_dict, data.item_feat_df, fixed_params.num_choices, data.pdt_id, fixed_params.item_id_type, train_data_paths.result_filepath) if fixed_params.item_id_type == 'SPECIFIC ITEM IDENTIFIER': coverage_metrics = check_coverage(data.user_item_train, data.item_feat_df, data.pdt_id, recs) sentence = ( "COVERAGE \n|| All transactions : " "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f}% | Eco {:.1f}% " "\n|| Recommendations : " "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f} | Eco {:.1f}%%" .format( coverage_metrics['generic_mean_whole'] * 100, coverage_metrics['junior_mean_whole'] * 100, coverage_metrics['male_mean_whole'] * 100, coverage_metrics['female_mean_whole'] * 100, coverage_metrics['eco_mean_whole'] * 100, coverage_metrics['generic_mean_recs'] * 100, coverage_metrics['junior_mean_recs'] * 100, coverage_metrics['male_mean_recs'] * 100, coverage_metrics['female_mean_recs'] * 100, coverage_metrics['eco_mean_recs'] * 100, ) ) log.info(sentence) save_txt(sentence, train_data_paths.result_filepath, mode='a') save_outputs( { 'embeddings': embeddings, 'already_bought': already_bought_dict, 'already_clicked': already_bought_dict, 'ground_truth': ground_truth_dict, 'recs': recs, }, 'outputs/' ) # Save model date = str(datetime.datetime.now())[:-10].replace(' ', '') torch.save(trained_model.state_dict(), f'models/FULL_Recall_{recall * 100:.2f}_{date}.pth') # Save all necessary params save_outputs( { f'{date}_params': params, f'{date}_fixed_params': vars(fixed_params), }, 'models/' ) print("Saved model & parameters to disk.") # Save graph & ID mapping save_graphs(f'models/{date}_graph.bin', [valid_graph]) save_outputs( { f'{date}_ctm_id': data.ctm_id, f'{date}_pdt_id': data.pdt_id, }, 'models/' ) print("Saved graph & ID mapping to disk.")
def inference_fn(trained_model, remove, fixed_params, overwrite_fixed_params=False, days_of_purchases=710, days_of_clicks=710, lifespan_of_items=710, **params): """ Function to run inference inside the hyperparameter loop and calculate metrics. Parameters ---------- trained_model: Model trained during training of hyperparameter loop. remove: Percentage of data removed. See src.utils_data for more details. fixed_params: All parameters used during training of hyperparameter loop. See src.utils_data for more details. overwrite_fixed_params: If true, training parameters will overwritten by the parameters below. Can be useful if need to test the model on different parameters, e.g. that includes older clicks or purchases. days_of_purchases, days_of_clicks, lifespan_of_items: All parameters that can overwrite the training parameters. Only useful if overwrite_fixed_params is True. params: All other parameters used during training. Returns ------- recall: Recall on the test set. Relevant to compare with recall computed on hyperparametrization test set (since parameters like 'remove' and all overwritable parameters are different) Saves to file ------------- Metrics computed on the test set. """ # Import parameters if isinstance(fixed_params, str): path = fixed_params fixed_params = read_data(path) class objectview(object): def __init__(self, d): self.__dict__ = d fixed_params = objectview(fixed_params) if 'params' in params.keys(): # if isinstance(params['params'], str): path = params['params'] params = read_data(path) # Initialize data data_paths = DataPaths() fixed_params.remove = remove if overwrite_fixed_params: fixed_params.days_of_purchases = days_of_purchases fixed_params.days_of_clicks = days_of_clicks fixed_params.lifespan_of_items = lifespan_of_items data = DataLoader(data_paths, fixed_params) # Get graph valid_graph = create_graph(data.graph_schema, ) valid_graph = assign_graph_features( valid_graph, fixed_params, data, **params, ) dim_dict = { 'user': valid_graph.nodes['user'].data['features'].shape[1], 'item': valid_graph.nodes['item'].data['features'].shape[1], 'out': params['out_dim'], 'hidden': params['hidden_dim'] } all_sids = None if 'sport' in valid_graph.ntypes: dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[ 1] all_sids = np.arange(valid_graph.num_nodes('sport')) # get training and test ids (train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, ground_truth_subtrain, ground_truth_valid, all_eids_dict) = train_valid_split( valid_graph, data.ground_truth_test, fixed_params.etype, fixed_params.subtrain_size, fixed_params.valid_size, fixed_params.reverse_etype, fixed_params.train_on_clicks, fixed_params.remove_train_eids, params['clicks_sample'], params['purchases_sample'], ) (edgeloader_train, edgeloader_valid, nodeloader_subtrain, nodeloader_valid, nodeloader_test) = generate_dataloaders( valid_graph, train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, fixed_params, num_workers, all_sids, embedding_layer=params['embedding_layer'], n_layers=params['n_layers'], neg_sample_size=params['neg_sample_size'], ) num_batches_test = math.ceil( (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size) # Import model if isinstance(trained_model, str): path = trained_model trained_model = ConvModel( valid_graph, params['n_layers'], dim_dict, params['norm'], params['dropout'], params['aggregator_type'], fixed_params.pred, params['aggregator_hetero'], params['embedding_layer'], ) trained_model.load_state_dict(torch.load(path, map_location=device)) if cuda: trained_model = trained_model.to(device) trained_model.eval() with torch.no_grad(): embeddings = get_embeddings( valid_graph, params['out_dim'], trained_model, nodeloader_test, num_batches_test, cuda, device, params['embedding_layer'], ) for ground_truth in [ data.ground_truth_purchase_test, data.ground_truth_test ]: precision, recall, coverage = get_metrics_at_k( embeddings, valid_graph, trained_model, params['out_dim'], ground_truth, all_eids_dict[('user', 'buys', 'item')], fixed_params.k, True, # Remove already bought cuda, device, fixed_params.pred, params['use_popularity'], params['weight_popularity'], ) sentence = ("TEST Precision " "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%".format( precision * 100, recall * 100, coverage * 100)) print(sentence) save_txt(sentence, data_paths.result_filepath, mode='a') return recall
def train_model(model, num_epochs, num_batches_train, num_batches_val_loss, edgeloader_train, edgeloader_valid, loss_fn, delta, neg_sample_size, use_recency=False, cuda=False, device=None, optimizer=torch.optim.Adam, lr=0.001, get_metrics=False, train_graph=None, valid_graph=None, nodeloader_valid=None, nodeloader_subtrain=None, k=None, out_dim=None, num_batches_val_metrics=None, num_batches_subtrain=None, bought_eids=None, ground_truth_subtrain=None, ground_truth_valid=None, remove_already_bought=True, result_filepath=None, start_epoch=0, patience=5, pred=None, use_popularity=False, weight_popularity=1, remove_false_negative=False, embedding_layer=True, ): """ Main function to train a GNN, using max margin loss on positive and negative examples. Process: - A full training epoch - Batch by batch. 1 batch is composed of multiple computational blocks, required to compute embeddings for all the nodes related to the edges in the batch. - Input the initial features. Compute the embeddings & the positive and negative scores - Also compute other considerations for the loss function: negative mask, recency scores - Loss is returned, then backward, then step. - Metrics are computed on the subtraining set (using nodeloader) - Validation set - Loss is computed (in model.eval() mode) for validation edge for early stopping purposes - Also, metrics are computed on the validation set (using nodeloader) - Logging & early stopping - Everything is logged, best metrics are saved. - Using the patience parameter, early stopping is applied when val_loss stops going down. """ model.train_loss_list = [] model.train_precision_list = [] model.train_recall_list = [] model.train_coverage_list = [] model.val_loss_list = [] model.val_precision_list = [] model.val_recall_list = [] model.val_coverage_list = [] best_metrics = {} # For visualization max_metric = -0.1 patience_counter = 0 # For early stopping min_loss = 1.1 opt = optimizer(model.parameters(), lr=lr) # TRAINING print('Starting training.') for epoch in range(start_epoch, num_epochs): start_time = time.time() print('TRAINING LOSS') model.train() # Because if not, after eval, dropout would be still be inactive i = 0 total_loss = 0 for _, pos_g, neg_g, blocks in edgeloader_train: opt.zero_grad() # Negative mask negative_mask = {} if remove_false_negative: nids = neg_g.ndata[dgl.NID] for etype in pos_g.canonical_etypes: neg_src, neg_dst = neg_g.edges(etype=etype) neg_src = nids[etype[0]][neg_src] neg_dst = nids[etype[2]][neg_dst] negative_mask_tensor = valid_graph.has_edges_between(neg_src, neg_dst, etype=etype) negative_mask[etype] = negative_mask_tensor.type(torch.float) if cuda: negative_mask[etype] = negative_mask[etype].to(device) if cuda: blocks = [b.to(device) for b in blocks] pos_g = pos_g.to(device) neg_g = neg_g.to(device) i += 1 if i % 10 == 0: print("Edge batch {} out of {}".format(i, num_batches_train)) input_features = blocks[0].srcdata['features'] # recency (TO BE CLEANED) recency_scores = None if use_recency: recency_scores = pos_g.edata['recency'] _, pos_score, neg_score = model(blocks, input_features, pos_g, neg_g, embedding_layer, ) loss = loss_fn(pos_score, neg_score, delta, neg_sample_size, use_recency=use_recency, recency_scores=recency_scores, remove_false_negative=remove_false_negative, negative_mask=negative_mask, cuda=cuda, device=device, ) if epoch > 0: # For the epoch 0, no training (just report loss) loss.backward() opt.step() total_loss += loss.item() if epoch == 0 and i > 10: break # For the epoch 0, report loss on only subset train_avg_loss = total_loss / i model.train_loss_list.append(train_avg_loss) print('VALIDATION LOSS') model.eval() with torch.no_grad(): total_loss = 0 i = 0 for _, pos_g, neg_g, blocks in edgeloader_valid: i += 1 if i % 10 == 0: print("Edge batch {} out of {}".format(i, num_batches_val_loss)) # Negative mask negative_mask = {} if remove_false_negative: nids = neg_g.ndata[dgl.NID] for etype in pos_g.canonical_etypes: neg_src, neg_dst = neg_g.edges(etype=etype) neg_src = nids[etype[0]][neg_src] neg_dst = nids[etype[2]][neg_dst] negative_mask_tensor = valid_graph.has_edges_between(neg_src, neg_dst, etype=etype) negative_mask[etype] = negative_mask_tensor.type(torch.float) if cuda: negative_mask[etype] = negative_mask[etype].to(device) if cuda: blocks = [b.to(device) for b in blocks] pos_g = pos_g.to(device) neg_g = neg_g.to(device) input_features = blocks[0].srcdata['features'] _, pos_score, neg_score = model(blocks, input_features, pos_g, neg_g, embedding_layer, ) # recency (TO BE CLEANED) recency_scores = None if use_recency: recency_scores = pos_g.edata['recency'] val_loss = loss_fn(pos_score, neg_score, delta, neg_sample_size, use_recency=use_recency, recency_scores=recency_scores, remove_false_negative=remove_false_negative, negative_mask=negative_mask, cuda=cuda, device=device, ) total_loss += val_loss.item() # print(val_loss.item()) val_avg_loss = total_loss / i model.val_loss_list.append(val_avg_loss) ############ # METRICS PER EPOCH if get_metrics and epoch % 10 == 1: model.eval() with torch.no_grad(): # training metrics print('TRAINING METRICS') y = get_embeddings(train_graph, out_dim, model, nodeloader_subtrain, num_batches_subtrain, cuda, device, embedding_layer, ) train_precision, train_recall, train_coverage = get_metrics_at_k(y, train_graph, model, out_dim, ground_truth_subtrain, bought_eids, k, False, # Remove already bought cuda, device, pred, use_popularity, weight_popularity) # validation metrics print('VALIDATION METRICS') y = get_embeddings(valid_graph, out_dim, model, nodeloader_valid, num_batches_val_metrics, cuda, device, embedding_layer, ) val_precision, val_recall, val_coverage = get_metrics_at_k(y, valid_graph, model, out_dim, ground_truth_valid, bought_eids, k, remove_already_bought, cuda, device, pred, use_popularity, weight_popularity ) sentence = '''Epoch {:05d} || TRAINING Loss {:.5f} | Precision {:.3f}% | Recall {:.3f}% | Coverage {:.2f}% || VALIDATION Loss {:.5f} | Precision {:.3f}% | Recall {:.3f}% | Coverage {:.2f}% '''.format( epoch, train_avg_loss, train_precision * 100, train_recall * 100, train_coverage * 100, val_avg_loss, val_precision * 100, val_recall * 100, val_coverage * 100) print(sentence) save_txt(sentence, result_filepath, mode='a') model.train_precision_list.append(train_precision * 100) model.train_recall_list.append(train_recall * 100) model.train_coverage_list.append(train_coverage * 10) model.val_precision_list.append(val_precision * 100) model.val_recall_list.append(val_recall * 100) model.val_coverage_list.append(val_coverage * 10) # just *10 for viz purposes # Visualization of best metric if val_recall > max_metric: max_metric = val_recall best_metrics = {'recall': val_recall, 'precision': val_precision, 'coverage': val_coverage} else: sentence = "Epoch {:05d} | Training Loss {:.5f} | Validation Loss {:.5f} | ".format( epoch, train_avg_loss, val_avg_loss) print(sentence) save_txt(sentence, result_filepath, mode='a') if val_avg_loss < min_loss: min_loss = val_avg_loss patience_counter = 0 else: patience_counter += 1 if patience_counter == patience: break elapsed = time.time() - start_time result_to_save = f'Epoch took {timedelta(seconds=elapsed)} \n' print(result_to_save) save_txt(result_to_save, result_filepath, mode='a') viz = {'train_loss_list': model.train_loss_list, 'train_precision_list': model.train_precision_list, 'train_recall_list': model.train_recall_list, 'train_coverage_list': model.train_coverage_list, 'val_loss_list': model.val_loss_list, 'val_precision_list': model.val_precision_list, 'val_recall_list': model.val_recall_list, 'val_coverage_list': model.val_coverage_list} print('Training completed.') return model, viz, best_metrics # model will already be to 'cuda' device?
def explore_recs(recs: dict, already_bought_dict: dict, already_clicked_dict, ground_truth_dict: dict, ground_truth_purchase_dict: dict, item_feat_df: pd.DataFrame, num_choices: int, pdt_id: pd.DataFrame, item_id_type: str, result_filepath: str): """ For a random sample of users, fetch information about what items were clicked/bought, recommended and ground truth. Users with only 1 previous click or purchase are explored at the end. """ choices = random.sample(recs.keys(), num_choices) for user in choices: save_txt('\nCustomer bought', result_filepath, mode='a') try: fetch_recs_for_users(user, already_bought_dict, pdt_id, item_feat_df, item_id_type, result_filepath) except: save_txt('Nothing', result_filepath, mode='a') save_txt('\nCustomer clicked on', result_filepath, mode='a') try: fetch_recs_for_users(user, already_clicked_dict, pdt_id, item_feat_df, item_id_type, result_filepath) except: save_txt('No click data', result_filepath, mode='a') save_txt('\nGot recommended', result_filepath, mode='a') fetch_recs_for_users(user, recs, pdt_id, item_feat_df, item_id_type, result_filepath) save_txt('\nGround truth', result_filepath, mode='a') fetch_recs_for_users(user, ground_truth_dict, pdt_id, item_feat_df, item_id_type, result_filepath, ground_truth_purchase_dict) # user with 1 item choices = random.sample([uid for uid, v in already_bought_dict.items() if len(v) == 1 and uid in recs.keys()], 2) for user in choices: save_txt('\nCustomer bought', result_filepath, mode='a') try: fetch_recs_for_users(user, already_bought_dict, pdt_id, item_feat_df, item_id_type, result_filepath) except: save_txt('Nothing', result_filepath, mode='a') save_txt('\nCustomer clicked on', result_filepath, mode='a') try: fetch_recs_for_users(user, already_clicked_dict, pdt_id, item_feat_df, item_id_type, result_filepath) except: save_txt('No click data', result_filepath, mode='a') save_txt('\nGot recommended', result_filepath, mode='a') fetch_recs_for_users(user, recs, pdt_id, item_feat_df, item_id_type, result_filepath) save_txt('\nGround truth', result_filepath, mode='a') fetch_recs_for_users(user, ground_truth_dict, pdt_id, item_feat_df, item_id_type, result_filepath, ground_truth_purchase_dict)
def train(data, fixed_params, data_paths, visualization, check_embedding, **params): """ Function to find the best hyperparameter combination. Files needed to run ------------------- All the files in the src.utils_data.DataPaths: It includes all the interactions between user, sport and items, as well as features for user, sport and items. If starting hyperparametrization from a checkpoint: The checkpoint file, generated by skopt during a previous hyperparametrization. The most recent file of the root folder will be fetched. Parameters ---------- data : Object of class DataLoader, containing multiple arguments such as user_item_train dataframe, graph schema, etc. fixed_params : All parameters that are fixed, i.e. not part of the hyperparametrization. data_paths : All data paths (mainly csv). # Note: currently, only paths.result_filepath is used here. visualization : Visualize results or not. # Note: currently not used, visualization is always on or controlled by fixed_params. check_embedding : Visualize recommendations or not. # Note: currently not used, controlled by fixed_params. **params : Mainly params that come from the hyperparametrization loop, controlled by skopt. Returns ------- recall : Recall on the test set for the current combination of hyperparameters. Saves to files -------------- logging of all experiments: All training logs are saved to result_filepath, including losses, metrics and examples of recommendations Plots of the evolution of losses and metrics are saved to the folder 'plots' best models: All models, fixed_params and params that yielded recall higher than 8% on specific item identifier or 20% on generic item identifier are saved to the folder 'models' """ # Establish hyperparameters # Dimensions out_dim = { 'Very Small': 32, 'Small': 96, 'Medium': 128, 'Large': 192, 'Very Large': 256 } hidden_dim = { 'Very Small': 64, 'Small': 192, 'Medium': 256, 'Large': 384, 'Very Large': 512 } params['out_dim'] = out_dim[params['embed_dim']] params['hidden_dim'] = hidden_dim[params['embed_dim']] # Popularity use_popularity = { 'No': False, 'Small': True, 'Medium': True, 'Large': True } weight_popularity = {'No': 0, 'Small': .01, 'Medium': .05, 'Large': .1} days_popularity = {'No': 0, 'Small': 7, 'Medium': 7, 'Large': 7} params['use_popularity'] = use_popularity[params['popularity_importance']] params['weight_popularity'] = weight_popularity[ params['popularity_importance']] params['days_popularity'] = days_popularity[ params['popularity_importance']] if fixed_params.duplicates == 'count_occurrence': params['aggregator_type'] += '_edge' # Make sure graph data is consistent with message passing parameters if fixed_params.duplicates == 'count_occurrence': assert params['aggregator_type'].endswith('edge') else: assert not params['aggregator_type'].endswith('edge') valid_graph = create_graph(data.graph_schema, ) valid_graph = assign_graph_features( valid_graph, fixed_params, data, **params, ) dim_dict = { 'user': valid_graph.nodes['user'].data['features'].shape[1], 'item': valid_graph.nodes['item'].data['features'].shape[1], 'out': params['out_dim'], 'hidden': params['hidden_dim'] } all_sids = None if 'sport' in valid_graph.ntypes: dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[ 1] all_sids = np.arange(valid_graph.num_nodes('sport')) # get training and test ids (train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, ground_truth_subtrain, ground_truth_valid, all_eids_dict) = train_valid_split( valid_graph, data.ground_truth_test, fixed_params.etype, fixed_params.subtrain_size, fixed_params.valid_size, fixed_params.reverse_etype, fixed_params.train_on_clicks, fixed_params.remove_train_eids, params['clicks_sample'], params['purchases_sample'], ) (edgeloader_train, edgeloader_valid, nodeloader_subtrain, nodeloader_valid, nodeloader_test) = generate_dataloaders( valid_graph, train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, all_iids, fixed_params, num_workers, all_sids, embedding_layer=params['embedding_layer'], n_layers=params['n_layers'], neg_sample_size=params['neg_sample_size'], ) train_eids_len = 0 valid_eids_len = 0 for etype in train_eids_dict.keys(): train_eids_len += len(train_eids_dict[etype]) valid_eids_len += len(valid_eids_dict[etype]) num_batches_train = math.ceil(train_eids_len / fixed_params.edge_batch_size) num_batches_subtrain = math.ceil( (len(subtrain_uids) + len(all_iids)) / fixed_params.node_batch_size) num_batches_val_loss = math.ceil(valid_eids_len / fixed_params.edge_batch_size) num_batches_val_metrics = math.ceil( (len(valid_uids) + len(all_iids)) / fixed_params.node_batch_size) num_batches_test = math.ceil( (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size) if fixed_params.neighbor_sampler == 'partial': params['n_layers'] = 3 model = ConvModel( valid_graph, params['n_layers'], dim_dict, params['norm'], params['dropout'], params['aggregator_type'], fixed_params.pred, params['aggregator_hetero'], params['embedding_layer'], ) if cuda: model = model.to(device) hp_sentence = params hp_sentence.update(vars(fixed_params)) hp_sentence.update({ 'cuda': cuda, }) hp_sentence = f'{str(hp_sentence)[1: -1]} \n' save_txt(f'\n \n START - Hyperparameters \n{hp_sentence}', data_paths.result_filepath, "a") start_time = time.time() # Train model trained_model, viz, best_metrics = train_model( model, fixed_params.num_epochs, num_batches_train, num_batches_val_loss, edgeloader_train, edgeloader_valid, max_margin_loss, params['delta'], params['neg_sample_size'], params['use_recency'], cuda, device, fixed_params.optimizer, params['lr'], get_metrics=True, train_graph=train_graph, valid_graph=valid_graph, nodeloader_valid=nodeloader_valid, nodeloader_subtrain=nodeloader_subtrain, k=fixed_params.k, out_dim=params['out_dim'], num_batches_val_metrics=num_batches_val_metrics, num_batches_subtrain=num_batches_subtrain, bought_eids=train_eids_dict[('user', 'buys', 'item')], ground_truth_subtrain=ground_truth_subtrain, ground_truth_valid=ground_truth_valid, remove_already_bought=True, result_filepath=data_paths.result_filepath, start_epoch=fixed_params.start_epoch, patience=fixed_params.patience, pred=params['pred'], use_popularity=params['use_popularity'], weight_popularity=params['weight_popularity'], remove_false_negative=fixed_params.remove_false_negative, embedding_layer=params['embedding_layer'], ) elapsed = time.time() - start_time result_to_save = f'\n {timedelta(seconds=elapsed)} \n END' save_txt(result_to_save, data_paths.result_filepath, mode='a') if visualization: plot_train_loss(hp_sentence, viz) # Report performance on validation set sentence = ("BEST VALIDATION Precision " "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%".format( best_metrics['precision'] * 100, best_metrics['recall'] * 100, best_metrics['coverage'] * 100)) log.info(sentence) save_txt(sentence, data_paths.result_filepath, mode='a') # Report performance on test set log.debug('Test metrics start ...') trained_model.eval() with torch.no_grad(): embeddings = get_embeddings( valid_graph, params['out_dim'], trained_model, nodeloader_test, num_batches_test, cuda, device, params['embedding_layer'], ) for ground_truth in [ data.ground_truth_purchase_test, data.ground_truth_test ]: precision, recall, coverage = get_metrics_at_k( embeddings, valid_graph, trained_model, params['out_dim'], ground_truth, all_eids_dict[('user', 'buys', 'item')], fixed_params.k, True, # Remove already bought cuda, device, fixed_params.pred, params['use_popularity'], params['weight_popularity'], ) sentence = ("TEST Precision " "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%".format( precision * 100, recall * 100, coverage * 100)) log.info(sentence) save_txt(sentence, data_paths.result_filepath, mode='a') if check_embedding: trained_model.eval() with torch.no_grad(): log.debug('ANALYSIS OF RECOMMENDATIONS') if 'sport' in train_graph.ntypes: result_sport = explore_sports(embeddings, data.sport_feat_df, data.spt_id, fixed_params.num_choices) save_txt(result_sport, data_paths.result_filepath, mode='a') already_bought_dict = create_already_bought( valid_graph, all_eids_dict[('user', 'buys', 'item')], ) already_clicked_dict = None if fixed_params.discern_clicks: already_clicked_dict = create_already_bought( valid_graph, all_eids_dict[('user', 'clicks', 'item')], etype='clicks', ) users, items = data.ground_truth_test ground_truth_dict = create_ground_truth(users, items) user_ids = np.unique(users).tolist() recs = get_recs(valid_graph, embeddings, trained_model, params['out_dim'], fixed_params.k, user_ids, already_bought_dict, remove_already_bought=True, pred=fixed_params.pred, use_popularity=params['use_popularity'], weight_popularity=params['weight_popularity']) users, items = data.ground_truth_purchase_test ground_truth_purchase_dict = create_ground_truth(users, items) explore_recs(recs, already_bought_dict, already_clicked_dict, ground_truth_dict, ground_truth_purchase_dict, data.item_feat_df, fixed_params.num_choices, data.pdt_id, fixed_params.item_id_type, data_paths.result_filepath) if fixed_params.item_id_type == 'SPECIFIC ITEM_IDENTIFIER': coverage_metrics = check_coverage(data.user_item_train, data.item_feat_df, data.pdt_id, recs) sentence = ( "COVERAGE \n|| All transactions : " "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f}% | Eco {:.1f}% " "\n|| Recommendations : " "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f} | Eco {:.1f}%%" .format( coverage_metrics['generic_mean_whole'] * 100, coverage_metrics['junior_mean_whole'] * 100, coverage_metrics['male_mean_whole'] * 100, coverage_metrics['female_mean_whole'] * 100, coverage_metrics['eco_mean_whole'] * 100, coverage_metrics['generic_mean_recs'] * 100, coverage_metrics['junior_mean_recs'] * 100, coverage_metrics['male_mean_recs'] * 100, coverage_metrics['female_mean_recs'] * 100, coverage_metrics['eco_mean_recs'] * 100, )) log.info(sentence) save_txt(sentence, data_paths.result_filepath, mode='a') save_outputs( { 'embeddings': embeddings, 'already_bought': already_bought_dict, 'already_clicked': already_bought_dict, 'ground_truth': ground_truth_dict, 'recs': recs, }, 'outputs/') del params['remove'] # Save model if the recall is greater than 8% if (recall > 0.08) & (fixed_params.item_id_type == 'SPECIFIC ITEM_IDENTIFIER') \ or (recall > 0.2) & (fixed_params.item_id_type == 'GENERAL ITEM_IDENTIFIER'): date = str(datetime.datetime.now())[:-10].replace(' ', '') torch.save(trained_model.state_dict(), f'models/HP_Recall_{recall * 100:.2f}_{date}.pth') # Save all necessary params save_outputs( { f'{date}_params': params, f'{date}_fixed_params': vars(fixed_params), }, 'models/') # Inference on different users if fixed_params.run_inference > 0: with torch.no_grad(): print('On normal params') inference_recall = inference_hp.inference_fn( trained_model, remove=fixed_params.remove_on_inference, fixed_params=fixed_params, overwrite_fixed_params=False, **params) if fixed_params.run_inference > 1: print('For all users') del params['days_of_purchases'], params[ 'days_of_clicks'], params['lifespan_of_items'] all_users_inference_recall = inference_hp.inference_fn( trained_model, remove=fixed_params.remove_on_inference, fixed_params=fixed_params, overwrite_fixed_params=True, days_of_purchases=710, days_of_clicks=710, lifespan_of_items=710, **params) recap = f"BEST RECALL on 1) Validation set : {best_metrics['recall'] * 100:.2f}%" \ f'\n2) Test set : {recall * 100:.2f}%' if fixed_params.run_inference == 1: recap += f'\n3) On random users of {fixed_params.remove_on_inference} removed : {inference_recall * 100:.2f}' recap += f"\nLoop took {timedelta(seconds=elapsed)} for {len(viz['train_loss_list'])} epochs, an average of " \ f"{timedelta(seconds=elapsed / len(viz['train_loss_list']))} per epoch" print(recap) save_txt(recap, data_paths.result_filepath, mode='a') return recall # This is the 'test set' recall, on both purchases & clicks