Exemplo n.º 1
0
def fetch_recs_for_users(user,
                         user_dict,
                         pdt_id,
                         item_feat_df,
                         item_id_type,
                         result_filepath,
                         ground_truth_purchase_dict=None):
    """
    For all items in a dict (of recs, or already_bought, or ground_truth), fetch information.

    """
    for iid in user_dict[user]:
        try:
            info1, info2, info3 = get_item_by_id(iid, pdt_id, item_feat_df, item_id_type)
            sentence = info1 + ', ' + info2 + info3
            if ground_truth_purchase_dict is not None:
                if iid in ground_truth_purchase_dict[user]:
                    count_purchases = len([item for item in ground_truth_purchase_dict[user] if item == iid])
                    sentence += f' ----- BOUGHT {count_purchases} TIME(S)'
        except:
            sentence = 'No name'
        save_txt(sentence, result_filepath, mode='a')
Exemplo n.º 2
0
def train_full_model(fixed_params_path,
                     visualization,
                     check_embedding,
                     remove,
                     edge_batch_size,
                     **params,):
    """
    Given the best hyperparameter combination, function to train the model on all available data.

    Files needed to run
    -------------------
    All the files in the TrainDataPaths:
        It includes all the interactions between user, sport and items, as well as features for user, sport and items.
    Fixed_params and params found in hyperparametrization:
        Those params will indicate how to train the model. Usually, they are found when running the hyperparametrization
        loop.

    Parameters
    ----------
    See click options below for details.


    Saves to files
    --------------
    trained_model with its fixed parameters and hyperparameters:
        The trained model with all parameters are saved to the folder 'models'.
    graph and ID mapping:
        When doing inference, it might be useful to import an already built graph (and the mapping that allows to
        associate node ID with personal information such as CUSTOMER IDENTIFIER or ITEM IDENTIFIER). Thus, the graph and ID mapping are saved to
        folder 'models'.
    """
    # Load parameters
    fixed_params = read_data(fixed_params_path)
    class objectview(object):
        def __init__(self, d):
            self.__dict__ = d
    fixed_params = objectview(fixed_params)
    fixed_params.remove = remove
    fixed_params.subtrain_size = 0.01
    fixed_params.valid_size = 0.01
    fixed_params.edge_batch_size = edge_batch_size

    # Create full train set
    train_data_paths = TrainDataPaths()
    presplit_item_feat = read_data(train_data_paths.item_feat_path)
    full_interaction_data = read_data(train_data_paths.full_interaction_path)
    train_df, test_df = presplit_data(presplit_item_feat,
                                      full_interaction_data,
                                      num_min=3,
                                      remove_unk=True,
                                      sort=True,
                                      test_size_days=1,
                                      item_id_type='ITEM IDENTIFIER',
                                      ctm_id_type='CUSTOMER IDENTIFIER', )
    train_data_paths.train_path = train_df
    train_data_paths.test_path = test_df
    data = DataLoader(train_data_paths, fixed_params)

    # Initialize graph & features
    valid_graph = create_graph(
        data.graph_schema,
    )
    valid_graph = assign_graph_features(valid_graph,
                                        fixed_params,
                                        data,
                                        **params,
                                        )

    dim_dict = {'user': valid_graph.nodes['user'].data['features'].shape[1],
                'item': valid_graph.nodes['item'].data['features'].shape[1],
                'out': params['out_dim'],
                'hidden': params['hidden_dim']}

    all_sids = None
    if 'sport' in valid_graph.ntypes:
        dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[1]
        all_sids = np.arange(valid_graph.num_nodes('sport'))

    # Initialize model
    model = ConvModel(valid_graph,
                      params['n_layers'],
                      dim_dict,
                      params['norm'],
                      params['dropout'],
                      params['aggregator_type'],
                      params['pred'],
                      params['aggregator_hetero'],
                      params['embedding_layer'],
                      )
    if cuda:
        model = model.to(device)

    # Initialize dataloaders
    # get training and test ids
    (
        train_graph,
        train_eids_dict,
        valid_eids_dict,
        subtrain_uids,
        valid_uids,
        test_uids,
        all_iids,
        ground_truth_subtrain,
        ground_truth_valid,
        all_eids_dict
    ) = train_valid_split(
        valid_graph,
        data.ground_truth_test,
        fixed_params.etype,
        fixed_params.subtrain_size,
        fixed_params.valid_size,
        fixed_params.reverse_etype,
        fixed_params.train_on_clicks,
        fixed_params.remove_train_eids,
        params['clicks_sample'],
        params['purchases_sample'],
    )

    (
        edgeloader_train,
        edgeloader_valid,
        nodeloader_subtrain,
        nodeloader_valid,
        nodeloader_test
    ) = generate_dataloaders(valid_graph,
                             train_graph,
                             train_eids_dict,
                             valid_eids_dict,
                             subtrain_uids,
                             valid_uids,
                             test_uids,
                             all_iids,
                             fixed_params,
                             num_workers,
                             all_sids,
                             embedding_layer=params['embedding_layer'],
                             n_layers=params['n_layers'],
                             neg_sample_size=params['neg_sample_size'],
                             )

    train_eids_len = 0
    valid_eids_len = 0
    for etype in train_eids_dict.keys():
        train_eids_len += len(train_eids_dict[etype])
        valid_eids_len += len(valid_eids_dict[etype])
    num_batches_train = math.ceil(train_eids_len / fixed_params.edge_batch_size)
    num_batches_subtrain = math.ceil(
        (len(subtrain_uids) + len(all_iids)) / fixed_params.node_batch_size
    )
    num_batches_val_loss = math.ceil(valid_eids_len / fixed_params.edge_batch_size)
    num_batches_val_metrics = math.ceil(
        (len(valid_uids) + len(all_iids)) / fixed_params.node_batch_size
    )
    num_batches_test = math.ceil(
        (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size
    )

    # Run model
    hp_sentence = params
    hp_sentence.update(vars(fixed_params))
    hp_sentence = f'{str(hp_sentence)[1: -1]} \n'
    save_txt(f'\n \n START - Hyperparameters \n{hp_sentence}', train_data_paths.result_filepath, "a")
    trained_model, viz, best_metrics = train_model(
        model,
        fixed_params.num_epochs,
        num_batches_train,
        num_batches_val_loss,
        edgeloader_train,
        edgeloader_valid,
        max_margin_loss,
        params['delta'],
        params['neg_sample_size'],
        params['use_recency'],
        cuda,
        device,
        fixed_params.optimizer,
        params['lr'],
        get_metrics=True,
        train_graph=train_graph,
        valid_graph=valid_graph,
        nodeloader_valid=nodeloader_valid,
        nodeloader_subtrain=nodeloader_subtrain,
        k=fixed_params.k,
        out_dim=params['out_dim'],
        num_batches_val_metrics=num_batches_val_metrics,
        num_batches_subtrain=num_batches_subtrain,
        bought_eids=train_eids_dict[('user', 'buys', 'item')],
        ground_truth_subtrain=ground_truth_subtrain,
        ground_truth_valid=ground_truth_valid,
        remove_already_bought=True,
        result_filepath=train_data_paths.result_filepath,
        start_epoch=fixed_params.start_epoch,
        patience=fixed_params.patience,
        pred=params['pred'],
        use_popularity=params['use_popularity'],
        weight_popularity=params['weight_popularity'],
        remove_false_negative=fixed_params.remove_false_negative,
        embedding_layer=params['embedding_layer'],
    )

    # Get viz & metrics
    if visualization:
        plot_train_loss(hp_sentence, viz)

    # Report performance on validation set
    sentence = ("BEST VALIDATION Precision "
                "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%"
                .format(best_metrics['precision'] * 100,
                        best_metrics['recall'] * 100,
                        best_metrics['coverage'] * 100))

    log.info(sentence)
    save_txt(sentence, train_data_paths.result_filepath, mode='a')

    # Report performance on test set
    log.debug('Test metrics start ...')
    trained_model.eval()
    with torch.no_grad():
        embeddings = get_embeddings(valid_graph,
                                    params['out_dim'],
                                    trained_model,
                                    nodeloader_test,
                                    num_batches_test,
                                    cuda,
                                    device,
                                    params['embedding_layer'],
                                    )

        for ground_truth in [data.ground_truth_purchase_test, data.ground_truth_test]:
            precision, recall, coverage = get_metrics_at_k(
                embeddings,
                valid_graph,
                trained_model,
                params['out_dim'],
                ground_truth,
                all_eids_dict[('user', 'buys', 'item')],
                fixed_params.k,
                True,  # Remove already bought
                cuda,
                device,
                params['pred'],
                params['use_popularity'],
                params['weight_popularity'],
            )

            sentence = ("TEST Precision "
                        "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%"
                        .format(precision * 100,
                                recall * 100,
                                coverage * 100))
            log.info(sentence)
            save_txt(sentence, train_data_paths.result_filepath, mode='a')

    if check_embedding:
        trained_model.eval()
        with torch.no_grad():
            log.debug('ANALYSIS OF RECOMMENDATIONS')
            if 'sport' in train_graph.ntypes:
                result_sport = explore_sports(embeddings,
                                              data.sport_feat_df,
                                              data.spt_id,
                                              fixed_params.num_choices)

                save_txt(result_sport, train_data_paths.result_filepath, mode='a')

            already_bought_dict = create_already_bought(valid_graph,
                                                        all_eids_dict[('user', 'buys', 'item')],
                                                        )
            already_clicked_dict = None
            if fixed_params.discern_clicks:
                already_clicked_dict = create_already_bought(valid_graph,
                                                             all_eids_dict[('user', 'clicks', 'item')],
                                                             etype='clicks',
                                                             )

            users, items = data.ground_truth_test
            ground_truth_dict = create_ground_truth(users, items)
            user_ids = np.unique(users).tolist()
            recs = get_recs(valid_graph,
                            embeddings,
                            trained_model,
                            params['out_dim'],
                            fixed_params.k,
                            user_ids,
                            already_bought_dict,
                            remove_already_bought=True,
                            pred=params['pred'],
                            use_popularity=params['use_popularity'],
                            weight_popularity=params['weight_popularity'])

            users, items = data.ground_truth_purchase_test
            ground_truth_purchase_dict = create_ground_truth(users, items)
            explore_recs(recs,
                         already_bought_dict,
                         already_clicked_dict,
                         ground_truth_dict,
                         ground_truth_purchase_dict,
                         data.item_feat_df,
                         fixed_params.num_choices,
                         data.pdt_id,
                         fixed_params.item_id_type,
                         train_data_paths.result_filepath)

            if fixed_params.item_id_type == 'SPECIFIC ITEM IDENTIFIER':
                coverage_metrics = check_coverage(data.user_item_train,
                                                  data.item_feat_df,
                                                  data.pdt_id,
                                                  recs)

                sentence = (
                    "COVERAGE \n|| All transactions : "
                    "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f}% | Eco {:.1f}% "
                    "\n|| Recommendations : "
                    "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f} | Eco {:.1f}%%"
                        .format(
                        coverage_metrics['generic_mean_whole'] * 100,
                        coverage_metrics['junior_mean_whole'] * 100,
                        coverage_metrics['male_mean_whole'] * 100,
                        coverage_metrics['female_mean_whole'] * 100,
                        coverage_metrics['eco_mean_whole'] * 100,
                        coverage_metrics['generic_mean_recs'] * 100,
                        coverage_metrics['junior_mean_recs'] * 100,
                        coverage_metrics['male_mean_recs'] * 100,
                        coverage_metrics['female_mean_recs'] * 100,
                        coverage_metrics['eco_mean_recs'] * 100,
                    )
                )
                log.info(sentence)
                save_txt(sentence, train_data_paths.result_filepath, mode='a')

        save_outputs(
            {
                'embeddings': embeddings,
                'already_bought': already_bought_dict,
                'already_clicked': already_bought_dict,
                'ground_truth': ground_truth_dict,
                'recs': recs,
            },
            'outputs/'
        )

    # Save model
    date = str(datetime.datetime.now())[:-10].replace(' ', '')
    torch.save(trained_model.state_dict(), f'models/FULL_Recall_{recall * 100:.2f}_{date}.pth')
    # Save all necessary params
    save_outputs(
        {
            f'{date}_params': params,
            f'{date}_fixed_params': vars(fixed_params),
        },
        'models/'
    )
    print("Saved model & parameters to disk.")

    # Save graph & ID mapping
    save_graphs(f'models/{date}_graph.bin', [valid_graph])
    save_outputs(
        {
            f'{date}_ctm_id': data.ctm_id,
            f'{date}_pdt_id': data.pdt_id,
        },
        'models/'
    )
    print("Saved graph & ID mapping to disk.")
Exemplo n.º 3
0
def inference_fn(trained_model,
                 remove,
                 fixed_params,
                 overwrite_fixed_params=False,
                 days_of_purchases=710,
                 days_of_clicks=710,
                 lifespan_of_items=710,
                 **params):
    """
    Function to run inference inside the hyperparameter loop and calculate metrics.

    Parameters
    ----------
    trained_model:
        Model trained during training of hyperparameter loop.
    remove:
        Percentage of data removed. See src.utils_data for more details.
    fixed_params:
        All parameters used during training of hyperparameter loop. See src.utils_data for more details.
    overwrite_fixed_params:
        If true, training parameters will overwritten by the parameters below. Can be useful if need to test the model
        on different parameters, e.g. that includes older clicks or purchases.
    days_of_purchases, days_of_clicks, lifespan_of_items:
        All parameters that can overwrite the training parameters. Only useful if overwrite_fixed_params is True.
    params:
        All other parameters used during training.

    Returns
    -------
    recall:
        Recall on the test set. Relevant to compare with recall computed on hyperparametrization test set (since
        parameters like 'remove' and all overwritable parameters are different)

    Saves to file
    -------------
    Metrics computed on the test set.
    """
    # Import parameters
    if isinstance(fixed_params, str):
        path = fixed_params
        fixed_params = read_data(path)

        class objectview(object):
            def __init__(self, d):
                self.__dict__ = d

        fixed_params = objectview(fixed_params)

    if 'params' in params.keys():
        # if isinstance(params['params'], str):
        path = params['params']
        params = read_data(path)

    # Initialize data
    data_paths = DataPaths()
    fixed_params.remove = remove
    if overwrite_fixed_params:
        fixed_params.days_of_purchases = days_of_purchases
        fixed_params.days_of_clicks = days_of_clicks
        fixed_params.lifespan_of_items = lifespan_of_items
    data = DataLoader(data_paths, fixed_params)

    # Get graph
    valid_graph = create_graph(data.graph_schema, )
    valid_graph = assign_graph_features(
        valid_graph,
        fixed_params,
        data,
        **params,
    )

    dim_dict = {
        'user': valid_graph.nodes['user'].data['features'].shape[1],
        'item': valid_graph.nodes['item'].data['features'].shape[1],
        'out': params['out_dim'],
        'hidden': params['hidden_dim']
    }

    all_sids = None
    if 'sport' in valid_graph.ntypes:
        dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[
            1]
        all_sids = np.arange(valid_graph.num_nodes('sport'))

    # get training and test ids
    (train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids,
     test_uids, all_iids, ground_truth_subtrain, ground_truth_valid,
     all_eids_dict) = train_valid_split(
         valid_graph,
         data.ground_truth_test,
         fixed_params.etype,
         fixed_params.subtrain_size,
         fixed_params.valid_size,
         fixed_params.reverse_etype,
         fixed_params.train_on_clicks,
         fixed_params.remove_train_eids,
         params['clicks_sample'],
         params['purchases_sample'],
     )
    (edgeloader_train, edgeloader_valid, nodeloader_subtrain, nodeloader_valid,
     nodeloader_test) = generate_dataloaders(
         valid_graph,
         train_graph,
         train_eids_dict,
         valid_eids_dict,
         subtrain_uids,
         valid_uids,
         test_uids,
         all_iids,
         fixed_params,
         num_workers,
         all_sids,
         embedding_layer=params['embedding_layer'],
         n_layers=params['n_layers'],
         neg_sample_size=params['neg_sample_size'],
     )

    num_batches_test = math.ceil(
        (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size)

    # Import model
    if isinstance(trained_model, str):
        path = trained_model
        trained_model = ConvModel(
            valid_graph,
            params['n_layers'],
            dim_dict,
            params['norm'],
            params['dropout'],
            params['aggregator_type'],
            fixed_params.pred,
            params['aggregator_hetero'],
            params['embedding_layer'],
        )
        trained_model.load_state_dict(torch.load(path, map_location=device))
    if cuda:
        trained_model = trained_model.to(device)

    trained_model.eval()
    with torch.no_grad():
        embeddings = get_embeddings(
            valid_graph,
            params['out_dim'],
            trained_model,
            nodeloader_test,
            num_batches_test,
            cuda,
            device,
            params['embedding_layer'],
        )

        for ground_truth in [
                data.ground_truth_purchase_test, data.ground_truth_test
        ]:
            precision, recall, coverage = get_metrics_at_k(
                embeddings,
                valid_graph,
                trained_model,
                params['out_dim'],
                ground_truth,
                all_eids_dict[('user', 'buys', 'item')],
                fixed_params.k,
                True,  # Remove already bought
                cuda,
                device,
                fixed_params.pred,
                params['use_popularity'],
                params['weight_popularity'],
            )

            sentence = ("TEST Precision "
                        "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%".format(
                            precision * 100, recall * 100, coverage * 100))

            print(sentence)
            save_txt(sentence, data_paths.result_filepath, mode='a')

    return recall
Exemplo n.º 4
0
def train_model(model,
                num_epochs,
                num_batches_train,
                num_batches_val_loss,
                edgeloader_train,
                edgeloader_valid,
                loss_fn,
                delta,
                neg_sample_size,
                use_recency=False,
                cuda=False,
                device=None,
                optimizer=torch.optim.Adam,
                lr=0.001,
                get_metrics=False,
                train_graph=None,
                valid_graph=None,
                nodeloader_valid=None,
                nodeloader_subtrain=None,
                k=None,
                out_dim=None,
                num_batches_val_metrics=None,
                num_batches_subtrain=None,
                bought_eids=None,
                ground_truth_subtrain=None,
                ground_truth_valid=None,
                remove_already_bought=True,
                result_filepath=None,
                start_epoch=0,
                patience=5,
                pred=None,
                use_popularity=False,
                weight_popularity=1,
                remove_false_negative=False,
                embedding_layer=True,
                ):
    """
    Main function to train a GNN, using max margin loss on positive and negative examples.

    Process:
        - A full training epoch
            - Batch by batch. 1 batch is composed of multiple computational blocks, required to compute embeddings
              for all the nodes related to the edges in the batch.
            - Input the initial features. Compute the embeddings & the positive and negative scores
            - Also compute other considerations for the loss function: negative mask, recency scores
            - Loss is returned, then backward, then step.
            - Metrics are computed on the subtraining set (using nodeloader)
        - Validation set
            - Loss is computed (in model.eval() mode) for validation edge for early stopping purposes
            - Also, metrics are computed on the validation set (using nodeloader)
        - Logging & early stopping
            - Everything is logged, best metrics are saved.
            - Using the patience parameter, early stopping is applied when val_loss stops going down.
    """
    model.train_loss_list = []
    model.train_precision_list = []
    model.train_recall_list = []
    model.train_coverage_list = []
    model.val_loss_list = []
    model.val_precision_list = []
    model.val_recall_list = []
    model.val_coverage_list = []
    best_metrics = {}  # For visualization
    max_metric = -0.1
    patience_counter = 0  # For early stopping
    min_loss = 1.1

    opt = optimizer(model.parameters(),
                    lr=lr)

    # TRAINING
    print('Starting training.')
    for epoch in range(start_epoch, num_epochs):
        start_time = time.time()
        print('TRAINING LOSS')
        model.train()  # Because if not, after eval, dropout would be still be inactive
        i = 0
        total_loss = 0
        for _, pos_g, neg_g, blocks in edgeloader_train:
            opt.zero_grad()

            # Negative mask
            negative_mask = {}
            if remove_false_negative:
                nids = neg_g.ndata[dgl.NID]
                for etype in pos_g.canonical_etypes:
                    neg_src, neg_dst = neg_g.edges(etype=etype)
                    neg_src = nids[etype[0]][neg_src]
                    neg_dst = nids[etype[2]][neg_dst]
                    negative_mask_tensor = valid_graph.has_edges_between(neg_src, neg_dst, etype=etype)
                    negative_mask[etype] = negative_mask_tensor.type(torch.float)
                    if cuda:
                        negative_mask[etype] = negative_mask[etype].to(device)
            if cuda:
                blocks = [b.to(device) for b in blocks]
                pos_g = pos_g.to(device)
                neg_g = neg_g.to(device)

            i += 1
            if i % 10 == 0:
                print("Edge batch {} out of {}".format(i, num_batches_train))
            input_features = blocks[0].srcdata['features']
            # recency (TO BE CLEANED)
            recency_scores = None
            if use_recency:
                recency_scores = pos_g.edata['recency']

            _, pos_score, neg_score = model(blocks,
                                            input_features,
                                            pos_g,
                                            neg_g,
                                            embedding_layer,
                                            )
            loss = loss_fn(pos_score,
                           neg_score,
                           delta,
                           neg_sample_size,
                           use_recency=use_recency,
                           recency_scores=recency_scores,
                           remove_false_negative=remove_false_negative,
                           negative_mask=negative_mask,
                           cuda=cuda,
                           device=device,
                           )

            if epoch > 0:  # For the epoch 0, no training (just report loss)
                loss.backward()
                opt.step()
            total_loss += loss.item()

            if epoch == 0 and i > 10:
                break  # For the epoch 0, report loss on only subset

        train_avg_loss = total_loss / i
        model.train_loss_list.append(train_avg_loss)

        print('VALIDATION LOSS')
        model.eval()
        with torch.no_grad():
            total_loss = 0
            i = 0
            for _, pos_g, neg_g, blocks in edgeloader_valid:
                i += 1
                if i % 10 == 0:
                    print("Edge batch {} out of {}".format(i, num_batches_val_loss))

                # Negative mask
                negative_mask = {}
                if remove_false_negative:
                    nids = neg_g.ndata[dgl.NID]
                    for etype in pos_g.canonical_etypes:
                        neg_src, neg_dst = neg_g.edges(etype=etype)
                        neg_src = nids[etype[0]][neg_src]
                        neg_dst = nids[etype[2]][neg_dst]
                        negative_mask_tensor = valid_graph.has_edges_between(neg_src, neg_dst, etype=etype)
                        negative_mask[etype] = negative_mask_tensor.type(torch.float)
                        if cuda:
                            negative_mask[etype] = negative_mask[etype].to(device)

                if cuda:
                    blocks = [b.to(device) for b in blocks]
                    pos_g = pos_g.to(device)
                    neg_g = neg_g.to(device)

                input_features = blocks[0].srcdata['features']
                _, pos_score, neg_score = model(blocks,
                                                input_features,
                                                pos_g,
                                                neg_g,
                                                embedding_layer,
                                                )
                # recency (TO BE CLEANED)
                recency_scores = None
                if use_recency:
                    recency_scores = pos_g.edata['recency']

                val_loss = loss_fn(pos_score,
                                   neg_score,
                                   delta,
                                   neg_sample_size,
                                   use_recency=use_recency,
                                   recency_scores=recency_scores,
                                   remove_false_negative=remove_false_negative,
                                   negative_mask=negative_mask,
                                   cuda=cuda,
                                   device=device,
                                   )
                total_loss += val_loss.item()
                # print(val_loss.item())
            val_avg_loss = total_loss / i
            model.val_loss_list.append(val_avg_loss)

        ############
        # METRICS PER EPOCH 
        if get_metrics and epoch % 10 == 1:
            model.eval()
            with torch.no_grad():
                # training metrics
                print('TRAINING METRICS')
                y = get_embeddings(train_graph,
                                   out_dim,
                                   model,
                                   nodeloader_subtrain,
                                   num_batches_subtrain,
                                   cuda,
                                   device,
                                   embedding_layer,
                                   )

                train_precision, train_recall, train_coverage = get_metrics_at_k(y,
                                                                                 train_graph,
                                                                                 model,
                                                                                 out_dim,
                                                                                 ground_truth_subtrain,
                                                                                 bought_eids,
                                                                                 k,
                                                                                 False,  # Remove already bought
                                                                                 cuda,
                                                                                 device,
                                                                                 pred,
                                                                                 use_popularity,
                                                                                 weight_popularity)

                # validation metrics
                print('VALIDATION METRICS')
                y = get_embeddings(valid_graph,
                                   out_dim,
                                   model,
                                   nodeloader_valid,
                                   num_batches_val_metrics,
                                   cuda,
                                   device,
                                   embedding_layer,
                                   )

                val_precision, val_recall, val_coverage = get_metrics_at_k(y,
                                                                           valid_graph,
                                                                           model,
                                                                           out_dim,
                                                                           ground_truth_valid,
                                                                           bought_eids,
                                                                           k,
                                                                           remove_already_bought,
                                                                           cuda,
                                                                           device,
                                                                           pred,
                                                                           use_popularity,
                                                                           weight_popularity
                                                                           )
                sentence = '''Epoch {:05d} || TRAINING Loss {:.5f} | Precision {:.3f}% | Recall {:.3f}% | Coverage {:.2f}% 
                || VALIDATION Loss {:.5f} | Precision {:.3f}% | Recall {:.3f}% | Coverage {:.2f}% '''.format(
                    epoch, train_avg_loss, train_precision * 100, train_recall * 100, train_coverage * 100,
                    val_avg_loss, val_precision * 100, val_recall * 100, val_coverage * 100)
                print(sentence)
                save_txt(sentence, result_filepath, mode='a')

                model.train_precision_list.append(train_precision * 100)
                model.train_recall_list.append(train_recall * 100)
                model.train_coverage_list.append(train_coverage * 10)
                model.val_precision_list.append(val_precision * 100)
                model.val_recall_list.append(val_recall * 100)
                model.val_coverage_list.append(val_coverage * 10)  # just *10 for viz purposes

                # Visualization of best metric
                if val_recall > max_metric:
                    max_metric = val_recall
                    best_metrics = {'recall': val_recall, 'precision': val_precision, 'coverage': val_coverage}

        else:
            sentence = "Epoch {:05d} | Training Loss {:.5f} | Validation Loss {:.5f} | ".format(
                epoch, train_avg_loss, val_avg_loss)
            print(sentence)
            save_txt(sentence, result_filepath, mode='a')

        if val_avg_loss < min_loss:
            min_loss = val_avg_loss
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter == patience:
            break

        elapsed = time.time() - start_time
        result_to_save = f'Epoch took {timedelta(seconds=elapsed)} \n'
        print(result_to_save)
        save_txt(result_to_save, result_filepath, mode='a')

    viz = {'train_loss_list': model.train_loss_list,
           'train_precision_list': model.train_precision_list,
           'train_recall_list': model.train_recall_list,
           'train_coverage_list': model.train_coverage_list,
           'val_loss_list': model.val_loss_list,
           'val_precision_list': model.val_precision_list,
           'val_recall_list': model.val_recall_list,
           'val_coverage_list': model.val_coverage_list}

    print('Training completed.')
    return model, viz, best_metrics  # model will already be to 'cuda' device?
Exemplo n.º 5
0
def explore_recs(recs: dict,
                 already_bought_dict: dict,
                 already_clicked_dict,
                 ground_truth_dict: dict,
                 ground_truth_purchase_dict: dict,
                 item_feat_df: pd.DataFrame,
                 num_choices: int,
                 pdt_id: pd.DataFrame,
                 item_id_type: str,
                 result_filepath: str):
    """
    For a random sample of users, fetch information about what items were clicked/bought, recommended and ground truth.

    Users with only 1 previous click or purchase are explored at the end.
    """
    choices = random.sample(recs.keys(), num_choices)

    for user in choices:
        save_txt('\nCustomer bought', result_filepath, mode='a')
        try:
            fetch_recs_for_users(user,
                                 already_bought_dict,
                                 pdt_id,
                                 item_feat_df,
                                 item_id_type,
                                 result_filepath)
        except:
            save_txt('Nothing', result_filepath, mode='a')

        save_txt('\nCustomer clicked on', result_filepath, mode='a')
        try:
            fetch_recs_for_users(user,
                                 already_clicked_dict,
                                 pdt_id,
                                 item_feat_df,
                                 item_id_type,
                                 result_filepath)
        except:
            save_txt('No click data', result_filepath, mode='a')

        save_txt('\nGot recommended', result_filepath, mode='a')
        fetch_recs_for_users(user,
                             recs,
                             pdt_id,
                             item_feat_df,
                             item_id_type,
                             result_filepath)

        save_txt('\nGround truth', result_filepath, mode='a')
        fetch_recs_for_users(user,
                             ground_truth_dict,
                             pdt_id,
                             item_feat_df,
                             item_id_type,
                             result_filepath,
                             ground_truth_purchase_dict)

    # user with 1 item
    choices = random.sample([uid for uid, v in already_bought_dict.items() if len(v) == 1 and uid in recs.keys()], 2)
    for user in choices:
        save_txt('\nCustomer bought', result_filepath, mode='a')
        try:
            fetch_recs_for_users(user,
                                 already_bought_dict,
                                 pdt_id,
                                 item_feat_df,
                                 item_id_type,
                                 result_filepath)
        except:
            save_txt('Nothing', result_filepath, mode='a')

        save_txt('\nCustomer clicked on', result_filepath, mode='a')
        try:
            fetch_recs_for_users(user,
                                 already_clicked_dict,
                                 pdt_id,
                                 item_feat_df,
                                 item_id_type,
                                 result_filepath)
        except:
            save_txt('No click data', result_filepath, mode='a')

        save_txt('\nGot recommended', result_filepath, mode='a')
        fetch_recs_for_users(user,
                             recs,
                             pdt_id,
                             item_feat_df,
                             item_id_type,
                             result_filepath)

        save_txt('\nGround truth', result_filepath, mode='a')
        fetch_recs_for_users(user,
                             ground_truth_dict,
                             pdt_id,
                             item_feat_df,
                             item_id_type,
                             result_filepath,
                             ground_truth_purchase_dict)
Exemplo n.º 6
0
def train(data, fixed_params, data_paths, visualization, check_embedding,
          **params):
    """
    Function to find the best hyperparameter combination.

    Files needed to run
    -------------------
    All the files in the src.utils_data.DataPaths:
        It includes all the interactions between user, sport and items, as well as features for user, sport and items.
    If starting hyperparametrization from a checkpoint:
        The checkpoint file, generated by skopt during a previous hyperparametrization. The most recent file of
        the root folder will be fetched.

    Parameters
    ----------
    data :
        Object of class DataLoader, containing multiple arguments such as user_item_train dataframe, graph schema, etc.
    fixed_params :
        All parameters that are fixed, i.e. not part of the hyperparametrization.
    data_paths :
        All data paths (mainly csv).  # Note: currently, only paths.result_filepath is used here.
    visualization :
        Visualize results or not.  # Note: currently not used, visualization is always on or controlled by fixed_params.
    check_embedding :
        Visualize recommendations or not.  # Note: currently not used, controlled by fixed_params.
    **params :
        Mainly params that come from the hyperparametrization loop, controlled by skopt.

    Returns
    -------
    recall :
        Recall on the test set for the current combination of hyperparameters.

    Saves to files
    --------------
    logging of all experiments:
        All training logs are saved to result_filepath, including losses, metrics and examples of recommendations
        Plots of the evolution of losses and metrics are saved to the folder 'plots'
    best models:
        All models, fixed_params and params that yielded recall higher than 8% on specific item identifier or 20% on
        generic item identifier are saved to the folder 'models'
    """
    # Establish hyperparameters
    # Dimensions
    out_dim = {
        'Very Small': 32,
        'Small': 96,
        'Medium': 128,
        'Large': 192,
        'Very Large': 256
    }
    hidden_dim = {
        'Very Small': 64,
        'Small': 192,
        'Medium': 256,
        'Large': 384,
        'Very Large': 512
    }
    params['out_dim'] = out_dim[params['embed_dim']]
    params['hidden_dim'] = hidden_dim[params['embed_dim']]

    # Popularity
    use_popularity = {
        'No': False,
        'Small': True,
        'Medium': True,
        'Large': True
    }
    weight_popularity = {'No': 0, 'Small': .01, 'Medium': .05, 'Large': .1}
    days_popularity = {'No': 0, 'Small': 7, 'Medium': 7, 'Large': 7}
    params['use_popularity'] = use_popularity[params['popularity_importance']]
    params['weight_popularity'] = weight_popularity[
        params['popularity_importance']]
    params['days_popularity'] = days_popularity[
        params['popularity_importance']]

    if fixed_params.duplicates == 'count_occurrence':
        params['aggregator_type'] += '_edge'

    # Make sure graph data is consistent with message passing parameters
    if fixed_params.duplicates == 'count_occurrence':
        assert params['aggregator_type'].endswith('edge')
    else:
        assert not params['aggregator_type'].endswith('edge')

    valid_graph = create_graph(data.graph_schema, )
    valid_graph = assign_graph_features(
        valid_graph,
        fixed_params,
        data,
        **params,
    )

    dim_dict = {
        'user': valid_graph.nodes['user'].data['features'].shape[1],
        'item': valid_graph.nodes['item'].data['features'].shape[1],
        'out': params['out_dim'],
        'hidden': params['hidden_dim']
    }

    all_sids = None
    if 'sport' in valid_graph.ntypes:
        dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[
            1]
        all_sids = np.arange(valid_graph.num_nodes('sport'))

    # get training and test ids
    (train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids,
     test_uids, all_iids, ground_truth_subtrain, ground_truth_valid,
     all_eids_dict) = train_valid_split(
         valid_graph,
         data.ground_truth_test,
         fixed_params.etype,
         fixed_params.subtrain_size,
         fixed_params.valid_size,
         fixed_params.reverse_etype,
         fixed_params.train_on_clicks,
         fixed_params.remove_train_eids,
         params['clicks_sample'],
         params['purchases_sample'],
     )

    (edgeloader_train, edgeloader_valid, nodeloader_subtrain, nodeloader_valid,
     nodeloader_test) = generate_dataloaders(
         valid_graph,
         train_graph,
         train_eids_dict,
         valid_eids_dict,
         subtrain_uids,
         valid_uids,
         test_uids,
         all_iids,
         fixed_params,
         num_workers,
         all_sids,
         embedding_layer=params['embedding_layer'],
         n_layers=params['n_layers'],
         neg_sample_size=params['neg_sample_size'],
     )

    train_eids_len = 0
    valid_eids_len = 0
    for etype in train_eids_dict.keys():
        train_eids_len += len(train_eids_dict[etype])
        valid_eids_len += len(valid_eids_dict[etype])
    num_batches_train = math.ceil(train_eids_len /
                                  fixed_params.edge_batch_size)
    num_batches_subtrain = math.ceil(
        (len(subtrain_uids) + len(all_iids)) / fixed_params.node_batch_size)
    num_batches_val_loss = math.ceil(valid_eids_len /
                                     fixed_params.edge_batch_size)
    num_batches_val_metrics = math.ceil(
        (len(valid_uids) + len(all_iids)) / fixed_params.node_batch_size)
    num_batches_test = math.ceil(
        (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size)

    if fixed_params.neighbor_sampler == 'partial':
        params['n_layers'] = 3

    model = ConvModel(
        valid_graph,
        params['n_layers'],
        dim_dict,
        params['norm'],
        params['dropout'],
        params['aggregator_type'],
        fixed_params.pred,
        params['aggregator_hetero'],
        params['embedding_layer'],
    )
    if cuda:
        model = model.to(device)

    hp_sentence = params
    hp_sentence.update(vars(fixed_params))
    hp_sentence.update({
        'cuda': cuda,
    })
    hp_sentence = f'{str(hp_sentence)[1: -1]} \n'

    save_txt(f'\n \n START - Hyperparameters \n{hp_sentence}',
             data_paths.result_filepath, "a")

    start_time = time.time()

    # Train model
    trained_model, viz, best_metrics = train_model(
        model,
        fixed_params.num_epochs,
        num_batches_train,
        num_batches_val_loss,
        edgeloader_train,
        edgeloader_valid,
        max_margin_loss,
        params['delta'],
        params['neg_sample_size'],
        params['use_recency'],
        cuda,
        device,
        fixed_params.optimizer,
        params['lr'],
        get_metrics=True,
        train_graph=train_graph,
        valid_graph=valid_graph,
        nodeloader_valid=nodeloader_valid,
        nodeloader_subtrain=nodeloader_subtrain,
        k=fixed_params.k,
        out_dim=params['out_dim'],
        num_batches_val_metrics=num_batches_val_metrics,
        num_batches_subtrain=num_batches_subtrain,
        bought_eids=train_eids_dict[('user', 'buys', 'item')],
        ground_truth_subtrain=ground_truth_subtrain,
        ground_truth_valid=ground_truth_valid,
        remove_already_bought=True,
        result_filepath=data_paths.result_filepath,
        start_epoch=fixed_params.start_epoch,
        patience=fixed_params.patience,
        pred=params['pred'],
        use_popularity=params['use_popularity'],
        weight_popularity=params['weight_popularity'],
        remove_false_negative=fixed_params.remove_false_negative,
        embedding_layer=params['embedding_layer'],
    )
    elapsed = time.time() - start_time
    result_to_save = f'\n {timedelta(seconds=elapsed)} \n END'
    save_txt(result_to_save, data_paths.result_filepath, mode='a')

    if visualization:
        plot_train_loss(hp_sentence, viz)

    # Report performance on validation set
    sentence = ("BEST VALIDATION Precision "
                "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%".format(
                    best_metrics['precision'] * 100,
                    best_metrics['recall'] * 100,
                    best_metrics['coverage'] * 100))

    log.info(sentence)
    save_txt(sentence, data_paths.result_filepath, mode='a')

    # Report performance on test set
    log.debug('Test metrics start ...')
    trained_model.eval()
    with torch.no_grad():
        embeddings = get_embeddings(
            valid_graph,
            params['out_dim'],
            trained_model,
            nodeloader_test,
            num_batches_test,
            cuda,
            device,
            params['embedding_layer'],
        )

        for ground_truth in [
                data.ground_truth_purchase_test, data.ground_truth_test
        ]:
            precision, recall, coverage = get_metrics_at_k(
                embeddings,
                valid_graph,
                trained_model,
                params['out_dim'],
                ground_truth,
                all_eids_dict[('user', 'buys', 'item')],
                fixed_params.k,
                True,  # Remove already bought
                cuda,
                device,
                fixed_params.pred,
                params['use_popularity'],
                params['weight_popularity'],
            )

            sentence = ("TEST Precision "
                        "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%".format(
                            precision * 100, recall * 100, coverage * 100))
            log.info(sentence)
            save_txt(sentence, data_paths.result_filepath, mode='a')

    if check_embedding:
        trained_model.eval()
        with torch.no_grad():
            log.debug('ANALYSIS OF RECOMMENDATIONS')
            if 'sport' in train_graph.ntypes:
                result_sport = explore_sports(embeddings, data.sport_feat_df,
                                              data.spt_id,
                                              fixed_params.num_choices)

                save_txt(result_sport, data_paths.result_filepath, mode='a')

            already_bought_dict = create_already_bought(
                valid_graph,
                all_eids_dict[('user', 'buys', 'item')],
            )
            already_clicked_dict = None
            if fixed_params.discern_clicks:
                already_clicked_dict = create_already_bought(
                    valid_graph,
                    all_eids_dict[('user', 'clicks', 'item')],
                    etype='clicks',
                )

            users, items = data.ground_truth_test
            ground_truth_dict = create_ground_truth(users, items)
            user_ids = np.unique(users).tolist()
            recs = get_recs(valid_graph,
                            embeddings,
                            trained_model,
                            params['out_dim'],
                            fixed_params.k,
                            user_ids,
                            already_bought_dict,
                            remove_already_bought=True,
                            pred=fixed_params.pred,
                            use_popularity=params['use_popularity'],
                            weight_popularity=params['weight_popularity'])

            users, items = data.ground_truth_purchase_test
            ground_truth_purchase_dict = create_ground_truth(users, items)
            explore_recs(recs, already_bought_dict, already_clicked_dict,
                         ground_truth_dict, ground_truth_purchase_dict,
                         data.item_feat_df, fixed_params.num_choices,
                         data.pdt_id, fixed_params.item_id_type,
                         data_paths.result_filepath)

            if fixed_params.item_id_type == 'SPECIFIC ITEM_IDENTIFIER':
                coverage_metrics = check_coverage(data.user_item_train,
                                                  data.item_feat_df,
                                                  data.pdt_id, recs)

                sentence = (
                    "COVERAGE \n|| All transactions : "
                    "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f}% | Eco {:.1f}% "
                    "\n|| Recommendations : "
                    "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f} | Eco {:.1f}%%"
                    .format(
                        coverage_metrics['generic_mean_whole'] * 100,
                        coverage_metrics['junior_mean_whole'] * 100,
                        coverage_metrics['male_mean_whole'] * 100,
                        coverage_metrics['female_mean_whole'] * 100,
                        coverage_metrics['eco_mean_whole'] * 100,
                        coverage_metrics['generic_mean_recs'] * 100,
                        coverage_metrics['junior_mean_recs'] * 100,
                        coverage_metrics['male_mean_recs'] * 100,
                        coverage_metrics['female_mean_recs'] * 100,
                        coverage_metrics['eco_mean_recs'] * 100,
                    ))
                log.info(sentence)
                save_txt(sentence, data_paths.result_filepath, mode='a')

        save_outputs(
            {
                'embeddings': embeddings,
                'already_bought': already_bought_dict,
                'already_clicked': already_bought_dict,
                'ground_truth': ground_truth_dict,
                'recs': recs,
            }, 'outputs/')

        del params['remove']
        # Save model if the recall is greater than 8%
        if (recall > 0.08) & (fixed_params.item_id_type == 'SPECIFIC ITEM_IDENTIFIER') \
                or (recall > 0.2) & (fixed_params.item_id_type == 'GENERAL ITEM_IDENTIFIER'):
            date = str(datetime.datetime.now())[:-10].replace(' ', '')
            torch.save(trained_model.state_dict(),
                       f'models/HP_Recall_{recall * 100:.2f}_{date}.pth')
            # Save all necessary params
            save_outputs(
                {
                    f'{date}_params': params,
                    f'{date}_fixed_params': vars(fixed_params),
                }, 'models/')

        # Inference on different users
        if fixed_params.run_inference > 0:
            with torch.no_grad():
                print('On normal params')
                inference_recall = inference_hp.inference_fn(
                    trained_model,
                    remove=fixed_params.remove_on_inference,
                    fixed_params=fixed_params,
                    overwrite_fixed_params=False,
                    **params)
                if fixed_params.run_inference > 1:
                    print('For all users')
                    del params['days_of_purchases'], params[
                        'days_of_clicks'], params['lifespan_of_items']
                    all_users_inference_recall = inference_hp.inference_fn(
                        trained_model,
                        remove=fixed_params.remove_on_inference,
                        fixed_params=fixed_params,
                        overwrite_fixed_params=True,
                        days_of_purchases=710,
                        days_of_clicks=710,
                        lifespan_of_items=710,
                        **params)

    recap = f"BEST RECALL on 1) Validation set : {best_metrics['recall'] * 100:.2f}%" \
            f'\n2) Test set : {recall * 100:.2f}%'
    if fixed_params.run_inference == 1:
        recap += f'\n3) On random users of {fixed_params.remove_on_inference} removed : {inference_recall * 100:.2f}'
    recap += f"\nLoop took {timedelta(seconds=elapsed)} for {len(viz['train_loss_list'])} epochs, an average of " \
             f"{timedelta(seconds=elapsed / len(viz['train_loss_list']))} per epoch"
    print(recap)
    save_txt(recap, data_paths.result_filepath, mode='a')

    return recall  # This is the 'test set' recall, on both purchases & clicks