예제 #1
0
def main():
    params = dict()
    params['batch_size'] = 1
    params['data_dir'] = args.path_to_train_data
    params['major'] = 'users'
    params['itemIdInd'] = 1
    params['userIdInd'] = 0
    params['extension'] = '.csv'
    params['delimiter'] = ','
    params['header'] = 1
    print("Loading training data")
    data_layer = input_layer.UserItemRecDataProvider(params=params)
    print("Data loaded")
    print("Total items found: {}".format(len(data_layer.data.keys())))
    print("Vector dim: {}".format(data_layer.vector_dim))

    print("Loading eval data")
    eval_params = copy.deepcopy(params)
    # must set eval batch size to 1 to make sure no examples are missed
    eval_params['batch_size'] = 1
    eval_params['data_dir'] = args.path_to_eval_data
    eval_data_layer = input_layer.UserItemRecDataProvider(params=eval_params,
                                                          user_id_map=data_layer.userIdMap,
                                                          item_id_map=data_layer.itemIdMap)

    rencoder = model.AutoEncoder(layer_sizes=[data_layer.vector_dim] + [int(l) for l in args.hidden_layers.split(',')],
                                 nl_type=args.non_linearity_type,
                                 is_constrained=args.constrained,
                                 dp_drop_prob=args.drop_prob,
                                 last_layer_activations=not args.skip_last_layer_nl)

    path_to_model = Path(args.save_path)
    if path_to_model.is_file():
        print("Loading model from: {}".format(path_to_model))
        rencoder.load_state_dict(torch.load(args.save_path))

    print('######################################################')
    print('######################################################')
    print('############# AutoEncoder Model: #####################')
    print(rencoder)
    print('######################################################')
    print('######################################################')
    rencoder.eval()
    if use_gpu: rencoder = rencoder.cuda()

    inv_userIdMap = {v: k for k, v in data_layer.userIdMap.items()}
    inv_itemIdMap = {v: k for k, v in data_layer.itemIdMap.items()}

    eval_data_layer.src_data = data_layer.data
    with open(args.predictions_path, 'w') as outf:
        for i, ((out, src), majorInd) in enumerate(eval_data_layer.iterate_one_epoch_eval(for_inf=True)):
            inputs = Variable(src.cuda().to_dense() if use_gpu else src.to_dense())
            targets_np = out.to_dense().numpy()[0, :]
            outputs = rencoder(inputs).cpu().data.numpy()[0, :]
            non_zeros = targets_np.nonzero()[0].tolist()
            major_key = inv_userIdMap[majorInd]
            for ind in non_zeros:
                outf.write("{}\t{}\t{}\t{}\n".format(major_key, inv_itemIdMap[ind], outputs[ind], targets_np[ind]))
            if i % 10000 == 0:
                print("Done: {}".format(i))
예제 #2
0
def load_recommender(vector_dim, hidden, activation, dropout, weights_path):

    rencoder_api = model.AutoEncoder(layer_sizes=[vector_dim] +
                                     [int(l) for l in hidden.split(',')],
                                     nl_type=activation,
                                     is_constrained=False,
                                     dp_drop_prob=dropout,
                                     last_layer_activations=True)
    load_model_weights(rencoder_api, weights_path)
    rencoder_api.eval()
    if USE_GPU: rencoder_api = rencoder_api.cuda()
    return rencoder_api
예제 #3
0
def main():
    logger = Logger(args.logdir)
    params = dict()
    params['batch_size'] = args.batch_size
    params['data_dir'] = args.path_to_train_data
    params['major'] = 'users'
    params['itemIdInd'] = 1
    params['userIdInd'] = 0
    print("Loading training data")
    data_layer = input_layer.UserItemRecDataProvider(params=params)
    print("Data loaded")
    print("Total items found: {}".format(len(data_layer.data.keys())))
    print("Vector dim: {}".format(data_layer.vector_dim))

    print("Loading eval data")
    eval_params = copy.deepcopy(params)
    # must set eval batch size to 1 to make sure no examples are missed
    eval_params['data_dir'] = args.path_to_eval_data
    eval_data_layer = input_layer.UserItemRecDataProvider(
        params=eval_params,
        user_id_map=data_layer.userIdMap,  # the mappings are provided
        item_id_map=data_layer.itemIdMap)

    eval_data_layer.src_data = data_layer.data
    rencoder = model.AutoEncoder(
        layer_sizes=[data_layer.vector_dim] +
        [int(l) for l in args.hidden_layers.split(',')],
        nl_type=args.non_linearity_type,
        is_constrained=args.constrained,
        dp_drop_prob=args.drop_prob,
        last_layer_activations=not args.skip_last_layer_nl)

    model_checkpoint = args.logdir + "/model"
    path_to_model = Path(model_checkpoint)
    if path_to_model.is_file():
        print("Loading model from: {}".format(model_checkpoint))
        rencoder.load_state_dict(torch.load(model_checkpoint))

    print('######################################################')
    print('######################################################')
    print('############# AutoEncoder Model: #####################')
    print(rencoder)
    print('######################################################')
    print('######################################################')

    gpu_ids = [int(g) for g in args.gpu_ids.split(',')]
    print('Using GPUs: {}'.format(gpu_ids))
    if len(gpu_ids) > 1:
        rencoder = nn.DataParallel(rencoder, device_ids=gpu_ids)
    rencoder = rencoder.cuda()

    if args.optimizer == "adam":
        optimizer = optim.Adam(rencoder.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
    elif args.optimizer == "adagrad":
        optimizer = optim.Adagrad(rencoder.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    elif args.optimizer == "momentum":
        optimizer = optim.SGD(rencoder.parameters(),
                              lr=args.lr,
                              momentum=0.9,
                              weight_decay=args.weight_decay)
        scheduler = MultiStepLR(optimizer,
                                milestones=[24, 36, 48, 66, 72],
                                gamma=0.5)
    elif args.optimizer == "rmsprop":
        optimizer = optim.RMSprop(rencoder.parameters(),
                                  lr=args.lr,
                                  momentum=0.9,
                                  weight_decay=args.weight_decay)
    else:
        raise ValueError('Unknown optimizer kind')

    t_loss = 0.0
    t_loss_denom = 0.0
    global_step = 0

    if args.noise_prob > 0.0:
        dp = nn.Dropout(p=args.noise_prob)

    for epoch in range(args.num_epochs):
        print('Doing epoch {} of {}'.format(epoch, args.num_epochs))
        e_start_time = time.time()
        rencoder.train()
        total_epoch_loss = 0.0
        denom = 0.0
        if args.optimizer == "momentum":
            scheduler.step()
        for i, mb in enumerate(data_layer.iterate_one_epoch()):
            inputs = Variable(mb.cuda().to_dense())
            optimizer.zero_grad()
            outputs = rencoder(inputs)
            loss, num_ratings = model.MSEloss(outputs, inputs)
            loss = loss / num_ratings
            loss.backward()
            optimizer.step()
            global_step += 1
            t_loss += loss.data[0]
            t_loss_denom += 1

            if i % args.summary_frequency == 0:
                print('[%d, %5d] RMSE: %.7f' %
                      (epoch, i, sqrt(t_loss / t_loss_denom)))
                logger.scalar_summary("Training_RMSE",
                                      sqrt(t_loss / t_loss_denom), global_step)
                t_loss = 0
                t_loss_denom = 0.0
                log_var_and_grad_summaries(logger, rencoder.encode_w,
                                           global_step, "Encode_W")
                log_var_and_grad_summaries(logger, rencoder.encode_b,
                                           global_step, "Encode_b")
                if not rencoder.is_constrained:
                    log_var_and_grad_summaries(logger, rencoder.decode_w,
                                               global_step, "Decode_W")
                log_var_and_grad_summaries(logger, rencoder.decode_b,
                                           global_step, "Decode_b")

            total_epoch_loss += loss.data[0]
            denom += 1

            #if args.aug_step > 0 and i % args.aug_step == 0 and i > 0:
            if args.aug_step > 0:
                # Magic data augmentation trick happen here
                for t in range(args.aug_step):
                    inputs = Variable(outputs.data)
                    if args.noise_prob > 0.0:
                        inputs = dp(inputs)
                    optimizer.zero_grad()
                    outputs = rencoder(inputs)
                    loss, num_ratings = model.MSEloss(outputs, inputs)
                    loss = loss / num_ratings
                    loss.backward()
                    optimizer.step()

        e_end_time = time.time()
        print(
            'Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}'
            .format(epoch, e_end_time - e_start_time,
                    sqrt(total_epoch_loss / denom)))
        logger.scalar_summary("Training_RMSE_per_epoch",
                              sqrt(total_epoch_loss / denom), epoch)
        logger.scalar_summary("Epoch_time", e_end_time - e_start_time, epoch)
        if epoch % 3 == 0 or epoch == args.num_epochs - 1:
            eval_loss = do_eval(rencoder, eval_data_layer)
            print('Epoch {} EVALUATION LOSS: {}'.format(epoch, eval_loss))
            logger.scalar_summary("EVALUATION_RMSE", eval_loss, epoch)
            print("Saving model to {}".format(model_checkpoint + ".epoch_" +
                                              str(epoch)))
            torch.save(rencoder.state_dict(),
                       model_checkpoint + ".epoch_" + str(epoch))

    print("Saving model to {}".format(model_checkpoint + ".last"))
    torch.save(rencoder.state_dict(), model_checkpoint + ".last")