Пример #1
0
def plrec(matrix_train, embeded_matrix=np.empty((0)), iteration=4, lam=80, rank=200, seed=1, **unused):
    """
    Function used to achieve generalized projected lrec w/o item-attribute embedding
    :param matrix_train: user-item matrix with shape m*n
    :param embeded_matrix: item-attribute matrix with length n (each row represents one item)
    :param lam: parameter of penalty
    :param k_factor: ratio of the latent dimension/number of items
    :return: prediction in sparse matrix
    """
    progress = WorkSplitter()
    matrix_input = matrix_train
    if embeded_matrix.shape[0] > 0:
        matrix_input = vstack((matrix_input, embeded_matrix.T))

    progress.subsection("Randomized SVD")
    start_time = time.time()
    P, sigma, Qt = randomized_svd(matrix_input,
                                  n_components=rank,
                                  n_iter=iteration,
                                  random_state=seed)

    RQ = matrix_input.dot(sparse.csc_matrix(Qt.T*np.sqrt(sigma)))

    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    progress.subsection("Closed-Form Linear Optimization")
    start_time = time.time()
    pre_inv = RQ.T.dot(RQ) + lam * sparse.identity(rank, dtype=np.float32)
    inverse = inv(pre_inv)
    Y = inverse.dot(RQ.T).dot(matrix_input)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
    return np.array(RQ.todense()), np.array(Y.todense()), None
Пример #2
0
def plrec(matrix_train, iteration=4, lamb=80, rank=200, seed=1, **unused):
    """
    Function used to achieve generalized projected lrec w/o item-attribute embedding
    :param matrix_train: user-item matrix with shape m*n
    :param iteration: number of power iterations in randomized svd
    :param lamb: parameter of penalty
    :param rank: latent dimension size
    :param seed: the seed of the pseudo random number generator to use when shuffling the data
    :return: prediction in sparse matrix
    """
    progress = WorkSplitter()

    progress.subsection("Randomized SVD")
    start_time = time.time()
    P, sigma, Qt = randomized_svd(matrix_train,
                                  n_components=rank,
                                  n_iter=iteration,
                                  random_state=seed)

    RQ = matrix_train.dot(sparse.csc_matrix(Qt.T*np.sqrt(sigma)))

    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    progress.subsection("Closed-Form Linear Optimization")
    start_time = time.time()
    pre_inv = RQ.T.dot(RQ) + lamb * sparse.identity(rank, dtype=np.float32)
    inverse = sparse.linalg.inv(pre_inv.tocsc())
    Y = inverse.dot(RQ.T).dot(matrix_train)
    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    return np.array(RQ.todense()), np.array(Y.todense()), None
Пример #3
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyper parameter settings
    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.path))
    print("Train File Name: {}".format(args.train))
    if args.validation:
        print("Valid File Name: {}".format(args.valid))
    print("Algorithm: {}".format(args.model))
    print("Lambda Diversity: {}".format(args.lambda_diversity))
    print("Lambda Serendipity: {}".format(args.lambda_serendipity))
    print("Nearest Neighbor Number: {}".format(args.k))
    print("Evaluation Ranking Topk: {}".format(args.topk))

    # Load Data
    progress.section("Loading Data")
    start_time = time.time()
    R_train = load_numpy(path=args.path, name=args.train)
    print("Elapsed: {}".format(inhour(time.time() - start_time)))
    print("Train U-I Dimensions: {}".format(R_train.shape))

    progress.section("Train")
    model = models[args.model]()
    model.train(R_train)

    progress.section("Predict")
    prediction_score = model.predict(
        R_train,
        k=args.k,
        lambda_diversity=args.lambda_diversity,
        lambda_serendipity=args.lambda_serendipity)

    prediction = predict(prediction_score=prediction_score,
                         topK=args.topk,
                         matrix_Train=R_train)

    if args.validation:
        progress.section("Create Metrics")
        start_time = time.time()

        metric_names = [
            'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'
        ]

        R_valid = load_numpy(path=args.path, name=args.valid)
        result = evaluate(prediction, R_valid, metric_names, [args.topk])
        print("-")
        for metric in result.keys():
            print("{}:{}".format(metric, result[metric]))
        print("Elapsed: {}".format(inhour(time.time() - start_time)))
def main(args):
    # Progress bar
    progress = WorkSplitter()

    progress.section("Load Data")
    if args.emb_type == 'bert':
        emb_size = 768
    elif args.emb_type == 'xlmr':
        emb_size = 1024

    # Load Data
    start_time = time.time()
    print("WARNING: Embedding size is set to", emb_size)
    data = Data(args, args.path, args.train, args.valid,emb_size, is_lb=True)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
    
    #build model
    progress.section("Build Model")

    if args.network_architecture == 'embedding_net':
        model = EmbeddingNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100],corruption=args.corruption)
    elif args.network_architecture == 'embedding_highway_net':
        model = EmbeddingHighWayNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100])
    else:
        raise NotImplementedError('either use embedding_net or embedding_highway_net')
    model.cuda()
    print(model)

    model.load_state_dict(torch.load(args.checkpoint))

    print(model)
    lb_loader = data.instance_a_lb_loader(args.batch)

    lbs = {'user_lb': list(), 'tweet_lb': list()}
    preds = []
    model.eval()
    with torch.no_grad():
        lb_iterator = tqdm(lb_loader, desc="lb")
        for _, batch in enumerate(lb_iterator):
            token, feature, tweet_lb, user_lb, embedding = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3], batch[4].float().cuda()#,batch[4].cuda()
            pred = torch.sigmoid(model(token,feature,embedding)).detach().cpu().numpy()
            
            if "Valid" in args.valid:
                lbs['tweet_lb'] += tweet_lb
            else:
                lbs['tweet_lb'] += tweet_lb[0]
            lbs['user_lb'] += user_lb[0]
            preds.append(pred)

        final_csv = pd.DataFrame(lbs)
        preds = np.float64(np.vstack(preds))
        if not os.path.exists(args.spath):
            os.makedirs(args.spath)

        print("Generating CSVs...")
        for i, engage in enumerate(["reply", "retweet", "comment", "like"]):
            final_csv[engage] = preds[:,i]
            final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
Пример #5
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(
        args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(
        R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(
        path=args.data_dir, name=args.train_item_keyphrase_set).toarray()

    table_path = load_yaml('config/global.yml', key='path')['tables']
    parameters = find_best_hyperparameters(table_path + args.dataset_name,
                                           'NDCG')
    parameters_row = parameters.loc[parameters['model'] == args.model]

    if args.dataset_name == "yelp/":
        R_train_item_keyphrase = R_train_item_keyphrase.T

    start_time = time.time()

    results = critiquing(
        matrix_Train=R_train,
        matrix_Test=R_test,
        keyphrase_freq=R_train_keyphrase,
        item_keyphrase_freq=R_train_item_keyphrase,
        num_users_sampled=args.num_users_sampled,
        num_items_sampled=args.num_items_sampled,
        max_iteration_threshold=args.max_iteration_threshold,
        dataset_name=args.dataset_name,
        model=models[args.model],
        parameters_row=parameters_row,
        critiquing_model_name=args.critiquing_model_name,
        keyphrase_selection_method=args.keyphrase_selection_method,
        topk=args.topk,
        lamb=args.lamb)

    print("Final Time Elapsed: {}".format(inhour(time.time() - start_time)))

    table_path = load_yaml('config/global.yml', key='path')['tables']
    save_dataframe_csv(results, table_path, args.save_path)
Пример #6
0
def acf(matrix_train,
        embeded_matrix=np.empty((0)),
        epoch=300,
        iteration=100,
        lamb=80,
        rank=100,
        key_dim=3,
        batch_size=32,
        optimizer="Adam",
        learning_rate=0.001,
        seed=1,
        root=1,
        fb=False,
        **unused):

    print(epoch, lamb, rank)
    progress = WorkSplitter()
    matrix_input = matrix_train
    if embeded_matrix.shape[0] > 0:
        matrix_input = vstack((matrix_input, embeded_matrix.T))
    progress.subsection("Create PMI matrix")
    pmi_matrix = get_pmi_matrix(matrix_input, root)
    progress.subsection("Randomized SVD")
    start_time = time.time()
    if fb:
        P, sigma, Qt = pca(pmi_matrix, k=rank, n_iter=iteration, raw=True)
    else:
        P, sigma, Qt = randomized_svd(pmi_matrix,
                                      n_components=rank,
                                      n_iter=iteration,
                                      power_iteration_normalizer='QR',
                                      random_state=seed)
    Q = Qt.T * np.sqrt(sigma)
    m, n = matrix_input.shape
    model = ACF(m,
                n,
                rank,
                key_dim,
                lamb=lamb,
                batch_size=batch_size,
                learning_rate=learning_rate,
                optimizer=Optimizer[optimizer],
                item_embeddings=Q)
    model.train_model(matrix_input, epoch)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    RQ = model.get_RQ()
    Y = model.get_Y().T
    model.sess.close()
    tf.reset_default_graph()

    return RQ, Y, None
Пример #7
0
def eval(matrix_valid, topk, prediction):
    import time
    from utils.progress import inhour
    start_time = time.time()

    metric_names = [
        'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'
    ]

    result = evaluate(prediction, matrix_valid, metric_names, [topk])

    print("-")
    for metric in result.keys():
        print("{0}:{1}".format(metric, result[metric]))
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    return result
Пример #8
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Load Data
    progress.section("Load Data")
    start_time = time.time()
    data = Data(args.path, args.train, args.valid,is_lb=True)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
    
    #build model
    progress.section("Build Model")

    model = FeatureNet(data.n_token, data.n_feature, [1024, 2000, 1000, 500, 100])
    model.cuda()
    print(model)

    model.cuda()
    model.load_state_dict(torch.load(args.checkpoint))

    print(model)
    lb_loader = data.instance_a_lb_loader(args.batch)

    lbs = {'user_lb': list(), 'tweet_lb': list()}
    preds = []
    model = model.eval()
    with torch.no_grad():
        lb_iterator = tqdm(lb_loader, desc="lb")
        for _, batch in enumerate(lb_iterator):
            token, feature, tweet_lb, user_lb = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3]#,batch[4].cuda()
            pred = torch.sigmoid(model(token,feature)).detach().cpu().numpy()
            lbs['tweet_lb'] += tweet_lb[0]
            lbs['user_lb'] += user_lb[0]
            preds.append(pred)

        final_csv = pd.DataFrame(lbs)
        preds = np.float64(np.vstack(preds))
        if not os.path.exists(args.spath):
            os.makedirs(args.spath)

        print("Generating CSVs...")
        for i, engage in enumerate(["reply", "retweet", "comment", "like"]):
            final_csv[engage] = preds[:,i]
            final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
Пример #9
0
def pmi_svd(matrix_train,
            embeded_matrix=np.empty((0)),
            iteration=4,
            rank=200,
            fb=False,
            seed=1,
            root=1.1,
            **unused):
    """
    PureSVD algorithm

    :param matrix_train: rating matrix
    :param embeded_matrix: item or user embedding matrix(side info)
    :param iteration: number of random SVD iterations
    :param rank: SVD top K eigenvalue ranks
    :param fb: facebook package or sklearn package. boolean
    :param seed: Random initialization seed
    :param unused: args that not applicable for this algorithm
    :return:
    """
    progress = WorkSplitter()
    matrix_input = matrix_train
    if embeded_matrix.shape[0] > 0:
        matrix_input = vstack((matrix_input, embeded_matrix.T))

    progress.subsection("Create PMI matrix")
    pmi_matrix = get_pmi_matrix_gpu(matrix_input, root)

    progress.subsection("Randomized SVD")
    start_time = time.time()
    if fb:
        P, sigma, Qt = pca(pmi_matrix, k=rank, n_iter=iteration, raw=True)
    else:
        P, sigma, Qt = randomized_svd(pmi_matrix,
                                      n_components=rank,
                                      n_iter=iteration,
                                      power_iteration_normalizer='QR',
                                      random_state=seed)

    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    return P, Qt, None
Пример #10
0
def chain_item_item(matrix_train,
                    embeded_matrix=np.empty((0)),
                    iteration=7,
                    rank=200,
                    fb=True,
                    seed=1,
                    chain=1,
                    **unused):

    progress = WorkSplitter()
    matrix_input = matrix_train
    if embeded_matrix.shape[0] > 0:
        matrix_input = vstack((matrix_input, embeded_matrix.T))

    progress.subsection("Randomized SVD")
    start_time = time.time()
    if fb:
        P, sigma, Qt = pca(matrix_input, k=rank, n_iter=iteration, raw=True)
    else:
        P, sigma, Qt = randomized_svd(matrix_input,
                                      n_components=rank,
                                      n_iter=iteration,
                                      power_iteration_normalizer='QR',
                                      random_state=seed)

    RQ = matrix_input.dot(sparse.csc_matrix(Qt).T).toarray()
    PS = P * sigma
    SPPS = PS.T.dot(PS)

    HRQ = RQ.dot(SPPS)

    if chain > 1:
        QTQ = Qt.dot(Qt.T)

    for i in range(1, chain):
        HRQ = HRQ.dot(QTQ).dot(SPPS)

    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    return HRQ, Qt, None
def LP1SumToOneOptimize(initial_prediction_u, keyphrase_freq, affected_items,
                        unaffected_items, num_keyphrases, query, test_user,
                        item_latent, reg):

    critiqued_vector = np.zeros(keyphrase_freq[0].shape)

    for q in query:
        critiqued_vector[q] = -keyphrase_freq[test_user][q]

    num_critiques = len(query)

    W2 = reg.coef_
    W = item_latent.dot(W2)

    num_affected_items = len(affected_items)
    num_unaffected_items = len(unaffected_items)

    start_time = time.time()

    # Model
    m = Model("LP1SumToOneOptimize")

    # Assignment variables
    lambs = []
    for k in range(1 + num_critiques):
        lambs.append(
            m.addVar(lb=0, ub=1, vtype=GRB.CONTINUOUS, name="lamb%d" % k))

    m.addConstr((sum(lambs[k] for k in range(1 + num_critiques)) == 1),
                name="sum_to_one")

    m.setObjective(
        quicksum(lambs[0] * initial_prediction_u[affected_item] *
                 num_unaffected_items +
                 quicksum(lambs[k + 1] * critiqued_vector[query[k]] *
                          W[affected_item][query[k]] * num_unaffected_items
                          for k in range(num_critiques))
                 for affected_item in affected_items) -
        quicksum(lambs[0] * initial_prediction_u[unaffected_item] *
                 num_affected_items +
                 quicksum(lambs[k + 1] * critiqued_vector[query[k]] *
                          W[unaffected_item][query[k]] * num_affected_items
                          for k in range(num_critiques))
                 for unaffected_item in unaffected_items), GRB.MINIMIZE)

    # Optimize
    m.optimize()

    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    lambdas = []
    for k in range(1 + num_critiques):
        optimal_lambda = m.getVars()[k].X
        lambdas.append(optimal_lambda)

    for k in range(num_critiques):
        critiqued_vector[query[k]] *= lambdas[k + 1]

    critique_score = predict_scores(matrix_U=reg.predict(
        critiqued_vector.reshape(1, -1)),
                                    matrix_V=item_latent)
    new_prediction = lambdas[
        0] * initial_prediction_u + critique_score.flatten()

    return new_prediction, lambdas
def main(args):

    writer = SummaryWriter(log_dir=os.path.join('./logs', args.run_name))

    # Progress bar
    progress = WorkSplitter()
    if not os.path.exists("./checkpoint"):
        os.mkdir("./checkpoint")

    if args.emb_type == 'bert':
        emb_size = 768
    elif args.emb_type == 'xlmr':
        emb_size = 1024

    # Load Data
    progress.section("Load Data")
    print("Embedding size is set to", emb_size)
    start_time = time.time()
    data = Data(args, args.path, args.train, args.valid, emb_size)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    #build model
    progress.section("Build Model")
    if args.network_architecture == 'embedding_net':
        model = EmbeddingNet(data.n_token,
                             data.n_feature,
                             emb_size, [1024, 2000, 1000, 500, 100],
                             corruption=args.corruption)
    elif args.network_architecture == 'embedding_highway_net':
        model = EmbeddingHighWayNet(data.n_token, data.n_feature, emb_size,
                                    [1024, 2000, 1000, 500, 100])
    else:
        raise NotImplementedError(
            'either use embedding_net or embedding_highway_net')
    model.cuda()
    print(model)

    criterion = nn.BCEWithLogitsLoss()
    optim = torch.optim.AdamW(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.lamb)
    valid_loader = data.instance_a_valid_loader(args.batch)
    # train_loader = data.instance_a_train_loader(args.batch)

    global_step = 0
    progress.section("Train model")
    for epoch in range(1, args.epoch):

        total_loss = 0
        epoch_step = 0
        model.train()

        start_split = time.time()

        for split_i in range(args.num_splits):

            train_loader = data.instance_a_train_loader(args.batch)

            epoch_iterator = tqdm(train_loader, desc="Iteration")
            for _, batch in enumerate(epoch_iterator):
                token, feature, label, embedding = batch[0].float().cuda(
                ), batch[1].float().cuda(), batch[2].float().cuda(
                ), batch[3].float().cuda()  #, batch[3].cuda()
                optim.zero_grad()
                logit = model(token, feature, embedding)
                loss = criterion(logit, label)
                loss.backward()
                optim.step()
                total_loss += loss.item()

                if global_step % 5000 == 0:
                    writer.add_scalar('Loss/train_running_avg',
                                      total_loss / (epoch_step + 1),
                                      global_step)
                    writer.add_scalar('Loss/train_batch', loss.item(),
                                      global_step)

                global_step += 1
                epoch_step += 1

            del train_loader
            gc.collect()

        print("This split took {} seconds ...".format(time.time() -
                                                      start_split))
        print("epoch{0} loss:{1:.4f}".format(epoch, total_loss))

        if epoch % 1 == 0:

            model.eval()

            with torch.no_grad():
                preds, labels = [], []
                valid_iterator = tqdm(valid_loader, desc="Validation")
                for _, batch in enumerate(valid_iterator):
                    token, feature, label, embedding = batch[0].float().cuda(
                    ), batch[1].float().cuda(), batch[2], batch[3].float(
                    ).cuda()  #,batch[3].cuda()
                    pred = torch.sigmoid(model(
                        token, feature, embedding)).detach().cpu().numpy()
                    labels.append(label)
                    preds.append(pred)

            labels = np.vstack(labels)
            preds = np.float64(np.vstack(preds))

            prauc_all = []
            rce_all = []

            for i, engage in enumerate(["reply", "retweet", "comment",
                                        "like"]):
                _prauc = compute_prauc(preds[:, i], labels[:, i])
                _rce = compute_rce(preds[:, i], labels[:, i])

                print(engage + ":")
                print(_prauc)
                print(_rce)
                prauc_all.append(_prauc)
                rce_all.append(_rce)

                writer.add_scalar('PRAUC/{}_val'.format(engage), _prauc, epoch)
                writer.add_scalar('RCE/{}_val'.format(engage), _rce, epoch)

            writer.add_scalar('PRAUC/mean_val', np.mean(prauc_all), epoch)
            writer.add_scalar('RCE/mean_val', np.mean(rce_all), epoch)

            torch.save(model.state_dict(),
                       "./checkpoint/{}_{}.ckpt".format(args.run_name, epoch))

    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
Пример #13
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyper parameter settings
    progress.section("Parameter Setting")
    print("Data Path: {0}".format(args.data_dir))
    print("Train File Name: {0}".format(args.train_set))
    if args.validation:
        print("Valid File Name: {0}".format(args.valid_set))
    print("Algorithm: {0}".format(args.model))
    if args.item == True:
        mode = "Item-based"
    else:
        mode = "User-based"
    print("Normalize: {0}".format(args.normalize))
    print("Mode: {0}".format(mode))
    print("Alpha: {0}".format(args.alpha))
    print("Rank: {0}".format(args.rank))
    print("Mode Dimension: {0}".format(args.mode_dim))
    print("Key Dimension: {0}".format(args.key_dim))
    print("Batch Size: {0}".format(args.batch_size))
    print("Optimizer: {0}".format(args.optimizer))
    print("Learning Rate: {0}".format(args.learning_rate))
    print("Lambda: {0}".format(args.lamb))
    print("SVD/Alter Iteration: {0}".format(args.iteration))
    print("Epoch: {0}".format(args.epoch))
    print("Corruption: {0}".format(args.corruption))
    print("Root: {0}".format(args.root))
    print("Evaluation Ranking Topk: {0}".format(args.topk))

    # Load Data
    progress.section("Loading Data")
    start_time = time.time()
    if args.shape is None:
        R_train = load_numpy(path=args.data_dir, name=args.train_set)
    else:
        # R_train = load_pandas(path=args.data_dir, name=args.train_set, shape=args.shape)
        R_train = load_csv(path=args.data_dir,
                           name=args.train_set,
                           shape=args.shape)

    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    print("Train U-I Dimensions: {0}".format(R_train.shape))

    # Item-Item or User-User
    if args.item == True:
        RQ, Yt, Bias = models[args.model](R_train,
                                          embedded_matrix=np.empty((0)),
                                          mode_dim=args.mode_dim,
                                          key_dim=args.key_dim,
                                          batch_size=args.batch_size,
                                          optimizer=args.optimizer,
                                          learning_rate=args.learning_rate,
                                          normalize=args.normalize,
                                          iteration=args.iteration,
                                          epoch=args.epoch,
                                          rank=args.rank,
                                          corruption=args.corruption,
                                          gpu_on=args.gpu,
                                          lamb=args.lamb,
                                          alpha=args.alpha,
                                          seed=args.seed,
                                          root=args.root)
        Y = Yt.T
    else:
        Y, RQt, Bias = models[args.model](R_train.T,
                                          embedded_matrix=np.empty((0)),
                                          mode_dim=args.mode_dim,
                                          key_dim=args.key_dim,
                                          batch_size=args.batch_size,
                                          optimizer=args.optimizer,
                                          learning_rate=args.learning_rate,
                                          normalize=args.normalize,
                                          iteration=args.iteration,
                                          rank=args.rank,
                                          corruption=args.corruption,
                                          gpu_on=args.gpu,
                                          lamb=args.lamb,
                                          alpha=args.alpha,
                                          seed=args.seed,
                                          root=args.root)
        RQ = RQt.T

    # np.save('latent/U_{0}_{1}'.format(args.model, args.rank), RQ)
    # np.save('latent/V_{0}_{1}'.format(args.model, args.rank), Y)
    # if Bias is not None:
    #     np.save('latent/B_{0}_{1}'.format(args.model, args.rank), Bias)

    progress.section("Predict")
    prediction = predict(matrix_U=RQ,
                         matrix_V=Y,
                         bias=Bias,
                         topK=args.topk,
                         matrix_Train=R_train,
                         measure=args.sim_measure,
                         gpu=args.gpu)
    if args.validation:
        progress.section("Create Metrics")
        start_time = time.time()

        metric_names = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision']
        R_valid = load_numpy(path=args.data_dir, name=args.valid_set)
        result = evaluate(prediction, R_valid, metric_names, [args.topk])
        print("-")
        for metric in result.keys():
            print("{0}:{1}".format(metric, result[metric]))
        print("Elapsed: {0}".format(inhour(time.time() - start_time)))
Пример #14
0
def main(args):
    # Progress bar
    progress = WorkSplitter()
    if not os.path.exists("./checkpoint"):
        os.mkdir("./checkpoint")

    # Load Data
    progress.section("Load Data")
    start_time = time.time()
    data = Data(args.path, args.train, args.valid)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    #build model
    progress.section("Build Model")
    model = FeatureNet(data.n_token,
                       data.n_feature, [1024, 2000, 1000, 500, 100],
                       corruption=args.corruption)
    model.cuda()
    print(model)

    criterion = nn.BCEWithLogitsLoss()
    optim = torch.optim.AdamW(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.lamb)
    valid_loader = data.instance_a_valid_loader(args.batch)
    train_loader = data.instance_a_train_loader(args.batch)

    progress.section("Train model")
    for epoch in range(1, args.epoch):

        total_loss = 0
        model.train()
        epoch_iterator = tqdm(train_loader, desc="Iteration")
        for _, batch in enumerate(epoch_iterator):
            token, feature, label = batch[0].float().cuda(), batch[1].float(
            ).cuda(), batch[2].float().cuda()  #, batch[3].cuda()
            optim.zero_grad()
            logit = model(token, feature)
            loss = criterion(logit, label)
            loss.backward()
            optim.step()
            total_loss += loss.item()

        print("epoch{0} loss:{1:.4f}".format(epoch, total_loss))
        if epoch % 1 == 0:
            model.eval()
            with torch.no_grad():
                preds, labels = [], []
                valid_iterator = tqdm(valid_loader, desc="Validation")
                for _, batch in enumerate(valid_iterator):
                    token, feature, label = batch[0].float().cuda(
                    ), batch[1].float().cuda(), batch[2]  #,batch[3].cuda()
                    pred = torch.sigmoid(model(
                        token, feature)).detach().cpu().numpy()
                    labels.append(label)
                    preds.append(pred)

            labels = np.vstack(labels)
            preds = np.float64(np.vstack(preds))

            for i, engage in enumerate(["reply", "retweet", "comment",
                                        "like"]):
                print(engage + ":")
                print(compute_prauc(preds[:, i], labels[:, i]))
                print(compute_rce(preds[:, i], labels[:, i]))
            torch.save(model.state_dict(),
                       "./checkpoint/{}_{}.ckpt".format(args.run_name, epoch))

    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
Пример #15
0
if not os.path.exists("./data"):
    os.mkdir("./data")

train_dict = generate_dict_np(train_path)
t = {'tweet_ids':train_dict['tweet_ids']}

with open('./data/Train_tid.sav', 'wb') as f:
    joblib.dump(t, f)

##Create scalers
scaler_f = PowerTransformer(copy=False)
start_time = time.time()
s = len(train_dict['features'])
scaler_f.fit(train_dict['features'][np.random.choice(s, int(0.1*s))].astype(np.float64,copy=False))
print("Elapsed: {0}".format(inhour(time.time() - start_time)))
print("fit feature scaler")

scaler_t = MinMaxScaler(copy=False)
start_time = time.time()
scaler_t.fit(train_dict['tokens'][np.random.choice(s, int(0.1*s))])
print("Elapsed: {0}".format(inhour(time.time() - start_time)))
print("fit token scaler")

##Save scalers
with open('./data/f_scaler.pkl', 'wb') as f:
    pickle.dump(scaler_f, f, protocol=4)

with open('./data/t_scaler.pkl', 'wb') as f:
    pickle.dump(scaler_t, f, protocol=4)
Пример #16
0
def mmp(matrix_train, embedded_matrix=np.empty((0)), mode_dim=5, key_dim=3,
        batch_size=32, optimizer="Adam", learning_rate=0.001, normalize=True,
        iteration=4, epoch=20, lamb=100, rank=200, corruption=0.5, fb=False,
        seed=1, root=1, alpha=1, return_model=False, **unused):
    """
    PureSVD algorithm
    :param matrix_train: rating matrix
    :param embedded_matrix: item or user embedding matrix(side info)
    :param iteration: number of random SVD iterations
    :param rank: SVD top K eigenvalue ranks
    :param fb: facebook package or sklearn package. boolean
    :param seed: Random initialization seed
    :param unused: args that not applicable for this algorithm
    :return:
    """
    progress = WorkSplitter()
    matrix_input = matrix_train
    if embedded_matrix.shape[0] > 0:
        matrix_input = vstack((matrix_input, embedded_matrix.T))
    progress.subsection("Create PMI matrix")
    pmi_matrix = get_pmi_matrix(matrix_input, root)
    progress.subsection("Randomized SVD")
    start_time = time.time()
    if fb:
        P, sigma, Qt = pca(pmi_matrix,
                           k=rank,
                           n_iter=iteration,
                           raw=True)
    else:
        P, sigma, Qt = randomized_svd(pmi_matrix,
                                      n_components=rank,
                                      n_iter=iteration,
                                      power_iteration_normalizer='QR',
                                      random_state=seed)
    Q = Qt.T*np.sqrt(sigma)
    # TODO: Verify this. Seems better with this.
    if normalize:
        Q = (Q - np.mean(Q)) / np.std(Q)

    # Type has to match with Tensorflow graph implementation which uses float32
    if isinstance(Q[0][0], np.float64):
        Q = np.float32(Q)

    model = MultiModesPreferenceEstimation(input_dim=matrix_train.shape[1],
                                           embed_dim=rank,
                                           mode_dim=mode_dim,
                                           key_dim=key_dim,
                                           batch_size=batch_size,
                                           alpha=alpha,
                                           lamb=lamb,
                                           learning_rate=learning_rate,
                                           optimizer=Optimizer[optimizer],
                                           item_embeddings=Q)
    model.train_model(matrix_train, corruption, epoch)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    if return_model:
        return model

    RQ = model.get_RQ(matrix_input)
    Y = model.get_Y()
    #Bias = model.get_Bias()
    model.sess.close()
    tf.reset_default_graph()
    return RQ, Y.T, None
Пример #17
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyper parameter settings
    progress.section("Parameter Setting")
    print("Data Path: {0}".format(args.path))
    print("Train File Name: {0}".format(args.dataset + args.train))
    print("Uniform Train File Name: {0}".format(args.dataset +
                                                args.unif_train))
    print("Valid File Name: {0}".format(args.dataset + args.valid))
    print("Algorithm: {0}".format(args.model))
    print("Way: {0}".format(args.way))
    print("Seed: {0}".format(args.seed))
    print("Batch Size: {0}".format(args.batch_size))
    print("Rank: {0}".format(args.rank))
    print("Lambda: {0}".format(args.lamb))
    print("Iteration: {0}".format(args.iter))

    # Load Data
    progress.section("Loading Data")
    start_time = time.time()

    train = load_numpy(path=args.path, name=args.dataset + args.train)

    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    print("Train U-I Dimensions: {0}".format(train.shape))

    # Train Model
    valid = load_numpy(path=args.path, name=args.dataset + args.valid)

    unif_train = load_numpy(path=args.path,
                            name=args.dataset + args.unif_train)

    if args.model in ['DeepAutoRec', 'HintAE', 'SoftLabelAE']:
        RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = models[args.model](
            train,
            valid,
            dataset=args.dataset,
            matrix_unif_train=unif_train,
            iteration=args.iter,
            rank=args.rank,
            rank2=args.rank2,
            gpu_on=args.gpu,
            lam=args.lamb,
            seed=args.seed,
            batch_size=args.batch_size,
            way=args.way,
            confidence=args.confidence,
            step=args.step,
            tau=args.tau)

        save_path = 'latent/' + args.dataset
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        if args.way is None:
            np.save(save_path + '/U_{0}_{1}'.format(args.model, args.rank), RQ)
            np.save(save_path + '/Y_{0}_{1}'.format(args.model, args.rank), Y)
            np.save(save_path + '/X_{0}_{1}'.format(args.model, args.rank), X)
            np.save(save_path + '/Z_{0}_{1}'.format(args.model, args.rank), Z)
            np.save(save_path + '/K_{0}_{1}'.format(args.model, args.rank), K)
            if xBias is not None:
                np.save(
                    save_path + '/xB_{0}_{1}'.format(args.model, args.rank),
                    xBias)
                np.save(
                    save_path + '/yB_{0}_{1}'.format(args.model, args.rank),
                    yBias)
                np.save(
                    save_path + '/zB_{0}_{1}'.format(args.model, args.rank),
                    zBias)
                np.save(
                    save_path + '/kB_{0}_{1}'.format(args.model, args.rank),
                    kBias)
        else:
            np.save(
                save_path + '/' + args.way +
                '_U_{0}_{1}'.format(args.model, args.rank), RQ)
            np.save(
                save_path + '/' + args.way +
                '_Y_{0}_{1}'.format(args.model, args.rank), Y)
            np.save(
                save_path + '/' + args.way +
                '_X_{0}_{1}'.format(args.model, args.rank), X)
            np.save(
                save_path + '/' + args.way +
                '_Z_{0}_{1}'.format(args.model, args.rank), Z)
            np.save(
                save_path + '/' + args.way +
                '_K_{0}_{1}'.format(args.model, args.rank), K)
            if xBias is not None:
                np.save(
                    save_path + '/' + args.way +
                    '_xB_{0}_{1}'.format(args.model, args.rank), xBias)
                np.save(
                    save_path + '/' + args.way +
                    '_yB_{0}_{1}'.format(args.model, args.rank), yBias)
                np.save(
                    save_path + '/' + args.way +
                    '_zB_{0}_{1}'.format(args.model, args.rank), zBias)
                np.save(
                    save_path + '/' + args.way +
                    '_kB_{0}_{1}'.format(args.model, args.rank), kBias)

        progress.section("Predict")
        prediction = predict(matrix_U=RQ,
                             matrix_V=K.T,
                             matrix_Valid=valid,
                             bias=yBias,
                             gpu=args.gpu)

        progress.section("Evaluation")
        start_time = time.time()
        metric_names = ['NLL', 'AUC']
        result = evaluate(prediction, valid, metric_names, gpu=args.gpu)

        print("----Final Result----")
        for metric in result.keys():
            print("{0}:{1}".format(metric, result[metric]))
        print("Elapsed: {0}".format(inhour(time.time() - start_time)))
    else:
        RQ, X, xBias, Y, yBias = models[args.model](
            train,
            valid,
            dataset=args.dataset,
            matrix_unif_train=unif_train,
            iteration=args.iter,
            rank=args.rank,
            gpu_on=args.gpu,
            lam=args.lamb,
            lam2=args.lamb2,
            seed=args.seed,
            batch_size=args.batch_size,
            way=args.way,
            confidence=args.confidence,
            step=args.step)

        save_path = 'latent/' + args.dataset
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        if args.way is None:
            np.save(save_path + '/U_{0}_{1}'.format(args.model, args.rank), RQ)
            np.save(save_path + '/Y_{0}_{1}'.format(args.model, args.rank), Y)
            np.save(save_path + '/X_{0}_{1}'.format(args.model, args.rank), X)
            if xBias is not None:
                np.save(
                    save_path + '/xB_{0}_{1}'.format(args.model, args.rank),
                    xBias)
                np.save(
                    save_path + '/yB_{0}_{1}'.format(args.model, args.rank),
                    yBias)
        else:
            np.save(
                save_path + '/' + args.way +
                '_U_{0}_{1}'.format(args.model, args.rank), RQ)
            np.save(
                save_path + '/' + args.way +
                '_Y_{0}_{1}'.format(args.model, args.rank), Y)
            np.save(
                save_path + '/' + args.way +
                '_X_{0}_{1}'.format(args.model, args.rank), X)
            if xBias is not None:
                np.save(
                    save_path + '/' + args.way +
                    '_xB_{0}_{1}'.format(args.model, args.rank), xBias)
                np.save(
                    save_path + '/' + args.way +
                    '_yB_{0}_{1}'.format(args.model, args.rank), yBias)

        progress.section("Predict")
        prediction = predict(matrix_U=RQ,
                             matrix_V=Y.T,
                             matrix_Valid=valid,
                             bias=yBias,
                             gpu=args.gpu)

        progress.section("Evaluation")
        start_time = time.time()
        metric_names = ['NLL', 'AUC']
        result = evaluate(prediction, valid, metric_names, gpu=args.gpu)

        print("----Final Result----")
        for metric in result.keys():
            print("{0}:{1}".format(metric, result[metric]))
        print("Elapsed: {0}".format(inhour(time.time() - start_time)))
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")

    print("Data Directory: {}".format(args.data_dir))
    print("Algorithm: {}".format(args.model))
    print("Optimizer: {}".format(args.optimizer))
    print("Corruption Rate: {}".format(args.corruption))
    print("Learning Rate: {}".format(args.learning_rate))
    print("Epoch: {}".format(args.epoch))
    print("Lambda L2: {}".format(args.lamb_l2))
    print("Lambda Keyphrase: {}".format(args.lamb_keyphrase))
    print("Lambda Latent: {}".format(args.lamb_latent))
    print("Lambda Rating: {}".format(args.lamb_rating))
    print("Beta: {}".format(args.beta))
    print("Rank: {}".format(args.rank))
    print("Train Batch Size: {}".format(args.train_batch_size))
    print("Predict Batch Size: {}".format(args.predict_batch_size))
    print("Evaluation Ranking Topk: {}".format(args.topk))
    print("Validation Enabled: {}".format(args.enable_validation))

    # Load Data
    progress.section("Load Data")
    start_time = time.time()

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train Keyphrase U-S Dimensions: {}".format(R_train_keyphrase.shape))

    if args.enable_validation:
        R_valid = load_numpy(path=args.data_dir, name=args.valid_set)
        R_valid_keyphrase = load_numpy(path=args.data_dir,
                                       name=args.valid_keyphrase_set)
    else:
        R_valid = load_numpy(path=args.data_dir, name=args.test_set)
        R_valid_keyphrase = load_numpy(path=args.data_dir,
                                       name=args.test_keyphrase_set)
    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    progress.section("Preprocess Keyphrase Frequency")
    start_time = time.time()

    R_train_keyphrase[R_train_keyphrase != 0] = 1
    R_valid_keyphrase[R_valid_keyphrase != 0] = 1
    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    progress.section("Train")
    start_time = time.time()

    model = models[args.model](matrix_train=R_train,
                               epoch=args.epoch,
                               lamb_l2=args.lamb_l2,
                               lamb_keyphrase=args.lamb_keyphrase,
                               lamb_latent=args.lamb_latent,
                               lamb_rating=args.lamb_rating,
                               beta=args.beta,
                               learning_rate=args.learning_rate,
                               rank=args.rank,
                               corruption=args.corruption,
                               optimizer=args.optimizer,
                               matrix_train_keyphrase=R_train_keyphrase)
    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    progress.section("Predict")
    start_time = time.time()

    rating_score, keyphrase_score = model.predict(R_train.todense())
    prediction = predict(rating_score, args.topk, matrix_Train=R_train)
    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    if args.enable_evaluation:
        progress.section("Create Metrics")
        start_time = time.time()

        metric_names = [
            'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'
        ]
        result = evaluate(prediction, R_valid, metric_names, [args.topk])

        print("-")
        for metric in result.keys():
            print("{}:{}".format(metric, result[metric]))

        if keyphrase_score is not None:
            keyphrase_prediction = predict_keyphrase(keyphrase_score,
                                                     args.topk)
            keyphrase_result = evaluate(keyphrase_prediction,
                                        sparse.csr_matrix(R_valid_keyphrase),
                                        metric_names, [args.topk])

            print("-")
            for metric in keyphrase_result.keys():
                print("{}:{}".format(metric, keyphrase_result[metric]))

        print("Elapsed: {}".format(inhour(time.time() - start_time)))

    model.sess.close()
    tf.reset_default_graph()
Пример #19
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyper parameter settings
    progress.section("Parameter Setting")
    print("Data Path: {0}".format(args.path))
    print("Train File Name: {0}".format(args.train))
    if args.validation:
        print("Valid File Name: {0}".format(args.valid))
    print("Algorithm: {0}".format(args.model))
    if args.item == True:
        mode = "Item-based"
    else:
        mode = "User-based"
    print("Mode: {0}".format(mode))
    print("Alpha: {0}".format(args.alpha))
    print("Rank: {0}".format(args.rank))
    print("Lambda: {0}".format(args.lamb))
    print("SVD/Alter Iteration: {0}".format(args.iter))
    print("Evaluation Ranking Topk: {0}".format(args.topk))

    # Load Data
    progress.section("Loading Data")
    start_time = time.time()
    if args.shape is None:
        R_train = load_numpy(path=args.path, name=args.train)
    else:
        # R_train = load_pandas(path=args.path, name=args.train, shape=args.shape)
        R_train = load_csv(path=args.path, name=args.train, shape=args.shape)
    print "Elapsed: {0}".format(inhour(time.time() - start_time))

    print("Train U-I Dimensions: {0}".format(R_train.shape))

    # Item-Item or User-User
    if args.item == True:
        RQ, Yt, Bias = models[args.model](R_train,
                                          embeded_matrix=np.empty((0)),
                                          iteration=args.iter,
                                          rank=args.rank,
                                          corruption=args.corruption,
                                          lam=args.lamb,
                                          alpha=args.alpha,
                                          seed=args.seed,
                                          root=args.root)
        Y = Yt.T
    else:
        Y, RQt, Bias = models[args.model](R_train.T,
                                          embeded_matrix=np.empty((0)),
                                          iteration=args.iter,
                                          rank=args.rank,
                                          corruption=args.corruption,
                                          lam=args.lamb,
                                          alpha=args.alpha,
                                          seed=args.seed,
                                          root=args.root)
        RQ = RQt.T

    # Save Files
    # progress.section("Save U-V Matrix")
    # start_time = time.time()
    # save_mxnet(matrix=RQ, path=args.path+mode+'/',
    #            name='U_{0}_{1}_{2}'.format(args.rank, args.lamb, args.model))
    # save_mxnet(matrix=Y, path=args.path+mode+'/',
    #            name='V_{0}_{1}_{2}'.format(args.rank, args.lamb, args.model))
    # print "Elapsed: {0}".format(inhour(time.time() - start_time))

    np.save('latent/U_{0}_{1}'.format(args.model, args.rank), RQ)
    np.save('latent/V_{0}_{1}'.format(args.model, args.rank), Y)
    if Bias is not None:
        np.save('latent/B_{0}_{1}'.format(args.model, args.rank), Bias)

    progress.section("Predict")
    prediction = predict(matrix_U=RQ,
                         matrix_V=Y,
                         bias=Bias,
                         topK=args.topk,
                         matrix_Train=R_train,
                         measure=args.sim_measure,
                         gpu=True)
    if args.validation:
        progress.section("Create Metrics")
        start_time = time.time()

        metric_names = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision']
        R_valid = load_numpy(path=args.path, name=args.valid)
        result = evaluate(prediction, R_valid, metric_names, [args.topk])
        print("-")
        for metric in result.keys():
            print("{0}:{1}".format(metric, result[metric]))
        print "Elapsed: {0}".format(inhour(time.time() - start_time))
def main(args):
    writer = SummaryWriter(log_dir=os.path.join('./logs', args.run_name))

    if args.emb_type == 'bert':
        emb_size = 768
    elif args.emb_type == 'xlmr':
        emb_size = 1024

    # Progress bar
    progress = WorkSplitter()
    if not os.path.exists("./checkpoint"):
        os.mkdir("./checkpoint")

    # Load Data
    progress.section("Load Data")
    start_time = time.time()
    print("Embedding size is set to", emb_size)
    data = Data(args, args.path, args.emb_path, args.train, args.valid, emb_size)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    # build model
    progress.section("Build Model")
    model = EmbeddingNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100],
                         corruption=args.corruption)
    model.cuda()
    # model.load_state_dict(torch.load("./checkpoint/featurenet_v12_8.ckpt"))
    print(model)

    criterion = nn.BCEWithLogitsLoss()
    optim = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.lamb)
    valid_loader = data.instance_a_valid_loader(args.batch)
    # train_loader = data.instance_a_train_loader(args.batch)

    global_step = 0
    progress.section("Train model")
    scores = []

    scores = validate(0, valid_loader, model, writer, scores, args)

    for epoch in range(1, args.epoch):

        total_loss = 0
        epoch_step = 0
        model.train()

        for split_i in range(args.num_splits):

            train_loader = data.instance_a_train_loader(args.batch)

            global_step, total_loss, epoch_step = train(epoch_step, global_step, train_loader, model, optim, criterion, total_loss, writer)

            del train_loader
            gc.collect()

        print("epoch{0} loss:{1:.4f}".format(epoch, total_loss))

        if epoch % 1 == 0:

            scores = validate(epoch, valid_loader, model, writer, scores, args)

    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
Пример #21
0
    def start_critiquing(self):
        self.get_initial_predictions() 

        for user in tqdm(self.test_users):
            start_time = time.time()
            
            # User id starts from 0
            self.row['user_id'] = user

            initial_prediction_items = predict_vector(rating_vector=self.prediction_scores[user],
                                                            train_vector=self.matrix_Train[user],
                                                            remove_train=True)
            # For keyphrase selection method 'diff' 
            top_recommended_keyphrase_freq = get_item_keyphrase_freq(self.item_keyphrase_freq,item = initial_prediction_items[0])
            
            # The iteration will stop if the wanted item is in top n
            for target_rank in self.target_ranks:
                self.row['target_rank'] = target_rank
                
                # Pick wanted items in test items
                candidate_items = self.matrix_Test[user].nonzero()[1]
                train_items = self.matrix_Train[user].nonzero()[1]
                wanted_items = np.setdiff1d(candidate_items, train_items)
                
                for item in wanted_items:
                    # Item id starts from 0
                    self.row['item_id'] = item
                    
                    ## Get item name
                    # try:
                    #     self.row['item_name'] = get_restaurant_name(df_train, self.business_df,item)
                    # except:
                    #     self.row['item_name'] = 'NOT_FOUND'
                    
                    # Set the wanted item's initial rank as None
                    self.row['item_rank'] = None
                    # Set the wanted item's initial prediction score as None
                    self.row['item_score'] = None
                    
                    if self.keyphrase_selection_method == "random" or self.keyphrase_selection_method == "pop":
                        # Get the item's existing keyphrases (we can boost)
                        try:
                            remaining_keyphrases = self.item_keyphrase_freq[item].nonzero()[1]
                        except:
                            remaining_keyphrases = np.ravel(self.item_keyphrase_freq[item].nonzero())
                    if self.keyphrase_selection_method == "diff":
                        # For keyphrase selection method 'diff' 
                        target_keyphrase_freq = get_item_keyphrase_freq(self.item_keyphrase_freq,item = item)
                        diff_keyphrase_freq = target_keyphrase_freq - top_recommended_keyphrase_freq
                        remaining_keyphrases = np.argsort(np.ravel(diff_keyphrase_freq))[::-1][:self.max_wanted_keyphrase]
                        
                    self.row['num_existing_keyphrases'] = len(remaining_keyphrases)
                    
                    if len(remaining_keyphrases) == 0:
                        break
                    
                    self.row['iteration'] = 0
                    self.row['critiqued_keyphrase'] = None
                    self.row['result'] = None
                    self.df = self.df.append(self.row, ignore_index=True)

                    query = []
                    affected_items = np.array([])
                    
                    # Set up latent embedding
                    user_latent_embedding = [self.Y[user]]
                    
                    for iteration in range(self.max_iteration_threshold):
                        self.row['iteration'] = iteration + 1            
                        
                        if self.keyphrase_selection_method == "pop":
                            # Always critique the least popular keyphrase
                            critiqued_keyphrase = remaining_keyphrases[np.argmin(self.keyphrase_popularity[remaining_keyphrases])]
                            
                        elif self.keyphrase_selection_method == "random":
                            critiqued_keyphrase = np.random.choice(remaining_keyphrases, size=1, replace=False)[0]
            
                        elif self.keyphrase_selection_method == "diff":
                            critiqued_keyphrase = remaining_keyphrases[0]
                        
                        self.row['critiqued_keyphrase'] = critiqued_keyphrase
                        self.row['critiqued_keyphrase_name'] = self.keyphrases_names[critiqued_keyphrase]
                        query.append(critiqued_keyphrase)

                        # Get affected items (items have critiqued keyphrase)
                        current_affected_items = self.item_keyphrase_freq[:, critiqued_keyphrase].nonzero()[0]
                        affected_items = np.unique(np.concatenate((affected_items, current_affected_items))).astype(int)
                        unaffected_items = np.setdiff1d(range(self.num_items), affected_items)

                        if iteration == 0:
                            prediction_items = initial_prediction_items #calculated once for each user

                        affected_items_mask = np.in1d(prediction_items, affected_items)
                        affected_items_index_rank = np.where(affected_items_mask == True)
                        unaffected_items_index_rank = np.where(affected_items_mask == False)

                        ## concat critique embeddings to user latent embedding
                        # Get critique vector 
                        critiqued_vector = np.zeros(self.keyphrase_freq.shape[1])
                        critiqued_vector[critiqued_keyphrase] = max(self.keyphrase_freq[user , critiqued_keyphrase],1)
                        
                        # map user critique to user latent embedding
                        k_ci = self.reg.predict(critiqued_vector.reshape(1, -1)).flatten()
                        user_latent_embedding.append(k_ci)
                        
                        prediction_scores_u, thetas = lpranksvm3(initial_prediction_u=self.prediction_scores[user],
                                                                             keyphrase_freq=copy.deepcopy(self.keyphrase_freq),
                                                                             affected_items=np.intersect1d(affected_items, prediction_items[affected_items_index_rank[0][:20]]),
                                                                             unaffected_items=np.intersect1d(unaffected_items, prediction_items[unaffected_items_index_rank[0][:20]]),
                                                                             num_keyphrases=self.num_keyphrases,
                                                                             query=query,
                                                                             test_user=user,
                                                                             item_latent=self.RQ,
                                                                             reg=self.reg,
                                                                             user_latent_embedding = user_latent_embedding,
                                                                             item_keyphrase_freq = self.item_keyphrase_freq,
                                                                             Y = self.Y,
                                                                             lamb = self.lamb)
                        self.row['theta'] = thetas
                        prediction_items = predict_vector(rating_vector=prediction_scores_u,
                                                          train_vector=self.matrix_Train[user],
                                                          remove_train=False)
                        recommended_items = prediction_items
                        
                        # Current item rank
                        item_rank = np.where(recommended_items == item)[0][0]

                        self.row['item_rank'] = item_rank
                        self.row['item_score'] = prediction_scores_u[item]

                        if item_rank + 1 <= target_rank:
                            # Items is ranked within target rank
                            self.row['result'] = 'successful'
                            self.df = self.df.append(self.row, ignore_index=True)
                            break
                        else:
                            remaining_keyphrases = np.setdiff1d(remaining_keyphrases, critiqued_keyphrase)
                            # Continue if more keyphrases and iterations remained
                            if len(remaining_keyphrases) > 0 and self.row['iteration'] < self.max_iteration_threshold:
                                self.row['result'] = None
                                self.df = self.df.append(self.row, ignore_index=True)
                            else:
                                # Otherwise, mark fail
                                self.row['result'] = 'fail'
                                self.df = self.df.append(self.row, ignore_index=True)
                                break
        
            print("User", user ,"Elapsed: {}".format(inhour(time.time() - start_time)))
        return self.df
Пример #22
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyper parameter settings
    progress.section("Parameter Setting")
    print("Data Path: {0}".format(args.path))
    print("Train File Name: {0}".format(args.dataset + args.train))
    print("Uniform Train File Name: {0}".format(args.dataset + args.unif_train))
    print("Valid File Name: {0}".format(args.dataset + args.valid))
    print("Algorithm: {0}".format(args.model))
    print("Way: {0}".format(args.way))
    print("Seed: {0}".format(args.seed))
    print("Batch Size: {0}".format(args.batch_size))
    print("Rank: {0}".format(args.rank))
    print("Lambda: {0}".format(args.lamb))
    print("Iteration: {0}".format(args.iter))

    # Load Data
    progress.section("Loading Data")
    start_time = time.time()

    train = load_numpy(path=args.path, name=args.dataset + args.train)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    print("Train U-I Dimensions: {0}".format(train.shape))

    # Train Model
    valid = load_numpy(path=args.path, name=args.dataset + args.valid)
    unif_train = load_numpy(path=args.path, name=args.dataset + args.unif_train)
    RQ, Y, uBias, iBias = models[args.model](train, valid, dataset=args.dataset, matrix_unif_train=unif_train,
                                             iteration=args.iter, rank=args.rank, gpu_on=args.gpu, lam=args.lamb,
                                             lam2=args.lamb2, seed=args.seed, batch_size=args.batch_size, way=args.way,
                                             confidence=args.confidence, step=args.step)

    save_path = 'latent/' + args.dataset
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    if args.way is None:
        np.save(save_path + '/U_{0}_{1}'.format(args.model, args.rank), RQ)
        np.save(save_path + '/V_{0}_{1}'.format(args.model, args.rank), Y)
        if uBias is not None:
            np.save(save_path + '/uB_{0}_{1}'.format(args.model, args.rank), uBias)
            np.save(save_path + '/iB_{0}_{1}'.format(args.model, args.rank), iBias)
    else:
        np.save(save_path + '/' + args.way + '_U_{0}_{1}'.format(args.model, args.rank), RQ)
        np.save(save_path + '/' + args.way + '_V_{0}_{1}'.format(args.model, args.rank), Y)
        if uBias is not None:
            np.save(save_path + '/' + args.way + '_uB_{0}_{1}'.format(args.model, args.rank), uBias)
            np.save(save_path + '/' + args.way + '_iB_{0}_{1}'.format(args.model, args.rank), iBias)

    progress.section("Predict")
    prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=valid, ubias=uBias, ibias=iBias, gpu=args.gpu)

    progress.section("Evaluation")
    start_time = time.time()
    metric_names = ['NLL', 'AUC']
    result = evaluate(prediction, valid, metric_names, gpu=args.gpu)

    print("----Final Result----")
    for metric in result.keys():
        print("{0}:{1}".format(metric, result[metric]))
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
Пример #23
0
def weighted_lrec_items(matrix_train,
                        embeded_matrix=np.empty((0)),
                        iteration=4,
                        lam=80,
                        rank=200,
                        alpha=100,
                        gpu=True,
                        seed=1,
                        **unused):
    """
    Function used to achieve generalized projected lrec w/o item-attribute embedding
    :param matrix_train: user-item matrix with shape m*n
    :param embeded_matrix: item-attribute matrix with length n (each row represents one item)
    :param iteration: number of SVD iterations
    :param lam: parameter of penalty
    :param rank: the latent dimension/number of items
    :param alpha: weights of the U-I ratings
    :param gpu: whether use gpu power
    :return: prediction in sparse matrix
    """
    progress = WorkSplitter()
    matrix_input = matrix_train
    if embeded_matrix.shape[0] > 0:
        matrix_input = vstack((matrix_input, embeded_matrix.T))

    progress.subsection("Randomized SVD")
    start_time = time.time()
    P, sigma, Qt = randomized_svd(matrix_input,
                                  n_components=rank,
                                  n_iter=iteration,
                                  random_state=seed)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))

    start_time = time.time()
    if gpu:
        import cupy as cp
        progress.subsection("Create Cacheable Matrices")
        # RQ = matrix_input.dot(sparse.csc_matrix(Qt).T).toarray()

        # sqrt sigma injection
        RQ = matrix_input.dot(sparse.csc_matrix(Qt.T *
                                                np.sqrt(sigma))).toarray()

        # Exact
        matrix_B = cp.array(RQ)
        matrix_BT = matrix_B.T
        matrix_A = matrix_BT.dot(matrix_B) + cp.array(
            (lam * sparse.identity(rank, dtype=np.float32)).toarray())

        # Approx
        # matrix_A = cp.array(sparse.diags(sigma * sigma + lam).todense())
        # matrix_B = cp.array(P*sigma)
        # matrix_BT = cp.array(matrix_B.T)
        print("Elapsed: {0}".format(inhour(time.time() - start_time)))

        progress.subsection("Item-wised Optimization")
        start_time = time.time()

        # For loop
        m, n = matrix_train.shape
        Y = []
        alpha = cp.array(alpha, dtype=cp.float32)
        for i in tqdm(xrange(n)):
            vector_r = matrix_train[:, i]
            vector_y = per_item_gpu(vector_r, matrix_A, matrix_B, matrix_BT,
                                    alpha)
            y_i_gpu = cp.asnumpy(vector_y)
            y_i_cpu = np.copy(y_i_gpu)
            Y.append(y_i_cpu)
        Y = scipy.vstack(Y)
        print("Elapsed: {0}".format(inhour(time.time() - start_time)))
    else:
        progress.subsection("Create Cacheable Matrices")
        RQ = matrix_input.dot(sparse.csc_matrix(Qt).T).toarray()

        # Exact
        matrix_B = RQ
        matrix_BT = RQ.T
        matrix_A = matrix_BT.dot(matrix_B) + (
            lam * sparse.identity(rank, dtype=np.float32)).toarray()

        # Approx
        # matrix_B = P * sigma
        # matrix_BT = matrix_B.T
        # matrix_A = sparse.diags(sigma * sigma - lam).todense()
        print("Elapsed: {0}".format(inhour(time.time() - start_time)))

        progress.subsection("Item-wised Optimization")
        start_time = time.time()

        # For loop
        m, n = matrix_train.shape
        Y = []
        for i in tqdm(xrange(n)):
            vector_r = matrix_train[:, i]
            vector_y = per_item_cpu(vector_r, matrix_A, matrix_B, matrix_BT,
                                    alpha)
            y_i_cpu = vector_y
            Y.append(y_i_cpu)
        Y = scipy.vstack(Y)
        print("Elapsed: {0}".format(inhour(time.time() - start_time)))
    return RQ, Y.T, None
Пример #24
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show parameter settings
    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.path))
    print("Active Learning Algorithm: {}".format(args.active_model))
    print("Recommendation Algorithm: {}".format(args.rec_model))
    print("GPU: {}".format(args.gpu))
    print("Iterative: {}".format(args.iterative))
    print("Sample From All: {}".format(args.sample_from_all))
    print("Train Valid Test Split Ratio: {}".format(args.ratio))
    print("Learning Rate: {}".format(args.learning_rate))
    print("Rank: {}".format(args.rank))
    print("Lambda: {}".format(args.lamb))
    print("Epoch: {}".format(args.epoch))
    print("Active Learning Iteration: {}".format(args.active_iteration))
    print("Evaluation Ranking Topk: {}".format(args.topk))
    print("UCB Confidence: {}".format(args.confidence_interval))
    print("Number of Item per Active Iteration: {}".format(args.num_item_per_iter))
    print("UCB Number of Latent Sampling: {}".format(args.num_latent_sampling))

    # Load Data
    progress.section("Loading Data")
    start_time = time.time()
    R_train = load_numpy(path=args.path, name=args.train)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_active = load_numpy(path=args.path, name=args.active)
    print("Active U-I Dimensions: {}".format(R_active.shape))

    R_test = load_numpy(path=args.path, name=args.test)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    print("Elapsed: {}".format(inhour(time.time() - start_time)))

    train_index = int(R_test.shape[0]*args.ratio[0])

    progress.section("Preparing Data")
    matrix_train, matrix_active, matrix_test, _ = filter_users(R_train,
                                                               R_active,
                                                               R_test,
                                                               train_index=train_index,
                                                               active_threshold=2*args.num_item_per_iter*args.active_iteration,
                                                               test_threshold=2*args.topk)

    m, n = matrix_train.shape

    history_items = np.array([])

    model = rec_models[args.rec_model](observation_dim=n, latent_dim=args.rank,
                                       batch_size=128, lamb=args.lamb,
                                       learning_rate=args.learning_rate,
                                       optimizer=Regularizer[args.optimizer])

    progress.section("Training")
    model.train_model(matrix_train[:train_index], args.corruption, args.epoch)

    for i in range(args.active_iteration):
        print('This is step {} \n'.format(i))
        print('The number of ones in train set is {}'.format(len(matrix_train[train_index:].nonzero()[0])))
        print('The number of ones in active set is {}'.format(len(matrix_active[train_index:].nonzero()[0])))

        progress.section("Predicting")
        observation = active_models[args.active_model](model=model, matrix=matrix_train[train_index:].A, ci=args.confidence_interval, num_latent_sampling=args.num_latent_sampling)

        progress.section("Update Train Set")
        matrix_train, history_items = update_matrix(history_items, matrix_train,
                                                    matrix_active, observation,
                                                    train_index, args.iterative,
                                                    args.sample_from_all,
                                                    args.num_item_per_iter,
                                                    args.active_iteration, args.gpu)

        if not args.iterative:
            break

#    matrix_train = matrix_train + matrix_active
    print('The number of ones in train set is {}'.format(len(matrix_train[train_index:].nonzero()[0])))

    progress.section("Re-Training")
    model.train_model(matrix_train, args.corruption, args.epoch)

    progress.section("Re-Predicting")
    observation = active_models['Greedy'](model=model, matrix=matrix_train.A)

    result = {}
    for topk in [5, 10, 15, 20, 50]:
        predict_items, _ = sampling_predict(prediction_scores=observation[train_index:],
                                            topK=topk,
                                            matrix_train=matrix_train[train_index:],
                                            matrix_active=matrix_active[train_index:],
                                            sample_from_all=True,
                                            iterative=False,
                                            history_items=np.array([]),
                                            gpu=args.gpu)

        progress.section("Create Metrics")
        result.update(eval(matrix_test[train_index:], topk, predict_items))

    print(result)

    model.sess.close()
    tf.reset_default_graph()