def plrec(matrix_train, embeded_matrix=np.empty((0)), iteration=4, lam=80, rank=200, seed=1, **unused): """ Function used to achieve generalized projected lrec w/o item-attribute embedding :param matrix_train: user-item matrix with shape m*n :param embeded_matrix: item-attribute matrix with length n (each row represents one item) :param lam: parameter of penalty :param k_factor: ratio of the latent dimension/number of items :return: prediction in sparse matrix """ progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) progress.subsection("Randomized SVD") start_time = time.time() P, sigma, Qt = randomized_svd(matrix_input, n_components=rank, n_iter=iteration, random_state=seed) RQ = matrix_input.dot(sparse.csc_matrix(Qt.T*np.sqrt(sigma))) print("Elapsed: {0}".format(inhour(time.time() - start_time))) progress.subsection("Closed-Form Linear Optimization") start_time = time.time() pre_inv = RQ.T.dot(RQ) + lam * sparse.identity(rank, dtype=np.float32) inverse = inv(pre_inv) Y = inverse.dot(RQ.T).dot(matrix_input) print("Elapsed: {0}".format(inhour(time.time() - start_time))) return np.array(RQ.todense()), np.array(Y.todense()), None
def plrec(matrix_train, iteration=4, lamb=80, rank=200, seed=1, **unused): """ Function used to achieve generalized projected lrec w/o item-attribute embedding :param matrix_train: user-item matrix with shape m*n :param iteration: number of power iterations in randomized svd :param lamb: parameter of penalty :param rank: latent dimension size :param seed: the seed of the pseudo random number generator to use when shuffling the data :return: prediction in sparse matrix """ progress = WorkSplitter() progress.subsection("Randomized SVD") start_time = time.time() P, sigma, Qt = randomized_svd(matrix_train, n_components=rank, n_iter=iteration, random_state=seed) RQ = matrix_train.dot(sparse.csc_matrix(Qt.T*np.sqrt(sigma))) print("Elapsed: {}".format(inhour(time.time() - start_time))) progress.subsection("Closed-Form Linear Optimization") start_time = time.time() pre_inv = RQ.T.dot(RQ) + lamb * sparse.identity(rank, dtype=np.float32) inverse = sparse.linalg.inv(pre_inv.tocsc()) Y = inverse.dot(RQ.T).dot(matrix_train) print("Elapsed: {}".format(inhour(time.time() - start_time))) return np.array(RQ.todense()), np.array(Y.todense()), None
def main(args): # Progress bar progress = WorkSplitter() # Show hyper parameter settings progress.section("Parameter Setting") print("Data Path: {}".format(args.path)) print("Train File Name: {}".format(args.train)) if args.validation: print("Valid File Name: {}".format(args.valid)) print("Algorithm: {}".format(args.model)) print("Lambda Diversity: {}".format(args.lambda_diversity)) print("Lambda Serendipity: {}".format(args.lambda_serendipity)) print("Nearest Neighbor Number: {}".format(args.k)) print("Evaluation Ranking Topk: {}".format(args.topk)) # Load Data progress.section("Loading Data") start_time = time.time() R_train = load_numpy(path=args.path, name=args.train) print("Elapsed: {}".format(inhour(time.time() - start_time))) print("Train U-I Dimensions: {}".format(R_train.shape)) progress.section("Train") model = models[args.model]() model.train(R_train) progress.section("Predict") prediction_score = model.predict( R_train, k=args.k, lambda_diversity=args.lambda_diversity, lambda_serendipity=args.lambda_serendipity) prediction = predict(prediction_score=prediction_score, topK=args.topk, matrix_Train=R_train) if args.validation: progress.section("Create Metrics") start_time = time.time() metric_names = [ 'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP' ] R_valid = load_numpy(path=args.path, name=args.valid) result = evaluate(prediction, R_valid, metric_names, [args.topk]) print("-") for metric in result.keys(): print("{}:{}".format(metric, result[metric])) print("Elapsed: {}".format(inhour(time.time() - start_time)))
def main(args): # Progress bar progress = WorkSplitter() progress.section("Load Data") if args.emb_type == 'bert': emb_size = 768 elif args.emb_type == 'xlmr': emb_size = 1024 # Load Data start_time = time.time() print("WARNING: Embedding size is set to", emb_size) data = Data(args, args.path, args.train, args.valid,emb_size, is_lb=True) print("Elapsed: {0}".format(inhour(time.time() - start_time))) #build model progress.section("Build Model") if args.network_architecture == 'embedding_net': model = EmbeddingNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100],corruption=args.corruption) elif args.network_architecture == 'embedding_highway_net': model = EmbeddingHighWayNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100]) else: raise NotImplementedError('either use embedding_net or embedding_highway_net') model.cuda() print(model) model.load_state_dict(torch.load(args.checkpoint)) print(model) lb_loader = data.instance_a_lb_loader(args.batch) lbs = {'user_lb': list(), 'tweet_lb': list()} preds = [] model.eval() with torch.no_grad(): lb_iterator = tqdm(lb_loader, desc="lb") for _, batch in enumerate(lb_iterator): token, feature, tweet_lb, user_lb, embedding = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3], batch[4].float().cuda()#,batch[4].cuda() pred = torch.sigmoid(model(token,feature,embedding)).detach().cpu().numpy() if "Valid" in args.valid: lbs['tweet_lb'] += tweet_lb else: lbs['tweet_lb'] += tweet_lb[0] lbs['user_lb'] += user_lb[0] preds.append(pred) final_csv = pd.DataFrame(lbs) preds = np.float64(np.vstack(preds)) if not os.path.exists(args.spath): os.makedirs(args.spath) print("Generating CSVs...") for i, engage in enumerate(["reply", "retweet", "comment", "like"]): final_csv[engage] = preds[:,i] final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Number of Users Sampled: {}".format(args.num_users_sampled)) print("Number of Items Sampled: {}".format(args.num_items_sampled)) print("Number of Max Allowed Iterations: {}".format( args.max_iteration_threshold)) print("Critiquing Model: {}".format(args.critiquing_model_name)) R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_test = load_numpy(path=args.data_dir, name=args.test_set) print("Test U-I Dimensions: {}".format(R_test.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Item Keyphrase U-I Dimensions: {}".format( R_train_keyphrase.shape)) R_train_item_keyphrase = load_numpy( path=args.data_dir, name=args.train_item_keyphrase_set).toarray() table_path = load_yaml('config/global.yml', key='path')['tables'] parameters = find_best_hyperparameters(table_path + args.dataset_name, 'NDCG') parameters_row = parameters.loc[parameters['model'] == args.model] if args.dataset_name == "yelp/": R_train_item_keyphrase = R_train_item_keyphrase.T start_time = time.time() results = critiquing( matrix_Train=R_train, matrix_Test=R_test, keyphrase_freq=R_train_keyphrase, item_keyphrase_freq=R_train_item_keyphrase, num_users_sampled=args.num_users_sampled, num_items_sampled=args.num_items_sampled, max_iteration_threshold=args.max_iteration_threshold, dataset_name=args.dataset_name, model=models[args.model], parameters_row=parameters_row, critiquing_model_name=args.critiquing_model_name, keyphrase_selection_method=args.keyphrase_selection_method, topk=args.topk, lamb=args.lamb) print("Final Time Elapsed: {}".format(inhour(time.time() - start_time))) table_path = load_yaml('config/global.yml', key='path')['tables'] save_dataframe_csv(results, table_path, args.save_path)
def acf(matrix_train, embeded_matrix=np.empty((0)), epoch=300, iteration=100, lamb=80, rank=100, key_dim=3, batch_size=32, optimizer="Adam", learning_rate=0.001, seed=1, root=1, fb=False, **unused): print(epoch, lamb, rank) progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) progress.subsection("Create PMI matrix") pmi_matrix = get_pmi_matrix(matrix_input, root) progress.subsection("Randomized SVD") start_time = time.time() if fb: P, sigma, Qt = pca(pmi_matrix, k=rank, n_iter=iteration, raw=True) else: P, sigma, Qt = randomized_svd(pmi_matrix, n_components=rank, n_iter=iteration, power_iteration_normalizer='QR', random_state=seed) Q = Qt.T * np.sqrt(sigma) m, n = matrix_input.shape model = ACF(m, n, rank, key_dim, lamb=lamb, batch_size=batch_size, learning_rate=learning_rate, optimizer=Optimizer[optimizer], item_embeddings=Q) model.train_model(matrix_input, epoch) print("Elapsed: {0}".format(inhour(time.time() - start_time))) RQ = model.get_RQ() Y = model.get_Y().T model.sess.close() tf.reset_default_graph() return RQ, Y, None
def eval(matrix_valid, topk, prediction): import time from utils.progress import inhour start_time = time.time() metric_names = [ 'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP' ] result = evaluate(prediction, matrix_valid, metric_names, [topk]) print("-") for metric in result.keys(): print("{0}:{1}".format(metric, result[metric])) print("Elapsed: {0}".format(inhour(time.time() - start_time))) return result
def main(args): # Progress bar progress = WorkSplitter() # Load Data progress.section("Load Data") start_time = time.time() data = Data(args.path, args.train, args.valid,is_lb=True) print("Elapsed: {0}".format(inhour(time.time() - start_time))) #build model progress.section("Build Model") model = FeatureNet(data.n_token, data.n_feature, [1024, 2000, 1000, 500, 100]) model.cuda() print(model) model.cuda() model.load_state_dict(torch.load(args.checkpoint)) print(model) lb_loader = data.instance_a_lb_loader(args.batch) lbs = {'user_lb': list(), 'tweet_lb': list()} preds = [] model = model.eval() with torch.no_grad(): lb_iterator = tqdm(lb_loader, desc="lb") for _, batch in enumerate(lb_iterator): token, feature, tweet_lb, user_lb = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3]#,batch[4].cuda() pred = torch.sigmoid(model(token,feature)).detach().cpu().numpy() lbs['tweet_lb'] += tweet_lb[0] lbs['user_lb'] += user_lb[0] preds.append(pred) final_csv = pd.DataFrame(lbs) preds = np.float64(np.vstack(preds)) if not os.path.exists(args.spath): os.makedirs(args.spath) print("Generating CSVs...") for i, engage in enumerate(["reply", "retweet", "comment", "like"]): final_csv[engage] = preds[:,i] final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
def pmi_svd(matrix_train, embeded_matrix=np.empty((0)), iteration=4, rank=200, fb=False, seed=1, root=1.1, **unused): """ PureSVD algorithm :param matrix_train: rating matrix :param embeded_matrix: item or user embedding matrix(side info) :param iteration: number of random SVD iterations :param rank: SVD top K eigenvalue ranks :param fb: facebook package or sklearn package. boolean :param seed: Random initialization seed :param unused: args that not applicable for this algorithm :return: """ progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) progress.subsection("Create PMI matrix") pmi_matrix = get_pmi_matrix_gpu(matrix_input, root) progress.subsection("Randomized SVD") start_time = time.time() if fb: P, sigma, Qt = pca(pmi_matrix, k=rank, n_iter=iteration, raw=True) else: P, sigma, Qt = randomized_svd(pmi_matrix, n_components=rank, n_iter=iteration, power_iteration_normalizer='QR', random_state=seed) print("Elapsed: {0}".format(inhour(time.time() - start_time))) return P, Qt, None
def chain_item_item(matrix_train, embeded_matrix=np.empty((0)), iteration=7, rank=200, fb=True, seed=1, chain=1, **unused): progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) progress.subsection("Randomized SVD") start_time = time.time() if fb: P, sigma, Qt = pca(matrix_input, k=rank, n_iter=iteration, raw=True) else: P, sigma, Qt = randomized_svd(matrix_input, n_components=rank, n_iter=iteration, power_iteration_normalizer='QR', random_state=seed) RQ = matrix_input.dot(sparse.csc_matrix(Qt).T).toarray() PS = P * sigma SPPS = PS.T.dot(PS) HRQ = RQ.dot(SPPS) if chain > 1: QTQ = Qt.dot(Qt.T) for i in range(1, chain): HRQ = HRQ.dot(QTQ).dot(SPPS) print("Elapsed: {0}".format(inhour(time.time() - start_time))) return HRQ, Qt, None
def LP1SumToOneOptimize(initial_prediction_u, keyphrase_freq, affected_items, unaffected_items, num_keyphrases, query, test_user, item_latent, reg): critiqued_vector = np.zeros(keyphrase_freq[0].shape) for q in query: critiqued_vector[q] = -keyphrase_freq[test_user][q] num_critiques = len(query) W2 = reg.coef_ W = item_latent.dot(W2) num_affected_items = len(affected_items) num_unaffected_items = len(unaffected_items) start_time = time.time() # Model m = Model("LP1SumToOneOptimize") # Assignment variables lambs = [] for k in range(1 + num_critiques): lambs.append( m.addVar(lb=0, ub=1, vtype=GRB.CONTINUOUS, name="lamb%d" % k)) m.addConstr((sum(lambs[k] for k in range(1 + num_critiques)) == 1), name="sum_to_one") m.setObjective( quicksum(lambs[0] * initial_prediction_u[affected_item] * num_unaffected_items + quicksum(lambs[k + 1] * critiqued_vector[query[k]] * W[affected_item][query[k]] * num_unaffected_items for k in range(num_critiques)) for affected_item in affected_items) - quicksum(lambs[0] * initial_prediction_u[unaffected_item] * num_affected_items + quicksum(lambs[k + 1] * critiqued_vector[query[k]] * W[unaffected_item][query[k]] * num_affected_items for k in range(num_critiques)) for unaffected_item in unaffected_items), GRB.MINIMIZE) # Optimize m.optimize() print("Elapsed: {}".format(inhour(time.time() - start_time))) lambdas = [] for k in range(1 + num_critiques): optimal_lambda = m.getVars()[k].X lambdas.append(optimal_lambda) for k in range(num_critiques): critiqued_vector[query[k]] *= lambdas[k + 1] critique_score = predict_scores(matrix_U=reg.predict( critiqued_vector.reshape(1, -1)), matrix_V=item_latent) new_prediction = lambdas[ 0] * initial_prediction_u + critique_score.flatten() return new_prediction, lambdas
def main(args): writer = SummaryWriter(log_dir=os.path.join('./logs', args.run_name)) # Progress bar progress = WorkSplitter() if not os.path.exists("./checkpoint"): os.mkdir("./checkpoint") if args.emb_type == 'bert': emb_size = 768 elif args.emb_type == 'xlmr': emb_size = 1024 # Load Data progress.section("Load Data") print("Embedding size is set to", emb_size) start_time = time.time() data = Data(args, args.path, args.train, args.valid, emb_size) print("Elapsed: {0}".format(inhour(time.time() - start_time))) #build model progress.section("Build Model") if args.network_architecture == 'embedding_net': model = EmbeddingNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100], corruption=args.corruption) elif args.network_architecture == 'embedding_highway_net': model = EmbeddingHighWayNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100]) else: raise NotImplementedError( 'either use embedding_net or embedding_highway_net') model.cuda() print(model) criterion = nn.BCEWithLogitsLoss() optim = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.lamb) valid_loader = data.instance_a_valid_loader(args.batch) # train_loader = data.instance_a_train_loader(args.batch) global_step = 0 progress.section("Train model") for epoch in range(1, args.epoch): total_loss = 0 epoch_step = 0 model.train() start_split = time.time() for split_i in range(args.num_splits): train_loader = data.instance_a_train_loader(args.batch) epoch_iterator = tqdm(train_loader, desc="Iteration") for _, batch in enumerate(epoch_iterator): token, feature, label, embedding = batch[0].float().cuda( ), batch[1].float().cuda(), batch[2].float().cuda( ), batch[3].float().cuda() #, batch[3].cuda() optim.zero_grad() logit = model(token, feature, embedding) loss = criterion(logit, label) loss.backward() optim.step() total_loss += loss.item() if global_step % 5000 == 0: writer.add_scalar('Loss/train_running_avg', total_loss / (epoch_step + 1), global_step) writer.add_scalar('Loss/train_batch', loss.item(), global_step) global_step += 1 epoch_step += 1 del train_loader gc.collect() print("This split took {} seconds ...".format(time.time() - start_split)) print("epoch{0} loss:{1:.4f}".format(epoch, total_loss)) if epoch % 1 == 0: model.eval() with torch.no_grad(): preds, labels = [], [] valid_iterator = tqdm(valid_loader, desc="Validation") for _, batch in enumerate(valid_iterator): token, feature, label, embedding = batch[0].float().cuda( ), batch[1].float().cuda(), batch[2], batch[3].float( ).cuda() #,batch[3].cuda() pred = torch.sigmoid(model( token, feature, embedding)).detach().cpu().numpy() labels.append(label) preds.append(pred) labels = np.vstack(labels) preds = np.float64(np.vstack(preds)) prauc_all = [] rce_all = [] for i, engage in enumerate(["reply", "retweet", "comment", "like"]): _prauc = compute_prauc(preds[:, i], labels[:, i]) _rce = compute_rce(preds[:, i], labels[:, i]) print(engage + ":") print(_prauc) print(_rce) prauc_all.append(_prauc) rce_all.append(_rce) writer.add_scalar('PRAUC/{}_val'.format(engage), _prauc, epoch) writer.add_scalar('RCE/{}_val'.format(engage), _rce, epoch) writer.add_scalar('PRAUC/mean_val', np.mean(prauc_all), epoch) writer.add_scalar('RCE/mean_val', np.mean(rce_all), epoch) torch.save(model.state_dict(), "./checkpoint/{}_{}.ckpt".format(args.run_name, epoch)) print("Elapsed: {0}".format(inhour(time.time() - start_time)))
def main(args): # Progress bar progress = WorkSplitter() # Show hyper parameter settings progress.section("Parameter Setting") print("Data Path: {0}".format(args.data_dir)) print("Train File Name: {0}".format(args.train_set)) if args.validation: print("Valid File Name: {0}".format(args.valid_set)) print("Algorithm: {0}".format(args.model)) if args.item == True: mode = "Item-based" else: mode = "User-based" print("Normalize: {0}".format(args.normalize)) print("Mode: {0}".format(mode)) print("Alpha: {0}".format(args.alpha)) print("Rank: {0}".format(args.rank)) print("Mode Dimension: {0}".format(args.mode_dim)) print("Key Dimension: {0}".format(args.key_dim)) print("Batch Size: {0}".format(args.batch_size)) print("Optimizer: {0}".format(args.optimizer)) print("Learning Rate: {0}".format(args.learning_rate)) print("Lambda: {0}".format(args.lamb)) print("SVD/Alter Iteration: {0}".format(args.iteration)) print("Epoch: {0}".format(args.epoch)) print("Corruption: {0}".format(args.corruption)) print("Root: {0}".format(args.root)) print("Evaluation Ranking Topk: {0}".format(args.topk)) # Load Data progress.section("Loading Data") start_time = time.time() if args.shape is None: R_train = load_numpy(path=args.data_dir, name=args.train_set) else: # R_train = load_pandas(path=args.data_dir, name=args.train_set, shape=args.shape) R_train = load_csv(path=args.data_dir, name=args.train_set, shape=args.shape) print("Elapsed: {0}".format(inhour(time.time() - start_time))) print("Train U-I Dimensions: {0}".format(R_train.shape)) # Item-Item or User-User if args.item == True: RQ, Yt, Bias = models[args.model](R_train, embedded_matrix=np.empty((0)), mode_dim=args.mode_dim, key_dim=args.key_dim, batch_size=args.batch_size, optimizer=args.optimizer, learning_rate=args.learning_rate, normalize=args.normalize, iteration=args.iteration, epoch=args.epoch, rank=args.rank, corruption=args.corruption, gpu_on=args.gpu, lamb=args.lamb, alpha=args.alpha, seed=args.seed, root=args.root) Y = Yt.T else: Y, RQt, Bias = models[args.model](R_train.T, embedded_matrix=np.empty((0)), mode_dim=args.mode_dim, key_dim=args.key_dim, batch_size=args.batch_size, optimizer=args.optimizer, learning_rate=args.learning_rate, normalize=args.normalize, iteration=args.iteration, rank=args.rank, corruption=args.corruption, gpu_on=args.gpu, lamb=args.lamb, alpha=args.alpha, seed=args.seed, root=args.root) RQ = RQt.T # np.save('latent/U_{0}_{1}'.format(args.model, args.rank), RQ) # np.save('latent/V_{0}_{1}'.format(args.model, args.rank), Y) # if Bias is not None: # np.save('latent/B_{0}_{1}'.format(args.model, args.rank), Bias) progress.section("Predict") prediction = predict(matrix_U=RQ, matrix_V=Y, bias=Bias, topK=args.topk, matrix_Train=R_train, measure=args.sim_measure, gpu=args.gpu) if args.validation: progress.section("Create Metrics") start_time = time.time() metric_names = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision'] R_valid = load_numpy(path=args.data_dir, name=args.valid_set) result = evaluate(prediction, R_valid, metric_names, [args.topk]) print("-") for metric in result.keys(): print("{0}:{1}".format(metric, result[metric])) print("Elapsed: {0}".format(inhour(time.time() - start_time)))
def main(args): # Progress bar progress = WorkSplitter() if not os.path.exists("./checkpoint"): os.mkdir("./checkpoint") # Load Data progress.section("Load Data") start_time = time.time() data = Data(args.path, args.train, args.valid) print("Elapsed: {0}".format(inhour(time.time() - start_time))) #build model progress.section("Build Model") model = FeatureNet(data.n_token, data.n_feature, [1024, 2000, 1000, 500, 100], corruption=args.corruption) model.cuda() print(model) criterion = nn.BCEWithLogitsLoss() optim = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.lamb) valid_loader = data.instance_a_valid_loader(args.batch) train_loader = data.instance_a_train_loader(args.batch) progress.section("Train model") for epoch in range(1, args.epoch): total_loss = 0 model.train() epoch_iterator = tqdm(train_loader, desc="Iteration") for _, batch in enumerate(epoch_iterator): token, feature, label = batch[0].float().cuda(), batch[1].float( ).cuda(), batch[2].float().cuda() #, batch[3].cuda() optim.zero_grad() logit = model(token, feature) loss = criterion(logit, label) loss.backward() optim.step() total_loss += loss.item() print("epoch{0} loss:{1:.4f}".format(epoch, total_loss)) if epoch % 1 == 0: model.eval() with torch.no_grad(): preds, labels = [], [] valid_iterator = tqdm(valid_loader, desc="Validation") for _, batch in enumerate(valid_iterator): token, feature, label = batch[0].float().cuda( ), batch[1].float().cuda(), batch[2] #,batch[3].cuda() pred = torch.sigmoid(model( token, feature)).detach().cpu().numpy() labels.append(label) preds.append(pred) labels = np.vstack(labels) preds = np.float64(np.vstack(preds)) for i, engage in enumerate(["reply", "retweet", "comment", "like"]): print(engage + ":") print(compute_prauc(preds[:, i], labels[:, i])) print(compute_rce(preds[:, i], labels[:, i])) torch.save(model.state_dict(), "./checkpoint/{}_{}.ckpt".format(args.run_name, epoch)) print("Elapsed: {0}".format(inhour(time.time() - start_time)))
if not os.path.exists("./data"): os.mkdir("./data") train_dict = generate_dict_np(train_path) t = {'tweet_ids':train_dict['tweet_ids']} with open('./data/Train_tid.sav', 'wb') as f: joblib.dump(t, f) ##Create scalers scaler_f = PowerTransformer(copy=False) start_time = time.time() s = len(train_dict['features']) scaler_f.fit(train_dict['features'][np.random.choice(s, int(0.1*s))].astype(np.float64,copy=False)) print("Elapsed: {0}".format(inhour(time.time() - start_time))) print("fit feature scaler") scaler_t = MinMaxScaler(copy=False) start_time = time.time() scaler_t.fit(train_dict['tokens'][np.random.choice(s, int(0.1*s))]) print("Elapsed: {0}".format(inhour(time.time() - start_time))) print("fit token scaler") ##Save scalers with open('./data/f_scaler.pkl', 'wb') as f: pickle.dump(scaler_f, f, protocol=4) with open('./data/t_scaler.pkl', 'wb') as f: pickle.dump(scaler_t, f, protocol=4)
def mmp(matrix_train, embedded_matrix=np.empty((0)), mode_dim=5, key_dim=3, batch_size=32, optimizer="Adam", learning_rate=0.001, normalize=True, iteration=4, epoch=20, lamb=100, rank=200, corruption=0.5, fb=False, seed=1, root=1, alpha=1, return_model=False, **unused): """ PureSVD algorithm :param matrix_train: rating matrix :param embedded_matrix: item or user embedding matrix(side info) :param iteration: number of random SVD iterations :param rank: SVD top K eigenvalue ranks :param fb: facebook package or sklearn package. boolean :param seed: Random initialization seed :param unused: args that not applicable for this algorithm :return: """ progress = WorkSplitter() matrix_input = matrix_train if embedded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embedded_matrix.T)) progress.subsection("Create PMI matrix") pmi_matrix = get_pmi_matrix(matrix_input, root) progress.subsection("Randomized SVD") start_time = time.time() if fb: P, sigma, Qt = pca(pmi_matrix, k=rank, n_iter=iteration, raw=True) else: P, sigma, Qt = randomized_svd(pmi_matrix, n_components=rank, n_iter=iteration, power_iteration_normalizer='QR', random_state=seed) Q = Qt.T*np.sqrt(sigma) # TODO: Verify this. Seems better with this. if normalize: Q = (Q - np.mean(Q)) / np.std(Q) # Type has to match with Tensorflow graph implementation which uses float32 if isinstance(Q[0][0], np.float64): Q = np.float32(Q) model = MultiModesPreferenceEstimation(input_dim=matrix_train.shape[1], embed_dim=rank, mode_dim=mode_dim, key_dim=key_dim, batch_size=batch_size, alpha=alpha, lamb=lamb, learning_rate=learning_rate, optimizer=Optimizer[optimizer], item_embeddings=Q) model.train_model(matrix_train, corruption, epoch) print("Elapsed: {0}".format(inhour(time.time() - start_time))) if return_model: return model RQ = model.get_RQ(matrix_input) Y = model.get_Y() #Bias = model.get_Bias() model.sess.close() tf.reset_default_graph() return RQ, Y.T, None
def main(args): # Progress bar progress = WorkSplitter() # Show hyper parameter settings progress.section("Parameter Setting") print("Data Path: {0}".format(args.path)) print("Train File Name: {0}".format(args.dataset + args.train)) print("Uniform Train File Name: {0}".format(args.dataset + args.unif_train)) print("Valid File Name: {0}".format(args.dataset + args.valid)) print("Algorithm: {0}".format(args.model)) print("Way: {0}".format(args.way)) print("Seed: {0}".format(args.seed)) print("Batch Size: {0}".format(args.batch_size)) print("Rank: {0}".format(args.rank)) print("Lambda: {0}".format(args.lamb)) print("Iteration: {0}".format(args.iter)) # Load Data progress.section("Loading Data") start_time = time.time() train = load_numpy(path=args.path, name=args.dataset + args.train) print("Elapsed: {0}".format(inhour(time.time() - start_time))) print("Train U-I Dimensions: {0}".format(train.shape)) # Train Model valid = load_numpy(path=args.path, name=args.dataset + args.valid) unif_train = load_numpy(path=args.path, name=args.dataset + args.unif_train) if args.model in ['DeepAutoRec', 'HintAE', 'SoftLabelAE']: RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = models[args.model]( train, valid, dataset=args.dataset, matrix_unif_train=unif_train, iteration=args.iter, rank=args.rank, rank2=args.rank2, gpu_on=args.gpu, lam=args.lamb, seed=args.seed, batch_size=args.batch_size, way=args.way, confidence=args.confidence, step=args.step, tau=args.tau) save_path = 'latent/' + args.dataset if not os.path.exists(save_path): os.makedirs(save_path) if args.way is None: np.save(save_path + '/U_{0}_{1}'.format(args.model, args.rank), RQ) np.save(save_path + '/Y_{0}_{1}'.format(args.model, args.rank), Y) np.save(save_path + '/X_{0}_{1}'.format(args.model, args.rank), X) np.save(save_path + '/Z_{0}_{1}'.format(args.model, args.rank), Z) np.save(save_path + '/K_{0}_{1}'.format(args.model, args.rank), K) if xBias is not None: np.save( save_path + '/xB_{0}_{1}'.format(args.model, args.rank), xBias) np.save( save_path + '/yB_{0}_{1}'.format(args.model, args.rank), yBias) np.save( save_path + '/zB_{0}_{1}'.format(args.model, args.rank), zBias) np.save( save_path + '/kB_{0}_{1}'.format(args.model, args.rank), kBias) else: np.save( save_path + '/' + args.way + '_U_{0}_{1}'.format(args.model, args.rank), RQ) np.save( save_path + '/' + args.way + '_Y_{0}_{1}'.format(args.model, args.rank), Y) np.save( save_path + '/' + args.way + '_X_{0}_{1}'.format(args.model, args.rank), X) np.save( save_path + '/' + args.way + '_Z_{0}_{1}'.format(args.model, args.rank), Z) np.save( save_path + '/' + args.way + '_K_{0}_{1}'.format(args.model, args.rank), K) if xBias is not None: np.save( save_path + '/' + args.way + '_xB_{0}_{1}'.format(args.model, args.rank), xBias) np.save( save_path + '/' + args.way + '_yB_{0}_{1}'.format(args.model, args.rank), yBias) np.save( save_path + '/' + args.way + '_zB_{0}_{1}'.format(args.model, args.rank), zBias) np.save( save_path + '/' + args.way + '_kB_{0}_{1}'.format(args.model, args.rank), kBias) progress.section("Predict") prediction = predict(matrix_U=RQ, matrix_V=K.T, matrix_Valid=valid, bias=yBias, gpu=args.gpu) progress.section("Evaluation") start_time = time.time() metric_names = ['NLL', 'AUC'] result = evaluate(prediction, valid, metric_names, gpu=args.gpu) print("----Final Result----") for metric in result.keys(): print("{0}:{1}".format(metric, result[metric])) print("Elapsed: {0}".format(inhour(time.time() - start_time))) else: RQ, X, xBias, Y, yBias = models[args.model]( train, valid, dataset=args.dataset, matrix_unif_train=unif_train, iteration=args.iter, rank=args.rank, gpu_on=args.gpu, lam=args.lamb, lam2=args.lamb2, seed=args.seed, batch_size=args.batch_size, way=args.way, confidence=args.confidence, step=args.step) save_path = 'latent/' + args.dataset if not os.path.exists(save_path): os.makedirs(save_path) if args.way is None: np.save(save_path + '/U_{0}_{1}'.format(args.model, args.rank), RQ) np.save(save_path + '/Y_{0}_{1}'.format(args.model, args.rank), Y) np.save(save_path + '/X_{0}_{1}'.format(args.model, args.rank), X) if xBias is not None: np.save( save_path + '/xB_{0}_{1}'.format(args.model, args.rank), xBias) np.save( save_path + '/yB_{0}_{1}'.format(args.model, args.rank), yBias) else: np.save( save_path + '/' + args.way + '_U_{0}_{1}'.format(args.model, args.rank), RQ) np.save( save_path + '/' + args.way + '_Y_{0}_{1}'.format(args.model, args.rank), Y) np.save( save_path + '/' + args.way + '_X_{0}_{1}'.format(args.model, args.rank), X) if xBias is not None: np.save( save_path + '/' + args.way + '_xB_{0}_{1}'.format(args.model, args.rank), xBias) np.save( save_path + '/' + args.way + '_yB_{0}_{1}'.format(args.model, args.rank), yBias) progress.section("Predict") prediction = predict(matrix_U=RQ, matrix_V=Y.T, matrix_Valid=valid, bias=yBias, gpu=args.gpu) progress.section("Evaluation") start_time = time.time() metric_names = ['NLL', 'AUC'] result = evaluate(prediction, valid, metric_names, gpu=args.gpu) print("----Final Result----") for metric in result.keys(): print("{0}:{1}".format(metric, result[metric])) print("Elapsed: {0}".format(inhour(time.time() - start_time)))
def main(args): # Progress bar progress = WorkSplitter() # Show hyperparameter settings progress.section("Parameter Setting") print("Data Directory: {}".format(args.data_dir)) print("Algorithm: {}".format(args.model)) print("Optimizer: {}".format(args.optimizer)) print("Corruption Rate: {}".format(args.corruption)) print("Learning Rate: {}".format(args.learning_rate)) print("Epoch: {}".format(args.epoch)) print("Lambda L2: {}".format(args.lamb_l2)) print("Lambda Keyphrase: {}".format(args.lamb_keyphrase)) print("Lambda Latent: {}".format(args.lamb_latent)) print("Lambda Rating: {}".format(args.lamb_rating)) print("Beta: {}".format(args.beta)) print("Rank: {}".format(args.rank)) print("Train Batch Size: {}".format(args.train_batch_size)) print("Predict Batch Size: {}".format(args.predict_batch_size)) print("Evaluation Ranking Topk: {}".format(args.topk)) print("Validation Enabled: {}".format(args.enable_validation)) # Load Data progress.section("Load Data") start_time = time.time() R_train = load_numpy(path=args.data_dir, name=args.train_set) print("Train U-I Dimensions: {}".format(R_train.shape)) R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray() print("Train Keyphrase U-S Dimensions: {}".format(R_train_keyphrase.shape)) if args.enable_validation: R_valid = load_numpy(path=args.data_dir, name=args.valid_set) R_valid_keyphrase = load_numpy(path=args.data_dir, name=args.valid_keyphrase_set) else: R_valid = load_numpy(path=args.data_dir, name=args.test_set) R_valid_keyphrase = load_numpy(path=args.data_dir, name=args.test_keyphrase_set) print("Elapsed: {}".format(inhour(time.time() - start_time))) progress.section("Preprocess Keyphrase Frequency") start_time = time.time() R_train_keyphrase[R_train_keyphrase != 0] = 1 R_valid_keyphrase[R_valid_keyphrase != 0] = 1 print("Elapsed: {}".format(inhour(time.time() - start_time))) progress.section("Train") start_time = time.time() model = models[args.model](matrix_train=R_train, epoch=args.epoch, lamb_l2=args.lamb_l2, lamb_keyphrase=args.lamb_keyphrase, lamb_latent=args.lamb_latent, lamb_rating=args.lamb_rating, beta=args.beta, learning_rate=args.learning_rate, rank=args.rank, corruption=args.corruption, optimizer=args.optimizer, matrix_train_keyphrase=R_train_keyphrase) print("Elapsed: {}".format(inhour(time.time() - start_time))) progress.section("Predict") start_time = time.time() rating_score, keyphrase_score = model.predict(R_train.todense()) prediction = predict(rating_score, args.topk, matrix_Train=R_train) print("Elapsed: {}".format(inhour(time.time() - start_time))) if args.enable_evaluation: progress.section("Create Metrics") start_time = time.time() metric_names = [ 'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP' ] result = evaluate(prediction, R_valid, metric_names, [args.topk]) print("-") for metric in result.keys(): print("{}:{}".format(metric, result[metric])) if keyphrase_score is not None: keyphrase_prediction = predict_keyphrase(keyphrase_score, args.topk) keyphrase_result = evaluate(keyphrase_prediction, sparse.csr_matrix(R_valid_keyphrase), metric_names, [args.topk]) print("-") for metric in keyphrase_result.keys(): print("{}:{}".format(metric, keyphrase_result[metric])) print("Elapsed: {}".format(inhour(time.time() - start_time))) model.sess.close() tf.reset_default_graph()
def main(args): # Progress bar progress = WorkSplitter() # Show hyper parameter settings progress.section("Parameter Setting") print("Data Path: {0}".format(args.path)) print("Train File Name: {0}".format(args.train)) if args.validation: print("Valid File Name: {0}".format(args.valid)) print("Algorithm: {0}".format(args.model)) if args.item == True: mode = "Item-based" else: mode = "User-based" print("Mode: {0}".format(mode)) print("Alpha: {0}".format(args.alpha)) print("Rank: {0}".format(args.rank)) print("Lambda: {0}".format(args.lamb)) print("SVD/Alter Iteration: {0}".format(args.iter)) print("Evaluation Ranking Topk: {0}".format(args.topk)) # Load Data progress.section("Loading Data") start_time = time.time() if args.shape is None: R_train = load_numpy(path=args.path, name=args.train) else: # R_train = load_pandas(path=args.path, name=args.train, shape=args.shape) R_train = load_csv(path=args.path, name=args.train, shape=args.shape) print "Elapsed: {0}".format(inhour(time.time() - start_time)) print("Train U-I Dimensions: {0}".format(R_train.shape)) # Item-Item or User-User if args.item == True: RQ, Yt, Bias = models[args.model](R_train, embeded_matrix=np.empty((0)), iteration=args.iter, rank=args.rank, corruption=args.corruption, lam=args.lamb, alpha=args.alpha, seed=args.seed, root=args.root) Y = Yt.T else: Y, RQt, Bias = models[args.model](R_train.T, embeded_matrix=np.empty((0)), iteration=args.iter, rank=args.rank, corruption=args.corruption, lam=args.lamb, alpha=args.alpha, seed=args.seed, root=args.root) RQ = RQt.T # Save Files # progress.section("Save U-V Matrix") # start_time = time.time() # save_mxnet(matrix=RQ, path=args.path+mode+'/', # name='U_{0}_{1}_{2}'.format(args.rank, args.lamb, args.model)) # save_mxnet(matrix=Y, path=args.path+mode+'/', # name='V_{0}_{1}_{2}'.format(args.rank, args.lamb, args.model)) # print "Elapsed: {0}".format(inhour(time.time() - start_time)) np.save('latent/U_{0}_{1}'.format(args.model, args.rank), RQ) np.save('latent/V_{0}_{1}'.format(args.model, args.rank), Y) if Bias is not None: np.save('latent/B_{0}_{1}'.format(args.model, args.rank), Bias) progress.section("Predict") prediction = predict(matrix_U=RQ, matrix_V=Y, bias=Bias, topK=args.topk, matrix_Train=R_train, measure=args.sim_measure, gpu=True) if args.validation: progress.section("Create Metrics") start_time = time.time() metric_names = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision'] R_valid = load_numpy(path=args.path, name=args.valid) result = evaluate(prediction, R_valid, metric_names, [args.topk]) print("-") for metric in result.keys(): print("{0}:{1}".format(metric, result[metric])) print "Elapsed: {0}".format(inhour(time.time() - start_time))
def main(args): writer = SummaryWriter(log_dir=os.path.join('./logs', args.run_name)) if args.emb_type == 'bert': emb_size = 768 elif args.emb_type == 'xlmr': emb_size = 1024 # Progress bar progress = WorkSplitter() if not os.path.exists("./checkpoint"): os.mkdir("./checkpoint") # Load Data progress.section("Load Data") start_time = time.time() print("Embedding size is set to", emb_size) data = Data(args, args.path, args.emb_path, args.train, args.valid, emb_size) print("Elapsed: {0}".format(inhour(time.time() - start_time))) # build model progress.section("Build Model") model = EmbeddingNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100], corruption=args.corruption) model.cuda() # model.load_state_dict(torch.load("./checkpoint/featurenet_v12_8.ckpt")) print(model) criterion = nn.BCEWithLogitsLoss() optim = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.lamb) valid_loader = data.instance_a_valid_loader(args.batch) # train_loader = data.instance_a_train_loader(args.batch) global_step = 0 progress.section("Train model") scores = [] scores = validate(0, valid_loader, model, writer, scores, args) for epoch in range(1, args.epoch): total_loss = 0 epoch_step = 0 model.train() for split_i in range(args.num_splits): train_loader = data.instance_a_train_loader(args.batch) global_step, total_loss, epoch_step = train(epoch_step, global_step, train_loader, model, optim, criterion, total_loss, writer) del train_loader gc.collect() print("epoch{0} loss:{1:.4f}".format(epoch, total_loss)) if epoch % 1 == 0: scores = validate(epoch, valid_loader, model, writer, scores, args) print("Elapsed: {0}".format(inhour(time.time() - start_time)))
def start_critiquing(self): self.get_initial_predictions() for user in tqdm(self.test_users): start_time = time.time() # User id starts from 0 self.row['user_id'] = user initial_prediction_items = predict_vector(rating_vector=self.prediction_scores[user], train_vector=self.matrix_Train[user], remove_train=True) # For keyphrase selection method 'diff' top_recommended_keyphrase_freq = get_item_keyphrase_freq(self.item_keyphrase_freq,item = initial_prediction_items[0]) # The iteration will stop if the wanted item is in top n for target_rank in self.target_ranks: self.row['target_rank'] = target_rank # Pick wanted items in test items candidate_items = self.matrix_Test[user].nonzero()[1] train_items = self.matrix_Train[user].nonzero()[1] wanted_items = np.setdiff1d(candidate_items, train_items) for item in wanted_items: # Item id starts from 0 self.row['item_id'] = item ## Get item name # try: # self.row['item_name'] = get_restaurant_name(df_train, self.business_df,item) # except: # self.row['item_name'] = 'NOT_FOUND' # Set the wanted item's initial rank as None self.row['item_rank'] = None # Set the wanted item's initial prediction score as None self.row['item_score'] = None if self.keyphrase_selection_method == "random" or self.keyphrase_selection_method == "pop": # Get the item's existing keyphrases (we can boost) try: remaining_keyphrases = self.item_keyphrase_freq[item].nonzero()[1] except: remaining_keyphrases = np.ravel(self.item_keyphrase_freq[item].nonzero()) if self.keyphrase_selection_method == "diff": # For keyphrase selection method 'diff' target_keyphrase_freq = get_item_keyphrase_freq(self.item_keyphrase_freq,item = item) diff_keyphrase_freq = target_keyphrase_freq - top_recommended_keyphrase_freq remaining_keyphrases = np.argsort(np.ravel(diff_keyphrase_freq))[::-1][:self.max_wanted_keyphrase] self.row['num_existing_keyphrases'] = len(remaining_keyphrases) if len(remaining_keyphrases) == 0: break self.row['iteration'] = 0 self.row['critiqued_keyphrase'] = None self.row['result'] = None self.df = self.df.append(self.row, ignore_index=True) query = [] affected_items = np.array([]) # Set up latent embedding user_latent_embedding = [self.Y[user]] for iteration in range(self.max_iteration_threshold): self.row['iteration'] = iteration + 1 if self.keyphrase_selection_method == "pop": # Always critique the least popular keyphrase critiqued_keyphrase = remaining_keyphrases[np.argmin(self.keyphrase_popularity[remaining_keyphrases])] elif self.keyphrase_selection_method == "random": critiqued_keyphrase = np.random.choice(remaining_keyphrases, size=1, replace=False)[0] elif self.keyphrase_selection_method == "diff": critiqued_keyphrase = remaining_keyphrases[0] self.row['critiqued_keyphrase'] = critiqued_keyphrase self.row['critiqued_keyphrase_name'] = self.keyphrases_names[critiqued_keyphrase] query.append(critiqued_keyphrase) # Get affected items (items have critiqued keyphrase) current_affected_items = self.item_keyphrase_freq[:, critiqued_keyphrase].nonzero()[0] affected_items = np.unique(np.concatenate((affected_items, current_affected_items))).astype(int) unaffected_items = np.setdiff1d(range(self.num_items), affected_items) if iteration == 0: prediction_items = initial_prediction_items #calculated once for each user affected_items_mask = np.in1d(prediction_items, affected_items) affected_items_index_rank = np.where(affected_items_mask == True) unaffected_items_index_rank = np.where(affected_items_mask == False) ## concat critique embeddings to user latent embedding # Get critique vector critiqued_vector = np.zeros(self.keyphrase_freq.shape[1]) critiqued_vector[critiqued_keyphrase] = max(self.keyphrase_freq[user , critiqued_keyphrase],1) # map user critique to user latent embedding k_ci = self.reg.predict(critiqued_vector.reshape(1, -1)).flatten() user_latent_embedding.append(k_ci) prediction_scores_u, thetas = lpranksvm3(initial_prediction_u=self.prediction_scores[user], keyphrase_freq=copy.deepcopy(self.keyphrase_freq), affected_items=np.intersect1d(affected_items, prediction_items[affected_items_index_rank[0][:20]]), unaffected_items=np.intersect1d(unaffected_items, prediction_items[unaffected_items_index_rank[0][:20]]), num_keyphrases=self.num_keyphrases, query=query, test_user=user, item_latent=self.RQ, reg=self.reg, user_latent_embedding = user_latent_embedding, item_keyphrase_freq = self.item_keyphrase_freq, Y = self.Y, lamb = self.lamb) self.row['theta'] = thetas prediction_items = predict_vector(rating_vector=prediction_scores_u, train_vector=self.matrix_Train[user], remove_train=False) recommended_items = prediction_items # Current item rank item_rank = np.where(recommended_items == item)[0][0] self.row['item_rank'] = item_rank self.row['item_score'] = prediction_scores_u[item] if item_rank + 1 <= target_rank: # Items is ranked within target rank self.row['result'] = 'successful' self.df = self.df.append(self.row, ignore_index=True) break else: remaining_keyphrases = np.setdiff1d(remaining_keyphrases, critiqued_keyphrase) # Continue if more keyphrases and iterations remained if len(remaining_keyphrases) > 0 and self.row['iteration'] < self.max_iteration_threshold: self.row['result'] = None self.df = self.df.append(self.row, ignore_index=True) else: # Otherwise, mark fail self.row['result'] = 'fail' self.df = self.df.append(self.row, ignore_index=True) break print("User", user ,"Elapsed: {}".format(inhour(time.time() - start_time))) return self.df
def main(args): # Progress bar progress = WorkSplitter() # Show hyper parameter settings progress.section("Parameter Setting") print("Data Path: {0}".format(args.path)) print("Train File Name: {0}".format(args.dataset + args.train)) print("Uniform Train File Name: {0}".format(args.dataset + args.unif_train)) print("Valid File Name: {0}".format(args.dataset + args.valid)) print("Algorithm: {0}".format(args.model)) print("Way: {0}".format(args.way)) print("Seed: {0}".format(args.seed)) print("Batch Size: {0}".format(args.batch_size)) print("Rank: {0}".format(args.rank)) print("Lambda: {0}".format(args.lamb)) print("Iteration: {0}".format(args.iter)) # Load Data progress.section("Loading Data") start_time = time.time() train = load_numpy(path=args.path, name=args.dataset + args.train) print("Elapsed: {0}".format(inhour(time.time() - start_time))) print("Train U-I Dimensions: {0}".format(train.shape)) # Train Model valid = load_numpy(path=args.path, name=args.dataset + args.valid) unif_train = load_numpy(path=args.path, name=args.dataset + args.unif_train) RQ, Y, uBias, iBias = models[args.model](train, valid, dataset=args.dataset, matrix_unif_train=unif_train, iteration=args.iter, rank=args.rank, gpu_on=args.gpu, lam=args.lamb, lam2=args.lamb2, seed=args.seed, batch_size=args.batch_size, way=args.way, confidence=args.confidence, step=args.step) save_path = 'latent/' + args.dataset if not os.path.exists(save_path): os.makedirs(save_path) if args.way is None: np.save(save_path + '/U_{0}_{1}'.format(args.model, args.rank), RQ) np.save(save_path + '/V_{0}_{1}'.format(args.model, args.rank), Y) if uBias is not None: np.save(save_path + '/uB_{0}_{1}'.format(args.model, args.rank), uBias) np.save(save_path + '/iB_{0}_{1}'.format(args.model, args.rank), iBias) else: np.save(save_path + '/' + args.way + '_U_{0}_{1}'.format(args.model, args.rank), RQ) np.save(save_path + '/' + args.way + '_V_{0}_{1}'.format(args.model, args.rank), Y) if uBias is not None: np.save(save_path + '/' + args.way + '_uB_{0}_{1}'.format(args.model, args.rank), uBias) np.save(save_path + '/' + args.way + '_iB_{0}_{1}'.format(args.model, args.rank), iBias) progress.section("Predict") prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=valid, ubias=uBias, ibias=iBias, gpu=args.gpu) progress.section("Evaluation") start_time = time.time() metric_names = ['NLL', 'AUC'] result = evaluate(prediction, valid, metric_names, gpu=args.gpu) print("----Final Result----") for metric in result.keys(): print("{0}:{1}".format(metric, result[metric])) print("Elapsed: {0}".format(inhour(time.time() - start_time)))
def weighted_lrec_items(matrix_train, embeded_matrix=np.empty((0)), iteration=4, lam=80, rank=200, alpha=100, gpu=True, seed=1, **unused): """ Function used to achieve generalized projected lrec w/o item-attribute embedding :param matrix_train: user-item matrix with shape m*n :param embeded_matrix: item-attribute matrix with length n (each row represents one item) :param iteration: number of SVD iterations :param lam: parameter of penalty :param rank: the latent dimension/number of items :param alpha: weights of the U-I ratings :param gpu: whether use gpu power :return: prediction in sparse matrix """ progress = WorkSplitter() matrix_input = matrix_train if embeded_matrix.shape[0] > 0: matrix_input = vstack((matrix_input, embeded_matrix.T)) progress.subsection("Randomized SVD") start_time = time.time() P, sigma, Qt = randomized_svd(matrix_input, n_components=rank, n_iter=iteration, random_state=seed) print("Elapsed: {0}".format(inhour(time.time() - start_time))) start_time = time.time() if gpu: import cupy as cp progress.subsection("Create Cacheable Matrices") # RQ = matrix_input.dot(sparse.csc_matrix(Qt).T).toarray() # sqrt sigma injection RQ = matrix_input.dot(sparse.csc_matrix(Qt.T * np.sqrt(sigma))).toarray() # Exact matrix_B = cp.array(RQ) matrix_BT = matrix_B.T matrix_A = matrix_BT.dot(matrix_B) + cp.array( (lam * sparse.identity(rank, dtype=np.float32)).toarray()) # Approx # matrix_A = cp.array(sparse.diags(sigma * sigma + lam).todense()) # matrix_B = cp.array(P*sigma) # matrix_BT = cp.array(matrix_B.T) print("Elapsed: {0}".format(inhour(time.time() - start_time))) progress.subsection("Item-wised Optimization") start_time = time.time() # For loop m, n = matrix_train.shape Y = [] alpha = cp.array(alpha, dtype=cp.float32) for i in tqdm(xrange(n)): vector_r = matrix_train[:, i] vector_y = per_item_gpu(vector_r, matrix_A, matrix_B, matrix_BT, alpha) y_i_gpu = cp.asnumpy(vector_y) y_i_cpu = np.copy(y_i_gpu) Y.append(y_i_cpu) Y = scipy.vstack(Y) print("Elapsed: {0}".format(inhour(time.time() - start_time))) else: progress.subsection("Create Cacheable Matrices") RQ = matrix_input.dot(sparse.csc_matrix(Qt).T).toarray() # Exact matrix_B = RQ matrix_BT = RQ.T matrix_A = matrix_BT.dot(matrix_B) + ( lam * sparse.identity(rank, dtype=np.float32)).toarray() # Approx # matrix_B = P * sigma # matrix_BT = matrix_B.T # matrix_A = sparse.diags(sigma * sigma - lam).todense() print("Elapsed: {0}".format(inhour(time.time() - start_time))) progress.subsection("Item-wised Optimization") start_time = time.time() # For loop m, n = matrix_train.shape Y = [] for i in tqdm(xrange(n)): vector_r = matrix_train[:, i] vector_y = per_item_cpu(vector_r, matrix_A, matrix_B, matrix_BT, alpha) y_i_cpu = vector_y Y.append(y_i_cpu) Y = scipy.vstack(Y) print("Elapsed: {0}".format(inhour(time.time() - start_time))) return RQ, Y.T, None
def main(args): # Progress bar progress = WorkSplitter() # Show parameter settings progress.section("Parameter Setting") print("Data Path: {}".format(args.path)) print("Active Learning Algorithm: {}".format(args.active_model)) print("Recommendation Algorithm: {}".format(args.rec_model)) print("GPU: {}".format(args.gpu)) print("Iterative: {}".format(args.iterative)) print("Sample From All: {}".format(args.sample_from_all)) print("Train Valid Test Split Ratio: {}".format(args.ratio)) print("Learning Rate: {}".format(args.learning_rate)) print("Rank: {}".format(args.rank)) print("Lambda: {}".format(args.lamb)) print("Epoch: {}".format(args.epoch)) print("Active Learning Iteration: {}".format(args.active_iteration)) print("Evaluation Ranking Topk: {}".format(args.topk)) print("UCB Confidence: {}".format(args.confidence_interval)) print("Number of Item per Active Iteration: {}".format(args.num_item_per_iter)) print("UCB Number of Latent Sampling: {}".format(args.num_latent_sampling)) # Load Data progress.section("Loading Data") start_time = time.time() R_train = load_numpy(path=args.path, name=args.train) print("Train U-I Dimensions: {}".format(R_train.shape)) R_active = load_numpy(path=args.path, name=args.active) print("Active U-I Dimensions: {}".format(R_active.shape)) R_test = load_numpy(path=args.path, name=args.test) print("Test U-I Dimensions: {}".format(R_test.shape)) print("Elapsed: {}".format(inhour(time.time() - start_time))) train_index = int(R_test.shape[0]*args.ratio[0]) progress.section("Preparing Data") matrix_train, matrix_active, matrix_test, _ = filter_users(R_train, R_active, R_test, train_index=train_index, active_threshold=2*args.num_item_per_iter*args.active_iteration, test_threshold=2*args.topk) m, n = matrix_train.shape history_items = np.array([]) model = rec_models[args.rec_model](observation_dim=n, latent_dim=args.rank, batch_size=128, lamb=args.lamb, learning_rate=args.learning_rate, optimizer=Regularizer[args.optimizer]) progress.section("Training") model.train_model(matrix_train[:train_index], args.corruption, args.epoch) for i in range(args.active_iteration): print('This is step {} \n'.format(i)) print('The number of ones in train set is {}'.format(len(matrix_train[train_index:].nonzero()[0]))) print('The number of ones in active set is {}'.format(len(matrix_active[train_index:].nonzero()[0]))) progress.section("Predicting") observation = active_models[args.active_model](model=model, matrix=matrix_train[train_index:].A, ci=args.confidence_interval, num_latent_sampling=args.num_latent_sampling) progress.section("Update Train Set") matrix_train, history_items = update_matrix(history_items, matrix_train, matrix_active, observation, train_index, args.iterative, args.sample_from_all, args.num_item_per_iter, args.active_iteration, args.gpu) if not args.iterative: break # matrix_train = matrix_train + matrix_active print('The number of ones in train set is {}'.format(len(matrix_train[train_index:].nonzero()[0]))) progress.section("Re-Training") model.train_model(matrix_train, args.corruption, args.epoch) progress.section("Re-Predicting") observation = active_models['Greedy'](model=model, matrix=matrix_train.A) result = {} for topk in [5, 10, 15, 20, 50]: predict_items, _ = sampling_predict(prediction_scores=observation[train_index:], topK=topk, matrix_train=matrix_train[train_index:], matrix_active=matrix_active[train_index:], sample_from_all=True, iterative=False, history_items=np.array([]), gpu=args.gpu) progress.section("Create Metrics") result.update(eval(matrix_test[train_index:], topk, predict_items)) print(result) model.sess.close() tf.reset_default_graph()