def main(): """ Main module. Fits a mean-based predictor on training set, loaded from pickle, and predict votes for a test set, loaded from pickle, outputing to a file with predicted values and displaying training performance on stdout. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/reviews-%d.pkl'% (_PKL_DIR, i), 'r')) predictor = fit_predictor(train) pred = [predictor(v) for v in train] truth = [v['vote'] for v in train] print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) output = open('%s/%s-%d-0.dat' % (_OUTPUT_DIR, _PRED, i), 'w') for v in test: print >> output, predictor(v) output.close()
def main(): """ Main module. Fits a mean-based predictor on training set, loaded from pickle, and predict votes for a test set, loaded from pickle, outputing to a file with predicted values and displaying training performance on stdout. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) predictor = fit_predictor(train) pred = [predictor(v) for v in train] truth = [v['vote'] for v in train] print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) output = open('%s/%s-%d-0.dat' % (_OUTPUT_DIR, _PRED, i), 'w') for v in test: print >> output, predictor(v) output.close()
def evaluate_regression(pred, votes, output): """ Evaluates predicted values using RMSE, a regression metric. Args: pred: a list of floats with predicted values. votes: a list of votes, represented as dictionaries, belonging to votes set. output: a file object to pirint output on. Returns: None. The result is printed on output file and stdout. """ truth = [v['vote'] for v in votes] rmse = calculate_rmse(pred, truth) print >> output, "RMSE: %f" % rmse return rmse
def main(): """ Predicts helpfulness votes using MF. Args: None. Returns: None. Results are printed to files. """ load_args() for i in xrange(NUM_SETS): print 'Reading pickles' train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) truth = [v['vote'] for v in train] for j in xrange(REP): print 'Fitting Model' model = MF_Model() model.fit(train) print 'Calculating Predictions' pred = model.predict(train) print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) pred = model.predict(val) print 'Outputting validation prediction' output = open('%s/mf-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() pred = model.predict(test) print 'Outputting testing prediction' output = open( '%s/mf-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts helpfulness votes using MF. Args: None. Returns: None. Results are printed to files. """ load_args() for i in xrange(NUM_SETS): print 'Reading pickles' train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) truth = [v['vote'] for v in train] for j in xrange(REP): print 'Fitting Model' model = MF_Model() model.fit(train) print 'Calculating Predictions' pred = model.predict(train) print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) pred = model.predict(val) print 'Outputting validation prediction' output = open('%s/mf-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() pred = model.predict(test) print 'Outputting testing prediction' output = open('%s/mf-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts helpfulness votes using MF. Args: None. Returns: None. Results are printed to files. """ load_args() for i in xrange(NUM_SETS): t = time() print 'Reading pickles' train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r')) truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Formatting input time: %f' % (time() - t) for j in xrange(REP): print 'Fitting Model' t = time() model = LR_Model() model.fit(X_train, y_train, qid_train) print 'Learning time: %f' % (time() - t) print 'Coefficients:' print model.w print 'Calculating Predictions' pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) print 'Outputting validation prediction' output = open('%s/corasvr-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() t = time() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) print 'Prediction time: %f' % (time() - t) print 'Outputting testing prediction' output = open('%s/corasvr-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts votes by applying LambdaMART technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Outputting model' outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') train_index = output_model(X_train, y_train, qid_train, outfile) outfile.close() outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') val_index = output_model(X_val, None, qid_val, outfile) outfile.close() outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') test_index = output_model(X_test, None, qid_test, outfile) outfile.close() for j in xrange(REP): print 'Fitting model' print getoutput(('java -jar lib/ranklib/RankLib.jar -train ' '%s/rank_train-%s-%d.dat -save %s/lambdamart_model-%s-%d-%d.dat ' '-gmax 5 -ranker 6 -metric2t NDCG@5 -tree %d -leaf %d -shrinkage ' '%f') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, j, _T, _L, _ALPHA)) print 'Evaluating in train' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_train-%s-%d.dat ' '-score %s/rank_pred_train-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) raw_pred = [] predfile = open('%s/rank_pred_train-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in train_index] if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, j) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) print 'Predicting in validation' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_val-%s-%d.dat ' '-score %s/rank_pred_val-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) predfile = open('%s/rank_pred_val-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in val_index] if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/lambdamart-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() print 'Predicting in test' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_test-%s-%d.dat ' '-score %s/rank_pred_test-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) predfile = open('%s/rank_pred_test-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in test_index] if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/lambdamart-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def predict(): """ Predicts votes by applying a SVR regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): t = time() print 'Reading data' reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Formatting input time: %f' % (time() - t) t = time() model = SVR(C=_C, epsilon=_EPS, kernel=_KERNEL) model.fit(X_train, y_train) print 'Learning time: %f' % (time() - t) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open( '%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close() t = time() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) print 'Prediction time: %f' % (time() - t) output = open( '%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_OUTPUT_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close()
def main(): """ Main method performing fitting, prediction and outputting to file. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) trusts = load(open('%s/trusts.pkl' % _PKL_DIR, 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) f_train = map_features(train, reviews, users, sim, conn, trusts) f_val = map_features(val, reviews, users, sim, conn, trusts) f_test = map_features(test, reviews, users, sim, conn, trusts) scaler = fit_cap_scaler(f_train) f_train = scale_cap_features(scaler, f_train) f_val = scale_cap_features(scaler, f_val) f_test = scale_cap_features(scaler, f_test) for j in xrange(REP): print 'Creating variables' var_groups = create_variable_groups() populate_variables(var_groups, train, users, trusts, f_train) print 'Running EM' expectation_maximization(var_groups, train) print 'Calculating Predictions' pred = calculate_predictions(var_groups, train, users, trusts, f_train, sim, conn) print 'TRAINING ERROR' truth = [v['vote'] for v in train] print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) print 'Outputting Validation Prediction' pred = calculate_predictions(var_groups, val, users, trusts, f_val, sim, conn) output = open('%s/cap-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() truth = [v['vote'] for v in val] print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(val, reviews, pred, truth, RANK_SIZE)) print 'Outputting Test Prediction' pred = calculate_predictions(var_groups, test, users, trusts, f_test, sim, conn) output = open( '%s/cap-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() truth = [v['vote'] for v in test] print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(test, reviews, pred, truth, RANK_SIZE))
def main(): """ Main method performing fitting, prediction and outputting to file. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading pickles' train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) train_reviews_ids = set([vote['review'] for vote in train]) train_reviews = {r_id: reviews[r_id] for r_id in train_reviews_ids} for j in xrange(REP): print 'Fitting Model' model = BETF_Model() for v in train: v['vote'] /= 5.0 for r_id in train_reviews: train_reviews[r_id]['rating'] /= 5.0 model.fit(train, train_reviews) print 'Calculating Predictions' pred = model.predict(train, reviews) for v in train: v['vote'] *= 5.0 for r_id in train_reviews: train_reviews[r_id]['rating'] *= 5.0 pred = [p * 5.0 for p in pred] truth = [v['vote'] for v in train] print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) pred = model.predict(val, reviews) pred = [p * 5.0 for p in pred] print 'Outputting Validation Prediction' output = open( '%s/betf-k:%d,l:%f,r:%f,e:%f,i:%d-%d-%d.dat' % (_VAL_DIR, _K, _ALPHA, _BETA, _TOL, _ITER, i, j), 'w') for p in pred: print >> output, p output.close() truth = [v['vote'] for v in val] print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(val, reviews, pred, truth, RANK_SIZE)) pred = model.predict(test, reviews) pred = [p * 5.0 for p in pred] print 'Outputting Test Prediction' output = open('%s/betf-k:%d,l:%f,r:%f,e:%f,i:%d-%d-%d.dat' % \ (_OUTPUT_DIR, _K, _ALPHA, _BETA, _TOL, _ITER, i, j), 'w') for p in pred: print >> output, p output.close() truth = [v['vote'] for v in test] print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(test, reviews, pred, truth, RANK_SIZE))
def main(): """ Predicts helpfulness votes using MF. Args: None. Returns: None. Results are printed to files. """ load_args() for i in xrange(NUM_SETS): t = time() print 'Reading pickles' train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r')) truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Formatting input time: %f' % (time() - t) for j in xrange(REP): print 'Fitting Model' t = time() model = LR_Model() model.fit(X_train, y_train, qid_train) print 'Learning time: %f' % (time() - t) print 'Coefficients:' print model.w print 'Calculating Predictions' pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) print 'Outputting validation prediction' output = open( '%s/corasvr-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() t = time() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) print 'Prediction time: %f' % (time() - t) print 'Outputting testing prediction' output = open( '%s/corasvr-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def predict(): """ Predicts votes by applying RankSVM technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Outputting model' outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') train_index = output_model(X_train, y_train, qid_train, outfile) outfile.close() outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') val_index = output_model(X_val, None, qid_val, outfile) outfile.close() outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') test_index = output_model(X_test, None, qid_test, outfile) outfile.close() print 'Fitting model' print getoutput(('lib/svm_rank/svm_rank_learn -c %f -w %s -t %s ' '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat') % (_C, _ALGO, _KERNEL, _DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i)) print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_train-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) raw_pred = [] predfile = open( '%s/rank_pred_train-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in train_index] if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) print 'Predicting in validation' print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_val-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_val-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) predfile = open( '%s/rank_pred_val-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in val_index] if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/svmrank-%s-%d-0.dat' % (_VAL_DIR, _CONF_STR, i), 'w') for p in pred: print >> output, p output.close() print 'Predicting in test' print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_test-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_test-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) predfile = open( '%s/rank_pred_test-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in test_index] if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/svmrank-%s-%d-0.dat' % (_OUTPUT_DIR, _CONF_STR, i), 'w') for p in pred: print >> output, p output.close()
def main(): """ Main method, which performs prediction and outputs to file. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) print 'Modeling' X_train = model_dyad(train, sim, conn, avg_sim, avg_conn) X_val = model_dyad(val, sim, conn, avg_sim, avg_conn) X_test = model_dyad(test, sim, conn, avg_sim, avg_conn) train_reviews = set([v['review'] for v in train]) test_reviews = set([v['review'] for v in val]).union(set([v['review'] for v in test])) X_item_train, item_train_key , X_item_test, item_test_key = \ model_items(reviews, users, train_reviews, test_reviews, avg_user) # train, test: same file, different scaling train_users = set([v['voter'] for v in train]) test_users = set([v['voter'] for v in val]).union(set([v['voter'] for v in test])) X_user_train, user_train_key, X_user_test, user_test_key = \ model_users(users, train_users, test_users, avg_user) print 'Scaling' dyad_scaler = fit_scaler('minmax', X_train) X_train = scale_features(dyad_scaler, X_train) X_val = scale_features(dyad_scaler, X_val) X_test = scale_features(dyad_scaler, X_test) item_scaler = fit_scaler('minmax', X_item_train) X_item_train = scale_features(item_scaler, X_item_train) X_item_test = scale_features(item_scaler, X_item_test) user_scaler = fit_scaler('minmax', X_user_train) X_user_train = scale_features(user_scaler, X_user_train) X_user_test = scale_features(user_scaler, X_user_test) X_item = vstack((X_item_train, X_item_test)) item_key = item_train_key + item_test_key X_user = vstack((X_user_train, X_user_test)) user_key = user_train_key + user_test_key print 'Outputting model' output_dyad('train', train, X_train, i) output_dyad('val', val, X_val, i) output_dyad('test', test, X_test, i) output_entity('item', X_item, item_key, i) output_entity('user', X_user, user_key, i) for j in xrange(REP): print 'Fitting model' print getoutput(('Rscript lib/rlfm/rlfm_fit.R %d %d %d %d %s %d %d ' '%s') % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s train' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) predfile = open('%s/rlfm-%s-%d-%d.dat' % (_TRAIN_DIR, _CONF_STR, i, j), 'r') pred = [float(p.strip()) for p in predfile] predfile.close() truth = [v['vote'] for v in train] print len(pred) print len(truth) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) print 'Predicting in validation' print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s val' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) print 'Predicting in test' print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s test' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))
def main(): """ Predicts votes by applying a LR regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) model = Ridge(alpha=_BETA) # for standardized notation across algorithms, we consider alpha to be # learning rate of and beta, regularization weight model.fit(X_train, y_train) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open( '%s/lr-r:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) output = open( '%s/lr-r:%f,f:%s,b:%s,-%d-%d.dat' % (_OUTPUT_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts votes by applying a LR regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) model = Ridge(alpha=_BETA) # for standardized notation across algorithms, we consider alpha to be # learning rate of and beta, regularization weight model.fit(X_train , y_train) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/lr-r:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/lr-r:%f,f:%s,b:%s,-%d-%d.dat' % (_OUTPUT_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts votes by applying LambdaMART technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Outputting model' outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') train_index = output_model(X_train, y_train, qid_train, outfile) outfile.close() outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') val_index = output_model(X_val, None, qid_val, outfile) outfile.close() outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') test_index = output_model(X_test, None, qid_test, outfile) outfile.close() for j in xrange(REP): print 'Fitting model' print getoutput(( 'java -jar lib/ranklib/RankLib.jar -train ' '%s/rank_train-%s-%d.dat -save %s/lambdamart_model-%s-%d-%d.dat ' '-gmax 5 -ranker 6 -metric2t NDCG@5 -tree %d -leaf %d -shrinkage ' '%f') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, j, _T, _L, _ALPHA)) print 'Evaluating in train' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_train-%s-%d.dat ' '-score %s/rank_pred_train-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) raw_pred = [] predfile = open( '%s/rank_pred_train-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in train_index] if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, j) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) print 'Predicting in validation' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_val-%s-%d.dat ' '-score %s/rank_pred_val-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) predfile = open( '%s/rank_pred_val-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in val_index] if _BIAS: bias.add_bias(val, reviews, pred) output = open( '%s/lambdamart-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() print 'Predicting in test' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_test-%s-%d.dat ' '-score %s/rank_pred_test-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) predfile = open( '%s/rank_pred_test-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in test_index] if _BIAS: bias.add_bias(test, reviews, pred) output = open( '%s/lambdamart-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def predict(): """ Predicts votes by applying RankSVM technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Outputting model' outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') train_index = output_model(X_train, y_train, qid_train, outfile) outfile.close() outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') val_index = output_model(X_val, None, qid_val, outfile) outfile.close() outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') test_index = output_model(X_test, None, qid_test, outfile) outfile.close() print 'Fitting model' print getoutput(('lib/svm_rank/svm_rank_learn -c %f -w %s -t %s ' '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat') % (_C, _ALGO, _KERNEL, _DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i)) print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_train-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) raw_pred = [] predfile = open('%s/rank_pred_train-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in train_index] if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) print 'Predicting in validation' print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_val-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_val-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) predfile = open('%s/rank_pred_val-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in val_index] if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/svmrank-%s-%d-0.dat' % (_VAL_DIR, _CONF_STR, i), 'w') for p in pred: print >> output, p output.close() print 'Predicting in test' print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_test-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_test-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) predfile = open('%s/rank_pred_test-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in test_index] if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/svmrank-%s-%d-0.dat' % (_OUTPUT_DIR, _CONF_STR, i), 'w') for p in pred: print >> output, p output.close()
def main(): """ Main method, which performs prediction and outputs to file. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) print 'Modeling' X_train = model_dyad(train, sim, conn, avg_sim, avg_conn) X_val = model_dyad(val, sim, conn, avg_sim, avg_conn) X_test = model_dyad(test, sim, conn, avg_sim, avg_conn) train_reviews = set([v['review'] for v in train]) test_reviews = set([v['review'] for v in val ]).union(set([v['review'] for v in test])) X_item_train, item_train_key , X_item_test, item_test_key = \ model_items(reviews, users, train_reviews, test_reviews, avg_user) # train, test: same file, different scaling train_users = set([v['voter'] for v in train]) test_users = set([v['voter'] for v in val]).union(set([v['voter'] for v in test])) X_user_train, user_train_key, X_user_test, user_test_key = \ model_users(users, train_users, test_users, avg_user) print 'Scaling' dyad_scaler = fit_scaler('minmax', X_train) X_train = scale_features(dyad_scaler, X_train) X_val = scale_features(dyad_scaler, X_val) X_test = scale_features(dyad_scaler, X_test) item_scaler = fit_scaler('minmax', X_item_train) X_item_train = scale_features(item_scaler, X_item_train) X_item_test = scale_features(item_scaler, X_item_test) user_scaler = fit_scaler('minmax', X_user_train) X_user_train = scale_features(user_scaler, X_user_train) X_user_test = scale_features(user_scaler, X_user_test) X_item = vstack((X_item_train, X_item_test)) item_key = item_train_key + item_test_key X_user = vstack((X_user_train, X_user_test)) user_key = user_train_key + user_test_key print 'Outputting model' output_dyad('train', train, X_train, i) output_dyad('val', val, X_val, i) output_dyad('test', test, X_test, i) output_entity('item', X_item, item_key, i) output_entity('user', X_user, user_key, i) for j in xrange(REP): print 'Fitting model' print getoutput( ('Rscript lib/rlfm/rlfm_fit.R %d %d %d %d %s %d %d ' '%s') % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) print getoutput( 'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s train' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) predfile = open( '%s/rlfm-%s-%d-%d.dat' % (_TRAIN_DIR, _CONF_STR, i, j), 'r') pred = [float(p.strip()) for p in predfile] predfile.close() truth = [v['vote'] for v in train] print len(pred) print len(truth) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) print 'Predicting in validation' print getoutput( 'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s val' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) print 'Predicting in test' print getoutput( 'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s test' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))
def predict(): """ Predicts votes by applying a SVR regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): t = time() print 'Reading data' reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Formatting input time: %f' % (time() - t) t = time() model = SVR(C=_C, epsilon=_EPS, kernel=_KERNEL) model.fit(X_train , y_train) print 'Learning time: %f' % (time() - t) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close() t = time() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) print 'Prediction time: %f' % (time() - t) output = open('%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_OUTPUT_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close()
def main(): """ Main method performing fitting, prediction and outputting to file. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) trusts = load(open('%s/trusts.pkl' % _PKL_DIR, 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) f_train = map_features(train, reviews, users, sim, conn, trusts) f_val = map_features(val, reviews, users, sim, conn, trusts) f_test = map_features(test, reviews, users, sim, conn, trusts) scaler = fit_cap_scaler(f_train) f_train = scale_cap_features(scaler, f_train) f_val = scale_cap_features(scaler, f_val) f_test = scale_cap_features(scaler, f_test) for j in xrange(REP): print 'Creating variables' var_groups = create_variable_groups() populate_variables(var_groups, train, users, trusts, f_train) print 'Running EM' expectation_maximization(var_groups, train) print 'Calculating Predictions' pred = calculate_predictions(var_groups, train, users, trusts, f_train, sim, conn) print 'TRAINING ERROR' truth = [v['vote'] for v in train] print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) print 'Outputting Validation Prediction' pred = calculate_predictions(var_groups, val, users, trusts, f_val, sim, conn) output = open('%s/cap-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() truth = [v['vote'] for v in val] print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(val, reviews, pred, truth, RANK_SIZE)) print 'Outputting Test Prediction' pred = calculate_predictions(var_groups, test, users, trusts, f_test, sim, conn) output = open('%s/cap-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() truth = [v['vote'] for v in test] print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(test, reviews, pred, truth, RANK_SIZE))
if __name__ == '__main__': print 'Reading pickles' train = load(open('%s/train%.2f.pkl' % (_PKL_DIR, _SAMPLE * 100), 'r')) test = load(open('%s/test%.2f.pkl' % (_PKL_DIR, _SAMPLE * 100), 'r')) overall_mean = float(sum([float(v['vote']) for v in train])) / len(train) print 'Fitting Model' model = BiasModel() model.fit(train) print 'Calculating Predictions' pred = model.predict(train) print 'TRAINING ERROR' truth = [v['vote'] for v in train] rmse = calculate_rmse(pred, truth) print 'RMSE: %s' % rmse for i in xrange(5, 21, 5): score = calculate_ndcg(pred, truth, i) print 'NDCG@%d: %f' % (i, score) print 'Outputing Prediction' pred = model.predict(test) output = open('%s/bias%.2f.dat' % (_OUTPUT_DIR, _SAMPLE * 100), 'w') for p in pred: print >> output, overall_mean if isnan(p) else p output.close()
pred[index] += self.product_bias[product] if __name__ == '__main__': print 'Reading pickles' train = load(open('%s/train%.2f.pkl' % (_PKL_DIR, _SAMPLE * 100), 'r')) test = load(open('%s/test%.2f.pkl' % (_PKL_DIR, _SAMPLE * 100), 'r')) overall_mean = float(sum([float(v['vote']) for v in train])) / len(train) print 'Fitting Model' model = BiasModel() model.fit(train) print 'Calculating Predictions' pred = model.predict(train) print 'TRAINING ERROR' truth = [v['vote'] for v in train] rmse = calculate_rmse(pred, truth) print 'RMSE: %s' % rmse for i in xrange(5, 21, 5): score = calculate_ndcg(pred, truth, i) print 'NDCG@%d: %f' % (i, score) print 'Outputing Prediction' pred = model.predict(test) output = open('%s/bias%.2f.dat' % (_OUTPUT_DIR, _SAMPLE * 100), 'w') for p in pred: print >> output, overall_mean if isnan(p) else p output.close()
def main(): """ Predicts votes by applying a GBRT regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) for j in xrange(REP): model = GradientBoostingRegressor(loss=_LOSS, learning_rate=_ALPHA, n_estimators=_T, max_depth=_MAX_D, subsample=_SUBSAMPLE, max_features=_MAX_F, random_state=(int(time() * 1000000) % 1000000)) model.fit(X_train, y_train) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, j) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/gbrt-l:%f,t:%d,d:%d,e:%s,p:%f,m:%s,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _ALPHA, _T, _MAX_D, _LOSS, _SUBSAMPLE, str(_MAX_F), _FEAT_TYPE, 'y' if _BIAS else 'n', i, j), 'w') for p in pred: print >> output, p output.close() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/gbrt-l:%f,t:%d,d:%d,e:%s,p:%f,m:%s,f:%s,b:%s-%d-%d.dat' % (_OUTPUT_DIR, _ALPHA, _T, _MAX_D, _LOSS, _SUBSAMPLE, str(_MAX_F), _FEAT_TYPE, 'y' if _BIAS else 'n', i, j), 'w') for p in pred: print >> output, p output.close()