def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids qids_train = np.loadtxt("train_2000_qids.txt") qids_test = np.loadtxt("test_2000_qids.txt") regparams = [2.**i for i in range(-10, 10)] learner = LeaveQueryOutRankRLS(X_train, Y_train, qids_train, regparams = regparams, measure = cindex) lqo_perfs = learner.cv_performances P_test = learner.predict(X_test) print("leave-query-out performances " +str(lqo_perfs)) print("chosen regparam %f" %learner.regparam) partition = map_ids(qids_test) #compute the ranking accuracy separately for each test query test_perfs = [] for query in partition: #skip such queries, where all instances have the same #score, since in this case cindex is undefined if np.var(Y_test[query]) != 0: perf = cindex(Y_test[query], P_test[query]) test_perfs.append(perf) test_perf = np.mean(test_perfs) print("test cindex %f" %test_perf)
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids qids_train = np.loadtxt("train_2000_qids.txt") qids_test = np.loadtxt("test_2000_qids.txt") learner = QueryRankRLS(X_train, Y_train, qids_train) P_test = learner.predict(X_test) folds = map_ids(qids_train) perfs = [] for fold in folds: if np.var(Y_train[fold]) != 0: P = learner.holdout(fold) c = cindex(Y_train[fold], P) perfs.append(c) perf = np.mean(perfs) print("leave-query-out cross-validation cindex %f" % perf) partition = map_ids(qids_test) test_perfs = [] #compute the ranking accuracy separately for each test query for query in partition: #skip such queries, where all instances have the same #score, since in this case cindex is undefined if np.var(Y_test[query]) != 0: perf = cindex(Y_test[query], P_test[query]) test_perfs.append(perf) test_perf = np.mean(test_perfs) print("test cindex %f" % test_perf)
def train_rls(): #Select regparam with leave-one-out cross-validation X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #Leave-one-out cross-validation predictions, this is fast due to #computational short-cut P_loo = learner.leave_one_out() e = sqerror(Y_train, P_loo) print("regparam 2**%d, loo-error %f" %(log_regparam, e)) if e < best_error: best_error = e best_regparam = regparam learner.solve(best_regparam) P_test = learner.predict(X_test) print("best regparam %d loo-error %f" %(best_regparam, best_error)) print("test error %f" %sqerror(Y_test, P_test))
def train_rls(): #Select regparam with leave-one-out cross-validation X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #Leave-one-out cross-validation predictions, this is fast due to #computational short-cut P_loo = learner.leave_one_out() e = sqerror(Y_train, P_loo) print("regparam 2**%d, loo-error %f" % (log_regparam, e)) if e < best_error: best_error = e best_regparam = regparam learner.solve(best_regparam) P_test = learner.predict(X_test) print("best regparam %d loo-error %f" % (best_regparam, best_error)) print("test error %f" % sqerror(Y_test, P_test))
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids qids_train = np.loadtxt("train_2000_qids.txt") qids_test = np.loadtxt("test_2000_qids.txt") learner = QueryRankRLS(X_train, Y_train, qids_train) P_test = learner.predict(X_test) folds = map_ids(qids_train) perfs = [] for fold in folds: if np.var(Y_train[fold]) != 0: P = learner.holdout(fold) c = cindex(Y_train[fold], P) perfs.append(c) perf = np.mean(perfs) print("leave-query-out cross-validation cindex %f" %perf) partition = map_ids(qids_test) test_perfs = [] #compute the ranking accuracy separately for each test query for query in partition: #skip such queries, where all instances have the same #score, since in this case cindex is undefined if np.var(Y_test[query]) != 0: perf = cindex(Y_test[query], P_test[query]) test_perfs.append(perf) test_perf = np.mean(test_perfs) print("test cindex %f" %test_perf)
def plot_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids ids = np.loadtxt("train_2000_qids.txt") #mapped to a list of lists, where each list #contains indices for one fold folds = map_ids(ids) learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) kfold_errors = [] loo_errors = [] test_errors = [] for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #K-fold cross-validation perfs = [] for fold in folds: #computes holdout predictions, where instances #in fold are left out of training set P = learner.holdout(fold) perfs.append(sqerror(Y_train[fold], P)) e_kfold = np.mean(perfs) kfold_errors.append(e_kfold) P_loo = learner.leave_one_out() e_loo = sqerror(Y_train, P_loo) loo_errors.append(e_loo) P_test = learner.predict(X_test) e_test = sqerror(Y_test, P_test) test_errors.append(e_test) plt.semilogy(log_regparams, loo_errors, label = "leave-one-out") plt.semilogy(log_regparams, kfold_errors, label = "leave-sentence-out") plt.semilogy(log_regparams, test_errors, label = "test error") plt.xlabel("$log_2(\lambda)$") plt.ylabel("mean squared error") plt.legend(loc=3) plt.show()
def print_stats(): X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") ids = np.loadtxt("train_2000_qids.txt", dtype=int) folds = map_ids(ids) print("Parse data set characteristics") print("Training set: %d instances, %d features" % X_train.shape) print("Instances grouped into %d sentences" % len(folds))
def print_stats(): X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") ids = np.loadtxt("train_2000_qids.txt", dtype=int) folds = map_ids(ids) print("Parse data set characteristics") print("Training set: %d instances, %d features" %X_train.shape) print("Instances grouped into %d sentences" %len(folds))
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids ids = np.loadtxt("train_2000_qids.txt") #mapped to a list of lists, where each list #contains indices for one fold folds = map_ids(ids) learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #K-fold cross-validation P = np.zeros(Y_train.shape) for fold in folds: #computes holdout predictions, where instances #in fold are left out of training set P[fold] = learner.holdout(fold) e = sqerror(Y_train, P) print("regparam 2**%d, k-fold error %f" % (log_regparam, e)) if e < best_error: best_error = e best_regparam = regparam learner.solve(best_regparam) P_test = learner.predict(X_test) print("best regparam %f k-fold error %f" % (best_regparam, best_error)) print("test error %f" % sqerror(Y_test, P_test))
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids ids = np.loadtxt("train_2000_qids.txt") #mapped to a list of lists, where each list #contains indices for one fold folds = map_ids(ids) learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #K-fold cross-validation P = np.zeros(Y_train.shape) for fold in folds: #computes holdout predictions, where instances #in fold are left out of training set P[fold] = learner.holdout(fold) e = sqerror(Y_train, P) print("regparam 2**%d, k-fold error %f" %(log_regparam, e)) if e < best_error: best_error = e best_regparam = regparam learner.solve(best_regparam) P_test = learner.predict(X_test) print("best regparam %f k-fold error %f" %(best_regparam, best_error)) print("test error %f" %sqerror(Y_test, P_test))
import numpy as np from rlscore.learner.rls import KfoldRLS from rlscore.utilities.reader import read_folds from rlscore.utilities.reader import read_sparse from rlscore.measure import auc train_labels = np.loadtxt("./legacy_tests/data/class_train.labels") test_labels = np.loadtxt("./legacy_tests/data/class_test.labels") folds = read_folds("./legacy_tests/data/folds.txt") train_features = read_sparse("./legacy_tests/data/class_train.features") test_features = read_sparse("./legacy_tests/data/class_test.features") kwargs = {} kwargs['measure']=auc kwargs['regparams'] = [2**i for i in range(-10,11)] kwargs["Y"] = train_labels kwargs["X"] = train_features kwargs["folds"] = folds learner = KfoldRLS(**kwargs) grid = kwargs['regparams'] perfs = learner.cv_performances for i in range(len(grid)): print "parameter %f cv_performance %f" %(grid[i], perfs[i]) P = learner.predict(test_features) test_perf = auc(test_labels, P) print "test set performance: %f" %test_perf
import numpy as np from rlscore.learner.mmc import MMC from rlscore.utilities.reader import read_sparse from rlscore.measure import auc train_labels = np.loadtxt("./legacy_tests/data/class_train.labels") test_labels = np.loadtxt("./legacy_tests/data/class_test.labels") train_features = read_sparse("./legacy_tests/data/class_train.features") test_features = read_sparse("./legacy_tests/data/class_test.features") kwargs = {} kwargs["Y"] = train_labels kwargs["X"] = train_features kwargs["regparam"] = 1 learner = MMC(**kwargs) P = learner.predict(test_features) test_perf = auc(test_labels, P) print("test set performance: %f" %test_perf)
import numpy as np from rlscore.learner.query_rankrls import QueryRankRLS from rlscore.utilities.reader import read_qids from rlscore.utilities.reader import read_sparse from rlscore.measure import cindex train_labels = np.loadtxt("./legacy_tests/data/rank_train.labels") test_labels = np.loadtxt("./legacy_tests/data/rank_test.labels") train_qids = read_qids("./legacy_tests/data/rank_train.qids") test_features = read_sparse("./legacy_tests/data/rank_test.features") train_features = read_sparse("./legacy_tests/data/rank_train.features") test_qids = read_qids("./legacy_tests/data/rank_test.qids") kwargs = {} kwargs["Y"] = train_labels kwargs["X"] = train_features kwargs["qids"] = train_qids kwargs["regparam"] = 1 learner = QueryRankRLS(**kwargs) P = learner.predict(test_features) from rlscore.measure.measure_utilities import UndefinedPerformance from rlscore.measure.measure_utilities import qids_to_splits test_qids = qids_to_splits(test_qids) perfs = [] for query in test_qids: try: perf = cindex(test_labels[query], P[query]) perfs.append(perf) except UndefinedPerformance: pass test_perf = np.mean(perfs) print("test set performance: %f" %test_perf)
import numpy as np from rlscore.learner.cg_rankrls import CGRankRLS from rlscore.utilities.reader import read_qids from rlscore.utilities.reader import read_sparse from rlscore.measure import cindex train_labels = np.loadtxt("./legacy_tests/data/rank_train.labels") test_labels = np.loadtxt("./legacy_tests/data/rank_test.labels") train_qids = read_qids("./legacy_tests/data/rank_train.qids") test_features = read_sparse("./legacy_tests/data/rank_test.features") train_features = read_sparse("./legacy_tests/data/rank_train.features") test_qids = read_qids("./legacy_tests/data/rank_test.qids") kwargs = {} kwargs["Y"] = train_labels kwargs["X"] = train_features kwargs["qids"] = train_qids kwargs["regparam"] = 1 learner = CGRankRLS(**kwargs) P = learner.predict(test_features) from rlscore.measure.measure_utilities import UndefinedPerformance from rlscore.measure.measure_utilities import qids_to_splits test_qids = qids_to_splits(test_qids) perfs = [] for query in test_qids: try: perf = cindex(test_labels[query], P[query]) perfs.append(perf) except UndefinedPerformance: pass test_perf = np.mean(perfs) print("test set performance: %f" %test_perf)