def test_holdout(self): for X in [self.Xtrain1, self.Xtrain2]: for Y in [self.Ytrain1, self.Ytrain2]: m = X.shape[0] qids, L = generate_qids(m) qids = np.array(qids) hoindices = np.where(qids == 1)[0] hocompl = list(set(range(m)) - set(hoindices)) #Holdout with linear kernel rls1 = QueryRankRLS(X, Y, qids) rls2 = QueryRankRLS(X[hocompl], Y[hocompl], qids[hocompl]) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) #Holdout with bias rls1 = QueryRankRLS(X, Y, qids, bias=3.0) rls2 = QueryRankRLS(X[hocompl], Y[hocompl], qids[hocompl], bias=3.0) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) #Fast regularization for i in range(-5, 5): rls1.solve(2**i) rls2.solve(2**i) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) #Kernel holdout rls1 = QueryRankRLS(X, Y, qids, kernel="GaussianKernel", gamma=0.01) rls2 = QueryRankRLS(X[hocompl], Y[hocompl], qids[hocompl], kernel="GaussianKernel", gamma=0.01) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) for i in range(-15, 15): rls1.solve(2**i) rls2.solve(2**i) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) #Incorrect indices I = [0, 3, 100] self.assertRaises(IndexError, rls1.holdout, I) I = [-1, 0, 2] self.assertRaises(IndexError, rls1.holdout, I) I = [1, 1, 2] self.assertRaises(IndexError, rls1.holdout, I) I = [0, 4, 8] self.assertRaises(IndexError, rls1.holdout, I)
def test_holdout(self): for X in [self.Xtrain1, self.Xtrain2]: for Y in [self.Ytrain1, self.Ytrain2]: m = X.shape[0] qids, L = generate_qids(m) qids = np.array(qids) hoindices = np.where(qids == 1)[0] hocompl = list(set(range(m)) - set(hoindices)) #Holdout with linear kernel rls1 = QueryRankRLS(X, Y, qids) rls2 = QueryRankRLS(X[hocompl], Y[hocompl], qids[hocompl]) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) #Holdout with bias rls1 = QueryRankRLS(X, Y, qids, bias = 3.0) rls2 = QueryRankRLS(X[hocompl], Y[hocompl], qids[hocompl], bias = 3.0) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) #Fast regularization for i in range(-5, 5): rls1.solve(2**i) rls2.solve(2**i) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) #Kernel holdout rls1 = QueryRankRLS(X, Y, qids, kernel = "GaussianKernel", gamma = 0.01) rls2 = QueryRankRLS(X[hocompl], Y[hocompl], qids[hocompl], kernel = "GaussianKernel", gamma = 0.01) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) for i in range(-15, 15): rls1.solve(2**i) rls2.solve(2**i) P1 = rls1.holdout(hoindices) P2 = rls2.predict(X[hoindices]) assert_allclose(P1, P2) #Incorrect indices I = [0, 3, 100] self.assertRaises(IndexError, rls1.holdout, I) I = [-1, 0, 2] self.assertRaises(IndexError, rls1.holdout, I) I = [1,1,2] self.assertRaises(IndexError, rls1.holdout, I) I = [0,4,8] self.assertRaises(IndexError, rls1.holdout, I)
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids qids_train = np.loadtxt("train_2000_qids.txt") qids_test = np.loadtxt("test_2000_qids.txt") learner = QueryRankRLS(X_train, Y_train, qids_train) P_test = learner.predict(X_test) folds = map_ids(qids_train) perfs = [] for fold in folds: if np.var(Y_train[fold]) != 0: P = learner.holdout(fold) c = cindex(Y_train[fold], P) perfs.append(c) perf = np.mean(perfs) print("leave-query-out cross-validation cindex %f" % perf) partition = map_ids(qids_test) test_perfs = [] #compute the ranking accuracy separately for each test query for query in partition: #skip such queries, where all instances have the same #score, since in this case cindex is undefined if np.var(Y_test[query]) != 0: perf = cindex(Y_test[query], P_test[query]) test_perfs.append(perf) test_perf = np.mean(test_perfs) print("test cindex %f" % test_perf)
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids qids_train = np.loadtxt("train_2000_qids.txt") qids_test = np.loadtxt("test_2000_qids.txt") learner = QueryRankRLS(X_train, Y_train, qids_train) P_test = learner.predict(X_test) folds = map_ids(qids_train) perfs = [] for fold in folds: if np.var(Y_train[fold]) != 0: P = learner.holdout(fold) c = cindex(Y_train[fold], P) perfs.append(c) perf = np.mean(perfs) print("leave-query-out cross-validation cindex %f" %perf) partition = map_ids(qids_test) test_perfs = [] #compute the ranking accuracy separately for each test query for query in partition: #skip such queries, where all instances have the same #score, since in this case cindex is undefined if np.var(Y_test[query]) != 0: perf = cindex(Y_test[query], P_test[query]) test_perfs.append(perf) test_perf = np.mean(test_perfs) print("test cindex %f" %test_perf)
def testLabelRankRLS(self): print("Testing the cross-validation routines of the QueryRankRLS module.\n") np.random.seed(100) floattype = np.float64 m, n = 100, 400 #data, features Xtrain = np.mat(np.random.rand(m, n)) K = Xtrain * Xtrain.T ylen = 1 Y = np.mat(np.zeros((m, ylen), dtype=floattype)) Y[:, 0] = np.sum(Xtrain, 1) labelcount = 5 hoindices = range(labelcount) hocompl = list(set(range(m)) - set(hoindices)) qidlist = [0 for i in range(100)] for h in range(5, 12): qidlist[h] = 1 for h in range(12, 32): qidlist[h] = 2 for h in range(32, 34): qidlist[h] = 3 for h in range(34, 85): qidlist[h] = 4 for h in range(85, 100): qidlist[h] = 5 qidlist_cv = qidlist[5: len(qidlist)] objcount = max(qidlist) + 1 P = np.mat(np.zeros((m, objcount), dtype=np.float64)) for i in range(m): qid = qidlist[i] P[i, qid] = 1. labelcounts = np.sum(P, axis=0) P = np.divide(P, np.sqrt(labelcounts)) D = np.mat(np.ones((1, m), dtype=np.float64)) L = np.multiply(np.eye(m), D) - P * P.T Kcv = K[np.ix_(hocompl, hocompl)] Lcv = L[np.ix_(hocompl, hocompl)] Xcv = Xtrain[hocompl] Xtest = Xtrain[hoindices] Yho = Y[hocompl] rpool = {} rpool["X"] = Xtrain rpool["Y"] = Y rpool["qids"] = qidlist primalrls = QueryRankRLS(**rpool) rpool = {} rpool["X"] = K rpool['kernel'] = 'PrecomputedKernel' rpool["Y"] = Y rpool["qids"] = qidlist dualrls = QueryRankRLS(**rpool) rpool = {} rpool['X'] = Xcv rpool['Y'] = Yho rpool['qids'] = qidlist_cv primalrls_naive = QueryRankRLS(**rpool) rpool = {} rpool['X'] = Kcv rpool['kernel'] = 'PrecomputedKernel' rpool['Y'] = Yho #rpool['X'] = Xcv rpool['qids'] = qidlist_cv dualrls_naive = QueryRankRLS(**rpool) testkm = K[np.ix_(hocompl, hoindices)] loglambdas = range(-5, 5) for j in range(0, len(loglambdas)): regparam = 2. ** loglambdas[j] print print("Regparam 2^%1d" % loglambdas[j]) print(str(np.squeeze(np.array((testkm.T * la.inv(Lcv * Kcv + regparam * np.eye(Lcv.shape[0])) * Lcv * Yho).T))) + ' Dumb HO') predhos = [] primalrls_naive.solve(regparam) predho = primalrls_naive.predictor.predict(Xtest) print(str(predho.T) + ' Naive HO (primal)') predhos.append(predho) dualrls_naive.solve(regparam) predho = dualrls_naive.predictor.predict(testkm.T) print(str(predho.T) + ' Naive HO (dual)') predhos.append(predho) primalrls.solve(regparam) predho = np.squeeze(primalrls.holdout(hoindices)) print(str(predho.T) + ' Fast HO (primal)') predhos.append(predho) dualrls.solve(regparam) predho = np.squeeze(dualrls.holdout(hoindices)) print(str(predho.T) + ' Fast HO (dual)') predhos.append(predho) predho0 = predhos.pop(0) for predho in predhos: self.assertEqual(predho0.shape, predho.shape) for row in range(predho.shape[0]): #for col in range(predho.shape[1]): # self.assertAlmostEqual(predho0[row,col],predho[row,col], places=5) self.assertAlmostEqual(predho0[row],predho[row], places=5)
def testLabelRankRLS(self): print("Testing the cross-validation routines of the QueryRankRLS module.\n") np.random.seed(100) floattype = np.float64 m, n = 100, 400 #data, features Xtrain = np.mat(np.random.rand(m, n)) K = Xtrain * Xtrain.T ylen = 1 Y = np.mat(np.zeros((m, ylen), dtype=floattype)) Y[:, 0] = np.sum(Xtrain, 1) labelcount = 5 hoindices = range(labelcount) hocompl = list(set(range(m)) - set(hoindices)) qidlist = [0 for i in range(100)] for h in range(5, 12): qidlist[h] = 1 for h in range(12, 32): qidlist[h] = 2 for h in range(32, 34): qidlist[h] = 3 for h in range(34, 85): qidlist[h] = 4 for h in range(85, 100): qidlist[h] = 5 qidlist_cv = qidlist[5: len(qidlist)] objcount = max(qidlist) + 1 P = np.mat(np.zeros((m, objcount), dtype=np.float64)) for i in range(m): qid = qidlist[i] P[i, qid] = 1. labelcounts = np.sum(P, axis=0) P = np.divide(P, np.sqrt(labelcounts)) D = np.mat(np.ones((1, m), dtype=np.float64)) L = np.multiply(np.eye(m), D) - P * P.T Kcv = K[np.ix_(hocompl, hocompl)] Lcv = L[np.ix_(hocompl, hocompl)] Xcv = Xtrain[hocompl] Xtest = Xtrain[hoindices] Yho = Y[hocompl] rpool = {} rpool["X"] = Xtrain rpool["Y"] = Y rpool["qids"] = qidlist primalrls = QueryRankRLS(**rpool) rpool = {} rpool["X"] = K rpool['kernel'] = 'PrecomputedKernel' rpool["Y"] = Y rpool["qids"] = qidlist dualrls = QueryRankRLS(**rpool) rpool = {} rpool['X'] = Xcv rpool['Y'] = Yho rpool['qids'] = qidlist_cv primalrls_naive = QueryRankRLS(**rpool) rpool = {} rpool['X'] = Kcv rpool['kernel'] = 'PrecomputedKernel' rpool['Y'] = Yho #rpool['X'] = Xcv rpool['qids'] = qidlist_cv dualrls_naive = QueryRankRLS(**rpool) testkm = K[np.ix_(hocompl, hoindices)] loglambdas = range(-5, 5) for j in range(0, len(loglambdas)): regparam = 2. ** loglambdas[j] print print("Regparam 2^%1d" % loglambdas[j]) print(str(np.squeeze(np.array((testkm.T * la.inv(Lcv * Kcv + regparam * np.eye(Lcv.shape[0])) * Lcv * Yho).T))) + ' Dumb HO') predhos = [] primalrls_naive.solve(regparam) predho = primalrls_naive.predictor.predict(Xtest) print(str(predho.T) + ' Naive HO (primal)') predhos.append(predho) dualrls_naive.solve(regparam) predho = dualrls_naive.predictor.predict(testkm.T) print(str(predho.T) + ' Naive HO (dual)') predhos.append(predho) primalrls.solve(regparam) predho = np.squeeze(primalrls.holdout(hoindices)) print(str(predho.T) + ' Fast HO (primal)') predhos.append(predho) dualrls.solve(regparam) predho = np.squeeze(dualrls.holdout(hoindices)) print(str(predho.T) + ' Fast HO (dual)') predhos.append(predho) predho0 = predhos.pop(0) for predho in predhos: self.assertEqual(predho0.shape, predho.shape) for row in range(predho.shape[0]): #for col in range(predho.shape[1]): # self.assertAlmostEqual(predho0[row,col],predho[row,col], places=5) self.assertAlmostEqual(predho0[row],predho[row], places=5)