def test_logistic_regression_equivalence_prediction(verbose=False): iris = datasets.load_iris() x_mat = iris.data y = iris.target x_mat = x_mat[y != 2] y = y[y != 2] x_mat -= np.mean(x_mat, 0) n_ents = x_mat.shape[1] + 3 # number of entities is the dimension with 2 slots and 1 intercept clf1 = linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6) clf1.fit(x_mat, y) prob = clf1.predict_proba(x_mat) n_show = 3 scores = x_mat.dot(clf1.coef_.T) + clf1.intercept_ clf2 = EmbeddingUpdater(rank=1, n_ents=n_ents, n_slots=2, reg=1.0, max_epochs=500, verbose=False, preprocessing=multitask_to_tuples) clf2.logistic2embeddings(coefs=clf1.coef_, intercept=clf1.intercept_) pred2, y, nll = clf2.predict(x_mat, y) clf3 = EmbeddingUpdater(rank=1, n_ents=n_ents, n_slots=2, reg=1.0, max_epochs=500, verbose=False, preprocessing=multitask_to_tuples) clf3.logistic2embeddings(coefs=clf1.coef_, intercept=clf1.intercept_) pred3, y, nll = clf3.predict(x_mat, y) if verbose: print('linear_model.LogisticRegression predictions:', prob[0:n_show]) print('logistic regression prediction: ', 1.0 / (np.exp(-scores[0:n_show]) + 1.0)) print('EmbeddingUpdater predictions:', 1.0 / (np.exp(-pred2[0:n_show]) + 1.0)) print('same:', 1.0 / (np.exp(-pred3[0:n_show]) + 1.0)) assert(np.linalg.norm(scores - pred2) < 1e-2) assert(np.linalg.norm(scores - pred3) < 1e-2)
def test_logistic_regression_equivalence_learning(verbose=False): iris = datasets.load_iris() x_mat = iris.data y = iris.target x_mat = x_mat[y != 2, 0:1] y = y[y != 2] x_mat -= np.mean(x_mat, 0) n_ents = x_mat.shape[1] + 3 idx_show = [0, 99] c = 1e10 # no regularization clf1 = linear_model.LogisticRegression(C=c, penalty='l2', tol=1e-6) clf1.fit(x_mat, y) scores = x_mat.dot(clf1.coef_.T) + clf1.intercept_ clf2 = EmbeddingUpdater(rank=1, n_ents=n_ents, n_slots=2, reg=1e-10, max_epochs=500, verbose=verbose, preprocessing=multitask_to_tuples) clf2.logistic2embeddings(coefs=clf1.coef_, intercept=clf1.intercept_) pred, y, nll = clf2.predict(x_mat, y) clf2.fit(x_mat, y) pred2, y2, nll2 = clf2.predict(x_mat, y) clf3 = EmbeddingUpdater(rank=1, n_ents=n_ents, n_slots=2, reg=1e-10, max_epochs=500, verbose=verbose, preprocessing=multitask_to_tuples) clf3.fit(x_mat, y) pred3, y3, nll3 = clf3.predict(x_mat, y) if verbose: print('logistic regression prediction:\n', 1.0 / (np.exp(-scores[idx_show]) + 1.0)) print('EmbeddingUpdater predictions before learning:\n', 1.0 / (np.exp(-pred[idx_show]) + 1.0)) print('EmbeddingUpdater predictions after learning (oracle initialization): \n', 1.0 / (np.exp(-pred2[idx_show]) + 1.0)) print('EmbeddingUpdater predictions after learning (random initialization): \n', 1.0 / (np.exp(-pred3[idx_show]) + 1.0)) assert(np.linalg.norm(scores-pred) < 1e-1) assert(np.linalg.norm(scores-pred2) < 1e-1) assert(np.linalg.norm(scores-pred3) < 1e-1)
def test_larcqy_logistic(verbose=False): # factorization of the parameters with multiple linear regressions toy = False np.random.seed(1) # load toy data if toy: data, rank, emb0_val = toy_dual_supervision_data() data_train, data_test = train_test_split(data) n_ents = 1 + np.max([np.max([np.max(t[0]) for t in qc[0]]) for qc in data_train] + [np.max([np.max(t[0]) for t in qc[0]]) for qc in data_test]) else: from factorix.demos.urban.urban_data_loading import load_area_aspects_data # aspects = {'multicultural'} # aspects = {'posh'} aspects = {'waterside'} nw, na = 1000000, 300 data_train, voc = load_area_aspects_data(aspects, 'train', max_n_words_per_area=nw, max_n_words_per_aspect=na) data_test, voc = load_area_aspects_data(aspects, 'test', max_n_words_per_area=nw, max_n_words_per_aspect=na, vocab=voc) rank = 1 n_ents = len(voc.index) regs = np.linspace(0.1, 20, 10) max_epochs_list = [500] # range(25, 500, 25) for reg in regs: for max_epochs in max_epochs_list: model = EmbeddingUpdater(rank, n_ents, reg, max_epochs=max_epochs, verbose=False) model.fit(data_train) pred, y, test_nll = model.predict(data_test) test_auc = eval_auc(pred, y) print('reg: ', reg, 'niter: ', max_epochs, ', auc: ', test_auc, ', nll: ', test_nll)
def test_logistic_regression_equivalence_learning(verbose=False): iris = datasets.load_iris() x_mat = iris.data y = iris.target x_mat = x_mat[y != 2, 0:1] y = y[y != 2] x_mat -= np.mean(x_mat, 0) n_ents = x_mat.shape[1] + 3 idx_show = [0, 99] c = 1e10 # no regularization clf1 = linear_model.LogisticRegression(C=c, penalty='l2', tol=1e-6) clf1.fit(x_mat, y) scores = x_mat.dot(clf1.coef_.T) + clf1.intercept_ clf2 = EmbeddingUpdater(rank=1, n_ents=n_ents, n_slots=2, reg=1e-10, max_epochs=500, verbose=verbose, preprocessing=multitask_to_tuples) clf2.logistic2embeddings(coefs=clf1.coef_, intercept=clf1.intercept_) pred, y, nll = clf2.predict(x_mat, y) clf2.fit(x_mat, y) pred2, y2, nll2 = clf2.predict(x_mat, y) clf3 = EmbeddingUpdater(rank=1, n_ents=n_ents, n_slots=2, reg=1e-10, max_epochs=500, verbose=verbose, preprocessing=multitask_to_tuples) clf3.fit(x_mat, y) pred3, y3, nll3 = clf3.predict(x_mat, y) if verbose: print('logistic regression prediction:\n', 1.0 / (np.exp(-scores[idx_show]) + 1.0)) print('EmbeddingUpdater predictions before learning:\n', 1.0 / (np.exp(-pred[idx_show]) + 1.0)) print( 'EmbeddingUpdater predictions after learning (oracle initialization): \n', 1.0 / (np.exp(-pred2[idx_show]) + 1.0)) print( 'EmbeddingUpdater predictions after learning (random initialization): \n', 1.0 / (np.exp(-pred3[idx_show]) + 1.0)) assert (np.linalg.norm(scores - pred) < 1e-1) assert (np.linalg.norm(scores - pred2) < 1e-1) assert (np.linalg.norm(scores - pred3) < 1e-1)
def test_logistic_regression_equivalence_prediction(verbose=False): iris = datasets.load_iris() x_mat = iris.data y = iris.target x_mat = x_mat[y != 2] y = y[y != 2] x_mat -= np.mean(x_mat, 0) n_ents = x_mat.shape[ 1] + 3 # number of entities is the dimension with 2 slots and 1 intercept clf1 = linear_model.LogisticRegression(C=1.0, penalty='l2', tol=1e-6) clf1.fit(x_mat, y) prob = clf1.predict_proba(x_mat) n_show = 3 scores = x_mat.dot(clf1.coef_.T) + clf1.intercept_ clf2 = EmbeddingUpdater(rank=1, n_ents=n_ents, n_slots=2, reg=1.0, max_epochs=500, verbose=False, preprocessing=multitask_to_tuples) clf2.logistic2embeddings(coefs=clf1.coef_, intercept=clf1.intercept_) pred2, y, nll = clf2.predict(x_mat, y) clf3 = EmbeddingUpdater(rank=1, n_ents=n_ents, n_slots=2, reg=1.0, max_epochs=500, verbose=False, preprocessing=multitask_to_tuples) clf3.logistic2embeddings(coefs=clf1.coef_, intercept=clf1.intercept_) pred3, y, nll = clf3.predict(x_mat, y) if verbose: print('linear_model.LogisticRegression predictions:', prob[0:n_show]) print('logistic regression prediction: ', 1.0 / (np.exp(-scores[0:n_show]) + 1.0)) print('EmbeddingUpdater predictions:', 1.0 / (np.exp(-pred2[0:n_show]) + 1.0)) print('same:', 1.0 / (np.exp(-pred3[0:n_show]) + 1.0)) assert (np.linalg.norm(scores - pred2) < 1e-2) assert (np.linalg.norm(scores - pred3) < 1e-2)