def test_end_to_end_logistic_regression(): pos, neg = logistic.generate_well_separable(100, 0.50) #graph_pos_neg(pos, neg) X = logistic.vstack([pos, neg]) y = logistic.hstack([np.array([1] * len(pos)), np.array([0] * len(neg)),]) data = logistic.generate_random_points(100, center=np.array([2,2]), scale=np.array([5,5])) #theta = logistic.logistic_gradient_descent(X, y) thetaC = logistic.fast_logistic_gradient_descent(X, y) theta = thetaC #assert np.allclose(theta, thetaC) labels = logistic.label_data(data, theta, binarize=True) assert len([l for l in labels if l == 0]) > 10 assert len([l for l in labels if l == 1]) > 10 labels = logistic.label_data(data, thetaC, binarize=True) assert len([l for l in labels if l == 0]) > 10 assert len([l for l in labels if l == 1]) > 10 small_data = np.array([[-1, -1], [11, 11]]) labels2 = logistic.label_data(small_data, theta, binarize=True) assert np.allclose([0, 1], labels2) assert not np.allclose([1, 1], labels2) labels2 = logistic.label_data(small_data, thetaC, binarize=True) assert np.allclose([0, 1], labels2) assert not np.allclose([1, 1], labels2)
def normalize_pu_nonnegative_data(pos_sample, unlabeled, v_p, v_u): """Same as above but works for non-negative data """ d = logistic.vstack([pos_sample, unlabeled]) # decorrelater = sklearn.decomposition.PCA(whiten=False) decorrelater = sklearn.decomposition.NMF() #decorrelater.fit(d) transformer = sklearn.preprocessing.Scaler() transformer.fit(d) #fixer = lambda d: transformer.transform(decorrelater.transform(d)) fixer = lambda d: transformer.transform(d) return ((fixer(pos_sample), fixer(unlabeled), fixer(v_p), fixer(v_u)), (decorrelater, transformer, fixer))
numpy.save(os.path.join(folder, 'data.neg.swissprot.npy'), neg.todense()) numpy.save(os.path.join(folder, 'data.test_pos.swissprot.npy'), test_pos.todense()) print 'read data...' table = [] for cp in [1.0, 0.5, 0.1, 0.7, 0.6, 0.4, 0.3, 0.2, 0.9, 0.8]: # split out the validation set separately split = lambda a: logistic.sample_split(a, int(0.8 * a.shape[0])) half_pos, v_pos = split(pos) half_neg, v_neg = split(neg) half_test_pos, v_test_pos = split(test_pos) # figure out the subset to sample (c) u = logistic.vstack([half_neg, half_test_pos]) pos_sample, unlabeled = logistic.sample_positive(cp, half_pos, u) # create validation set the same way u = logistic.vstack([v_neg, v_test_pos]) v_p, v_u = logistic.sample_positive(cp, v_pos, u) print 'set up data...' data = (pos_sample, unlabeled, v_p, v_u) #data, fixers = normalize_pu_nonnegative_data(*data) print 'not-normalized...' #print 'normalized...' _, estimators = logistic.calculate_estimators(*data, max_iter=100) t = (
def add_x2_y2(a): """Accepts an (N,2) array, adds 2 more columns which are first col squared, second col squared. """ return logistic.vstack([a.T, a[:,0]**2, a[:,1]**2]).T
pos, neg, test_pos = (np.load(os.path.join(folder, "data.%s.swissprot.npy" % d)) for d in npy_filenames) print "read data..." # set up data table = [] for cp in [1.0, 0.5, 0.1, 0.7, 0.6, 0.4, 0.3, 0.2, 0.9, 0.8]: # split out the validation set separately split_half = lambda a: logistic.sample_split(a, len(a) / 2) half_pos, v_pos = split_half(pos) half_neg, v_neg = split_half(neg) half_test_pos, v_test_pos = split_half(test_pos) # figure out the subset to sample (c) u = logistic.vstack([half_neg, half_test_pos]) pos_sample, unlabeled = logistic.sample_positive(cp, half_pos, u) # create validation set the same way u = logistic.vstack([v_neg, v_test_pos]) v_p, v_u = logistic.sample_positive(cp, v_pos, u) print "set up data..." _, estimators = logistic.calculate_estimators(pos_sample, unlabeled, v_p, v_u) t = ( cp, len(half_pos), len(half_neg), len(half_test_pos),