half_test_pos, v_test_pos = split(test_pos) # figure out the subset to sample (c) u = logistic.vstack([half_neg, half_test_pos]) pos_sample, unlabeled = logistic.sample_positive(cp, half_pos, u) # create validation set the same way u = logistic.vstack([v_neg, v_test_pos]) v_p, v_u = logistic.sample_positive(cp, v_pos, u) print 'set up data...' data = (pos_sample, unlabeled, v_p, v_u) #data, fixers = normalize_pu_nonnegative_data(*data) print 'not-normalized...' #print 'normalized...' _, estimators = logistic.calculate_estimators(*data, max_iter=100) t = ( cp, half_pos.shape[0], half_neg.shape[0], half_test_pos.shape[0], estimators, float(int(half_pos.shape[0] * cp)) / (half_test_pos.shape[0] + half_pos.shape[0]), ) table.append(t) print t
pos_sample = scipy.sparse.csr_matrix(pos_sample) unlabeled = scipy.sparse.csr_matrix(unlabeled) testX = np.vstack([pos, neg]) testy = np.hstack([np.array([1] * pos.shape[0]), np.array([0] * neg.shape[0]),]) scaler = sklearn.preprocessing.Scaler() scaler.fit(testX) testX = scaler.transform(testX) data = (pos_sample, unlabeled, v_p, v_u) #data, fixers = logistic.normalize_pu_data(*data) params, estimators = logistic.calculate_estimators(*data, max_iter=1000) theta, thetaM, b = params t = ('vf:', vf, 'c:', c, ) + estimators print t table.append(t) # run the LR on the true data (thetaTrue, _, _), _ = logistic.calculate_estimators(*(pos, neg, v_p, v_u), max_iter=1000) # unit area ellipse fig = pyplot.figure() ax = fig.add_subplot(111) ax.scatter(pos[:,0], pos[:,1], s=6, c='b', marker='+') ax.scatter(neg[:,0], neg[:,1], s=6, c='r', marker='o', lw=0)
for cp in [1.0, 0.5, 0.1, 0.7, 0.6, 0.4, 0.3, 0.2, 0.9, 0.8]: # split out the validation set separately split_half = lambda a: logistic.sample_split(a, len(a) / 2) half_pos, v_pos = split_half(pos) half_neg, v_neg = split_half(neg) half_test_pos, v_test_pos = split_half(test_pos) # figure out the subset to sample (c) u = logistic.vstack([half_neg, half_test_pos]) pos_sample, unlabeled = logistic.sample_positive(cp, half_pos, u) # create validation set the same way u = logistic.vstack([v_neg, v_test_pos]) v_p, v_u = logistic.sample_positive(cp, v_pos, u) print "set up data..." _, estimators = logistic.calculate_estimators(pos_sample, unlabeled, v_p, v_u) t = ( cp, len(half_pos), len(half_neg), len(half_test_pos), estimators, float(int(len(half_pos) * cp)) / (len(half_test_pos) + len(half_pos)), ) table.append(t) print t
split = lambda a: logistic.sample_split(a, int(0.8 * a.shape[0])) half_pos, v_pos = split(pos) half_neg, v_neg = split(neg) half_test_pos, v_test_pos = split(test_pos) # figure out the subset to sample (c) u = logistic.vstack([half_neg, half_test_pos]) pos_sample, unlabeled = logistic.sample_positive(cp, half_pos, u) # create validation set the same way u = logistic.vstack([v_neg, v_test_pos]) v_p, v_u = logistic.sample_positive(cp, v_pos, u) print 'set up data...' data = (pos_sample, unlabeled, v_p, v_u) #data, fixers = normalize_pu_nonnegative_data(*data) print 'not-normalized...' #print 'normalized...' _, estimators = logistic.calculate_estimators(*data, max_iter=100) t = (cp, half_pos.shape[0], half_neg.shape[0], half_test_pos.shape[0], estimators, float(int(half_pos.shape[0] * cp)) / (half_test_pos.shape[0] + half_pos.shape[0]), ) table.append(t) print t
split_half = lambda a: logistic.sample_split(a, len(a) / 2) half_pos, v_pos = split_half(pos) half_neg, v_neg = split_half(neg) half_test_pos, v_test_pos = split_half(test_pos) # figure out the subset to sample (c) u = logistic.vstack([half_neg, half_test_pos]) pos_sample, unlabeled = logistic.sample_positive(cp, half_pos, u) # create validation set the same way u = logistic.vstack([v_neg, v_test_pos]) v_p, v_u = logistic.sample_positive(cp, v_pos, u) print 'set up data...' _, estimators = logistic.calculate_estimators(pos_sample, unlabeled, v_p, v_u) t = ( cp, len(half_pos), len(half_neg), len(half_test_pos), estimators, float(int(len(half_pos) * cp)) / (len(half_test_pos) + len(half_pos)), ) table.append(t) print t
pos_sample = scipy.sparse.csr_matrix(pos_sample) unlabeled = scipy.sparse.csr_matrix(unlabeled) testX = np.vstack([pos, neg]) testy = np.hstack([ np.array([1] * pos.shape[0]), np.array([0] * neg.shape[0]), ]) scaler = sklearn.preprocessing.Scaler() scaler.fit(testX) testX = scaler.transform(testX) data = (pos_sample, unlabeled, v_p, v_u) #data, fixers = logistic.normalize_pu_data(*data) params, estimators = logistic.calculate_estimators(*data, max_iter=1000) theta, thetaM, b = params t = ( 'vf:', vf, 'c:', c, ) + estimators print t table.append(t) # run the LR on the true data (thetaTrue, _, _), _ = logistic.calculate_estimators(*(pos, neg, v_p, v_u), max_iter=1000)