def linear_gd_error(df, Y): binary = utils.check_binary(df[Y]) model = gd.gradient(df, df[Y], 0.00001, max_iterations=50) print model predict = gd.predict(df, model, binary) print predict error = mystats.get_error(predict, df_train[Y], binary) return error
def logistic_gd(df_train, df_test, Y): """ logistic gradient descent """ binary = utils.check_binary(df_train[Y]) model = gd.logistic_gradient(df_train, df_train[Y], 0.1, max_iterations=5) print model predict = gd.predict(df_train, model, binary, True) print predict error_train = mystats.get_error(predict, df_train[Y], binary) predict = gd.predict(df_test, model, binary, True) print predict error_test = mystats.get_error(predict, df_test[Y], binary) return [error_train, error_test]
def linear_gd(df_train, df_test, Y): """ linear gradient descent """ binary = utils.check_binary(df_train[Y]) model = gd.gradient(df_train, df_train[Y], 0.00001, max_iterations=50) print model predict = gd.predict(df_train, model, binary) print predict error_train = mystats.get_error(predict, df_train[Y], binary) predict = gd.predict(df_test, model, binary) print predict error_test = mystats.get_error(predict, df_test[Y], binary) return [error_train, error_test]
def testLogisticGradient(): """ logistic gradient descent """ df_test, df_train = utils.split_test_and_train(utils.load_and_normalize_spam_data()) Y = 'is_spam' binary = utils.check_binary(df_train[Y]) model = gd.logistic_gradient(df_train, df_train[Y], .1, max_iterations=5) #print model #raw_input() predict = gd.predict(df_train, model, binary, True) print predict error_train = mystats.get_error(predict, df_train[Y], binary) #raw_input() predict = gd.predict(df_test, model, binary, True) print predict error_test = mystats.get_error(predict, df_test[Y], binary) print 'error train {} error_test {}'.format(error_train, error_test) return [error_train, error_test]
def testGradient(): # Great success with subset test, train = utils.load_and_normalize_housing_set() df_full = pd.DataFrame(train) subset_size = 100 df = utils.train_subset(df_full, ['CRIM', 'TAX', 'B', 'MEDV'], n=subset_size) dfX = pd.DataFrame([df['CRIM'], df['TAX']]).transpose() print len(dfX) print dfX #raw_input() fit = gd.gradient(dfX, df['MEDV'].head(subset_size), .5, max_iterations=300) print 'read v fit' print len(dfX) print df['MEDV'].head(10) print fit data = gd.add_col(gd.pandas_to_data(dfX), 1) print np.dot(data, fit)
def k_folds_linear_gd(df_test, df_train, Y): k = 10 df_test = gd.pandas_to_data(df_test) k_folds = partition_folds(df_test, k) model = Model_w() theta = None for ki in range(k - 1): print "k fold is {}".format(k) data, truth = get_data_and_truth(k_folds[ki]) binary = True model.update(gd.gradient(data, np.array(truth), 0.00001, max_iterations=5, binary=binary)) print model.w if theta is None: theta, max_acc = get_best_theta(data, truth, model.w, binary, False) predict = gd.predict_data(data, model.w, binary, False, theta) error = mystats.get_error(predict, truth, binary) print "Error for fold {} is {} with theta = {}".format(k, error, theta) test, truth = get_data_and_truth(k_folds[k - 1]) predict = gd.predict_data(test, model.w, binary, False, theta) test_error = mystats.get_error(predict, truth, binary) return [error, test_error]
def testGradient_by_columns(df, cols): # fail df = utils.train_subset(df, cols, n=len(df)) #dfX = pd.DataFrame([df['CRIM'], df['TAX']]).transpose() print len(df) print df #raw_input() fit = gd.gradient(df, df['MEDV'].head(len(df)), .00001, max_iterations=5000) print 'read v fit' print len(df) print df['MEDV'].head(10) print fit print np.dot(df, fit)
def get_best_theta(data, truth, model, binary, logistic): best_theta = None max_acc = 0 modmin = min(model) modmax = max(model) for theta_i in range(100): theta = modmin + float(theta_i) / (modmax - modmin) predict = gd.predict_data(data, model, binary, False, theta) acc = mystats.get_error(predict, truth, binary) if best_theta is None: best_theta = theta max_acc = acc elif acc > max_acc: best_theta = theta max_acc = acc return best_theta, max_acc
def testGradient2(): X = np.random.random(size=[10, 2]) y = .5 * X[:, 0] + 2 * X[:, 1] + 3 df = pd.DataFrame(data=X) w = gd.gradient(df, y, .05)
def testGradSynth(): data, y = get_test_data() df = pd.DataFrame(data, columns=["x0", "x1"]) print gd.gradient(df, y, .5, max_iterations=30) pass
def testLogGradient2(): X = np.random.random(size=[10, 2]) y = utils.sigmoid(X[:, 0]* .5 + 2 * X[:, 1] + 3) df = pd.DataFrame(data=X) w = gd.logistic_gradient(df, y, .05) print w