def linear_gd_error(df, Y):
    binary = utils.check_binary(df[Y])
    model = gd.gradient(df, df[Y], 0.00001, max_iterations=50)
    print model
    predict = gd.predict(df, model, binary)
    print predict
    error = mystats.get_error(predict, df_train[Y], binary)
    return error
def logistic_gd(df_train, df_test, Y):
    """ logistic gradient descent """
    binary = utils.check_binary(df_train[Y])
    model = gd.logistic_gradient(df_train, df_train[Y], 0.1, max_iterations=5)
    print model
    predict = gd.predict(df_train, model, binary, True)
    print predict
    error_train = mystats.get_error(predict, df_train[Y], binary)
    predict = gd.predict(df_test, model, binary, True)
    print predict
    error_test = mystats.get_error(predict, df_test[Y], binary)
    return [error_train, error_test]
def linear_gd(df_train, df_test, Y):
    """ linear gradient descent """
    binary = utils.check_binary(df_train[Y])
    model = gd.gradient(df_train, df_train[Y], 0.00001, max_iterations=50)
    print model
    predict = gd.predict(df_train, model, binary)
    print predict
    error_train = mystats.get_error(predict, df_train[Y], binary)
    predict = gd.predict(df_test, model, binary)
    print predict
    error_test = mystats.get_error(predict, df_test[Y], binary)
    return [error_train, error_test]
def testLogisticGradient():
    """ logistic gradient descent """
    df_test, df_train = utils.split_test_and_train(utils.load_and_normalize_spam_data())
    Y = 'is_spam'
    binary = utils.check_binary(df_train[Y])
    model = gd.logistic_gradient(df_train, df_train[Y], .1, max_iterations=5)
    #print model
    #raw_input()
    predict = gd.predict(df_train, model, binary, True)
    print predict
    error_train = mystats.get_error(predict, df_train[Y], binary)
    #raw_input()
    predict = gd.predict(df_test, model, binary, True)
    print predict
    error_test = mystats.get_error(predict, df_test[Y], binary)
    print 'error train {} error_test {}'.format(error_train, error_test)
    return [error_train, error_test]
def testGradient():  # Great success with subset
    test, train = utils.load_and_normalize_housing_set()
    df_full = pd.DataFrame(train)
    subset_size = 100
    df = utils.train_subset(df_full, ['CRIM', 'TAX', 'B', 'MEDV'], n=subset_size)
    dfX = pd.DataFrame([df['CRIM'], df['TAX']]).transpose()
    print len(dfX)
    print dfX
    #raw_input()

    fit = gd.gradient(dfX, df['MEDV'].head(subset_size), .5, max_iterations=300)

    print 'read v fit'
    print len(dfX)
    print df['MEDV'].head(10)
    print fit
    data = gd.add_col(gd.pandas_to_data(dfX), 1)
    print np.dot(data, fit)
def k_folds_linear_gd(df_test, df_train, Y):
    k = 10
    df_test = gd.pandas_to_data(df_test)
    k_folds = partition_folds(df_test, k)
    model = Model_w()
    theta = None
    for ki in range(k - 1):
        print "k fold is {}".format(k)
        data, truth = get_data_and_truth(k_folds[ki])
        binary = True
        model.update(gd.gradient(data, np.array(truth), 0.00001, max_iterations=5, binary=binary))
        print model.w
        if theta is None:
            theta, max_acc = get_best_theta(data, truth, model.w, binary, False)
        predict = gd.predict_data(data, model.w, binary, False, theta)
        error = mystats.get_error(predict, truth, binary)
        print "Error for fold {} is {} with theta =  {}".format(k, error, theta)
    test, truth = get_data_and_truth(k_folds[k - 1])
    predict = gd.predict_data(test, model.w, binary, False, theta)
    test_error = mystats.get_error(predict, truth, binary)
    return [error, test_error]
def testGradient_by_columns(df, cols):  # fail
    df = utils.train_subset(df, cols, n=len(df))
    #dfX = pd.DataFrame([df['CRIM'], df['TAX']]).transpose()
    print len(df)
    print df
    #raw_input()

    fit = gd.gradient(df, df['MEDV'].head(len(df)), .00001, max_iterations=5000)
    print 'read v fit'
    print len(df)
    print df['MEDV'].head(10)
    print fit
    print np.dot(df, fit)
def get_best_theta(data, truth, model, binary, logistic):
    best_theta = None
    max_acc = 0
    modmin = min(model)
    modmax = max(model)
    for theta_i in range(100):
        theta = modmin + float(theta_i) / (modmax - modmin)
        predict = gd.predict_data(data, model, binary, False, theta)
        acc = mystats.get_error(predict, truth, binary)
        if best_theta is None:
            best_theta = theta
            max_acc = acc
        elif acc > max_acc:
            best_theta = theta
            max_acc = acc
    return best_theta, max_acc
def testGradient2():
    X = np.random.random(size=[10, 2])
    y = .5 * X[:, 0] + 2 * X[:, 1] + 3
    df = pd.DataFrame(data=X)
    w = gd.gradient(df, y, .05)
Exemplo n.º 10
0
def testGradSynth():
    data, y = get_test_data()
    df = pd.DataFrame(data, columns=["x0", "x1"])
    print gd.gradient(df, y, .5, max_iterations=30)
    pass
Exemplo n.º 11
0
def testLogGradient2():
    X = np.random.random(size=[10, 2])
    y = utils.sigmoid(X[:, 0]* .5 + 2 * X[:, 1] + 3)
    df = pd.DataFrame(data=X)
    w = gd.logistic_gradient(df, y, .05)
    print w