예제 #1
0
def s(x):
    log1,log2 = logistic_regression.predict(x)
    svm1,svm2 = SVM.predict(x)
    nb1,nb2 = NaiveBayes.predict(x)
    X = np.concatenate((log1.reshape(len(log1),1) , log2.reshape(len(log2),1), svm1.reshape(len(svm1),1), svm2.reshape(len(svm2),1),nb1.reshape(len(nb1),1),nb2.reshape(len(nb2),1)),axis = 1)
    prediction = model.predict(X)
    return prediction
예제 #2
0
def run_logistic_regression(Train_X, Train_Y, Test_X, Test_Y):

    print('Logistic_Regression:')

    print('\nTraining begins...')
    m, _ = Train_X.shape
    Train_X = np.concatenate([np.ones(shape=(m, 1)), Train_X], axis=1)
    start_time = time.time()
    theta, accuracy, J_history, it = lr.train(Train_X, Train_Y)
    end_time = time.time()
    print('iterations: {it}'.format(it=it))
    print('time_taken: {time} sec'.format(time=(end_time - start_time)))
    print('accuracy: {acc}'.format(acc=accuracy))

    plot_J_history(it, J_history, 'Logistic Regression')

    print('\nValidation on test data:')
    m, _ = Test_X.shape
    Test_X = np.concatenate([np.ones(shape=(m, 1)), Test_X], axis=1)
    res = lr.predict(theta, Test_X)
    accuracy = lr.get_accuracy(res, Test_Y)
    print('validation accuracy: {acc}'.format(acc=accuracy))

    print('\nTop 30 Spam Predictors:')
    features = cd.get_features()
    feature_w = zip(features, theta[1:, 0])
    sorted_feature_w = sorted(feature_w,
                              key=lambda t: (t[1], t[0]),
                              reverse=True)
    for i in range(30):
        print('%s\t\t%.2f' % (sorted_feature_w[i][0], sorted_feature_w[i][1]))

    print('\n\n')
예제 #3
0
def test_logreg_engines_reg():
    data = pd.read_csv('data/ex2data2.txt', header=None)
    x1 = data[0]
    x2 = data[1]
    x1 = x1.reshape(x1.size, 1)
    x2 = x2.reshape(x2.size, 1)

    y = data[2]
    y = y.reshape(y.size, 1)

    x = util.mapfeat(x1,x2)
    theta = np.zeros(x.shape[1])
    l = 1.0

    print 'initial cost %f' % logreg.cost(theta, x, y, l)
    print logreg.gradient(theta,x,y,l).shape

    param, neval, status = opt.fmin_tnc(func=logreg.cost, x0=theta, fprime=logreg.gradient, args=(x, y, l))
    print param
    print 'Neval %d status %d\n' % (neval, status)
    print 'Cost: %f' % logreg.cost(param, x, y, l)
    
    p = logreg.predict(param,x)
    #compare predictions from trained params with original truth
    print 'accuracy %f' % (np.sum(np.logical_not(np.logical_xor(p,data[2]))) / float(y.size))
    util.plot_dec_boundary(param,x,y,True)
예제 #4
0
def main_mz():
    dataMat, labelMat = load_data("record.csv")
    weights = lr_train(dataMat, labelMat)
    save_model('model.txt', weights)
    test_data, test_label = load_data("record_test.csv")
    ans = predict(test_data, weights.T)
    n = shape(ans)[0]
    for i in range(n):
        if ans[i][0] != test_label[i, 0]:
            print i, test_data[i, 1:], ans[i, 0], test_label[i, 0]
예제 #5
0
async def predict_results(req):
    # we need to unpackage the json file so we can get initial values being passet from frontend.
    # we do this with req.json

    values = req.json  #values is a dictionary
    prediction = predict(values['age'], values['income'])
    # We can also use other algorthms here.
    # We can also compare results here.

    print('prediction says:', prediction)

    return response.json(prediction)
예제 #6
0
def predict_house(data, weights):
    try:
        houses = ['Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff']
        _, X = tools.generic_preprocess(data, 'mean')
        weights = weights.drop(weights.columns[0], axis=1)
        students = data.loc[:, 'Hogwarts House'].to_frame()

        i = 0
        for house in houses:
            theta = np.array(weights.iloc[i:i + 1]).reshape(X.shape[1], 1)
            p = logreg.predict(X, theta)
            students[house] = p
            i += 1

        students = students.drop(columns=['Hogwarts House'])
        predictions = students.idxmax(axis=1)
    except Exception:
        tools.error_exit('Failed to predict houses.')
    return predictions
예제 #7
0
def test_logreg_accept_scores():
    data = pd.read_csv('data/ex2data1.txt', header=None)
    x1 = data[0]
    x2 = data[1]
    x1 = x1.reshape(x1.size, 1)
    x2 = x2.reshape(x2.size, 1)
    x3 = x1 ** 2
    #x4 = x1 ** 4

    x0 = np.ones((data[0].size, 1))
    x = np.hstack([x0, x1, x2, x3])
    #x = np.hstack([x0, x1, x2, x3, x4])

    y = data[2]
    y = y.reshape(y.size, 1)
    l = 0.0025 #squared term

    #theta must be an array to work with fmin_tnc x0 param
    theta = np.zeros(x[0].size)

    print 'initial cost %f' % logreg.cost(theta, x, y, l)
    
    #must take an array
    #param, neval, status = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(x, y), approx_grad=True, epsilon=0.000000000001)
    param, neval, status = opt.fmin_tnc(func=logreg.cost, x0=theta, fprime=logreg.gradient, args=(x, y, l), maxfun=1000)
    print param
    print 'Neval %d status %d\n' % (neval, status)
    print 'Cost: %f' % logreg.cost(param, x, y, 0)
    #print 'COE: %f' % sigmoid(np.dot(np.array([1,45,85, (45**2), (45**4)]), param.T))
    #print 'COE: %f' % sigmoid(np.dot(np.array([1,75.3,46.3, (75.3**2), (75.3**4)]), param.T))
    #print 'COE: %f' % sigmoid(np.dot(np.array([1,82.2,41.9, (82.2**2), (82.2**4)]), param.T))
    print 'COE: %f' % util.sigmoid(np.dot(np.array([1,45,85, 45**2]), param.T))
    print 'COE: %f' % util.sigmoid(np.dot(np.array([1,75.3,46.3, 75.3**2]), param.T))
    print 'COE: %f' % util.sigmoid(np.dot(np.array([1,82.2,41.9, 82.2**2]), param.T))

    p = logreg.predict(param,x)
    #compare predictions from trained params with original truth
    print 'accuracy %f' % (np.sum(np.logical_not(np.logical_xor(p,data[2]))) / float(y.size))

    util.plot_dec_boundary(param, x, y, False)
    
    '''
def get_stats(examples, alt_examples, w):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    confidences = np.zeros((np.size(examples, 0), 2))

    for i in xrange(np.size(examples, 0)):
        # get the class of the example
        ex_out = examples[i][-1]

        # get predicted output
        confidence = predict(alt_examples[i], w)
        out_pred = np.round(confidence)
        confidences[i][0] = ex_out
        confidences[i][1] = confidence

        if out_pred == 1:
            if ex_out == 1:
                TP += 1
            else:
                FP += 1
        else:
            if ex_out == 0:
                TN += 1
            else:
                FN += 1

    TOT = float(len(examples))
    accuracy = float(TP + TN) / (TOT)
    if TP == 0 and FP == 0:
        print 'no positive classifications, cannot compute precision'
        precision = 0
    else:
        precision = float(TP) / float(TP + FP)
    if TP == 0 and FN == 0:
        print 'could not compute recall'
        recall = 0
    else:
        recall = float(TP) / float(TP + FN)
    return accuracy, precision, recall, confidences
lambda_ = 1

# set options for optimize.minimize
options = {'maxiter': 100}

res = optimize.minimize(costFunctionReg,
                        initial_theta, (X, y, lambda_),
                        jac=True,
                        method='TNC',
                        options=options)

# the fun property of OptimizeResult object returns
# the value of costFunction at optimized theta
cost = res.fun

# the optimized theta is in the x property of the result
theta = res.x

utils.plotDecisionBoundary(plot_data, theta, X, y)
pyplot.xlabel('Microchip Test 1')
pyplot.ylabel('Microchip Test 2')
pyplot.legend(['y = 1', 'y = 0'])
pyplot.grid(False)
pyplot.title('lambda = %0.2f' % lambda_)
pyplot.show()

# Compute accuracy on our training set
p = predict(theta, X)

print('Train Accuracy: %.1f %%' % (np.mean(p == y) * 100))
print('Expected accuracy (with lambda = 1): 83.1 % (approx)\n')
예제 #10
0
import numpy as np

from load import load, load_feature
from logistic_regression import logistic_regression, predict

if __name__ == '__main__':
    features = ["Pclass", "Sex"]
    x, y = load(features, "train.csv")
    theta = np.zeros((x.shape[0], 1))
    theta = logistic_regression(x.transpose(), y, theta, 0.1, 500)
    x_test = load(features, "test.csv")
    y_predict = predict(x_test, theta)
    ids = load_feature("PassengerId", "data/test.csv")
    # print(ids)
    f = open("data/ans.csv", 'w')
    f.write("PassengerId,Survived\n")
    for i in range(418):
        f.write(str(ids[i]))
        f.write(',')
        f.write(str(y_predict[i][0]))
        f.write('\n')
    f.close()

    # f = open("features.txt", "a")
    # for i in range(len(features)):
    #     f.write(features[i] + ", ")
    # f.write("\t" + str(accuracy) + "\n")
예제 #11
0
    cost = result.fun

    # scipy.optimizeを利用して指定回数ループして探した最小値のcostを表示
    print('Cost at theta found by scipy.optimize.minimize: %f' % cost)

    # 決定境界の表示
    plot_data(X_original, y, show=False)
    u = np.linspace(-1, 1.5, 50)
    v = np.linspace(-1, 1.5, 50)
    z = np.zeros((u.size, v.size))
    for i in range(u.size):
        for j in range(v.size):
            z[i, j] = map_feature(u[i], v[j]).dot(theta)
    plot.contour(u, v, z.T, 0)
    plot.show()

    # 予測値の出力 (plot図から、試験結果は中央の0に近ければ合格率が高い)
    test_data = [[0.1, -0.1], [-0.7, 0.2], [0.8, -0.1], [1.0, -1.0]]
    for data in test_data:
        test1 = data[0]
        test2 = data[1]
        x = map_feature(np.array(test1), np.array(test2))
        prob = sigmoid(np.array(x).dot(theta))
        print('試験結果が{}と{}だったマイクロチップが品質保証に合格している確率は{}'.format(
            test1, test2, prob))

    # 精度の表示
    predictions = predict(theta, X)
    accuracy = 100 * np.mean(predictions == y)
    print('Train accuracy: %0.2f %%' % accuracy)
            correctBayes += 1
    if correctBayes > maxBayes:
        maxBayes = correctBayes
        maxHyperParametersIndexBayes = i
    if correctAdaboost > maxAdaboost:
        maxAdaboost = correctAdaboost
        maxHyperParametersIndexAdaboost = i

print("Training logistic regression")
for i in range(len(logregrHyperParameters)):
    print(i)
    correctlogregr = 0
    w8z = logistic_regression.train(sortedIGs, trainMailsList,
                                    logregrHyperParameters[i])
    for incoming in development_mails:
        if (logistic_regression.predict(w8z, incoming, sortedIGs) >
                0.5) == incoming[-1]:
            correctlogregr += 1
    if correctlogregr > maxlogregr:
        maxlogregr = correctlogregr
        maxHyperParametersIndexlogregr = i

# now test and plot
mypath = join("pu_corpora_public", "pu3", "part" + "10")
test_mails = []
for file in listdir(mypath):
    with open(join(mypath, file), "r") as f:
        templist = []
        for line in f:
            for word in line.split():
                if word not in templist and word.isnumeric():  #len(word) > 1
예제 #13
0
import plot
import logistic_regression as lr

data = lr.load_data('iris_data.csv')  # Load the data

plot.scatter_plot(
    data, ['Iris-setosa', 'Iris-versicolor'])  # Scatter plot of the data

X, y = lr.split(data)  # Split into data and labels

X_train, X_test, y_train, y_test = lr.train_test_split(
    X, y)  # Split all the data into training and testing set

theta = lr.SGD(X_train, y_train)  # Run SGD to calculate optimal theta
print('\nCalculated theta:\n {}'.format(theta))

hypothesis = lr.predict(X_test, theta)  # Test the model

lr.accuracy(hypothesis, y_test)

plot.boundary(data, ['Iris-setosa', 'Iris-versicolor'],
              theta)  # Plot the decision boundary
예제 #14
0
print("Calculated GD = \n", grad)

# Compute and display cost and gradient with non-zero theta
test_theta = np.array([[-24], [0.2], [0.2]])
J = lr.compute_cost(test_theta, X1, y)
grad = lr.gradient_descent(test_theta, X1, y)

print('Cost at test theta: {:7.3f}'.format(J))  # ans = 0.218
print('Gradient at test theta: \n', grad)  # ans = [[0.043], [2.566], [2.647]]

# overlay the decision boundary on the data
# but, first compute the optimized theta for global min :: ans = [[-25.161], [0.206], [0.201]]
theta = lr.optimizer_func(initial_theta, X1, y)
print('Computed theta: ', theta)
theta = np.vstack(theta)

# now compute the decision boundary
lr.decision_boundary(theta, X1, y)

# test the model by running a prediction  :: ans = 0.775 +/- 0.002
# for a student with score 45 on exam 1 and score 85 on exam 2
X_test = np.array([1, 45, 85])
prob = lr.sigmoid(np.dot(X_test, theta))
print("Probability of student with scores {} getting admitted = {}".format(
    X_test[[1, 2]], prob))

# calculate the overall accuracy of our model :: ans = 89.0
p = lr.predict(theta, X1)
accuracy = np.sum(np.equal(p, y)) / m
print("Accuracy of the model = {:7.3f}%".format(accuracy * 100))