def get_train_data(limit=-1):
    print('Loading train data')
    X,y = rd.read_train(limit=limit)
    print('Augmenting data set')
    X,y = rd.nudge_dataset(X,y)
    print('Scaling data')
    X = scale(X)
    return X,y
예제 #2
0
def get_train_data(limit=-1):
    print('Loading train data')
    X, y = rd.read_train(limit=limit)
    print('Augmenting data set')
    X, y = rd.nudge_dataset(X, y)
    print('Scaling data')
    X = scale(X)
    return X, y
예제 #3
0
def main():
    print("Reading dataset...")
    X_train, Y_train, Y_train_raw = read_train()
    print("Dataset ready.")

    print("Start training...")
    learning_rate = 0.045
    num_iterations = 2500
    parameters, costs = nn_model(X_train,
                                 Y_train,
                                 num_iterations=num_iterations,
                                 learning_rate=learning_rate,
                                 print_cost=True)
    print("Traning finished.")

    print("Predicting train set...")
    Y_prediction_train = predict(X_train, parameters)
    precision, recall, fscore, _ = precision_recall_fscore_support(
        Y_train_raw.reshape((X_train.shape[1], 1)),
        Y_prediction_train,
        labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    print("Precistion:", precision)
    print("Recall:", recall)
    print("Fscore:", fscore)

    print("Reading test set...")
    X_test = read_test()
    print("Predicting test set...")
    Y_prediction_test = predict(X_test, parameters)
    print("Predicted.")

    costs = np.squeeze(costs)
    plt.plot(costs)
    plt.ylabel("cost")
    plt.xlabel("iterations")
    plt.title("Learning rate = " + str(learning_rate))
    plt.show()

    d = {
        "costs": costs.tolist(),
        "Y_prediction_test": Y_prediction_test.tolist(),
        "Y_prediction_train": Y_prediction_train.tolist(),
        "W1": parameters["W1"].tolist(),
        "W2": parameters["W2"].tolist(),
        "b1": parameters["b1"].tolist(),
        "b2": parameters["b2"].tolist(),
        "learning_rate": learning_rate,
        "num_iterations": num_iterations
    }

    print("Saving model...")
    jd = json.dumps(d)
    with open("model.json", "w") as f:
        f.write(jd)
    print("Model saved.")
def classifyRF(train_file="train.csv", test_file ="test.csv", trees=70):
    #So, let's classifiy this thing. Reading the Features and then the test.
    print("Reading train data")
    X,y = rd.read_train(file_name=train_file)
    print("Augmenting dataset")
    X,y = rd.nudge_dataset(X,y)
    print("Reading test data")
    test_data = rd.read_test(file_name=test_file)

    #Creating the classifier. It has a ton of parameters, but since this a hard and fast one, here you go.
    rfc = RandomForestClassifier(trees)
    #Train with the data we have. Cry a little inside.
    print("Training classifier")
    rfc.fit(X, y)
    predictions = rfc.predict(test_data)

    #Most submitions are cute with a CSV. Might as well learn how to do it.
    pd.DataFrame({"ImageId": range(1,len(predictions)+1), "Label": predictions}).to_csv('submit.csv', index=False, header=True)
예제 #5
0
def classifyRF(train_file="train.csv", test_file="test.csv", trees=70):
    #So, let's classifiy this thing. Reading the Features and then the test.
    print("Reading train data")
    X, y = rd.read_train(file_name=train_file)
    print("Augmenting dataset")
    X, y = rd.nudge_dataset(X, y)
    print("Reading test data")
    test_data = rd.read_test(file_name=test_file)

    #Creating the classifier. It has a ton of parameters, but since this a hard and fast one, here you go.
    rfc = RandomForestClassifier(trees)
    #Train with the data we have. Cry a little inside.
    print("Training classifier")
    rfc.fit(X, y)
    predictions = rfc.predict(test_data)

    #Most submitions are cute with a CSV. Might as well learn how to do it.
    pd.DataFrame({
        "ImageId": range(1,
                         len(predictions) + 1),
        "Label": predictions
    }).to_csv('submit.csv', index=False, header=True)
예제 #6
0
#Let's test out what number of trees is best on a forest!
import numpy as np
import read_dataset as rd
import evaluation as e
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

# loading training data
print('Loading training data')
X,y = rd.read_train()
X,y = rd.nudge_dataset(X,y)

scores = []
scores_std = []

#just so we know it didn't blow up or something
print('Start learning...')
#The last few might be excessive.
forests = [10, 15, 20, 25, 30, 40, 50, 70, 100, 125, 150, 175, 200, 250]

for tree in forests:
    print("This forest has {} trees!".format(tree))
    classifier = RandomForestClassifier(tree)
    #score = cross_validation.cross_val_score(classifier, X, y)
    #scores.append(np.mean(score))
    #scores_std.append(np.std(score))
    name = "plots_extended/RandomForest_{}_trees.png".format(tree)
    e.evaluate_classifier(classifier,X,y, name=name)

#print('Score: ', np.array(scores))
#print('Std  : ', np.array(scores_std))
예제 #7
0
#Let's test out what number of trees is best on a forest!
import numpy as np
import read_dataset as rd
import evaluation as e
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

# loading training data
print('Loading training data')
X, y = rd.read_train()
X, y = rd.nudge_dataset(X, y)

scores = []
scores_std = []

#just so we know it didn't blow up or something
print('Start learning...')
#The last few might be excessive.
forests = [10, 15, 20, 25, 30, 40, 50, 70, 100, 125, 150, 175, 200, 250]

for tree in forests:
    print("This forest has {} trees!".format(tree))
    classifier = RandomForestClassifier(tree)
    #score = cross_validation.cross_val_score(classifier, X, y)
    #scores.append(np.mean(score))
    #scores_std.append(np.std(score))
    name = "plots_extended/RandomForest_{}_trees.png".format(tree)
    e.evaluate_classifier(classifier, X, y, name=name)

#print('Score: ', np.array(scores))
#print('Std  : ', np.array(scores_std))