示例#1
0
def cross_validationcv(model, verbose=0):
    data = GalaxyData(feature_extraction.raw_1, scale_features=False)

    (features, solutions) = data.get_training_data()

    # Train and Predict Model
    (clf, _) = model(features, solutions, verbose)
    scores = cross_validation.cross_val_score(clf, features, solutions, cv=5, scoring=rmse_scorer, n_jobs=-1)
    print(scores)
    print("Cross validation error: ", sum(scores)/len(scores))
示例#2
0
def competition_run():
    data = GalaxyData()

    (training_features, training_solutions) = data.get_training_data()
    (test_features, _) = data.get_test_data()

    # Predict
    (clf, columns) = models.default_model(training_features, training_solutions, 5)
    predicted_solutions = models.predict(clf, test_features, columns)

    data.save_solution(predicted_solutions)
示例#3
0
def grid_search_cv(model, verbose=0):
    data = GalaxyData(feature_extraction.hog_features, scale_features=False)

    (features, solutions) = data.get_training_data()

    # Train and Predict Model
    (clf, _) = model(features, solutions, verbose)
    parameters = {'min_sample_split': [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 1e-3, 1e-2, 1e-1]} 

    gs = grid_search.GridSearchCV(clf, param_grid=parameters, scoring=rmse_scorer, n_jobs=-1,
            cv=5, verbose=5)
    gs.fit(features, solutions)
    print(gs.grid_scores_)
示例#4
0
def run_training_test(model, verbose=0):
    """Entry Point to run models

    Args:
        model: model function to run.
    """
    # Load the data and split into training and validation sets
    data = GalaxyData(feature_extraction.raw_9, scale_features=False)

    (test_features, test_solutions) = data.get_test_data()
    (training_features, training_solutions) = data.get_training_data()

    # Train and Predict Model
    (clf, columns) = model(training_features, training_solutions, verbose)
    predicted_solutions = models.predict(clf, test_features, columns)

    # Evaluate Predictions
    score = evaluate.get_rmse(test_solutions, predicted_solutions)
    print(score)
示例#5
0
def run(model, verbose=0):
    """Entry Point to run models

    Args:
        model: model function to run.
    """
    # Load the data and split into training and validation sets
    data = GalaxyData(feature_extraction.hog_features, scale_features=False)

    (training_features, training_solutions,
     validation_features, validation_solutions) = data.split_training_and_validation_data(50)

    # Train and Predict Model
    (clf, columns) = model(training_features, training_solutions, verbose)
    predicted_validation_solutions = models.predict(clf, validation_features, columns)

    # Evaluate Predictions
    valid_rmse = evaluate.get_errors_clf(clf, validation_features, validation_solutions)
    train_rmse = evaluate.get_errors_clf(clf, training_features, training_solutions)
    print " Validation RMSE: ", valid_rmse
    print " Training RMSE: ", train_rmse
示例#6
0
def extract_features(extraction_method, index=None, percent_subset=100, classification=False):
    """Runs the given extraction method on only those galaxys listed in index. Return a subset of
    those galaxies.

    Attrubutes:
        extraction_method: Extraction method to use. See feature_extraction
        index: Index of Galaxy for which to process data. If None, process all galaxies.
        percent_subset: Returns a subset of the data of this size (percent).

    Returns: A Tuple containing (X, y), with X being the features and y the labels.
    """
    data = GalaxyData(extraction_method, scale_features=False)
    if index is not None:
        data.set_restricted_universe(index)

    if percent_subset == 100:
        (X, y) = data.get_training_data(competition=True)
    else:
        (X, y, _, _) = data.split_training_and_validation_data(100-percent_subset, competition=True)

    y = get_reduced_solutions(y, classification=classification)
    return (X, y)
示例#7
0
import numpy as np

from evaluate import cross_validate
from galaxy_data import GalaxyData

from sklearn import (ensemble, cross_validation)

data = GalaxyData(scale_features=False)
(X_train, y_train) = data.get_training_data()
(X_test, y_test) = data.get_test_data()

clf = ensemble.RandomForestRegressor(n_estimators=1, n_jobs=-1, verbose=5)

scores = cross_validate(clf, X_train, y_train, 2)
mean_score = sum(scores) / float(scores.shape[0])
print(scores)
print(mean_score)
示例#8
0
import random
import numpy as np
import pandas as pd
import SimpleCV as cv

from sklearn import svm
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

import evaluate
import feature_extraction
from galaxy_data import GalaxyData

solutions_raw = pd.read_csv("./input_data/training_solutions_rev1.csv", index_col="GalaxyID")
solutions = solutions_raw[["Class1.1", "Class1.2"]]

upper_threshold = 1

solutions = solutions[(solutions >= upper_threshold)]

solutions = solutions.dropna(how='all')
#solutions.apply(lambda x: x[0] if not isnan(x[0]) else x[1], axis=1).to_frame()
solutions = solutions.applymap(lambda x: 0 if np.isnan(x) else x)

data = GalaxyData(feature_extraction.raw)
data.set_restricted_universe(solutions.index)
(feature_vectors, _) = data.get_training_data()

示例#9
0
from galaxy_data import GalaxyData
from sklearn import (grid_search, ensemble)

import evaluate
import pickle

data = GalaxyData(scale_features=False)
(X_train, y_train) = data.get_training_data()

clf = ensemble.RandomForestRegressor(n_estimators=100, max_features='log2')

parameters = {'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

gs = grid_search.GridSearchCV(clf, param_grid=parameters, scoring=evaluate.get_rmse_clf, n_jobs=-1,
        cv=5, verbose=5)
gs.fit(X_train, y_train)
print(gs.grid_scores_)
pickle.dump(gs, open( "min_samples_split_rf", "wb" ))