示例#1
0
#data_to_use = postprocess.remove_columns_with_large_number_of_missing_values(data_to_use, 0.95, deepcopy=False)

imputation = Imputer(missing_values='NaN',
                     strategy='most_frequent',
                     verbose=20)
data_to_use = imputation.fit_transform(data_to_use)

data_in_reserve = data_to_use[-200:]
outcomes_in_reserve = outcomes_to_use[-200:]

data_to_use = data_to_use[:-200]
outcomes_to_use = outcomes_to_use[:-200]

feature_sel = SelectKBest(score_func=f_regression, k=100)
regressor = Lasso(fit_intercept=True)

pipeline = Pipeline([('select', feature_sel), ('regression', regressor)])

pipeline.fit(data_to_use, outcomes_to_use)

prediction = pipeline.predict(data_in_reserve)
mean_squared_error = general_f.mean_squared_error(prediction,
                                                  outcomes_in_reserve)
print "Mean squared error: " + str(mean_squared_error)

r_squared_prediction = pipeline.score(data_in_reserve, outcomes_in_reserve)
print "R^2 error: " + str(r_squared_prediction)

fig = lar.plot_predict_actual_pairs(prediction, outcomes_in_reserve)
fig.savefig("temp3.pdf")
    # Fit and predict
    pipeline.fit(data_to_use[training_indices],
                 outcomes_to_use[training_indices])
    prediction = pipeline.predict(data_to_use[testing_indices])

    # Save the predicted and actual values
    predicted_outcomes.extend(prediction)
    actual_outcomes.extend(outcomes_to_use[testing_indices])

    # Calculate an R^2 value for the fold
    r_squared_prediction = pipeline.score(data_to_use[testing_indices],
                                          outcomes_to_use[testing_indices])
    r_squared_error.append(r_squared_prediction)

# Calculate, print, and save the mean squared error across all the folds
mse = general_f.mean_squared_error(predicted_outcomes, actual_outcomes)

print "Mean, median, 95% confidence intervals for MSE:"
print np.mean(mse)
print np.median(mse)
print np.percentile(mse, (2.28, 97.72), interpolation='linear')

print ""
print "Mean, median, 95% confidence intervals for R^2:"
print np.mean(r_squared_error)
print np.median(r_squared_error)
print np.percentile(r_squared_error, (2.28, 97.72), interpolation='linear')
print ""
print ""
print ""
num = 0
imputation = Imputer(missing_values='NaN',
                     strategy='most_frequent',
                     verbose=10)
h = imputation.fit_transform(data_to_use)

#pipeline_2 = Pipeline([("imputer", Imputer(missing_values='NaN',
#                                           strategy='most_frequent')),
#                    ("regression", LinearRegression(n_jobs=4))])

#pipeline = Pipeline([("regression", LinearRegression(n_jobs=4))])

regressor = LinearRegression(n_jobs=4, fit_intercept=True)

data_to_use_transformed = regressor.fit(h, outcomes_to_use)

data_test = regressor.predict(h)
print len(data_test)
mean_squared_error = general_f.mean_squared_error(data_test, outcomes_to_use)
print "Mean squared error: " + str(mean_squared_error)

r_squared_prediction = regressor.score(h, outcomes_to_use)
print "R^2 error: " + str(r_squared_prediction)

print outcomes_to_use[0:25]
print data_test[0:25]

for i in range(len(data_test)):
    if abs(data_test[i] - outcomes_to_use[i]) >= 1e-6:
        print str(data_test[i]) + "    " + str(outcomes_to_use[i])