#data_to_use = postprocess.remove_columns_with_large_number_of_missing_values(data_to_use, 0.95, deepcopy=False) imputation = Imputer(missing_values='NaN', strategy='most_frequent', verbose=20) data_to_use = imputation.fit_transform(data_to_use) data_in_reserve = data_to_use[-200:] outcomes_in_reserve = outcomes_to_use[-200:] data_to_use = data_to_use[:-200] outcomes_to_use = outcomes_to_use[:-200] feature_sel = SelectKBest(score_func=f_regression, k=100) regressor = Lasso(fit_intercept=True) pipeline = Pipeline([('select', feature_sel), ('regression', regressor)]) pipeline.fit(data_to_use, outcomes_to_use) prediction = pipeline.predict(data_in_reserve) mean_squared_error = general_f.mean_squared_error(prediction, outcomes_in_reserve) print "Mean squared error: " + str(mean_squared_error) r_squared_prediction = pipeline.score(data_in_reserve, outcomes_in_reserve) print "R^2 error: " + str(r_squared_prediction) fig = lar.plot_predict_actual_pairs(prediction, outcomes_in_reserve) fig.savefig("temp3.pdf")
# Fit and predict pipeline.fit(data_to_use[training_indices], outcomes_to_use[training_indices]) prediction = pipeline.predict(data_to_use[testing_indices]) # Save the predicted and actual values predicted_outcomes.extend(prediction) actual_outcomes.extend(outcomes_to_use[testing_indices]) # Calculate an R^2 value for the fold r_squared_prediction = pipeline.score(data_to_use[testing_indices], outcomes_to_use[testing_indices]) r_squared_error.append(r_squared_prediction) # Calculate, print, and save the mean squared error across all the folds mse = general_f.mean_squared_error(predicted_outcomes, actual_outcomes) print "Mean, median, 95% confidence intervals for MSE:" print np.mean(mse) print np.median(mse) print np.percentile(mse, (2.28, 97.72), interpolation='linear') print "" print "Mean, median, 95% confidence intervals for R^2:" print np.mean(r_squared_error) print np.median(r_squared_error) print np.percentile(r_squared_error, (2.28, 97.72), interpolation='linear') print "" print "" print ""
num = 0 imputation = Imputer(missing_values='NaN', strategy='most_frequent', verbose=10) h = imputation.fit_transform(data_to_use) #pipeline_2 = Pipeline([("imputer", Imputer(missing_values='NaN', # strategy='most_frequent')), # ("regression", LinearRegression(n_jobs=4))]) #pipeline = Pipeline([("regression", LinearRegression(n_jobs=4))]) regressor = LinearRegression(n_jobs=4, fit_intercept=True) data_to_use_transformed = regressor.fit(h, outcomes_to_use) data_test = regressor.predict(h) print len(data_test) mean_squared_error = general_f.mean_squared_error(data_test, outcomes_to_use) print "Mean squared error: " + str(mean_squared_error) r_squared_prediction = regressor.score(h, outcomes_to_use) print "R^2 error: " + str(r_squared_prediction) print outcomes_to_use[0:25] print data_test[0:25] for i in range(len(data_test)): if abs(data_test[i] - outcomes_to_use[i]) >= 1e-6: print str(data_test[i]) + " " + str(outcomes_to_use[i])