try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show() ### identify and remove the most outlier-y points cleaned_data = [] try: predictions = reg.predict(ages_train) # cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train ) ages, net_worths, errors = outlierCleaner( predictions, ages_train, net_worths_train ) except NameError: print("your regression object doesn't exist, or isn't name reg") print("can't make predictions to use in identifying outliers") ## only run this code if cleaned_data is returning data ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) ### refit your cleaned data! try: reg.fit(ages, net_worths) print(reg.coef_) print(reg.score(ages_test, net_worths_test))
try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show() ### identify and remove the most outlier-y points cleaned_data = [] try: predictions = reg.predict(ages_train) cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train ) except NameError: print "your regression object doesn't exist, or isn't name reg" print "can't make predictions to use in identifying outliers" ### only run this code if cleaned_data is returning data if len(cleaned_data) > 0: ages, net_worths, errors = zip(*cleaned_data) ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
### The first feature must be "poi". #features_list = ['poi','salary'] # You will need to use more features #features_list = ['poi','salary','total_stock_value'] # You will need to use more features #features_list = ['poi','salary','total_stock_value','from_this_person_to_poi','from_poi_to_this_person'] # You will need to use more features features_filename = sys.argv[1] features_list = [line.rstrip('\n') for line in open(features_filename)] ### Load the dictionary containing the dataset data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) ### Task 2: Remove outliers data_dict = outlierCleaner(data_dict) ### Task 3: Create new feature(s) data_dict = addFeatures(data_dict) ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations,
### by convention, n_rows is the number of data points ### and n_columns is the number of features ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) from sklearn.cross_validation import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that ### the plotting code below works, and you can see what your regression looks like from sklearn.linear_model import LinearRegression reg = LinearRegression() reg.fit(ages_train, net_worths_train) print reg.score(ages_test, net_worths_test) outlierCleaner(reg.predict(ages_train), ages_train,net_worths_train) try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show() ### identify and remove the most outlier-y points cleaned_data = [] try: predictions = reg.predict(ages_train)
''' try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show() ''' ### identify and remove the most outlier-y points cleaned_data = [] try: predictions = reg.predict(ages) cleaned_data = outlierCleaner( predictions, ages, net_worths ) except NameError: print "your regression object doesn't exist, or isn't name reg" print "can't make predictions to use in identifying outliers" ### only run this code if cleaned_data is returning data if len(cleaned_data) > 0: ages, net_worths, errors = zip(*cleaned_data) ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
pass plt.scatter(ages, net_worths) plt.show() ### identify and remove the most outlier-y points import pandas as pd data = pd.DataFrame(predictions) data[1] = ages_train data[2] = net_worths_train cleaned_data = [] try: predictions = reg.predict(ages_train) c_data = outlierCleaner(data) '''c_data = c_data.reset_index() cl_data = c_data[int(len(c_data)*0.1):] cl_data = cl_data.sort(columns='index', ascending=True) cl_data = cl_data.reset_index(drop=True)''' clean_data = [tuple(x) for x in cl_data.values] cleaned_data = clean_data except NameError: print "your regression object doesn't exist, or isn't name reg" print "can't make predictions to use in identifying outliers"
ages = numpy.reshape(numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1)) from sklearn.model_selection import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split( ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that ### the plotting code below works, and you can see what your regression looks like from sklearn.linear_model import LinearRegression reg = LinearRegression() reg.fit(ages_train, net_worths_train) clf_after_cleaned = LinearRegression() cleaned_d = outlierCleaner(reg.predict(ages_train), ages_train, net_worths_train) age_cleaned = numpy.array([e[0] for e in cleaned_d]) net_worth_cleaned = numpy.array([e[1] for e in cleaned_d]) clf_after_cleaned.fit(age_cleaned, net_worth_cleaned) print(reg.coef_) print(clf_after_cleaned.coef_) print(reg.score(ages_test, net_worths_test)) print(clf_after_cleaned.score(ages_test, net_worths_test)) try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show()
try: plt.plot(batter_ages_flat, reg.predict(batter_ages_flat), color="blue") except NameError: pass plt.scatter(batter_ages_flat, batting_avgs_flat) plt.xlabel("ages") plt.ylabel("batting averages") plt.show() # identify and remove the most outlier-y points cleaned_data = [] try: predictions = reg.predict(batter_ages_train) cleaned_data = outlierCleaner(predictions, batter_ages_train, batting_avgs_train) except NameError: print "can't make predictions to use in identifying outliers" # only run this code if cleaned_data is returning data if len(cleaned_data) > 0: ages, avgs, errors = zip(*cleaned_data) ages = np.reshape(np.array(ages), (len(ages), 1)) avgs = np.reshape(np.array(avgs), (len(avgs), 1)) # refit the data try: reg.fit(ages, avgs) print 'slope after outlier removal:', reg.coef_ print 'score on test data after outlier removal:', reg.score(batter_ages_test, batting_avgs_test)
try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.xlabel("ages") plt.ylabel("net worths") plt.show() ### identify and remove the most outlier-y points cleaned_data = [] try: net_worths_pred = reg.predict(ages_train) cleaned_data = outlierCleaner( net_worths_pred, ages_train, net_worths_train ) except NameError: print "your regression object doesn't exist, or isn't name reg" print "can't make predictions to use in identifying outliers" ### only run this code if cleaned_data is returning data if len(cleaned_data) > 0: ages, net_worths, errors = zip(*cleaned_data) ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
### fill in a regression here! Name the regression object reg so that ### the plotting code below works, and you can see what your regression looks like try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show() ### identify and remove the most outlier-y points cleaned_data = [] try: predictions = reg.predict(ages) cleaned_data = outlierCleaner(predictions, ages, net_worths) except NameError: print "your regression object doesn't exist, or isn't name reg" print "can't make predictions to use in identifying outliers" ### only run this code if cleaned_data is returning data if len(cleaned_data) > 0: ages, net_worths, errors = zip(*cleaned_data) ages = numpy.reshape(numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1)) ### refit your cleaned data! try: reg.fit(ages, net_worths) plt.plot(ages, reg.predict(ages), color="blue") except NameError:
def test_outliearCleaner(): predictions = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) ages = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) net_worths = np.array([1, 2, 3, 4, 10, 6, 7, 8, 9, 10]) output = outlier_cleaner.outlierCleaner(predictions, ages, net_worths) print output
print "Testing R^2 =", reg.score(ages_test, net_worths_test) #print "Predictions =", reg.predict(ages_train).flatten() try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: pass plt.scatter(ages, net_worths) plt.show() ### identify and remove the most outlier-y points cleaned_data = [] try: predictions = reg.predict(ages_train) cleaned_data = outlierCleaner(predictions.flatten(), ages_train.flatten(), net_worths_train.flatten()) except NameError: print "your regression object doesn't exist, or isn't named reg" print "can't make predictions to use in identifying outliers" ### only run this code if cleaned_data is returning data if len(cleaned_data) > 0: ages, net_worths, errors = zip(*cleaned_data) ages = numpy.reshape(numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1)) ### refit your cleaned data! try: reg.fit(ages, net_worths) print "Slope with outliers removed =", reg.coef_ print "Testing R^2 =", reg.score(ages_test, net_worths_test)