try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()


### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages_train)
    # cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
    ages, net_worths, errors = outlierCleaner( predictions, ages_train, net_worths_train )
except NameError:
    print("your regression object doesn't exist, or isn't name reg")
    print("can't make predictions to use in identifying outliers")



## only run this code if cleaned_data is returning data
ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))

### refit your cleaned data!
try:
    reg.fit(ages, net_worths)
    print(reg.coef_)
    print(reg.score(ages_test, net_worths_test))


try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()


### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages_train)
    cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
except NameError:
    print "your regression object doesn't exist, or isn't name reg"
    print "can't make predictions to use in identifying outliers"







### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = zip(*cleaned_data)
    ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
Пример #3
0
### The first feature must be "poi".
#features_list = ['poi','salary'] # You will need to use more features
#features_list = ['poi','salary','total_stock_value'] # You will need to use more features
#features_list = ['poi','salary','total_stock_value','from_this_person_to_poi','from_poi_to_this_person'] # You will need to use more features
 
features_filename = sys.argv[1]

features_list = [line.rstrip('\n') for line in open(features_filename)]  

  
### Load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

### Task 2: Remove outliers

data_dict = outlierCleaner(data_dict)

### Task 3: Create new feature(s)

data_dict = addFeatures(data_dict)

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### by convention, n_rows is the number of data points
### and n_columns is the number of features
ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
from sklearn.cross_validation import train_test_split
ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42)

### fill in a regression here!  Name the regression object reg so that
### the plotting code below works, and you can see what your regression looks like

from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(ages_train, net_worths_train)
print reg.score(ages_test, net_worths_test)

outlierCleaner(reg.predict(ages_train), ages_train,net_worths_train)



try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()


### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages_train)
'''

try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()
'''

### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages)
    cleaned_data = outlierCleaner( predictions, ages, net_worths )
except NameError:
    print "your regression object doesn't exist, or isn't name reg"
    print "can't make predictions to use in identifying outliers"






### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = zip(*cleaned_data)
    ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
Пример #6
0

try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()


### identify and remove the most outlier-y points
cleaned_data = []

try:
    predictions = reg.predict(ages_train)
    cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
except NameError:
    print "your regression object doesn't exist, or isn't name reg"
    print "can't make predictions to use in identifying outliers"







### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = zip(*cleaned_data)
    ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
    pass
plt.scatter(ages, net_worths)
plt.show()


### identify and remove the most outlier-y points
import pandas as pd

data = pd.DataFrame(predictions)
data[1] = ages_train
data[2] = net_worths_train
cleaned_data = []

try:
    predictions = reg.predict(ages_train)
    c_data = outlierCleaner(data)
    
    '''c_data = c_data.reset_index()
    cl_data = c_data[int(len(c_data)*0.1):]
    cl_data = cl_data.sort(columns='index', ascending=True)
    cl_data = cl_data.reset_index(drop=True)'''
    clean_data = [tuple(x) for x in cl_data.values]
    cleaned_data = clean_data
except NameError:
    print "your regression object doesn't exist, or isn't name reg"
    print "can't make predictions to use in identifying outliers"




ages = numpy.reshape(numpy.array(ages), (len(ages), 1))
net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1))
from sklearn.model_selection import train_test_split
ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(
    ages, net_worths, test_size=0.1, random_state=42)

### fill in a regression here!  Name the regression object reg so that
### the plotting code below works, and you can see what your regression looks like

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(ages_train, net_worths_train)

clf_after_cleaned = LinearRegression()
cleaned_d = outlierCleaner(reg.predict(ages_train), ages_train,
                           net_worths_train)
age_cleaned = numpy.array([e[0] for e in cleaned_d])
net_worth_cleaned = numpy.array([e[1] for e in cleaned_d])
clf_after_cleaned.fit(age_cleaned, net_worth_cleaned)

print(reg.coef_)
print(clf_after_cleaned.coef_)
print(reg.score(ages_test, net_worths_test))
print(clf_after_cleaned.score(ages_test, net_worths_test))

try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()
try:
    plt.plot(batter_ages_flat, reg.predict(batter_ages_flat), color="blue")
except NameError:
    pass
plt.scatter(batter_ages_flat, batting_avgs_flat)
plt.xlabel("ages")
plt.ylabel("batting averages")
plt.show()


# identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(batter_ages_train)
    cleaned_data = outlierCleaner(predictions, batter_ages_train, batting_avgs_train)
except NameError:
    print "can't make predictions to use in identifying outliers"


# only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, avgs, errors = zip(*cleaned_data)
    ages = np.reshape(np.array(ages), (len(ages), 1))
    avgs = np.reshape(np.array(avgs), (len(avgs), 1))

    # refit the data
    try:
        reg.fit(ages, avgs)
        print 'slope after outlier removal:', reg.coef_
        print 'score on test data after outlier removal:', reg.score(batter_ages_test, batting_avgs_test)
Пример #10
0
try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.xlabel("ages")
plt.ylabel("net worths")
plt.show()


### identify and remove the most outlier-y points
cleaned_data = []
try:
    net_worths_pred = reg.predict(ages_train)
    cleaned_data = outlierCleaner( net_worths_pred, ages_train, net_worths_train )
except NameError:
    print "your regression object doesn't exist, or isn't name reg"
    print "can't make predictions to use in identifying outliers"







### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = zip(*cleaned_data)
    ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
Пример #11
0
### fill in a regression here!  Name the regression object reg so that
### the plotting code below works, and you can see what your regression looks like

try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()

### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages)
    cleaned_data = outlierCleaner(predictions, ages, net_worths)
except NameError:
    print "your regression object doesn't exist, or isn't name reg"
    print "can't make predictions to use in identifying outliers"

### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = zip(*cleaned_data)
    ages = numpy.reshape(numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1))

    ### refit your cleaned data!
    try:
        reg.fit(ages, net_worths)
        plt.plot(ages, reg.predict(ages), color="blue")
    except NameError:
Пример #12
0
def test_outliearCleaner():
    predictions = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    ages = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    net_worths = np.array([1, 2, 3, 4, 10, 6, 7, 8, 9, 10])
    output = outlier_cleaner.outlierCleaner(predictions, ages, net_worths)
    print output
print "Testing R^2 =", reg.score(ages_test, net_worths_test)

#print "Predictions =", reg.predict(ages_train).flatten()

try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()

### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages_train)
    cleaned_data = outlierCleaner(predictions.flatten(), ages_train.flatten(),
                                  net_worths_train.flatten())
except NameError:
    print "your regression object doesn't exist, or isn't named reg"
    print "can't make predictions to use in identifying outliers"

### only run this code if cleaned_data is returning data
if len(cleaned_data) > 0:
    ages, net_worths, errors = zip(*cleaned_data)
    ages = numpy.reshape(numpy.array(ages), (len(ages), 1))
    net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1))

    ### refit your cleaned data!
    try:
        reg.fit(ages, net_worths)
        print "Slope with outliers removed =", reg.coef_
        print "Testing R^2 =", reg.score(ages_test, net_worths_test)