Пример #1
def addFeatures(data_dict, features_list):


    # we don't want to inadvertently change the data_dict
    # just yet so take a copy of data_dict so we can play
    # with some of the features
    # (yes it is wasteful to copy the whole thing)
    data_dict_copy = deepcopy(data_dict)
    # now we must will rescale the features so that we
    # have a level playing field
    #(technically unecessary with these choices as they
    # are of similar scale)
    data_dict_rescale = rescale(data_dict_copy, ['from_poi_to_this_person', 
    #data_dict_rescale = data_dict_copy                        
    # now create the new feature
    cnt = 0
    for person, features in data_dict_rescale.iteritems():
        frompoi = features['from_poi_to_this_person']
        frompoi = 0 if frompoi == 'NaN' else frompoi        
        topoi = features['from_this_person_to_poi']
        topoi = 0 if topoi == 'NaN' else topoi

        sharedpoi = features['shared_receipt_with_poi']
        sharedpoi = 0 if sharedpoi == 'NaN' else sharedpoi
        #print str(frompoi) + " : " + str(topoi) + " : " + str(sharedpoi)
        shared_messages = frompoi + topoi + sharedpoi
        # now add to data_dict
        data_dict[person]['shared_messages'] = shared_messages
        cnt += 1
    print "data_dict size: " + str(len(data_dict))
    print "cnt: " + str(cnt)
    return (data_dict, features_list)
Пример #2
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

from sklearn import neighbors

k = 3
clf = neighbors.KNeighborsClassifier(n_neighbors=k)

# perform feature re-scaling
# NOTE: only want to rescale the input features
in_features = copy.deepcopy(features_list)
my_dataset = rescale(my_dataset, in_features)

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
t0 = time()

test_classifier(clf, my_dataset, features_list)

print "time:", round(time() -t0, 3), "s"

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.