# Ignore the new feature as it messes up PCA data_dict = pickle.load(open("data/own_data_dict.pkl", "r")) features_list = getallFeatures(data_dict) data = featureFormat(data_dict, features_list, sort_keys = True) # Scale features: mins = np.min(data, axis=0) maxs = np.max(data, axis=0) data = (data-mins)/(maxs-mins) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = \ stratifiedShuffleSplit(features, labels) ### Do some PCA pca = PCA.doPCA(features_train, n = 4) transformed_train = pca.transform(features_train) # Do some hyperparam validation: best_svc, svc_grid_scores = ClassifySVM.gridsearch( transformed_train, labels_train ) svmfit = ClassifySVM.train(transformed_train, labels_train, best_svc) test_classifier(svmfit, data) dump_classifier_and_data(svmfit, data_dict, features_list)
results = {} for k in xrange(1, len(features_list)): print k # Get feature importance: importance, selector = getFeatureImportance(features_train, labels_train, features_list, k) ### Next try with selector selector_train = selector.transform(features_train) # Do some hyperparam validation: best_svc, svc_grid_scores = ClassifySVM.gridsearch( selector_train, labels_train ) svmfit = ClassifySVM.train(selector_train, labels_train, best_svc) precision, recall, f1, f2 = test_classifier(svmfit, data) results[k] = [precision, recall, f1, f2] pprint.pprint(results) pickle.dump(results, open('featureselection_orig.pkl', "w") ) pcaresults = {} for k in xrange(1, 7): print k ### Do some PCA pca = PCA.doPCA(features_train, n = k) transformed_train = pca.transform(features_train) # Do some hyperparam validation: best_svc, svc_grid_scores = ClassifySVM.gridsearch( transformed_train, labels_train