def test(): """ 测试用函数 """ my_dataset = data_dict clean_outliers(my_dataset) # new_feature(my_dataset) features_list = gen_features(my_dataset) list_nan = check_nan(my_dataset, n=5) features_list = [f for f in features_list if f not in list_nan] print(features_list) ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features_rs = targetFeatureSplit(data) features_rs = MinMaxScaler().fit_transform(features_rs) labels_train, labels_test, features_train, features_test = train_test_split( labels, features_rs, test_size=0.3, random_state=42) clf = RandomForestClassifier(max_depth=5, n_estimators=3, min_samples_split=2, min_samples_leaf=2, random_state=36) clf.fit(features_train, labels_train) print(clf.feature_importances_) pred = clf.predict(features_test) print(clf.score(features_test, labels_test)) print(metrics.precision_score(labels_test, pred)) print(metrics.recall_score(labels_test, pred)) dump_classifier_and_data(clf, my_dataset, features_list) tester.main()
def menu(): print(''' \u001b[38;5;9;1mdP .d88888b dP \u001b[38;5;10;1m88 88. "' 88 \u001b[38;5;11;1m88 .dP `Y88888b. d8888P 88d888b. .d8888b. dP. .dP \u001b[38;5;12;1m88888" `8b 88 88' `88 88' `88 `8bd8' \u001b[38;5;13;1m88 `8b. d8' .8P 88 88 88. .88 .d88b. \u001b[38;5;14;1mdP `YP Y88888P dP dP `88888P' dP' `dP \u001b[0m ''') is_running = True while is_running: choice = program_menu() if choice == "Sample recorder": sample_recorder.main() elif choice == "Trainer": trainer.main() elif choice == "Evaluator": evaluator.main() else: tester.main() again = get_binary_validation("Do you want to load another program ?", False) if not again: is_running = False
def runGSCV(pipeline, parameters, cv, flist): gs = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1') t0 = time() gs.fit(features, labels) print 'done in %0.3fs' % (time() - t0) print 'best score %3.3f' % gs.best_score_ print 'best params: %s' % gs.best_params_ gs_scores = gs.grid_scores_ #for key in gs_scores: # print key print '------- best features' best_features = sorted(zip(flist[1:], gs.best_estimator_.steps[0][1].scores_, gs.best_estimator_.steps[0][1].get_support()), reverse=True, key=lambda x: x[1]) #print best_features for list in best_features: if list[2] == True: print list print '----- running tester' dump_classifier_and_data(gs.best_estimator_.steps[1][1], my_dataset, flist) tester.main() return None
def test(path, tests): print "Starting tests...(this might take a while)" print "----------------" stdout = sys.stdout sys.stdout = Logger(os.path.join(curPath, "results", "results.txt")) try: tester.main(path, tests) except Exception, e: "Failed to execute tests: ", e
def learn_svm(my_dataset, features_list, scaler, cvalues, iterations): for cvalue in cvalues: for maxiter in iterations: svm_clf = svm.SVC(kernel='rbf', C=cvalue, max_iter=maxiter) clf = Pipeline([('scaler', scaler), ('SVM', svm_clf)]) dump_classifier_and_data(clf, my_dataset, features_list) print 'C', cvalue, '#ofiterations', maxiter tester.main()
def test_code(features, labels): import tester from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.5, random_state=42) from sklearn.metrics import average_precision_score #print "Average Precision-recall score = {0:0.5f}".format(average_precision_score(labels_train, labels_test)) tester.main()
def learn_DT(my_dataset, features_list, scaler): #min_samples_split = 10 dt = DecisionTreeClassifier() clf = Pipeline([('scaler', scaler), ('classifier', dt)]) data = featureFormat(my_dataset, features_list) labels, features = targetFeatureSplit(data) clf.fit(features, labels) dump_classifier_and_data(clf, my_dataset, features_list) tester.main()
def computeExternalTestResult(clf, data, feature_list): import tester try: return tester.main(clf=clf, dataset=data, feature_list=feature_list) except ValueError as e: print("ERROR: Exception occurred running tester: {}".format(e)) return {"Accuracy": 0, "F1": 0, "Precision": 0, "Recall": 0}
def main(): """ 执行函数 1.清理异常值 2.生成特征列表 3.移除NaN值最多的N个特征 4.分离特征和Labels 5.选出5个最佳的特征 6.依次根据f1, recall, accuracy, precision利用GridSearchCV选出最佳分类器 7.用选出的分类器计算分数 """ ### Task 2: Remove outliers ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = data_dict clean_outliers(my_dataset) # new_feature(my_dataset) features_list = gen_features(my_dataset) list_nan = check_nan(my_dataset, n=5) features_list = [f for f in features_list if f not in list_nan] ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) # draw_plot(data) features = MinMaxScaler().fit_transform(features) sb = SelectKBest(chi2, k=3) features = sb.fit_transform(features, labels) list_score = ['f1', 'recall', 'accuracy', 'precision'] dict_result = {} for scoring in list_score: print('---====[{0}]====---'.format(scoring)) clf = choose_best_est(features, labels, scoring) dict_result[scoring] = clf print('\n>>>>>>>>>>>>>>>>>>>>>>\n') for cate, clf in dict_result.items(): print('++++++++++++++++++++++++++') print(cate) print('++++++++++++++++++++++++++') dump_classifier_and_data(clf, my_dataset, features_list) tester.main()
def assignment(): x = "assign" send_commands(conn, x) file1 = open('assignment.py', 'wb') print('file opened successfully') n = conn.recv(128) data1 = n #print(data1) while True: n = conn.recv(128) data1 = data1 + n #print("\nCHUNK written") if len(n) < 128: break file1.write(data1) file1.close() return tester.main('assignment.py')
def __evaluate_one_ref_hypothesis_pair(self, refs, hyps): """ :param refs: :param hyps: :return: """ # Dump the data into the corresponding files for index,pair in enumerate(zip(refs,hyps)): file_ref_nm = self.reference_store_loc + '/ref' + str(index) + '.txt' file_hyp_nm = self.hypothesis_store_loc + '/gen' + str(index) + '.txt' ref_file = open(file_ref_nm,'w') hyp_file = open(file_hyp_nm,'w') ref_file.write(str(pair[0])) if pair[1] != 'nan': hyp_file.write(str(pair[1])) else: hyp_file.write('') # Call the tester function to get the evaluations return tester.main()
def runTest(self,clf,features_list): print "test result on stratified cross validation data...." dump_classifier_and_data(clf, self.data_dict, features_list) tester.main() return
# In[203]: print print "*********************************************************************************************************" print "Model Evaluation - Baseline Performance before any New engineered features / Feature Selection / Tuning" print "*********************************************************************************************************" #data = featureFormat(my_dataset, features_list, sort_keys = True) #labels, features = targetFeatureSplit(data) # Create and test the Gaussian Naive Bayes Classifier clf = GaussianNB() tester.dump_classifier_and_data(clf, my_dataset, features_list_org_all) tester.main(); print "*********************************************************************************************************" # Create and test the Decision Tree Classifier clf = DecisionTreeClassifier() tester.dump_classifier_and_data(clf, my_dataset, features_list_org_all) tester.main(); print "*********************************************************************************************************" # Create and test the K Means clustering classifier clf = KMeans(n_clusters=2) tester.dump_classifier_and_data(clf, my_dataset, features_list_org_all) tester.main();
### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". features_list = [ 'poi', 'salary', 'bonus', 'from_poi_to_this_person', 'from_this_person_to_poi' ] data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### 初步特征建模 import tester clf = GaussianNB() tester.dump_classifier_and_data(clf, my_dataset, features_list) print tester.main() clf = DecisionTreeClassifier() tester.dump_classifier_and_data(clf, my_dataset, features_list) print tester.main() clf = SVC(kernel="rbf", C=10000) tester.dump_classifier_and_data(clf, my_dataset, features_list) print tester.main() clf = AdaBoostClassifier(n_estimators=10) tester.dump_classifier_and_data(clf, my_dataset, features_list) print tester.main() clf = KNeighborsClassifier(n_neighbors=2) tester.dump_classifier_and_data(clf, my_dataset, features_list)
def main(): tester.main()
# Example starting point. Try investigating other evaluation techniques! features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.3, random_state=42) pipe = Pipeline([('scaler', preprocessing.StandardScaler()), ('reducer', PCA(random_state=42)), ('selector', SelectKBest()), ('classifier', tree.DecisionTreeClassifier())]) param_grid = { 'classifier__criterion': ['gini', 'entropy'], 'classifier__splitter': ['best', 'random'], 'classifier__min_samples_split': [2, 4, 8, 16, 32], 'classifier__class_weight': ['balanced', None], 'selector__k': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 'all'] } sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42) grid_search = GridSearchCV(pipe, param_grid, scoring='f1', cv=sss) grid = grid_search.fit(features_train, labels_train) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(grid_search.best_estimator_, my_dataset, features_list) main()
def learn_bayes(my_dataset, features_list, scaler): gnb = GaussianNB() ppl = Pipeline([('scaler', scaler), ('classifier', gnb)]) dump_classifier_and_data(ppl, my_dataset, features_list) tester.main()
### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html clf = DecisionTreeClassifier(random_state=42) pipe = Pipeline(steps=[('fss', fss), ('clf', clf)]) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html cv = StratifiedShuffleSplit(labels, n_iter=50, random_state=42) param_grid = [ {'clf__min_samples_split': [2, 5, 10, 15, 20, 50], 'clf__criterion': ['gini', 'entropy'], 'clf__max_features': ['sqrt', 'log2', None], 'clf__class_weight': [None, 'balanced'], 'clf__max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20, 50], 'fss__k': range(1, len(features_list))} ] grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='f1', n_jobs=1) grid.fit(features, labels) clf = grid.best_estimator_ ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list) main()
def algorithm_tester(clf): tester.dump_classifier_and_data(clf, my_dataset, features_list) return tester.main()
def evaluate(clf, my_dataset, features_list): dump_classifier_and_data(clf, my_dataset, features_list) print '{1}Udacity\'s Evaluation:{0}'.format(color.Normal, color.BlinkBlue) return main() # from tester.py
features_importance = [] for i in range(len(clf.feature_importances_)): if clf.feature_importances_[i] > 0: features_importance.append([df.columns[i+1], clf.feature_importances_[i]]) features_importance.sort(key=lambda x: x[1], reverse = True) print "Feature selection with DecisionTreeClassifier:" for f_i in features_importance: print f_i features_list = [x[0] for x in features_importance] features_list.insert(0, 'poi') #Decision Tree Classifier with standard parametres clf = DecisionTreeClassifier(random_state = 75) my_dataset = df[features_list].to_dict(orient = 'index') tester.dump_classifier_and_data(clf, my_dataset, features_list) tester.main() #Random Forest with standard parameters from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state = 75) clf.fit(df.ix[:,1:], np.ravel(df.ix[:,:1])) # selecting the features with non null importance, sorting and creating features_list for the model features_importance = [] for i in range(len(clf.feature_importances_)): if clf.feature_importances_[i] > 0: features_importance.append([df.columns[i+1], clf.feature_importances_[i]]) features_importance.sort(key=lambda x: x[1], reverse = True) print "Feature selection with Random Forest:" for f_i in features_importance[:11]: print f_i features_list = [x[0] for x in features_importance]
class B(tester.Testable): class BErr(BaseException): pass def __init__(self): pass #print('Creating B') def prepareToBeTested(self): pass #print('Preparing B') def test1(self): #print('Running B.test1') self.checkFalse(False) def test2(self): #print('Running B.test2') self.checkTrue(0 / 0) def funcB(self): #print('In funcB') # uncomment the next line to pass test3 pass #raise self.BErr() def test3(self): #print('In test3') self.checkRaises(self.funcB, self.BErr) if __name__ == '__main__': tester.main()
print(kf) count=0 for trainIndex, testIndex in kf.split(labels): features_train = [features[index] for index in trainIndex] features_test = [features[index] for index in testIndex] labels_train = [labels[index] for index in trainIndex] labels_test = [labels[index] for index in testIndex] clf = DecisionTreeClassifier(min_samples_split=6) clf = clf.fit(features_train,labels_train) pred = clf.predict(features_test) accuracy = accuracy_score(labels_test,pred) t1 = time() print 'Accuracy',round(accuracy,2) print "Precision: ",round(precision_score(labels_test,pred),2) print "Recall: ", round(recall_score(labels_test,pred),2) print "DecisionTree Clf algo. time",round(time()-t1, 3),'sec' # # Example starting point. Try investigating other evaluation techniques! # from sklearn.cross_validation import train_test_split # features_train, features_test, labels_train, labels_test = \ # train_test_split(features, labels, test_size=0.3, random_state=42) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. ts.dump_classifier_and_data(clf, my_dataset, features_list) ts.main()