Python test_classifier 예제들, tester.test_classifier Python 예제들

예제 #1

0

파일 보기

파일: poi_bkp.py 프로젝트: MengoDB/ud120-projects

def GaussianNB(feature_list, dataset):
    from sklearn.naive_bayes import GaussianNB

    clf = GaussianNB()
    test_classifier(clf, dataset, feature_list)
    #score = clf.
    return clf

예제 #2

0

파일 보기

파일: poi_id.py 프로젝트: MengoDB/Identify-Fraud-from-Enron-Emails

def RandomForest(feature_list,dataset):
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier()
    test_classifier(clf,dataset,feature_list)
    imp= clf.feature_importances_
    print_importance (feature_list,imp)
    return clf

예제 #3

0

파일 보기

파일: tuning.py 프로젝트: tommysiu/udacity-data-analyst

def tune_classifier(classifier, clf_params, max_features):
    ### features_list is a list of strings, each of which is a feature name.
    ### The first feature must be "poi".
    features_list = get_feature_list()

    ### Create new feature(s)
    ### Store to my_dataset for easy export below.
    my_dataset = get_data()

    ### Extract features and labels from dataset for local testing
    features_list = features_list[0:max_features+1]
    data, labels, features = get_features_and_labels(my_dataset, features_list)

    ### Tune your classifier to achieve better than .3 precision and recall
    ### using our testing script. Check the tester.py script in the final project
    ### folder for details on the evaluation method, especially the test_classifier
    ### function. Because of the small size of the dataset, the script uses
    ### stratified shuffle split cross validation. For more info:
    ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

    from sklearn.cross_validation import train_test_split
    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.3, random_state=42)

    # Testing
    clf = GridSearchCV(classifier, param_grid=clf_params, scoring=make_scorer(f1_score))
    clf.fit(features_train, labels_train)
    clf_final = clf.best_estimator_
    print "The best estimator = ", clf_final
    test_classifier(clf_final, my_dataset, features_list, 1000)

예제 #4

0

파일 보기

파일: poi_bkp.py 프로젝트: MengoDB/ud120-projects

def decisionTree(feature_list, dataset):
    from sklearn import tree

    clf = tree.DecisionTreeClassifier()
    test_classifier(clf, dataset, feature_list)
    print clf.feature_importances_
    return clf

예제 #5

0

파일 보기

파일: poi_id.py 프로젝트: Faylfire/identifying_enron_fraud_project_5_fang_lu

def iterPipe(num1, num2):
    for i in range(num1, num2 + 1):
        # estimators = [('scaling', StandardScaler()),('reduce_dim', PCA()), ('dtc', DTC(min_samples_split=i*2))]
        # estimators = [('reduce_dim', PCA(n_components=2)), ('dtc', DTC(min_samples_split=i))]
        # clfIter = Pipeline(estimators)
        # clfIter.set_params(reduce_dim__n_components=3)
        clfIter = DTC(min_samples_split=i)
        test_classifier(clfIter, my_dataset, features_list)

예제 #6

0

파일 보기

파일: poi_id.py 프로젝트: yielder/identifying-fraud-from-enron-email

def KNN(feature_list,dataset):
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler
	knn = KNeighborsClassifier()
	# feature scale
	estimators = [('scale', StandardScaler()), ('knn', knn)]
	clf = Pipeline(estimators)
	test_classifier(clf, my_dataset, features_list)

예제 #7

0

파일 보기

파일: utils.py 프로젝트: joashxu/enron

def setup_and_test(my_dataset, features_list, classifier):
    # Dump classifier and features list, so we can test them
    dump_classifier_and_data(classifier, my_dataset, features_list)

    # load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    # Run testing script
    test_classifier(clf, dataset, feature_list)

    return

예제 #8

0

파일 보기

파일: poi_id.py 프로젝트: yielder/identifying-fraud-from-enron-email

def tuneDT(feature_list,dataset):
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.grid_search import GridSearchCV
	from sklearn import tree
	tree_clf = tree.DecisionTreeClassifier()
	parameters = {'criterion':('gini', 'entropy'),
		'splitter':('best','random')}
	clf = GridSearchCV(tree_clf, parameters,scoring = 'recall')
	test_classifier(clf, my_dataset, features_list)
	print '###best_params'
	print clf.best_params_

예제 #9

0

파일 보기

파일: poi.id.py 프로젝트: RaphaelTam/Enron_Bad_Guys

def detect_poi():
### Load the dictionary containing the dataset
    data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
### Task 1: Remove outliers
    data_dict.pop('TOTAL',0)    
    
### Task 2: Select what features
### 'stk_pay_ratio','to_poi_ratio', 'from_poi_ratio','bonus_salary_ratio'
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
    my_dataset = data_dict
    stk_pay_ratio(my_dataset)
    from_poi_ratio(my_dataset)
    to_poi_ratio(my_dataset)
    bonus_salary_ratio(my_dataset)
     
### Task 3: Feature Selection
### Generate a set of 15 feature lists from these 4 features
### This way, all possible combinations of these features are tested

    all_features_list = fList_set()

### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation in tester.py
    metrics = []    
    clf = GaussianNB()    
### ptest uses Stratified shuffle split cross validation and calculates the precision
### Find the precision for every list
    for i in range(0,15):
        metrics.append(ptest(clf,my_dataset,all_features_list[i]))
### Go for the feature list that produces the best precision.  
### For this dataset only, it is harder to get a high precision.
    best = np.array(metrics).argmax()  
    
### Run test_classifier to print evaluation metrics to console
    test_classifier(clf, my_dataset,all_features_list[best])

### Now use the same feature list to run the decison tree classifier
    features_list = all_features_list[best]
### Task 4: Try a varity of classifiers
    samples_split_values = [2,4]
    samples_leaf_values = [1,2]

    for split in samples_split_values:
        for leaf in samples_leaf_values:
            clf = tree.DecisionTreeClassifier(min_samples_split=split,\
            min_samples_leaf=leaf)
            test_classifier(clf, my_dataset, features_list)
            print_feature_importances(features_list, clf)
###Choose best classfier and feature set    
    clf = GaussianNB()   

### Dump classifier, dataset, and features_list
    dump_classifier_and_data(clf, my_dataset, features_list)

예제 #10

0

파일 보기

파일: poi_bkp.py 프로젝트: MengoDB/ud120-projects

def tuneKmeans(feature_list,dataset):
    from sklearn.cluster import KMeans
    from sklearn.grid_search import GridSearchCV
    km_clf = KMeans(n_clusters=2, tol=0.001)

    parameters = {'n_clusters': (2,10)}
    clf = GridSearchCV(km_clf, parameters, scoring='recall')
    test_classifier(clf, dataset, feature_list)
    print '###best_params'
    print clf.best_params_
    return clf.best_estimator_

예제 #11

0

파일 보기

파일: poi_id.py 프로젝트: BlaneG/Udacity_Intro_machine_learning

def explore_scores():
    for n in features:
        for c in n_neighbor:
            for d in weights:
                for e in algorithm:
                    for f in leaf_size:
                        for g in p:
                            for h in metric:
                                feature = 0
                                feature = features_select(n)
                                pipeline = Pipeline([('normalization', scaler), 
                                             ('classifier', KNeighborsClassifier(n_neighbors=c, weights=d, algorithm=e, 
                                                                                 leaf_size=f, p=g, metric=h))])
                                test_classifier(pipeline, enron_data, feature)

예제 #12

0

파일 보기

파일: poi_id.py 프로젝트: yielder/identifying-fraud-from-enron-email

def tuneKNN(feature_list,dataset):
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler
	from sklearn.grid_search import GridSearchCV
	knn = KNeighborsClassifier()
	# feature scale
	estimators = [('scale', StandardScaler()), ('knn', knn)]
	pipeline = Pipeline(estimators)
	parameters = {'knn__n_neighbors':[1,8],
		'knn__algorithm':('ball_tree','kd_tree','brute','auto')}
	clf = GridSearchCV(pipeline, parameters,scoring = 'recall')
	test_classifier(clf, my_dataset, features_list)
	print '###best_params'
	print clf.best_params_

예제 #13

0

파일 보기