예제 #1
0
def test():
    """
    测试用函数
    """
    my_dataset = data_dict
    clean_outliers(my_dataset)

    # new_feature(my_dataset)

    features_list = gen_features(my_dataset)
    list_nan = check_nan(my_dataset, n=5)
    features_list = [f for f in features_list if f not in list_nan]

    print(features_list)

    ### Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys=True)
    labels, features_rs = targetFeatureSplit(data)
    features_rs = MinMaxScaler().fit_transform(features_rs)
    labels_train, labels_test, features_train, features_test = train_test_split(
        labels, features_rs, test_size=0.3, random_state=42)
    clf = RandomForestClassifier(max_depth=5,
                                 n_estimators=3,
                                 min_samples_split=2,
                                 min_samples_leaf=2,
                                 random_state=36)
    clf.fit(features_train, labels_train)
    print(clf.feature_importances_)
    pred = clf.predict(features_test)
    print(clf.score(features_test, labels_test))
    print(metrics.precision_score(labels_test, pred))
    print(metrics.recall_score(labels_test, pred))
    dump_classifier_and_data(clf, my_dataset, features_list)
    tester.main()
예제 #2
0
def menu():
    print('''
\u001b[38;5;9;1mdP       .d88888b    dP
\u001b[38;5;10;1m88       88.    "'   88
\u001b[38;5;11;1m88  .dP  `Y88888b. d8888P 88d888b. .d8888b. dP.  .dP
\u001b[38;5;12;1m88888"         `8b   88   88'  `88 88'  `88  `8bd8'
\u001b[38;5;13;1m88  `8b. d8'   .8P   88   88       88.  .88  .d88b.
\u001b[38;5;14;1mdP   `YP  Y88888P    dP   dP       `88888P' dP'  `dP
\u001b[0m
    ''')

    is_running = True
    while is_running:
        choice = program_menu()

        if choice == "Sample recorder":
            sample_recorder.main()
        elif choice == "Trainer":
            trainer.main()
        elif choice == "Evaluator":
            evaluator.main()
        else:
            tester.main()

        again = get_binary_validation("Do you want to load another program ?",
                                      False)
        if not again:
            is_running = False
예제 #3
0
def runGSCV(pipeline, parameters, cv, flist):

    gs = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')

    t0 = time()
    gs.fit(features, labels)
    print 'done in %0.3fs' % (time() - t0)

    print 'best score %3.3f' % gs.best_score_
    print 'best params: %s' % gs.best_params_
    gs_scores = gs.grid_scores_

    #for key in gs_scores:
    #    print key

    print '------- best features'

    best_features = sorted(zip(flist[1:],
                               gs.best_estimator_.steps[0][1].scores_,
                               gs.best_estimator_.steps[0][1].get_support()),
                           reverse=True,
                           key=lambda x: x[1])
    #print best_features

    for list in best_features:
        if list[2] == True:
            print list

    print '----- running tester'
    dump_classifier_and_data(gs.best_estimator_.steps[1][1], my_dataset, flist)
    tester.main()

    return None
예제 #4
0
def test(path, tests):
    print "Starting tests...(this might take a while)"
    print "----------------"
    stdout = sys.stdout
    sys.stdout = Logger(os.path.join(curPath, "results", "results.txt"))
    try:
        tester.main(path, tests)
    except Exception, e:
        "Failed to execute tests: ", e
예제 #5
0
def learn_svm(my_dataset, features_list, scaler, cvalues, iterations):

    for cvalue in cvalues:
        for maxiter in iterations:
            svm_clf = svm.SVC(kernel='rbf', C=cvalue, max_iter=maxiter)
            clf = Pipeline([('scaler', scaler), ('SVM', svm_clf)])
            dump_classifier_and_data(clf, my_dataset, features_list)
            print 'C', cvalue, '#ofiterations', maxiter
            tester.main()
예제 #6
0
def test_code(features, labels):
    import tester
    from sklearn.cross_validation import train_test_split
    features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.5, random_state=42)
    from sklearn.metrics import average_precision_score

    #print "Average Precision-recall score = {0:0.5f}".format(average_precision_score(labels_train, labels_test))

    tester.main()
예제 #7
0
def learn_DT(my_dataset, features_list, scaler):

    #min_samples_split = 10

    dt = DecisionTreeClassifier()
    clf = Pipeline([('scaler', scaler), ('classifier', dt)])

    data = featureFormat(my_dataset, features_list)
    labels, features = targetFeatureSplit(data)

    clf.fit(features, labels)

    dump_classifier_and_data(clf, my_dataset, features_list)
    tester.main()
예제 #8
0
def computeExternalTestResult(clf, data, feature_list):
    import tester
    try:
        return tester.main(clf=clf, dataset=data, feature_list=feature_list)
    except ValueError as e:
        print("ERROR: Exception occurred running tester: {}".format(e))
        return {"Accuracy": 0, "F1": 0, "Precision": 0, "Recall": 0}
예제 #9
0
def main():
    """
    执行函数
    1.清理异常值
    2.生成特征列表
    3.移除NaN值最多的N个特征
    4.分离特征和Labels
    5.选出5个最佳的特征
    6.依次根据f1, recall, accuracy, precision利用GridSearchCV选出最佳分类器
    7.用选出的分类器计算分数
    """
    ### Task 2: Remove outliers
    ### Task 3: Create new feature(s)
    ### Store to my_dataset for easy export below.
    my_dataset = data_dict
    clean_outliers(my_dataset)
    # new_feature(my_dataset)

    features_list = gen_features(my_dataset)
    list_nan = check_nan(my_dataset, n=5)
    features_list = [f for f in features_list if f not in list_nan]

    ### Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    # draw_plot(data)
    features = MinMaxScaler().fit_transform(features)
    sb = SelectKBest(chi2, k=3)
    features = sb.fit_transform(features, labels)

    list_score = ['f1', 'recall', 'accuracy', 'precision']
    dict_result = {}
    for scoring in list_score:
        print('---====[{0}]====---'.format(scoring))
        clf = choose_best_est(features, labels, scoring)
        dict_result[scoring] = clf
    print('\n>>>>>>>>>>>>>>>>>>>>>>\n')
    for cate, clf in dict_result.items():
        print('++++++++++++++++++++++++++')
        print(cate)
        print('++++++++++++++++++++++++++')
        dump_classifier_and_data(clf, my_dataset, features_list)
        tester.main()
def assignment():
    x = "assign"
    send_commands(conn, x)
    file1 = open('assignment.py', 'wb')
    print('file opened successfully')
    n = conn.recv(128)
    data1 = n
    #print(data1)
    while True:
        n = conn.recv(128)
        data1 = data1 + n
        #print("\nCHUNK written")
        if len(n) < 128:
            break
    file1.write(data1)
    file1.close()

    return tester.main('assignment.py')
예제 #11
0
    def __evaluate_one_ref_hypothesis_pair(self, refs, hyps):
        """

        :param refs:
        :param hyps:
        :return:
        """
        # Dump the data into the corresponding files
        for index,pair in enumerate(zip(refs,hyps)):
            file_ref_nm = self.reference_store_loc + '/ref' + str(index) + '.txt'
            file_hyp_nm = self.hypothesis_store_loc + '/gen' + str(index) + '.txt'
            ref_file = open(file_ref_nm,'w')
            hyp_file = open(file_hyp_nm,'w')
            ref_file.write(str(pair[0]))
            if pair[1] != 'nan':
                hyp_file.write(str(pair[1]))
            else:
                hyp_file.write('')
        # Call the tester function to get the evaluations
        return tester.main()
예제 #12
0
 def runTest(self,clf,features_list):
     print "test result on stratified cross validation data...."
     dump_classifier_and_data(clf, self.data_dict, features_list)
     tester.main()
     return
예제 #13
0
# In[203]:


print 
print "*********************************************************************************************************"
print "Model Evaluation - Baseline Performance before any New engineered features / Feature Selection / Tuning"
print "*********************************************************************************************************"

#data = featureFormat(my_dataset, features_list, sort_keys = True)
#labels, features = targetFeatureSplit(data)


# Create and test the Gaussian Naive Bayes Classifier
clf = GaussianNB()
tester.dump_classifier_and_data(clf, my_dataset, features_list_org_all)
tester.main();

print "*********************************************************************************************************"

# Create and test the Decision Tree Classifier
clf = DecisionTreeClassifier()
tester.dump_classifier_and_data(clf, my_dataset, features_list_org_all)
tester.main();

print "*********************************************************************************************************"

# Create and test the K Means clustering classifier
clf = KMeans(n_clusters=2)
tester.dump_classifier_and_data(clf, my_dataset, features_list_org_all)
tester.main();
예제 #14
0
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = [
    'poi', 'salary', 'bonus', 'from_poi_to_this_person',
    'from_this_person_to_poi'
]

data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### 初步特征建模
import tester

clf = GaussianNB()
tester.dump_classifier_and_data(clf, my_dataset, features_list)
print tester.main()

clf = DecisionTreeClassifier()
tester.dump_classifier_and_data(clf, my_dataset, features_list)
print tester.main()

clf = SVC(kernel="rbf", C=10000)
tester.dump_classifier_and_data(clf, my_dataset, features_list)
print tester.main()

clf = AdaBoostClassifier(n_estimators=10)
tester.dump_classifier_and_data(clf, my_dataset, features_list)
print tester.main()

clf = KNeighborsClassifier(n_neighbors=2)
tester.dump_classifier_and_data(clf, my_dataset, features_list)
예제 #15
0
def main():
    tester.main()
예제 #16
0
# Example starting point. Try investigating other evaluation techniques!
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

pipe = Pipeline([('scaler', preprocessing.StandardScaler()),
                 ('reducer', PCA(random_state=42)),
                 ('selector', SelectKBest()),
                 ('classifier', tree.DecisionTreeClassifier())])

param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__splitter': ['best', 'random'],
    'classifier__min_samples_split': [2, 4, 8, 16, 32],
    'classifier__class_weight': ['balanced', None],
    'selector__k': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 'all']
}

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
grid_search = GridSearchCV(pipe, param_grid, scoring='f1', cv=sss)
grid = grid_search.fit(features_train, labels_train)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(grid_search.best_estimator_, my_dataset,
                         features_list)
main()
예제 #17
0
def learn_bayes(my_dataset, features_list, scaler):
    gnb = GaussianNB()
    ppl = Pipeline([('scaler', scaler), ('classifier', gnb)])
    dump_classifier_and_data(ppl, my_dataset, features_list)
    tester.main()
예제 #18
0
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
clf = DecisionTreeClassifier(random_state=42)
pipe = Pipeline(steps=[('fss', fss), ('clf', clf)])

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
cv = StratifiedShuffleSplit(labels, n_iter=50, random_state=42)
param_grid = [
  {'clf__min_samples_split': [2, 5, 10, 15, 20, 50], 'clf__criterion': ['gini', 'entropy'],
   'clf__max_features': ['sqrt', 'log2', None], 'clf__class_weight': [None, 'balanced'],
   'clf__max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20, 50],
   'fss__k': range(1, len(features_list))}
 ]
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='f1', n_jobs=1)
grid.fit(features, labels)
clf = grid.best_estimator_

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, my_dataset, features_list)
main()
예제 #19
0
def main():
    tester.main()
예제 #20
0
def algorithm_tester(clf):
    tester.dump_classifier_and_data(clf, my_dataset, features_list)
    return tester.main()
예제 #21
0
def evaluate(clf, my_dataset, features_list):
    dump_classifier_and_data(clf, my_dataset, features_list)
    print '{1}Udacity\'s Evaluation:{0}'.format(color.Normal, color.BlinkBlue)
    return main()  # from tester.py
예제 #22
0
features_importance = []
for i in range(len(clf.feature_importances_)):
    if clf.feature_importances_[i] > 0:
        features_importance.append([df.columns[i+1], clf.feature_importances_[i]])
features_importance.sort(key=lambda x: x[1], reverse = True)
print "Feature selection with DecisionTreeClassifier:"
for f_i in features_importance:
    print f_i
features_list = [x[0] for x in features_importance]
features_list.insert(0, 'poi')

#Decision Tree Classifier with standard parametres 
clf = DecisionTreeClassifier(random_state = 75)
my_dataset = df[features_list].to_dict(orient = 'index')
tester.dump_classifier_and_data(clf, my_dataset, features_list)
tester.main()

#Random Forest with standard parameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 75)
clf.fit(df.ix[:,1:], np.ravel(df.ix[:,:1]))
# selecting the features with non null importance, sorting and creating features_list for the model
features_importance = []
for i in range(len(clf.feature_importances_)):
    if clf.feature_importances_[i] > 0:
        features_importance.append([df.columns[i+1], clf.feature_importances_[i]])
features_importance.sort(key=lambda x: x[1], reverse = True)
print "Feature selection with Random Forest:"
for f_i in features_importance[:11]:
    print f_i
features_list = [x[0] for x in features_importance]
예제 #23
0
class B(tester.Testable):
    class BErr(BaseException):
        pass

    def __init__(self):
        pass  #print('Creating B')

    def prepareToBeTested(self):
        pass  #print('Preparing B')

    def test1(self):
        #print('Running B.test1')
        self.checkFalse(False)

    def test2(self):
        #print('Running B.test2')
        self.checkTrue(0 / 0)

    def funcB(self):
        #print('In funcB')
        # uncomment the next line to pass test3
        pass  #raise self.BErr()

    def test3(self):
        #print('In test3')
        self.checkRaises(self.funcB, self.BErr)


if __name__ == '__main__':
    tester.main()
예제 #24
0
print(kf)
count=0
for trainIndex, testIndex in kf.split(labels):
    features_train = [features[index] for index in trainIndex]
    features_test =  [features[index] for index in testIndex]
    labels_train =   [labels[index] for index in trainIndex]
    labels_test =    [labels[index] for index in testIndex]

clf = DecisionTreeClassifier(min_samples_split=6)
clf = clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
accuracy = accuracy_score(labels_test,pred)
t1 = time()
print 'Accuracy',round(accuracy,2)
print "Precision: ",round(precision_score(labels_test,pred),2)
print "Recall: ", round(recall_score(labels_test,pred),2)
print "DecisionTree Clf algo. time",round(time()-t1, 3),'sec'

# # Example starting point. Try investigating other evaluation techniques!
# from sklearn.cross_validation import train_test_split
# features_train, features_test, labels_train, labels_test = \
#     train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

ts.dump_classifier_and_data(clf, my_dataset, features_list)
ts.main()