Exemplo n.º 1
0
def run_test():
    """ set up a series of CV grid searches """
    train_size = 1000
    test_size = 1000

    #test_types=['USPS']
    #test_types=['MNIST']
    test_types = ['MNIST', 'USPS']

    clf_type = 'kNN'
    rerun_CV = True
    #rerun_CV=False

    # setup grid search parameters
    # intentionally incomplete and restricted, change as desired
    num_cv_folds = 10
    param_names = ['n_neighbors', 'weights']
    param_values = [range(1, 6, 1), ['uniform', 'distance']]
    param_string_types = [False, True]

    print('Running', clf_type, 'CV grid search tests...')
    for test_type in test_types:
        print('Running CV on dataset', test_type, '...')
        if test_type == 'MNIST':
            train, train_label, _, _ = util.MNIST_loader(1,
                                                         train_size,
                                                         1,
                                                         test_size,
                                                         echo=False)
        else:
            train, train_label, _, _ = util.USPS_loader(1,
                                                        train_size,
                                                        1,
                                                        test_size,
                                                        echo=False)

        for param_name, param_value, param_str_type in zip(
                param_names, param_values, param_string_types):
            print('... on parameter', param_name)
            if rerun_CV:
                params = {param_name: param_value}
                np.random.seed(0)  # need this, no random_state on CV and kNN
                # check unlisted default settings vs intended analysis
                # default n_neighbors=3 for the weights cv
                clf_cv = GridSearchCV(KNeighborsClassifier(
                    algorithm='ball_tree', n_neighbors=3),
                                      param_grid=params,
                                      cv=num_cv_folds,
                                      verbose=1)
                util.run_CV(clf_cv, clf_type, test_type, train, train_label,
                            param_name, param_value)

            # plot from files
            util.plotterB(str(clf_type + '_grid_search_' + param_name +
                              '_mean_' + test_type + '.csv'),
                          str(clf_type + '_grid_search_' + param_name +
                              '_mean_std_' + test_type + '.csv'),
                          str(param_name + ' (' + test_type + ')'),
                          str('Accuracy (' + test_type + ')'),
                          string=param_str_type)
def run_test():
    """ set up a series of CV grid searches """
    train_size=1000
    test_size=1000
    
    #test_types=['USPS']
    #test_types=['MNIST']
    test_types=['MNIST','USPS']
    
    clf_type='MLP'
    rerun_CV=True
    #rerun_CV=False
    
    # setup grid search parameters
    # intentionally incomplete and restricted, change as desired
    num_cv_folds=10
    param_names=['activation','hidden_layer_sizes']
    param_values=[['logistic','tanh','relu'],range(10,60,10),]
    param_string_types=[True,False]
    
    print('Running',clf_type,'CV grid search tests...')
    for test_type in test_types:
        print('Running CV on dataset',test_type,'...')
        if test_type=='MNIST':
            train,train_label,_,_=util.MNIST_loader(1,train_size,1,test_size,echo=False)
        else:
            train,train_label,_,_=util.USPS_loader(1,train_size,1,test_size,echo=False)
        
        for param_name,param_value,param_str_type in zip(param_names,param_values,param_string_types):
            print('... on parameter',param_name)
            if rerun_CV:
                params={param_name:param_value}
                # check unlisted default settings vs intended analysis
                clf_cv=GridSearchCV(MLPClassifier(hidden_layer_sizes=100,solver='adam',learning_rate='adaptive',random_state=0),
                                    param_grid=params,cv=num_cv_folds,verbose=1)
                util.run_CV(clf_cv,clf_type,test_type,train,train_label,param_name,param_value)
            
            # plot from files
            util.plotterB(str(clf_type+'_grid_search_'+param_name+'_mean_'+test_type+'.csv'),
                          str(clf_type+'_grid_search_'+param_name+'_mean_std_'+test_type+'.csv'),
                          str(param_name+' ('+test_type+')'),str('Accuracy ('+test_type+')'),
                          string=param_str_type)
Exemplo n.º 3
0
def run_test():
    """ set up a series of CV grid searches """
    train_size=1000
    test_size=1000
    
    #test_types=['USPS']
    #test_types=['MNIST']
    test_types=['MNIST','USPS']
    
    clf_type='Decision_Tree'
    rerun_CV=True
    #rerun_CV=False
    
    # setup grid search parameters
    # intentionally incomplete and restricted, change as desired
    num_cv_folds=10
    param_names=['max_depth','min_samples_leaf']
    param_values=[range(1,6,1),range(1,6,1)]
    
    print('Running',clf_type,'CV grid search tests...')
    for test_type in test_types:
        print('Running CV on dataset',test_type,'...')
        if test_type=='MNIST':
            train,train_label,_,_=util.MNIST_loader(1,train_size,1,test_size,echo=False)
        else:
            train,train_label,_,_=util.USPS_loader(1,train_size,1,test_size,echo=False)
        
        for param_name,param_value in zip(param_names,param_values):
            print('... on parameter',param_name)
            if rerun_CV:
                params={param_name:param_value}
                # check unlisted default settings vs intended analysis
                clf_cv=GridSearchCV(DecisionTreeClassifier(random_state=0,criterion='gini'),
                                    param_grid=params,cv=num_cv_folds,verbose=1)
                #clf_cv=GridSearchCV(DecisionTreeClassifier(random_state=0,criterion='entropy'),param_grid=params,cv=num_cv_folds)
                util.run_CV(clf_cv,clf_type,test_type,train,train_label,param_name,param_value)
            
            # plot from files
            util.plotterB(str(clf_type+'_grid_search_'+param_name+'_mean_'+test_type+'.csv'),
                          str(clf_type+'_grid_search_'+param_name+'_mean_std_'+test_type+'.csv'),
                          str(param_name+' ('+test_type+')'),str('Accuracy ('+test_type+')'))
Exemplo n.º 4
0
def run_test():
    """ set up a series of CV grid searches """
    # reduced size to shorten execution time, change as desired
    train_size = 500
    test_size = 500

    #test_types=['USPS']
    #test_types=['MNIST']
    test_types = ['MNIST', 'USPS']

    clf_type = 'SVM'
    rerun_CV = True
    #rerun_CV=False

    # setup grid search parameters
    # intentionally incomplete and restricted, change as desired
    # PCA is to shorten run time, but might not be advisable for analysis
    # reduced cv to shorten execution time, change as desired
    num_cv_folds = 5
    num_pca = None
    #num_pca=30
    param_names = ['estimator__kernel', 'estimator__C']
    param_values = [['linear', 'rbf', 'poly'], np.logspace(0, 5, 10)]
    param_string_types = [True, False]

    print('Running', clf_type, 'CV grid search tests...')
    print('... some settings might take a very long time!')
    for test_type in test_types:
        print('Running CV on dataset', test_type, '...')
        if test_type == 'MNIST':
            train, train_label, _, _ = util.MNIST_loader(1,
                                                         train_size,
                                                         1,
                                                         test_size,
                                                         echo=False)
        else:
            train, train_label, _, _ = util.USPS_loader(1,
                                                        train_size,
                                                        1,
                                                        test_size,
                                                        echo=False)

        # some datasets/settings might need PCA to shorten execution
        print('... running PCA pre-processing')
        if num_pca is not None:
            pca = PCA(n_components=num_pca)
            train = pca.fit_transform(train)

        for param_name, param_value, param_str_type in zip(
                param_names, param_values, param_string_types):
            print('... on parameter', param_name)
            if rerun_CV:
                params = {param_name: param_value}
                # check unlisted default settings vs intended analysis
                clf_cv = GridSearchCV(OneVsRestClassifier(
                    estimator=SVC(C=1.0, degree=3, random_state=0)),
                                      param_grid=params,
                                      cv=num_cv_folds,
                                      verbose=2)
                util.run_CV(clf_cv, clf_type, test_type, train, train_label,
                            param_name, param_value)

            # plot from files
            util.plotterB(str(clf_type + '_grid_search_' + param_name +
                              '_mean_' + test_type + '.csv'),
                          str(clf_type + '_grid_search_' + param_name +
                              '_mean_std_' + test_type + '.csv'),
                          str(param_name + ' (' + test_type + ')'),
                          str('Accuracy (' + test_type + ')'),
                          string=param_str_type,
                          log_scale=True)
def run_test(test_type='MNIST'):
    """ run test over different training sizes """
    print('Running tests, all classifiers over different training sizes...')
    
    # set default (check scikit for defaults) or simple/reasonable classifier settings
    # replace with tuned settings if desired
    # note that different tuned settings might be needed for best performance on each dataset
    clf1=DecisionTreeClassifier(random_state=0)
    clf2=KNeighborsClassifier(n_neighbors=1)
    clf3=GradientBoostingClassifier(n_estimators=50,learning_rate=1.0,max_depth=10,random_state=0)
    clf4=svm.SVC(kernel='linear',C=1.0,random_state=0)
    clf5=MLPClassifier(hidden_layer_sizes=100,solver='adam',activation='relu',
                       learning_rate='adaptive',random_state=0)
    
    clf_list=[clf1,clf2,clf3,clf4,clf5]
    clf_names=['Decision Tree','kNN','Boosted Trees','SVM','MLP']
    
    if test_type=='MNIST':
        f_names_train=['MNIST_Decision_Trees_by_training_size_vs_train_data.csv',
                       'MNIST_kNN_by_training_size_vs_train_data.csv',
                       'MNIST_Boosted_Trees_by_training_size_vs_train_data.csv',
                       'MNIST_SVM_by_training_size_vs_train_data.csv',
                       'MNIST_MLP_by_training_size_vs_train_data.csv']
        
        f_names_val=['MNIST_Decision_Trees_by_training_size_vs_validation_data.csv',
                     'MNIST_kNN_by_training_size_vs_validation_data.csv',
                     'MNIST_Boosted_Trees_by_training_size_vs_validation_data.csv',
                     'MNIST_SVM_by_training_size_vs_validation_data.csv',
                     'MNIST_MLP_by_training_size_vs_validation_data.csv']
        
        f_names_test=['MNIST_Decision_Trees_by_training_size_vs_test_data.csv',
                      'MNIST_kNN_by_training_size_vs_test_data.csv',
                      'MNIST_Boosted_Trees_by_training_size_vs_test_data.csv',
                      'MNIST_SVM_by_training_size_vs_test_data.csv',
                      'MNIST_MLP_by_training_size_vs_test_data.csv']
    else:
        f_names_train=['USPS_Decision_Trees_by_training_size_vs_train_data.csv',
                       'USPS_kNN_by_training_size_vs_train_data.csv',
                       'USPS_Boosted_Trees_by_training_size_vs_train_data.csv',
                       'USPS_SVM_by_training_size_vs_train_data.csv',
                       'USPS_MLP_by_training_size_vs_train_data.csv']
        
        f_names_val=['USPS_Decision_Trees_by_training_size_vs_validation_data.csv',
                     'USPS_kNN_by_training_size_vs_validation_data.csv',
                     'USPS_Boosted_Trees_by_training_size_vs_validation_data.csv',
                     'USPS_SVM_by_training_size_vs_validation_data.csv',
                     'USPS_MLP_by_training_size_vs_validation_data.csv']
        
        f_names_test=['USPS_Decision_Trees_by_training_size_vs_test_data.csv',
                      'USPS_kNN_by_training_size_vs_test_data.csv',
                      'USPS_Boosted_Trees_by_training_size_vs_test_data.csv',
                      'USPS_SVM_by_training_size_vs_test_data.csv',
                      'USPS_MLP_by_training_size_vs_test_data.csv']
    
    # define train and test size as desired
    train_size=1000
    test_size=1000
    
    for clf,clf_name,f_name_train,f_name_val,f_name_test in zip(clf_list,clf_names,
                                                                f_names_train,
                                                                f_names_val,
                                                                f_names_test):
        print('Running',clf_name,'...')
        err_train_list=[]
        err_val_list=[]
        err_test_list=[]
        
        # test different values for the training size
        train_size_list=np.arange(100,1000,100)
        
        if test_type=='MNIST':
            train,train_label,test,test_label=util.MNIST_loader(1,train_size,1,test_size,echo=False)
        else:
            train,train_label,test,test_label=util.USPS_loader(1,train_size,1,test_size,echo=False)
    
        X_train,X_val,y_train,y_val=train_test_split(train,train_label,test_size=0.2,random_state=0)
        print('... train and val set size',X_train.shape,X_val.shape)
        
        for i in train_size_list:
            clf.fit(X_train[:i],y_train[:i])
            
            acc_train=clf.score(X_train[:i],y_train[:i])
            
            acc_val=clf.score(X_val,y_val)
            acc_test=clf.score(test,test_label)
            
            err_train_list.append(1.0-acc_train)
            err_val_list.append(1.0-acc_val)
            err_test_list.append(1.0-acc_test)
            #print('... train, val, and test accuracy (error rate) at train size:',acc_train,acc_val,acc_test,i)
        
        print('... done, min_train, max_train',np.min(err_train_list),np.max(err_train_list))
        print('... done, min_val, max_val',np.min(err_val_list),np.max(err_val_list))
        print('... done, min_test, max_test',np.min(err_test_list),np.max(err_test_list))
        
        df_train=pd.DataFrame({'Classifier':[clf_name]*len(err_train_list),
                               'Size':train_size_list,
                               'Error':err_train_list})
        df_val=pd.DataFrame({'Classifier':[clf_name]*len(err_val_list),
                               'Size':train_size_list,
                               'Error':err_val_list})
        df_test=pd.DataFrame({'Classifier':[clf_name]*len(err_test_list),
                              'Size':train_size_list,
                              'Error':err_test_list})
        
        df_train.to_csv(f_name_train,index=False,header=True)
        df_val.to_csv(f_name_val,index=False,header=True)
        df_test.to_csv(f_name_test,index=False,header=True)
        
    return f_names_train,f_names_val,f_names_test