예제 #1
0
파일: dt.py 프로젝트: skeate/cs7641-p1
def run_dt(data, title, solved_params=None):
    """
    run the decision tree algo on the data given
    """
    x, y, pipeline = data
    pipe = Pipeline([
        *pipeline,
        ('DT', dtclf_pruned()),
    ])
    print("Splitting into train/test")
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
    if solved_params is None:
        print("Doing a GridSearch for best hyperparameters")
        params = {
            'DT__criterion': ['gini', 'entropy'],
            'DT__alpha': ALPHAS,
            'DT__class_weight': ['balanced'],
            'DT__min_samples_split': [2, 3, 4, 5],
        }
        clf = basicResults(pipe, x_train, y_train, x_test,
                           y_test, params, 'DT', title)
    else:
        print("Using pre-solved hyperparameters")
        clf = pipe.set_params(**solved_params)
    # print ("Plotting learning curve")
    # plot_learning_curve(clf, title + ' decision tree', x,
    #                     y, n_jobs=4, scoring=scorer, ylim=(0, 1))
    # plt.savefig('./graphs/' + title + '-dt.png')
    y_pred = clf.predict(x_test)
    conf = confusion_matrix(y_test, clf.predict(x_test))
    conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    print('Confusion matrix:')
    print(conf)
    np.savetxt('./output/DT_{}_confusion.csv'.format(title), conf, delimiter=',', fmt='%.2f')
예제 #2
0
파일: bdt.py 프로젝트: skeate/cs7641-p1
def run_boost(data, dataset, dtparams={}):
    x, y, pipeline = data
    pipe = Pipeline([
        *pipeline,
        ('Boost',
         ensemble.AdaBoostClassifier(algorithm='SAMME',
                                     base_estimator=dtclf_pruned(**dtparams))),
    ])
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
    params = {
        'Boost__n_estimators': [2**i for i in range(8)],
        'Boost__algorithm': ['SAMME', 'SAMME.R'],
    }
    clf = basicResults(pipe, x_train, y_train, x_test, y_test, params,
                       'boosted', dataset)
    # plot_learning_curve(clf, dataset + ' boosted', x, y,
    #                     ylim=(0.0, 1.01), cv=5, n_jobs=4, scoring=scorer)
    # plt.savefig('./graphs/' + dataset + '-boost.png')
    # plot_timing_curve(clf, x, y, 'boost', dataset)
    # plt.savefig('./graphs/' + dataset + '-boost-timing.png')
    # plot_iteration_curve(clf, x_train, y_train, x_test, y_test, params, 'boosted', dataset)
    # plt.savefig('./graphs/' + dataset + '-boost-iteration.png')
    conf = confusion_matrix(y_test, clf.predict(x_test))
    conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    print('Confusion matrix:')
    print(conf)
    np.savetxt('./output/Boosted_{}_confusion.csv'.format(dataset),
               conf,
               delimiter=',',
               fmt='%.2f')
예제 #3
0
alphas = [
    -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1,
    -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5, 1e-2,
    (1e-3) * 10**-0.5, 1e-3
]
##alphas=[0]
#pipeM = Pipeline([('Scale',StandardScaler()),
#                 ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')),
#                 ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')),
#                 ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
#                 ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
#                 ('DT',dtclf_pruned(random_state=55))])
#

pipeA = Pipeline([('Scale', StandardScaler()),
                  ('DT', dtclf_pruned(random_state=55))])

params = {
    'DT__criterion': ['gini', 'entropy'],
    'DT__alpha': alphas,
    'DT__class_weight': ['balanced']
}

#madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params,'DT','madelon')
adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         params, 'DT', 'adult')

#madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'}
#adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'}
#madelon_final_params = madelon_clf.best_params_
adult_final_params = adult_clf.best_params_
예제 #4
0
adultX = adult.drop('income',1).copy().values
adultY = adult['income'].copy().values

cancer = pd.read_hdf('cancer.hdf','cancer')   
cancerX = cancer.drop('class',1).copy().values
cancerY = cancer['class'].copy().values


alphas = [-1,-1e-3,-(1e-3)*10**-0.5, -1e-2, -(1e-2)*10**-0.5,-1e-1,-(1e-1)*10**-0.5, 0, (1e-1)*10**-0.5,1e-1,(1e-2)*10**-0.5,1e-2,(1e-3)*10**-0.5,1e-3]


adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(adultX, adultY, test_size=0.25, random_state=0,stratify=adultY)     
cancer_trgX, cancer_tstX, cancer_trgY, cancer_tstY = ms.train_test_split(cancerX, cancerY, test_size=0.25, random_state=0,stratify=cancerY)     


cancer_base = dtclf_pruned(criterion='entropy',class_weight='balanced',random_state=55)
adult_base = dtclf_pruned(criterion='entropy',class_weight='balanced',random_state=55)
OF_base = dtclf_pruned(criterion='gini',class_weight='balanced',random_state=55)                

#paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]}
paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,45,60,80,100],
          'Boost__base_estimator__alpha':alphas}
#paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100],
#           'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]}

paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,45,60,80,100],
           'Boost__base_estimator__alpha':alphas}
                                   
         
cancer_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=cancer_base,random_state=55)
adult_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=adult_base,random_state=55)
예제 #5
0
from sklearn import metrics, preprocessing

from helpers import dtclf_pruned
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
import numpy as np

df = pd.read_csv("student-prf.csv", sep=';', header=0)

df = df.apply(preprocessing.LabelEncoder().fit_transform)
df = np.array(df)  # type conversion needed to use slicing

all_column = np.arange(25)  # select feature for prediction
all_column = np.append(all_column, [28, 29, 30, 31])
X = df[:400, all_column]
y = df[:400, 26]
#for j,alpha in enumerate([-99999, -1,-0.01,-0.0001, 0,0.01,0.25]):
for j, alpha in enumerate([-1, -0.01, 0, 0.01, 0.1]):
    boost = AdaBoostClassifier(dtclf_pruned(alpha=alpha), n_estimators=5)
    boost.fit(X, y)
    predicted = boost.predict(df[400:, all_column])
    expected = df[400:, 26]
    print('Booster number {}'.format(j))
    for i, dt in enumerate(boost.estimators_):
        print('pruned tree {}. Alpha is {}. There are {} nodes'.format(
            i + 1, dt.alpha, dt.numNodes()))
    print("Classification report for classifier %s:\n%s\n" %
          (boost, metrics.classification_report(expected, predicted)))
    stratify=adultY)
mushrooms_trgX, mushrooms_tstX, mushrooms_trgY, mushrooms_tstY = ms.train_test_split(
    mushroomsX, mushroomsY, test_size=0.3, random_state=0, stratify=mushroomsY)
redwine_trgX, redwine_tstX, redwine_trgY, redwine_tstY = ms.train_test_split(
    redwineX, redwineY, test_size=0.3, random_state=0, stratify=redwineY)

# Search for good alphas
alphas = [
    -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1,
    -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5, 1e-2,
    (1e-3) * 10**-0.5, 1e-3
]

#madelon_base = dtclf_pruned(criterion='gini',class_weight='balanced',random_state=55)
adult_base = dtclf_pruned(criterion='entropy',
                          class_weight='balanced',
                          random_state=55)
mushrooms_base = dtclf_pruned(criterion='entropy',
                              class_weight='balanced',
                              random_state=55)
redwine_base = dtclf_pruned(criterion='entropy',
                            class_weight='balanced',
                            random_state=55)
OF_base = dtclf_pruned(criterion='gini',
                       class_weight='balanced',
                       random_state=55)

# Define parameters for grid search cross validation
#paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]}
paramsA = {
    'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
예제 #7
0
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

from helpers import dtclf_pruned
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
import numpy as np
df = pd.read_csv("alpha-recognition.csv") # !!!type is dataframe, not ndarray!!
# print 'shape of data: ', df.shape
df = np.array(df)  # type conversion needed to use slicing
# print type(df2)
# print df2[:1,:]
X = df[0:16000, 1:]
y = df[0:16000, 0]
for j,alpha in enumerate([-1000,-0.1,-0.01,-0.001,-0.0001,0,0.0001, 0.01,0.1,10]):
#for j, alpha in enumerate([-9999, -0.1, -0.01, -0.001, -0.0001, 0, 0.0001, 0.01, 0.01, 0.25]):
    boost = dtclf_pruned(alpha=alpha)
    boost.fit(X, y)
    predicted = boost.predict(df[16000:, 1:])
    expected = df[16000:, 0]
    print('Booster number {}'.format(j))
    print('There are {} nodes'.format(boost.numNodes()))
    print("Classification report for classifier %s:\n%s\n"
          % (boost, metrics.classification_report(expected, predicted)))
예제 #8
0
def main():

    cars = pd.read_hdf('data/processed/datasets.hdf', 'cars')
    carsX = cars.drop('Class', 1).copy().values
    carsY = cars['Class'].copy().values

    madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon')
    madelonX = madelon.drop('Class', 1).copy().values
    madelonY = madelon['Class'].copy().values

    alphas = [
        -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1,
        -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5,
        1e-2, (1e-3) * 10**-0.5, 1e-3
    ]

    cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split(
        carsX, carsY, test_size=0.3, random_state=0, stratify=carsY)
    madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split(
        madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY)

    madelon_base = dtclf_pruned(criterion='gini',
                                class_weight='balanced',
                                random_state=55)
    cars_base = dtclf_pruned(criterion='entropy',
                             class_weight='balanced',
                             random_state=55)
    OF_base = dtclf_pruned(criterion='gini',
                           class_weight='balanced',
                           random_state=55)
    #paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]}
    paramsA = {
        'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
        'Boost__base_estimator__alpha': alphas
    }
    #paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100],
    #           'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]}

    paramsM = {
        'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
        'Boost__base_estimator__alpha': alphas
    }

    madelon_booster = AdaBoostClassifier(algorithm='SAMME',
                                         learning_rate=1,
                                         base_estimator=madelon_base,
                                         random_state=55)
    cars_booster = AdaBoostClassifier(algorithm='SAMME',
                                      learning_rate=1,
                                      base_estimator=cars_base,
                                      random_state=55)
    OF_booster = AdaBoostClassifier(algorithm='SAMME',
                                    learning_rate=1,
                                    base_estimator=OF_base,
                                    random_state=55)

    pipeM = Pipeline([('Scale', StandardScaler()),
                      ('Cull1',
                       SelectFromModel(RandomForestClassifier(random_state=1),
                                       threshold='median')),
                      ('Cull2',
                       SelectFromModel(RandomForestClassifier(random_state=2),
                                       threshold='median')),
                      ('Cull3',
                       SelectFromModel(RandomForestClassifier(random_state=3),
                                       threshold='median')),
                      ('Cull4',
                       SelectFromModel(RandomForestClassifier(random_state=4),
                                       threshold='median')),
                      ('Boost', madelon_booster)])

    pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', cars_booster)])

    #
    madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX,
                               madelon_tstY, paramsM, 'Boost', 'madelon')
    cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY,
                            paramsA, 'Boost', 'cars')

    #
    #
    #madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02}
    #cars_final_params = {'n_estimators': 10, 'learning_rate': 1}
    #OF_params = {'learning_rate':1}

    madelon_final_params = madelon_clf.best_params_
    cars_final_params = cars_clf.best_params_
    OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50}

    ##
    pipeM.set_params(**madelon_final_params)
    pipeA.set_params(**cars_final_params)
    makeTimingCurve(madelonX, madelonY, pipeM, 'Boost', 'madelon')
    makeTimingCurve(carsX, carsY, pipeA, 'Boost', 'cars')
    #
    pipeM.set_params(**madelon_final_params)
    iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY,
                {
                    'Boost__n_estimators':
                    [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
                }, 'Boost', 'madelon')
    pipeA.set_params(**cars_final_params)
    iterationLC(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY,
                {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]},
                'Boost', 'cars')
    pipeM.set_params(**OF_params)
    iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY,
                {
                    'Boost__n_estimators':
                    [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
                }, 'Boost_OF', 'madelon')
    pipeA.set_params(**OF_params)
    iterationLC(pipeA, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY,
                {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]},
                'Boost_OF', 'cars')
예제 #9
0
def main():
    # adult = pd.read_csv('data/adult_parsed.csv')
    # plt.figure(figsize=(15,12))
    # cor_map = adult.corr()
    # sns.heatmap(cor_map, annot=True, fmt='.3f', cmap='YlGnBu')
    # plt.show()

    # adult['net_capital'] = adult['capital-gain']-adult['capital-loss']
    # adult = adult.drop(["fnlwgt","capital-gain","capital-loss","workclass"],axis=1)
    #
    # adult['income']=adult['income'].map({'<=50K': 0, '>50K': 1})
    # adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int)
    # adult['race'] = adult['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3,
    #                                    'Amer-Indian-Eskimo': 4}).astype(int)
    # adult['marital-status'] = adult['marital-status'].map({'Never-married':0,'Widowed':1,'Divorced':2, 'Separated':3,
    #                                                        'Married-spouse-absent':4, 'Married-civ-spouse':5, 'Married-AF-spouse':6})
    # adult['education'] = adult['education'].map({'Preschool':0,'1st-4th':1,'5th-6th':2, '7th-8th':3,
    #                                              '9th':4, '10th':5, '11th':6, '12th':7, 'Prof-school':8,
    #                                              'HS-grad':9, 'Some-college':10, 'Assoc-voc':11, 'Assoc-acdm':12,
    #                                              'Bachelors':13, 'Masters':14, 'Doctorate':15})
    #
    # adult['occupation'] = adult['occupation'].map({'Priv-house-serv':0,'Protective-serv':1,'Handlers-cleaners':2, 'Machine-op-inspct':3,
    #                                                'Adm-clerical':4, 'Farming-fishing':5, 'Transport-moving':6, 'Craft-repair':7, 'Other-service':8,
    #                                                'Tech-support':9, 'Sales':10, 'Exec-managerial':11, 'Prof-specialty':12, 'Armed-Forces':13 })
    # adult['native-country'] = adult['native-country'].map({'?':-1,'Puerto-Rico':0,'Haiti':1,'Cuba':2, 'Iran':3,
    #                                                        'Honduras':4, 'Jamaica':5, 'Vietnam':6, 'Mexico':7, 'Dominican-Republic':8,
    #                                                        'Laos':9, 'Ecuador':10, 'El-Salvador':11, 'Cambodia':12, 'Columbia':13,
    #                                                        'Guatemala':14, 'South':15, 'India':16, 'Nicaragua':17, 'Yugoslavia':18,
    #                                                        'Philippines':19, 'Thailand':20, 'Trinadad&Tobago':21, 'Peru':22, 'Poland':23,
    #                                                        'China':24, 'Hungary':25, 'Greece':26, 'Taiwan':27, 'Italy':28, 'Portugal':29,
    #                                                        'France':30, 'Hong':31, 'England':32, 'Scotland':33, 'Ireland':34,
    #                                                        'Holand-Netherlands':35, 'Canada':36, 'Germany':37, 'Japan':38,
    #                                                        'Outlying-US(Guam-USVI-etc)':39, 'United-States':40
    #                                                        })
    #
    # adult['relationship'] = adult['relationship'].map({'Unmarried':0,'Other-relative':1, 'Not-in-family':2,
    #                                                    'Wife':3, 'Husband':4,'Own-child':5})
    #
    # adult = pd.get_dummies(adult)
    # adult_income_X = adult.drop('income',1).copy().values
    # adult_income_Y = adult['income'].copy().values
    #
    #
    #
    #
    #
    # adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(adult_income_X, adult_income_Y, test_size=0.3, random_state=0,stratify=adult_income_Y)
    # # alphas = [0.00005, 0.0001, 0.0002,0.00025, 0.0003, 0.0004,0.0005, 0.0006,0.0007, 0.0008, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.5]
    alphas = np.append(np.arange(0.001, 0.05, 0.001), 0)
    pipeA = Pipeline([('Scale', StandardScaler()),
                      ('DT', dtclf_pruned(random_state=55))])
    #
    params = {
        'DT__criterion': ['gini', 'entropy'],
        'DT__alpha': alphas,
        'DT__class_weight': ['balanced']
    }
    # adult_income_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,params,'DT','adult_income')
    # adult_final_params = adult_income_clf.best_params_
    # pipeA.set_params(**adult_final_params)
    # makeTimingCurve(adult_income_X,adult_income_Y,pipeA,'DT','adult_income')
    # DTpruningVSnodes(pipeA,alphas,adult_trgX,adult_trgY,'adult_income')

    #wine_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'}
    #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'}

    # Data Parsing for wine quality dataset
    wine_data = pd.read_csv('data/winequality_white.csv')
    wine_data['category'] = wine_data['quality'] >= 7

    wineX = wine_data[wine_data.columns[0:11]].values
    wineY = wine_data['category'].values.astype(np.int)
    # plt.figure(figsize=(12,6))
    # sns.heatmap(wine_data.corr(),annot=True)
    # plt.show()

    wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(
        wineX, wineY, test_size=0.3, random_state=0, stratify=wineY)
    wine_clf = basicResults(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY,
                            params, 'DT', 'wine')
    wine_final_params = wine_clf.best_params_
    pipeA.set_params(**wine_final_params)
    makeTimingCurve(wineX, wineY, pipeA, 'DT', 'wine')

    DTpruningVSnodes(pipeA, alphas, wine_trgX, wine_trgY, 'wine')
예제 #10
0
def main():
    # Load Data
    cars = pd.read_hdf('data/processed/datasets.hdf', 'cars')
    carsX = cars.drop('Class', 1).copy().values
    carsY = cars['Class'].copy().values

    madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon')
    madelonX = madelon.drop('Class', 1).copy().values
    madelonY = madelon['Class'].copy().values

    cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split(
        carsX, carsY, test_size=0.3, random_state=0, stratify=carsY)
    madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split(
        madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY)

    # Search for good alphas
    alphas = [
        -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1,
        -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5,
        1e-2, (1e-3) * 10**-0.5, 1e-3
    ]
    #alphas=[0]
    pipeM = Pipeline([('Scale', StandardScaler()),
                      ('Cull1',
                       SelectFromModel(RandomForestClassifier(random_state=1),
                                       threshold='median')),
                      ('Cull2',
                       SelectFromModel(RandomForestClassifier(random_state=2),
                                       threshold='median')),
                      ('Cull3',
                       SelectFromModel(RandomForestClassifier(random_state=3),
                                       threshold='median')),
                      ('Cull4',
                       SelectFromModel(RandomForestClassifier(random_state=4),
                                       threshold='median')),
                      ('DT', dtclf_pruned(random_state=55))])

    pipeA = Pipeline([('Scale', StandardScaler()),
                      ('DT', dtclf_pruned(random_state=55))])

    params = {
        'DT__criterion': ['gini', 'entropy'],
        'DT__alpha': alphas,
        'DT__class_weight': ['balanced']
    }

    madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX,
                               madelon_tstY, params, 'DT', 'madelon')
    cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY,
                            params, 'DT', 'cars')

    #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'}
    #cars_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'}
    madelon_final_params = madelon_clf.best_params_
    cars_final_params = cars_clf.best_params_

    pipeM.set_params(**madelon_final_params)
    makeTimingCurve(madelonX, madelonY, pipeM, 'DT', 'madelon')
    pipeA.set_params(**cars_final_params)
    makeTimingCurve(carsX, carsY, pipeA, 'DT', 'cars')

    DTpruningVSnodes(pipeM, alphas, madelon_trgX, madelon_trgY, 'madelon')
    DTpruningVSnodes(pipeA, alphas, cars_trgX, cars_trgY, 'cars')
예제 #11
0
def main():

    # adult = pd.read_csv('data/adult_parsed.csv')
    # adult['net_capital'] = adult['capital-gain']-adult['capital-loss']
    # adult = adult.drop(["fnlwgt","capital-gain","capital-loss","workclass","native-country"],axis=1)
    #
    # adult['income']=adult['income'].map({'<=50K': 0, '>50K': 1})
    # adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int)
    # adult['race'] = adult['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3,
    #                                    'Amer-Indian-Eskimo': 4}).astype(int)
    # adult['marital-status'] = adult['marital-status'].map({'Never-married':0,'Widowed':1,'Divorced':2, 'Separated':3,
    #                                                        'Married-spouse-absent':4, 'Married-civ-spouse':5, 'Married-AF-spouse':6})
    # adult['education'] = adult['education'].map({'Preschool':0,'1st-4th':1,'5th-6th':2, '7th-8th':3,
    #                                              '9th':4, '10th':5, '11th':6, '12th':7, 'Prof-school':8,
    #                                              'HS-grad':9, 'Some-college':10, 'Assoc-voc':11, 'Assoc-acdm':12,
    #                                              'Bachelors':13, 'Masters':14, 'Doctorate':15})
    #
    # adult['occupation'] = adult['occupation'].map({'Priv-house-serv':0,'Protective-serv':1,'Handlers-cleaners':2, 'Machine-op-inspct':3,
    #                                                'Adm-clerical':4, 'Farming-fishing':5, 'Transport-moving':6, 'Craft-repair':7, 'Other-service':8,
    #                                                'Tech-support':9, 'Sales':10, 'Exec-managerial':11, 'Prof-specialty':12, 'Armed-Forces':13 })
    #
    # adult['relationship'] = adult['relationship'].map({'Unmarried':0,'Other-relative':1, 'Not-in-family':2,
    #                                                    'Wife':3, 'Husband':4,'Own-child':5})
    #
    # adult = pd.get_dummies(adult)
    # adult_income_X = adult.drop('income',1).copy().values
    # adult_income_Y = adult['income'].copy().values

    wine_data = pd.read_csv('data/winequality_white.csv')
    wine_data['category'] = wine_data['quality'] >= 7

    wineX = wine_data[wine_data.columns[0:11]].values
    wineY = wine_data['category'].values.astype(np.int)

    alphas = np.append(np.arange(0.001, 0.05, 0.001), 0)

    # adult_income_trgX, adult_income_tstX, adult_income_trgY, adult_income_tstY = ms.train_test_split(adult_income_X, adult_income_Y, test_size=0.3, random_state=0,stratify=adult_income_Y)
    wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(
        wineX, wineY, test_size=0.3, random_state=0, stratify=wineY)

    # adult_income_base = dtclf_pruned(criterion='entropy',class_weight='balanced',random_state=55)
    wine_base = dtclf_pruned(criterion='gini',
                             class_weight='balanced',
                             random_state=55)

    OF_base = dtclf_pruned(criterion='gini',
                           class_weight='balanced',
                           random_state=55)
    #paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]}
    paramsA = {
        'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
        'Boost__base_estimator__alpha': alphas
    }
    #paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100],
    #           'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]}

    paramsM = {
        'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
        'Boost__base_estimator__alpha': alphas
    }

    # adult_income_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=adult_income_base,random_state=55)
    wine_booster = AdaBoostClassifier(algorithm='SAMME',
                                      learning_rate=1,
                                      base_estimator=wine_base,
                                      random_state=55)
    OF_booster = AdaBoostClassifier(algorithm='SAMME',
                                    learning_rate=1,
                                    base_estimator=OF_base,
                                    random_state=55)

    pipeM = Pipeline([('Scale', StandardScaler()),
                      ('Cull1',
                       SelectFromModel(RandomForestClassifier(random_state=1),
                                       threshold='median')),
                      ('Cull2',
                       SelectFromModel(RandomForestClassifier(random_state=2),
                                       threshold='median')),
                      ('Cull3',
                       SelectFromModel(RandomForestClassifier(random_state=3),
                                       threshold='median')),
                      ('Cull4',
                       SelectFromModel(RandomForestClassifier(random_state=4),
                                       threshold='median')),
                      ('Boost', wine_booster)])

    pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', wine_booster)])

    #
    # adult_income_clf = basicResults(pipeM,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,paramsM,'Boost','adult_income')
    wine_clf = basicResults(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY,
                            paramsA, 'Boost', 'wine')

    #
    #

    # adult_income_final_params = adult_income_clf.best_params_
    wine_final_params = wine_clf.best_params_
    OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50}

    ##
    # pipeM.set_params(**adult_income_final_params)
    pipeA.set_params(**wine_final_params)
    # makeTimingCurve(adult_income_X,adult_income_Y,pipeM,'Boost','adult_income')
    makeTimingCurve(wineX, wineY, pipeA, 'Boost', 'wine')

    # pipeM.set_params(**adult_income_final_params)
    # iterationLC(pipeM,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost','adult_income')
    pipeM.set_params(**wine_final_params)
    iterationLC(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY,
                {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]},
                'Boost', 'wine')
    # pipeM.set_params(**OF_params)
    # iterationLC(pipeM,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost_OF','adult_income')
    pipeA.set_params(**OF_params)
    iterationLC(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY,
                {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]},
                'Boost_OF', 'wine')
redwineY = redwine['quality'].copy().values

# Split data 70/30 between train and test in a stratified manner
adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(
    adultX,
    adultY,
    test_size=0.05,
    train_size=0.1666,
    random_state=0,
    stratify=adultY)
redwine_trgX, redwine_tstX, redwine_trgY, redwine_tstY = ms.train_test_split(
    redwineX, redwineY, test_size=0.3, random_state=0, stratify=redwineY)

# DT
pipeA = Pipeline([('Scale', StandardScaler()),
                  ('DT', dtclf_pruned(random_state=55))])

pipeR = Pipeline([('Scale', StandardScaler()),
                  ('DT', dtclf_pruned(random_state=55))])

adult_final_params = {
    'DT__alpha': 0.0031622776601683794,
    'DT__class_weight': 'balanced',
    'DT__criterion': 'entropy'
}
redwine_final_params = {
    'DT__alpha': -0.0316227766016838,
    'DT__class_weight': 'balanced',
    'DT__criterion': 'entropy'
}