예제 #1
0
def find_best_feature(selected_features, X_train, C_param, y_train, feature):
    # decide on which features we are using (selected_features + feature)
    features_in_use = np.append(selected_features, feature)
    X_train_filt = X_train[:, features_in_use]
    X_train_filt_ranked = bin_rank(X_train_filt)

    # Fit a Logistic regression model
    lr = CategoricalNB(alpha=1.0,
                       fit_prior=True,
                       class_prior=None,
                       min_categories=5).fit(X_train_filt_ranked, y_train)
    #lr = LogisticRegression(C=C_param, random_state=0).fit(X_train_filt_ranked, y_train)
    # Get the accuracy rate using the validation set
    accu_train = lr.score(X_train_filt_ranked, y_train)
    # Tuple format = (feature number, training accuracy found)
    return (feature, accu_train)
예제 #2
0
def selected_feature_check(data, X_train, y_train, selected_features,
                           selected_features_by_name, C_param, best_feature):
    # Save the previous selected features to check later if we have made a change
    prev_selected_features = selected_features.copy()
    prev_selected_features_by_name = selected_features_by_name.copy()
    # Add the best feature to the list
    selected_features = np.append(selected_features, best_feature)
    selected_features_by_name.append(data.columns[best_feature])
    feature_removal_score = {}

    # Iterate through each feature in the list and remove it
    for feature in selected_features:
        temp_features = np.setdiff1d(selected_features, np.array([feature]))
        X_train_filt = X_train[:, temp_features]
        X_train_filt_ranked = bin_rank(X_train_filt)
        # Fit a Logistic regression model
        lr = CategoricalNB(alpha=1.0,
                           fit_prior=True,
                           class_prior=None,
                           min_categories=5).fit(X_train_filt_ranked, y_train)
        #lr = LogisticRegression(C=C_param, random_state=0).fit(X_train_filt_ranked, y_train)
        # Get the accuracy rate using the validation set
        accu_train = lr.score(X_train_filt_ranked, y_train)
        feature_removal_score[feature] = accu_train

    # Get the feature which causes the highest accuracy without it
    max_key = max(feature_removal_score,
                  key=lambda k: feature_removal_score[k])
    selected_features = np.setdiff1d(selected_features, np.array([max_key]))
    max_key_name = data.columns[max_key]
    selected_features_by_name = list(
        set(selected_features_by_name) - set([max_key_name]))

    # Check if we have made any changes, if not let the caller know that this function is no longer needed
    if np.array_equal(selected_features, prev_selected_features):
        print(
            "-----------------------------------------------------------------------------------------------"
        )
        print(
            "We have found the unchanged set, thus from now on we are just adding to our selected features.\n"
        )
        print(
            "-----------------------------------------------------------------------------------------------"
        )
        return max_key, selected_features, selected_features_by_name, True
    else:
        return max_key, selected_features, selected_features_by_name, False
예제 #3
0
NB.fit(X_train_new, y_train)

prediction['Naive Bayes'] = NB.predict(X_test_new)

#accuracy, precision, recall, confusion matrix
print("Acurracy:")
print(accuracy_score(y_test, prediction['Naive Bayes']))
print("\n")
print("Classfication report:")
print(classification_report(y_test, prediction['Naive Bayes']))
print("\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, prediction['Naive Bayes']))

#scoring with train data
print('train score:', NB.score(X_train_new, y_train))

# scoring with test data
print('test score:', NB.score(X_test_new, y_test))

NB.predict_proba(X_test_new)
"""# Random Forest"""

#reassemble training dataset
#for numerical features, use the ones selected in Logistic Regression
#for categorical features, use the datasets after applying label encoding

#training dataset
x = X_sm_num.drop(columns=['hour', 'N1', 'N2', 'N5', 'N6', 'N7'], axis=1)
train1 = x.join(X_sm_c1).join(X_sm['newlabel'])
train1
예제 #4
0
#[1. 1. 1. 1. 1.]

mlp1.fit(x_train, y_train)
mlpscores = cross_val_score(mlp1, x_train, y_train, cv=5)
print("MLPClassifier Cross Validation Attempt 1: " + str(mlpscores))
#[0.98242531 0.99032542 0.98504837 0.9876869  0.98416887]

mlp2 = MLPClassifier(max_iter=8)
mlp2.fit(x_train, y_train)
mlpscores = cross_val_score(mlp2, x_train, y_train, cv=5)
print("MLPTreeClassifier Cross Validation Attempt 2: " + str(mlpscores))
#max_iter 값을 8까지 늘리면 Cross-Val-Score가 전보다 증가하는 것을 볼 수 있습니다.
#[0.99912127 0.99824099 0.99912049 0.99912049 0.99736148]

cnb1.fit(x_train, y_train)
cnb1.score(x_test, y_test)
cnbscores = cross_val_score(cnb1, x_train, y_train, cv=5)
print("CategoricalNB Cross Validation Attempt 1: " + str(cnbscores))
#[0.94112478 0.94547054 0.93755497 0.93491645 0.92524186]

cnb2 = CategoricalNB(alpha=0.2)
cnb2.fit(x_train, y_train)
cnbscores = cross_val_score(cnb2, x_train, y_train, cv=5)
print("CategoricalNB Cross Validation Attempt 2: " + str(cnbscores))
#alpha값을 0.2로 조정했을 때 Cross-Val-Score가 증가하는 것을 볼 수 있습니다.
#[0.95342707 0.96042216 0.95602463 0.95074758 0.93667546]

rfc1.fit(x_train, y_train)
rfcscores = cross_val_score(rfc1, x_train, y_train, cv=5)
print("RandomForestClassifer Cross Validation Attempt 1: " + str(rfcscores))
#[0.99472759 0.99560246 0.99472296 1.         0.99472296]
예제 #5
0
def get_solid_20(data,
                 X_train,
                 y_train,
                 X_val,
                 y_val,
                 thread_Pool,
                 selected_features=[],
                 selected_features_by_name=[],
                 C_param=1):
    # Getting all the features and remembering to remove the isLumA column
    features = [i for i in range(data.shape[1] - 1)]
    # Removing from the available features what we already have in the selected features
    features = np.setdiff1d(features, selected_features)
    # Maaking sure that are selected features contain what we already have
    selected_features = selected_features
    selected_features_by_name = selected_features_by_name
    highest_accu = 0
    best_feature = None
    selected_features_dict = []
    # Boolean flag indicating whether we are to continue trying to swap out the 20 original features
    isFinishedSwapping = False
    isExit = False
    i = 0

    # IIterations for amount of rounds we want to prefrorm (Note: max_rounds != number of features we will have at the end)
    while True:
        # Keeping track of time, to monitor how long it takes for each iteration
        start = time.time()
        best_feature = None
        highest_accu = 0
        result = []
        # Create a thread pool of thread_Pool amount of process
        pool = multiprocessing.Pool(thread_Pool)
        # Creating a dummy function in order to send multiple inputs to the function
        func = partial(find_best_feature, selected_features, X_train, C_param,
                       y_train)
        # Gather the results
        result = pool.map(func, features)
        pool.close()
        pool.join()
        #Iterate through the results, find the best_feature
        for res in result:
            if res[1] > highest_accu:
                highest_accu = res[1]
                best_feature = res[0]

        # Add the best feature to the list
        if not isFinishedSwapping:
            # We still havent converged in our starting selected features, so we run the function to updated the selected features
            best_feature, selected_features, selected_features_by_name, isFinishedSwapping = selected_feature_check(
                data, X_train, y_train, selected_features,
                selected_features_by_name, C_param, best_feature)
            if isFinishedSwapping == True:
                isExit = True
        else:
            isExit = True
            # We have converged and now just the best feature
            selected_features = np.append(selected_features, best_feature)
            selected_features_by_name.append(data.columns[best_feature])

        # Train the model again with these selected features
        X_train_filt = X_train[:, selected_features]
        X_val_filt = X_val[:, selected_features]
        # Rank the data
        X_train_filt_ranked = bin_rank(X_train_filt)
        X_val_filt_ranked = bin_rank(X_val_filt)
        # Train the model
        lr = CategoricalNB(alpha=1.0,
                           fit_prior=True,
                           class_prior=None,
                           min_categories=5).fit(X_train_filt_ranked, y_train)
        #lr = LogisticRegression(C=C_param, random_state=0).fit(X_train_filt_ranked, y_train)
        # Measure Validation accuracy with the features
        accu_val = lr.score(X_val_filt_ranked, y_val)
        # Remove the feature found from the list of features
        features = np.setdiff1d(features, np.array([best_feature]))
        # Measure time
        times = time.time() - start

        # Populate dictionary of info and add it to our list and continue
        selected_features_dict.append({
            "Feature":
            selected_features_by_name.copy(),
            "Iteration":
            i + 1,
            "Training accuracy":
            highest_accu,
            "Validation Accuracy":
            accu_val,
            "Time":
            times
        })
        print("Round:", i + 1, "complete")
        i = i + 1
        if isExit:
            break
    print("Finished: Total time can  be found below:")
    # return a dictionary
    return (selected_features_dict, highest_accu)
def naive_bayes(x_train, y_train):
    model = CategoricalNB()
    model.fit(x_train, y_train)
    score = model.score(x_train, y_train)
    return score
예제 #7
0
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import brier_score_loss

X, y = make_blobs(n_samples=[500, 500],
                  centers=[[0.0, 0.0], [2.0, 2.0]],
                  cluster_std=[0.5, 0.5],
                  random_state=0,
                  shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=420)

# 分箱,将数据转换为分类型
from sklearn.preprocessing import KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
kbd.fit(X_train)
X_train_ = kbd.transform(X_train)
X_test_ = kbd.transform(X_test)

print('分类值建模')
cnb = CategoricalNB()
cnb.fit(X_train_, y_train)
print('test accuracy: {}'.format(cnb.score(X_test_, y_test)))
print('test brier_score_loss: {}'.format(
    brier_score_loss(y_test, cnb.predict_proba(X_test_)[:, 1], pos_label=1)))
예제 #8
0
def scoring_comparison(base_path,datasets,verbose=1,test_size=0.3,seed=None,n_iterations=30):
    column_names = ["dataset",
                    "custom_training_score",
                    "custom_test_score",
                    "categorical_training_score",
                    "categorical_test_score"]
    data =[]
    clf_no_encoding = NaiveBayes(encode_data=True)
    clf_categorical_sklearn = CategoricalNB()
    
    datasets_iter = tqdm(datasets, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')
    c = CustomOrdinalFeatureEncoder()
    l = CustomLabelEncoder()
    
    for dataset in datasets_iter:
        dataset_name, label = dataset
        data_filename = f"{dataset_name}.data.csv"
        test_filename = f"{dataset_name}.test.csv"
        X, y = get_X_y_from_database(base_path=base_path,
                                     name = dataset_name,
                                     data = data_filename, 
                                     test = test_filename, 
                                     label = label)
        custom_train = []
        custom_test = []

        sklearn_train = []
        sklearn_test = []


        X  = c.fit_transform(X)
        y  = l.fit_transform(y)
        for iteration in range(n_iterations):
            if verbose:
                datasets_iter.set_postfix({"Dataset": dataset_name, "seed":iteration})
                datasets_iter.refresh()
            try:
                X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                             test_size=test_size,
                                                             random_state=seed+iteration,
                                                             shuffle=True,
                                                             stratify=y)
            except:
                #Not enough values to stratify y
                X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                                test_size=test_size,
                                                                random_state=seed+iteration,
                                                                shuffle=True
                                                                )
            #Fit
            clf_no_encoding.fit(X_train,y_train)
            clf_categorical_sklearn.min_categories = [1+np.max(np.concatenate([X_train[:,j],X_test[:,j]])) for j in range(X_train.shape[1])]
            clf_categorical_sklearn.fit(X_train,y_train)
            
            
            #Predict
            custom_train.append(clf_no_encoding.score(X_train,y_train))
            custom_test.append(clf_no_encoding.score(X_test,y_test))
            sklearn_train.append(clf_categorical_sklearn.score(X_train,y_train))
            sklearn_test.append(clf_categorical_sklearn.score(X_test,y_test))
        data.append([dataset_name,np.mean(custom_train),np.mean(custom_test),np.mean(sklearn_train),np.mean(sklearn_test)])
    return pd.DataFrame(data,columns = column_names)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/grbruns/cst383/master/heart.csv')
df['output'] = df['output'] - 1

predictors = ['chestpain', 'exercise']
X = df[predictors].values
y = df['output'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

blind_prediction = np.median(y_train) #median? i thought blind was mean?
print((y_test == blind_prediction).mean())

clf = CategoricalNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test) # why do we want probability/what probability

clf.score(X_test, y_test)

df.describe()
X['Sex']=X['Sex'].apply(lambda x: 1 if x=='female' else 0)
X.head()


X.columns[X.isna().any()]
X.describe()

X.fillna(X.mean(), inplace=True)
X.describe()


X_train, X_test, y_train,y_test = train_test_split(X,Y, test_size=0.2)
model = CategoricalNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)
model.score(X_train, y_train)

data =pd.read_csv('spam.csv.xlsx')
data.head()
from sklearn.feature_extraction.text import CountVectorizer
data.describe()
data.groupby('Category').describe()
data['spam']=data['Category'].apply(lambda x: 1 if x=='spam' else 0)
data.head()
X_train, X_test, y_train, y_test = train_test_split(data.Message, data.spam)

v = CountVectotrizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray([:2])