Пример #1
0
def classifier(data, y, model="forest"):
    if model == "forest":
        from sklearn.ensemble import RandomForestClassifier as rfc
        est = rfc(n_estimators=10, n_jobs=-1)

    elif model == "tree":
        from sklearn.tree import DecisionTreeClassifier as dtc
        est = dtc()

    elif model == "extra":
        from sklearn.ensemble import ExtraTreesClassifier as etc
        est = etc(n_estimators=10, n_jobs=-1)

    elif model == "logistic":
        from sklearn.linear_model import LogisticRegression as lr
        cases = y.nunique()
        if cases > 2: est = lr(solver="newton-cg", multi_class="multinomial")
        else: est = lr(n_jobs=-1)

    elif model == "svm":
        from sklearn.svm import SVC as svc
        est = svc()

    elif model == "boost":
        from sklearn.ensemble import GradientBoostingClassifier as gbc
        est = gbc(n_estimators=10)

    elif model == "neural":
        from sklearn.neural_network import MLPClassifier as nnc
        est = nnc(max_iter=10, learning_rate_init=1)

    est.fit(data, y)
    return est
Пример #2
0
def regression(data, y, model="forest"):
    if model == "forest":
        from sklearn.ensemble import RandomForestRegressor as rfc
        est = rfc(n_estimators=10, n_jobs=-1)

    elif model == "tree":
        from sklearn.tree import DecisionTreeRegressor as dtc
        est = dtc()

    elif model == "extra":
        from sklearn.ensemble import ExtraTreesRegressor as etc
        est = etc(n_estimators=10, n_jobs=-1)

    elif model == "linear":
        from sklearn.linear_model import LinearRegression as lr
        cases = y.nunique()
        est = lr(n_jobs=-1)

    elif model == "svm":
        from sklearn.svm import SVR as svc
        est = svc()

    elif model == "boost":
        from sklearn.ensemble import GradientBoostingRegressor as gbc
        est = gbc(n_estimators=10)

    elif model == "neural":
        from sklearn.neural_network import MLPRegressor as nnc
        est = nnc(max_iter=10, learning_rate_init=1)

    est.fit(data, y)
    return est
Пример #3
0
def initialize_models(X_train, y_train, X_test, y_test, accuracy, fscore):
    # TODO: Initialize the three models
    clf_A = dtc(random_state=13)
    clf_B = rfc(random_state=13)
    clf_C = abc(random_state=13)

    # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
    # HINT: samples_100 is the entire training set i.e. len(y_train)
    # HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
    # HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`)
    samples_100 = len(y_train)
    samples_10 = len(y_train) // 10
    samples_1 = len(y_train) // 100

    # Collect results on the learners
    results = {}
    for clf in [clf_A, clf_B, clf_C]:
        clf_name = clf.__class__.__name__
        results[clf_name] = {}
        for i, samples in enumerate([samples_1, samples_10, samples_100]):
            results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test)

    # Run metrics visualization for the three supervised learning models chosen
    vs.evaluate(results, accuracy, fscore)
    return clf_C
Пример #4
0
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """

    # Create cross-validation sets from the training data
    # sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
    # sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
    cv_sets = ShuffleSplit(X.shape[0],
                           n_iter=10,
                           test_size=0.20,
                           random_state=0)

    # TODO: Create a decision tree regressor object
    regressor = dtc()

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
    scoring_fnc = make_scorer(performance_metric)

    # TODO: Create the grid search cv object --> GridSearchCV()
    # Make sure to include the right parameters in the object:
    # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
    grid = gscv(regressor, params, scoring=scoring_fnc)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_
Пример #5
0
def DecisionTree():

w
scoring = ['precision_macro', 'recall_macro','f1_macro']
clf = dtc()
socres = cross_validate(clf,x,y,scoring=scoring,cv=10,return_train_score=False)
socres
clf.predict(x)
 def __init__(self, pathToData):
     self.dataFilePath = pathToData
     self.algoname = 'Boosting'
     self.datasetName = 'Letter'
     self.baseEstimater = dtc(class_weight='balanced')
     # x = {'base_estimator': self.baseEstimater,
     #      'base_estimator__max_depth': 15}
     self.classifier = abc(base_estimator=self.baseEstimater,
                           algorithm='SAMME')
     # self.classifier.set_params(**x)
     self.cv = 5
Пример #7
0
def decision_tree_implementation(df, x, x_train, y_train, x_test, y_test):
    print("Decision Tree")
    print("*************")

    my_tree = dtc(random_state=0)
    my_tree.fit(x_train, y_train)
    data = tree.export_graphviz(my_tree,
                                out_file=None,
                                feature_names=x.columns,
                                filled=True,
                                special_characters=True)
    graph = graphviz.Source(data)
    graph.render("mushroom")
    y_pred = my_tree.predict(x_test)
    print(classification_report(y_test, y_pred))
Пример #8
0
def classification(file, X, Y, x, y):
    param = []
    acc = []
    criterion = ['gini', 'entropy']
    for i in it.product(criterion, splitter, max_depth, min_samples_split,
                        min_samples_leaf, min_weight_fraction_leaf,
                        max_features, random_state, max_leaf_nodes,
                        min_impurity_decrease, min_impurity_split,
                        class_weight, presort):
        # print(*i)
        dtree = dtc(*i)
        dtree.fit(X, Y)
        # print('Accuracy: ' + str(dtree.score(x,y)) + '\n')
        acc.append(dtree.score(x, y))
        param.append([*i])
    _results(file, acc, param)
Пример #9
0
    def classification(self, metric, folds, alphas, graph):
        size = 1.3 * self.report_width // 10

        models = {}
        models["K nearest neighbors classifier K2"]  = knnc(n_neighbors=2)
        models["K nearest neighbors classifier K5"]  = knnc(n_neighbors=5)
        models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10)        
        models["Decision tree classifier"]           = dtc()
        models["Logistic classifier"]                = logitc()
        models["SVM classifier with RBF kernel"]     = svc(gamma='scale')
        models["SVM classifier with linear kernel"]  = svc(kernel='linear')
        models["Gaussian naive bayes"]               = gnbc()
        models["Bernoulli naive bayes"]              = bnbc()
        models["SGD classifier"]                     = sgdc(max_iter=10000)
        models["Random forest classifier"]           = rfc(n_estimators=100)
        models["Gradient boosting classifier"]       = gbc()
        self.models = models

        print('\n')
        print(self.report_width * '*', '\n*')
        print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
        kf = StratifiedKFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        for model_name in models:
            cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, error_score=np.nan)  
            results.append(cv_scores)
            names.append(model_name)
        print(self.report_width * '*', '')
        report = pd.DataFrame({'Classifier': names, 'Score': results})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True, ascending=False)
        report.drop('Score', axis=1, inplace=True)
        display(report)
        print('\n')
        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Classifier Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0)
            plt.show()             
        return None
Пример #10
0
def run_min_samples_leaf(training_data, training_labels, validation_data, validation_labels):
    min_samples_leaf_list = range(1, 51)
    
    training_accuracy_list = []
    validation_accuracy_list = []
    for this_min_samples_leaf in min_samples_leaf_list:
        print('Processing min samples leaf: ' + str(this_min_samples_leaf) + '/' +
                str(len(min_samples_leaf_list)))
        clf = dtc(criterion='entropy', min_samples_leaf=this_min_samples_leaf)
        (training_accuracy, validation_accuracy) = get_training_accuracy.run(clf, training_data,
                                                                             training_labels,
                                                                             validation_data,
                                                                             validation_labels)
        training_accuracy_list.append(training_accuracy)
        validation_accuracy_list.append(validation_accuracy)
        print(CURSOR_UP_ONE + ERASE_LINE + CURSOR_UP_ONE)
    
    # Plot data ------------------------------------------------------------------------------------
    training_accuracy_list = [training_accuracy*100 for training_accuracy
                              in training_accuracy_list]
    validation_accuracy_list = [validation_accuracy*100 for validation_accuracy 
                                in validation_accuracy_list]

    pylab.plot(min_samples_leaf_list, training_accuracy_list)
    pylab.plot(min_samples_leaf_list, validation_accuracy_list)
    
    pylab.xlabel('Min Samples Leaf')
    pylab.ylabel('Accuracy (% out of 100)')
    pylab.title('Training and Validation Accuracy as function of Min Samples Leaf')
    pylab.legend(['Training Accuracy', 'Validation Accuracy'], loc=2)
    pylab.grid(True)
    pylab.savefig("Accuracy_vs_Min_Samples_Leaf.png")
    #pylab.show()
    pylab.close()
    pylab.clf()
    # End plot data --------------------------------------------------------------------------------
    
    (best_index, best_accuracy) = max(enumerate(validation_accuracy_list), key = itemgetter(1))
    best_min_samples_leaf = min_samples_leaf_list[best_index]
    return best_min_samples_leaf
Пример #11
0
    def decisionTree(self, screens, sym1, sym2, sym3, sym4, sym5):
        self.decisionText = Text(screens,
                                 height=1,
                                 width=30,
                                 bg="orange",
                                 fg="black")
        self.decisionText.grid(row=1, column=7, padx=10)

        self.decisionTreeClass = dtc()
        self.decisionTreeClass = self.decisionTreeClass.fit(self.X, self.Y)

        # Check the accuracy of the algorithm
        self.YPrediction = self.decisionTreeClass.predict(self.XTest)
        print(accs(self.YTest, self.YPrediction))
        print(accs(self.YTest, self.YPrediction, normalize=False))

        self.clientSymp = [sym1, sym2, sym3, sym4, sym5]

        for s in range(0, len(self.symptoms)):
            for t in self.clientSymp:
                if t == self.symptoms[s]:
                    self.newList[s] = 1

        self.inputs = [self.newList]
        self.predictions = self.decisionTreeClass.predict(self.inputs)
        self.predicted = self.predictions[0]

        self.ans = 'no'
        for a in range(0, len(self.diseases)):
            if self.predicted == a:
                self.ans = 'yes'
                break

        if self.ans == 'yes':
            self.decisionText.delete("1.0", END)
            self.decisionText.insert(END, self.diseases[a])
        else:
            self.decisionText.delete("1.0", END)
            self.decisionText.insert(END, "Not Found")
Пример #12
0
def trainClassifier():
    trainMat=[]
    training=[]
    trainMat=preprocess(trainfile)
    training=prepare(trainMat)
    labels=training[1]

    '''
    Fitting the training data into the decision tree
    '''
    
    topicClf = dtc(criterion='entropy',random_state=0)
    topicClf.fit(training[0],labels)
    
    
    #Cross validating the results using 10% of the training set as the test set 
    '''
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    training[0], labels, test_size=0.1, random_state=0)
    print "Cross Validation Score"
    print topicClf.score(X_test, y_test)                                        
    '''
    '''
Пример #13
0
scale_num = ('scale_num',
             Scale_NumCols(['Age', 'SibSp', 'Parch', 'Fare'], take_log=True))
pipeline = Pipeline([('deal_na', Deal_NAs()),
                     ('encode_cat', Encode_CatCols(drop=['Name', 'Ticket'])),
                     scale_num])
#X_prepared = pipeline.fit_transform(X_)
X_train_p = pipeline.fit_transform(X_train)
X_vali_p = pipeline.transform(X_vali)

from sklearn.linear_model import LogisticRegression as lr
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import AdaBoostClassifier as abc
from sklearn.ensemble import GradientBoostingClassifier as gbc
model = lr(C=1)
model = dtc(min_samples_split=10, max_features=5)
model = abc(dtc(max_depth=4), n_estimators=100)
model = gbc(n_estimators=200)
#model = rfc(n_estimators=200 ,min_samples_split = 5)
model.fit(X_train_p, Y_train)
# print(model.score(X_train_p, Y_train))
# print(model.score(X_vali_p, Y_vali))
# coef_df = pd.DataFrame({'name':X_train_p.columns.tolist(), 'coef':model.coef_[0]})
# coef_df.sort_values('coef', ascending = False)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
Y_pred = model.predict(X_vali_p)
print(classification_report(Y_vali, Y_pred))

print(submit.head())
# Encode Embarked
# [C, S, Q]
train["Embarked_C"] = train["Embarked"] == "C"
train["Embarked_S"] = train["Embarked"] == "S"
train["Embarked_Q"] = train["Embarked"] == "Q"
test["Embarked_C"] = test["Embarked"] == "C"
test["Embarked_S"] = test["Embarked"] == "S"
test["Embarked_Q"] = test["Embarked"] == "Q"

#train
# feature_names = ["Pclass","Sex_encode","Fare_fillin","Embarked_C","Embarked_S","Embarked_Q",]
feature_names = ["Pclass","Sex_encode","Embarked_C","Embarked_S","Embarked_Q",]

X_train = train[feature_names]
y_train = train['Survived']
X_test = test[feature_names]

from sklearn.tree import ExtraTreeClassifier as dtc

model = dtc(max_depth=5)
predictions = model.fit(X_train, y_train).predict(X_test)

print (predictions)

submission = pd.read_csv("./data/titanic/gender_submission.csv", index_col="PassengerId")
submission["Survived"] = predictions

submission.to_csv("./data/titanic/result_decisiontree.csv")

# the accuray is 0.77990 (above 77%)
                                                    random_state=0)

print(
    cl('X_train shape : {}'.format(X_train.shape), attrs=['bold'],
       color='red'))
print(cl('X_test shape : {}'.format(X_test.shape), attrs=['bold'],
         color='red'))
print(
    cl('y_train shape : {}'.format(y_train.shape),
       attrs=['bold'],
       color='green'))
print(
    cl('y_test shape : {}'.format(y_test.shape), attrs=['bold'],
       color='green'))

model = dtc(criterion='entropy', max_depth=4)
model.fit(X_train, y_train)

pred_model = model.predict(X_test)

print(
    cl('Accuracy of the model is {:.0%}'.format(
        accuracy_score(y_test, pred_model)),
       attrs=['bold']))

feature_names = df.columns[:5]
target_names = df['Drug'].unique().tolist()

plot_tree(model,
          feature_names=feature_names,
          class_names=target_names,
Пример #16
0
#%%
import math


def validation(l1, l2):
    match_count = 0
    if (len(l1) != len(l2)):
        print("two list is not same length")
        return -1
    for i in range(len(l1)):
        if (l1[i] == l2[i]):
            match_count = match_count + 1
    return str(math.floor(match_count / len(l1) * 100)) + "%"


#%%
from sklearn.tree import DecisionTreeClassifier as dtc
model = dtc()
predictions = model.fit(X=train[features],
                        y=train[label]).predict(X=test[features])
print(validation(predictions, list(test[label])))

#%%
model = cb.Classifier()
model.class_names
model.fit(train, features, label)
test[features]
predictions = model.getPredictions(test[features], mode='CLASS')
predictions
print(validation(predictions, list(test[label])))
        for l1 in fbank_feat[50:100, :]:
            for l2 in l1:
                fl2.append(l2)

        fl = ['%.4f' % elem for elem in fl]
        fl2 = ['%.4f' % elem for elem in fl2]

        for l1 in fl:
            fList.append(l1)
        for l1 in fl2:
            fList.append(l1)

        mfList.append(fList)
        fList = []
        n = n + 1


path1 = '/home/hp/Desktop/Trainingsamplesmono/'
path2 = '/home/hp/Desktop/set9/'
clf = dtc()  # this class is used to make decision tree
build(path1)

clf.fit(mfList, labels)
dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("iris1.pdf")
mfList = []  #clear mflist
build(path2)
res = clf.predict(mfList)  # prediction of sentiments
print(res)
Пример #18
0
    from sklearn import datasets
    from sklearn.tree import DecisionTreeClassifier as dtc
    from sklearn.metrics import auc as skauc
    iris_data = datasets.load_iris()

    vals = iris_data["data"]
    target = iris_data["target"]

    from numpy import random, c_ as add_col
    shuffled_data = add_col[vals, target]
    random.shuffle(shuffled_data)

    vals = shuffled_data[:, :-1]
    target = shuffled_data[:, -1]

    dt = dtc(criterion="gini", splitter="best")

    tpr = []
    fpr = []
    for i in range(1, vals.shape[0] - 1):
        classifier = dt.fit(X=vals[:i, :], y=target[:i])

        preds_array = classifier.predict(vals[i:, :])
        true_labels = target[i:]

        # Class "0" vs all
        cm = Valclass.confusion_matrix(true_labels,
                                       preds_array,
                                       warnings=False)
        if cm is not None:
            aux1 = Valclass.tpr(cm)
Пример #19
0
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)
# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])
model=dtc(criterion = "entropy", random_state=100)
model.fit(train_x,train_y)
# save all of our data structures
#pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "training_data", "wb" ) )
# restore all of our data structures
#data = pickle.load( open( "training_data", "rb" ) )
#words = data['words']
#classes = data['classes']
#train_x = data['train_x']
#train_y = data['train_y']
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as dtc  #to train the model
from sklearn.model_selection import train_test_split as tts  #to split train and test
from sklearn.metrics import accuracy_score as asc  #to calculate the accuracy

my_data = pd.read_csv('music.csv')
input = my_data.drop(columns=['genre'])
output = my_data['genre']
X_train, X_test, y_train, y_test = tts(
    input, output, test_size=0.2)  #it returns a tuple of size 4
my_model = dtc()
#my_model.fit(input,output)
my_model.fit(X_train, y_train)  #training data
#pred=my_model.predict([[21,1],[34,0]])
print(X_test)
pred = my_model.predict(X_test)

accuracy = asc(y_test, pred)
print(pred)
print('the accurancy is: ', accuracy)
from cross_validation import cross_validation as CV
import matplotlib.pyplot as plt
from feature_selection import feature_selection

#Loading data
x_train = np.loadtxt('../Data/x_train.txt')
y_train_binary = np.loadtxt('../Data/y_train_binary.txt')
x_test = np.loadtxt('../Data/x_test.txt')
y_test_binary = np.loadtxt('../Data/y_test_binary.txt')
x_orig_train = np.loadtxt('../Data/x_orig_train.txt')
y_orig_train_binary = np.loadtxt('../Data/y_orig_train_binary.txt')
x_final_test = np.loadtxt('../Data/x_final_test.txt')
y_final_test_binary = np.loadtxt('../Data/y_final_test_binary.txt')

#Modeling classifier
clf = dtc(max_depth = 3)

#Calling feature selection methods
fs = feature_selection()
#clf,x_train,x_test,x_final_test,y_out = fs.PCASelection(x_train,y_train_binary,x_test,y_test_binary,x_final_test,clf)
clf,x_train,x_test,x_final_test,y_out = fs.KBest(x_train,y_train_binary,x_test,y_test_binary,x_final_test,clf)
#clf.fit (x_train,y_train_binary)
#y_out = clf.predict(x_test)

#Printing scores
score = clf.score(x_test,y_test_binary)
print "Score : ", score
print "Precision recall f-score support : " , prfs(y_test_binary,y_out)


#Cross validation
Пример #22
0

        
'''
Step.3 Training the model
'''
trainMat=[]
training=[]
trainMat=preprocess(trainfile)
training=prepare(trainMat)
labels=training[1]
#Decision Tree
#topicClf=dtc(criterion='entropy',random_state=0)
#topicClf=dtc(random_state=0)
#topicClf = RFC(n_estimators=12, max_features=5, random_state=0)
topicClf=ovr(dtc(criterion='entropy',random_state=0))
topicClf.fit(training[0],labels)
#print topicClf.multilabel_ 
#scores = cross_val_score(topicClf, training[0], labels)
#print "Mean2: ",scores.mean()    
                            
'''
topicClf = dtc(max_depth=None, min_samples_split=1,random_state=0)
scores = cross_val_score(topicClf, training[0], labels)
print "Mean1: ", scores.mean() 

                         

topicClf = ETC(n_estimators=10, max_depth=None,min_samples_split=1, random_state=0)
scores = cross_val_score(topicClf, training[0], labels)
print "Mean3: ",scores.mean()   
Пример #23
0
print(train.shape, test.shape)
# train[features] # training X

#%%
import math
def validation(l1, l2):
    match_count = 0
    if(len(l1)!=len(l2)):
        print("two list is not same length")
        return -1
    for i in range(len(l1)):
        if(l1[i]==l2[i]):
            match_count = match_count+1
    return str(math.floor(match_count/len(l1) * 100)) +"%"


#%%
from sklearn.tree import DecisionTreeClassifier as dtc
model = dtc()
predictions = model.fit(X=train[features], y=train[label]).predict(X=test[features])
print(validation(predictions, list(test[label])))

#%%
model = cb.Classifier()
model.class_names
model.fit(train, features, label)
test[features]
predictions = model.getPredictions(test[features], mode='CLASS')
predictions
print(validation(predictions, list(test[label])))
Пример #24
0
    'Percentage_of_disposable_income', 'Duration_in_Present_Residence',
    'Age_in_years', 'No_of_Credits_at_this__Bank', 'No_of_dependents',
    'Creditability'
]

categorical_columns = []

for i in df.columns:
    if i not in numerical_columns:
        df[i] = df[i].astype(str)
        categorical_columns.append(i)

for i in categorical_columns:
    dummies = pd.get_dummies(df[i], prefix=i)
    df = pd.concat([df, dummies], axis=1)

y = df['Creditability']
x = df.drop('Creditability', axis=1)
#print(x.info())
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

clf = dtc(random_state=0)
clf.fit(x_train, y_train)
print(clf.score(x_train, y_train))
print(clf.score(x_test, y_test))
Пример #25
0
sex = 'F'
scaler = StandardScaler()

data_partial = data[data['Sex'] == sex].drop('Sex', axis=1)
# corr_matrix_f, corr_matrix_m = data_f.corr(), data_m.corr()
# plot_corr_matrices(corr_matrix_f, corr_matrix_m)

y = data_partial['EmoState']
X = scaler.fit_transform(data_partial.drop('EmoState', axis=1))
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=71)

models = (('DTC', dtc()), ('SVM', svc(C=10)), ('KNN', knc(n_neighbors=10)),
          ('SGDC', sgdc()), ('GNBC', gnbc()), ('MLPC',
                                               mlpc(max_iter=1000,
                                                    learning_rate='adaptive')))
results = []
names = []
seed = 13
scoring = 'accuracy'

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model,
                                                 X_train,
                                                 y_train,
                                                 cv=kfold,
                                                 scoring=scoring)
Пример #26
0
	model.fit( X , y )

	X_test = [ row[ :-1 ] for row in test_data ]
	y_real = [ row[ -1 ] for row in test_data ]
	y_pred = model.predict( X_test )
	print report( y_real , y_pred )
	tp = lambda x : 1 if x == 'spam' else 0
	real = [ tp( v ) for v in y_real ]
	pred = [ tp( v ) for v in y_pred ]
	print mean_absolute_error( real , pred )
	print mean_squared_error( real , pred )

if __name__ == '__main__' :
	if len( sys.argv ) > 2 :
		train_fpath , test_fpath = sys.argv[ 1: ]
		train_data = import_csv( train_fpath )
		test_data = import_csv( test_fpath )
		''' DECISION TREE '''
		cf = dtc( criterion = 'gini' , max_depth = 50 )
		classify( cf , train_data , test_data , 'decision_tree' )
		
		''' NEAREST NEIGHBORS '''
		cf = knc( n_neighbors = 1 , metric = 'hamming' )
		classify( cf , train_data , test_data , 'knearest_neighbors' )
		
		''' NAIVE BAYES '''
		cf = mnb( alpha = 100.0 )
		classify( cf , train_data , test_data , 'naive_bayes' )
	else :
		print "Usage python %s [train_csv_file] [test_csv_file]" % sys.argv[ 0 ]
Пример #27
0
import matplotlib.pyplot as plt
diab = pd.read_csv('diabetes.csv')
# print(diab)
# print(diab.columns)
# print(diab.isna().sum())
# print(diab.shape)
# print(diab.tail(5))
X = diab[['plas', 'insu', 'mass']]
y = diab['class']
le = LabelEncoder()
y = le.fit_transform(y)
# print(y)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=109)

model = dtc(criterion="gini", max_depth=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_pred, y_test))

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=100)

fn=['plas', 'insu', 'mass']
cn=['possitive', 'negative']

tree.plot_tree(model,feature_names = fn,class_names=cn,filled = True, );
plt.show()
text_representation = tree.export_text(model)
print(text_representation)
Пример #28
0
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn import metrics

n_class = 3
colors = 'ryb'
step = 0.2

iris = load_iris()

for pairdx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2,
                                                                        3]]):
    x = iris.data[:, pair]
    y = iris.target

    clf = dtc()
    clf.fit(x, y)
    plt.subplot(2, 3, pairdx + 1)

    x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1
    y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, step),
                         np.arange(y_min, y_max, step))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
    z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    z = z.reshape(xx.shape)
    cs = plt.contour(xx, yy, z, cmap=plt.cm.RdYlBu)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as dtc
from time import time
from random_dataset import random_dataset
import numpy as np
#from sklearn.metrics import accuracy_score

# Training Model
features_train, labels_train, features_test, labels_test = random_dataset()

model1 = GaussianNB()
model2 = SVC(kernel='linear')
model3 = dtc()

t1 = time()
model1.fit(features_train, labels_train)
print "Training time NB : ", round(time() - t1, 3), "s"

accuracy1 = model1.score(features_test, labels_test)
print "Accuracy of NB:", accuracy1

t2 = time()
model2.fit(features_train, labels_train)
print "Training time SVM : ", round(time() - t2, 3), "s"

accuracy2 = model2.score(features_test, labels_test)
print "Accuracy of SVM:", accuracy2

t3 = time()
model3.fit(features_train, labels_train)
Пример #30
0
    validation_data[missing_headers] = validation_data[missing_headers].applymap(lambda x: False)

missing_headers = validation_data.columns.diff(training_data.columns)
if len(missing_headers) > 0:
    training_data[missing_headers] = validation_data[missing_headers]
    training_data[missing_headers] = training_data[missing_headers].applymap(lambda x: False)

# Process Decision Tree
best_max_depth = classifier_run.run_max_depth(training_data, training_labels,
                                              validation_data, validation_labels)
best_min_samples_leaf = classifier_run.run_min_samples_leaf(training_data, training_labels,
                                                            validation_data, validation_labels)
print('Optimal max depth was: ' + str(best_max_depth))
print('Optimal min samples leaf: ' + str(best_min_samples_leaf))

clf = dtc(criterion='entropy', max_depth=best_max_depth, min_samples_leaf=best_min_samples_leaf)
clf.fit(training_data, training_labels)
dot_data = StringIO.StringIO()
export_graphviz(clf, out_file=dot_data, max_depth=2) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("Decision_Tree.pdf") 

(best_n_estimator, best_n_estimator_accuracy) = classifier_run.run_random_forest(
                                                   training_data, training_labels,
                                                   validation_data, validation_labels)
(best_n_estimator_modified, 
 best_n_estimator_modified_accuracy) = classifier_run.run_random_forest(
                                            training_data, training_labels, validation_data,
                                            validation_labels, best_max_depth=30,
                                            best_min_samples_leaf=1)
Пример #31
0
svm_classifier = clf.fit(training_features, training_labels)
predictions = svm_classifier.predict(training_features)
print("Precision of linear SVM classifier is:")
precision = calculate_precision(predictions, training_labels)
print("Training data\t" + str(precision))
predictions = svm_classifier.predict(test_features)
#precision = calculate_precision(predictions,test_gold_labels)
#print("Test data\t" + str(precision))
#Real time tesing
#real_time_test(svm_classifier,vocabulary_mv)

##Decision tree algorithm
from sklearn.tree import DecisionTreeClassifier as dtc

clf_gini = dtc(criterion="gini",
               random_state=100,
               max_depth=3,
               min_samples_leaf=5)
dt_classifier = clf_gini.fit(training_features, training_labels)
predictions = dt_classifier.predict(training_features)
print("Precision of DecisionTreeClassifier is")
precision = calculate_precision(predictions, test_gold_labels)
print("Test data\t" + str(precision))
#Real time tesing
real_time_test(dt_classifier, vocabulary_mv)
##Implementation of logistice regression
from sklearn.linear_model import LinearRegression as lr

lmModel = lr()
lm_classifier = lmModel.fit(training_features, training_labels)
predictions = lm_classifier.predict(test_features)
print("Precision of LinearRegression is")
Пример #32
0
X[:, 1] = le_sex.transform(X[:, 1])
le_BP = pproc.LabelEncoder()
le_BP.fit(['LOW', 'NORMAL', 'HIGH'])
X[:, 2] = le_BP.transform(X[:, 2])
le_chol = pproc.LabelEncoder()
le_chol.fit(['LOW', 'NORMAL', 'HIGH'])
X[:, 3] = le_chol.transform(X[:, 3])
print(X[0:5])
y = data['Drug']
print(y[0:5])
from sklearn.model_selection import train_test_split as tts
X_trn, X_test, y_trn, y_test = tts(X, y, test_size=0.3, random_state=3)
print(X_trn.shape)
print(y_trn.shape)
#modelling from here now
drugtree = dtc(criterion='entropy', max_depth=4)
drugtree.fit(X_trn, y_trn)
predtree = drugtree.predict(X_test)
print(predtree[0:5])
print(y_test[0:5])
#finding the accuracy of model
from sklearn import metrics
import matplotlib.pyplot as plt
print("decision tree accuracy:", metrics.accuracy_score(y_test, predtree))
#visualization
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
dot_data = StringIO()
filename = "drugtree.png"
Пример #33
0
plt.xlabel('Predicted')
plt.ylabel('Truth')

print(classification_report(y_test, y_pred))

#applying k-fold cross validation
from sklearn.model_selection import cross_val_score as cvs
accuracies = cvs(estimator=classifier,X=x_train,y=y_train,cv=10)
print(accuracies.mean())
print(accuracies.std())

"""Decision Tree"""

#fitting decision tree classifier to the training set
from sklearn.tree import DecisionTreeClassifier as dtc
classifier = dtc(criterion='entropy' , random_state=0)
classifier.fit(x_train, y_train)

#predicting the test set results
y_pred=classifier.predict(x_test)

from sklearn.metrics import confusion_matrix, classification_report

cm=confusion_matrix(y_test, y_pred)
plt.figure(figsize = (5,5))
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

print(classification_report(y_test, y_pred))
Пример #34
0
import sys
import sklearn
from classifier_utils import *
from sklearn.tree import DecisionTreeClassifier as dtc

if __name__ == '__main__' :
	if len( sys.argv ) > 3 :
		infilepath , crit , depth = sys.argv[ 1: ]
		data = import_csv( infilepath )
		cf = dtc( criterion = crit , max_depth = int( depth ) )
		stats = cross_validation( data , cf )
		print "PARAMS: criterion=%s , max_depth=%s" % ( crit, depth )
		print_stats( stats )
	else :
		print "Usage python %s [csv_file] [neighbors] [distance]" % sys.argv[ 0 ]
print(f"Manual Training Accuracy: {training_accuracy:.2%}")
print(f"Manual Test Accuracy: {test_accuracy:.2%}")

# =============================================================================
# Compare to actual function using pandas and sklearn
# =============================================================================

df = pd.read_csv("iris.csv")
train, test = train_test_split(df,
                               train_size=.75,
                               stratify=df["species"],
                               random_state=7)

target = ["species"]

X_train = train.drop(target, axis=1)
y_train = train[target]

X_test = test.drop(target, axis=1)
y_test = test[target]

clf = dtc(max_depth=5, min_samples_split=4)
clf.fit(X_train, y_train)

# Find accuracy of sklearn implementation
training_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)

print(f"Sklearn Train Score: {training_accuracy:.2%}")
print(f"Sklearn Test Score:, {test_accuracy:.2%}")
Пример #36
0
def main():
    df = getDF(sys.argv[1])
    df_shuffled = df.sample(frac=1)
    df = df_shuffled.reset_index(drop=True)

    df_list = []
    for i in range(0, 150, 30):
        df_list.append(df[i:i + 30])

    Columns = ['sepal_l', 'sepal_w', 'petal_l', 'petal_w', 'result']
    train_knn = pd.DataFrame(columns=Columns)
    test_knn = pd.DataFrame(columns=Columns)
    k_vals = {}
    all_accuracies = []
    for i in range(5):
        print(i)
        test_knn = pd.concat([test_knn, df_list[i]])

        for j in range(5):
            if i != j:
                train_knn = pd.concat([train_knn, df_list[j]])

        x_test_knn = test_knn.loc[:, :'petal_w']
        y_test_knn = test_knn.loc[:, ['result']]

        x_train_knn = train_knn.loc[:, :'petal_w']
        y_train_knn = train_knn.loc[:, ['result']]

        accuracies = []
        for i in range(1, 51):
            knn_classifier = knn(n_neighbors=i)
            knn_classifier.fit(x_train_knn, y_train_knn)
            knn_y_pred = knn_classifier.predict(x_test_knn)
            # print(accuracy_score(y_test_knn, knn_y_pred))
            accuracies.append(accuracy_score(y_test_knn, knn_y_pred))

        all_accuracies.append(accuracies)
        tot = 0

        for i in range(50):
            for l in all_accuracies:
                tot += l[i]
            tot = tot / 5
            k_vals[i + 1] = tot
            tot = 0

        print(k_vals)
        test_knn = pd.DataFrame(columns=Columns)
        train_knn = pd.DataFrame(columns=Columns)

    print(k_vals)

    num = 1
    for training_df in df_list[1:]:
        x_train = training_df.loc[:, :'petal_w']
        y_train = training_df.loc[:, ['result']]

        knn_classifier = knn(n_neighbors=2)
        knn_classifier.fit(x_train, y_train)

        dtree_classifier = dtc(criterion='entropy',
                               random_state=100,
                               max_depth=8,
                               min_samples_leaf=4)
        dtree_classifier.fit(x_train, y_train)

        knn_y_pred = knn_classifier.predict(x_test)
        dtree_y_pred = dtree_classifier.predict(x_test)

        # print('{}: Dtree Accuracy - {}'.format(num, accuracy_score(y_test, dtree_y_pred)))
        # print('Report: {}'.format(classification_report(y_test, dtree_y_pred)))

        # print('{}: KNN Accuracy - {}'.format(num, accuracy_score(y_test, knn_y_pred)))
        # print('Report: {}'.format(classification_report(y_test, knn_y_pred)))

        # print('-'*50)
        num += 1
    return
Пример #37
0
y = covid['Direction']

covid.info()

# print(set(alc["Series_reference"]))
# Series_reference_encoded = le.fit_transform(alc["Series_reference"])
# print('Series_reference_encoded', Series_reference_encoded)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.10,
                                                    random_state=50)

model = dtc(criterion="entropy", max_depth=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(set(covid['Direction']))
print("Accuracy:", metrics.accuracy_score(y_pred, y_test))

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=266)

fn = ['Value', 'Cumulative', 'Year', 'Commodity', 'Country']
cn = ['right', 'left', 'center']

tree.plot_tree(model, feature_names=fn, class_names=cn, filled=True)

text_representation = tree.export_text(model)
print(text_representation)
    def classification(self, metric, folds, printt=True, graph=False):
        size = self.graph_width

        if len(self.y.iloc[:,0].unique()) > 2:
            struct = 'multiclass'
        else:
            struct = 'binary'

        # significant model setup differences should be list as different models
        models = {}
        models["Linear discriminant analysis"]          = ldac()
        models["Nearest centroid classifier euclidian"] = ncc(metric='euclidean')
        models["Nearest centroid classifier manhattan"] = ncc(metric='manhattan')
        models["K nearest neighbors classifier K2"]     = knnc(n_neighbors=2)
        models["K nearest neighbors classifier K5"]     = knnc(n_neighbors=5)
        models["K nearest neighbors classifier K10"]    = knnc(n_neighbors=10)        
        models["Decision tree classifier"]              = dtc()
        models["Gaussian naive bayes"]                  = gnbc()
        models["Bernoulli naive bayes"]                 = bnbc(binarize=0.5)
        models["Multinomial naive bayes"]               = mnbc()
        models["SGD classifier"]                        = sgdc(max_iter=10000)
        models["Ridge classifier"]                      = rc()

        if len(self.Xt_train) < 10000:
            models["SVM classifier RBF"]                = svc(gamma='scale')
            models["SVM classifier Linear"]             = svc(kernel='linear')
            models["SVM classifier Poly"]               = svc(kernel='poly')

        if self.Xt_train.shape[0] < 10000 or self.Xt_train.shape[1] < 5:
            models["Gradient boosting classifier"]      = gbc()
            models["Random forest classifier"]          = rfc(n_estimators=100)

        if struct == 'multiclass':
            models["Logistic classifier multinomial"]   = logitc(multi_class='multinomial', solver='lbfgs')
            models["Logistic classifier auto"]          = logitc(multi_class='auto')
            models["Logistic One vs Rest"]              = ovrc(logitc())
            models["Logistic One vs One"]               = ovoc(logitc())

        if struct == 'binary':
            models["Logistic classifier"]               = logitc(max_iter=2000)

        self.models = models

        kf = StratifiedKFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        et = []
        for model_name in models:
            start = time.time()
            cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric, error_score=np.nan)  
            results.append(cv_scores)
            names.append(model_name)
            et.append((time.time() - start))
            #print(model_name, time.time() - start)
        report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True, ascending=False)
        report.drop('Score', axis=1, inplace=True)
        report.reset_index(inplace=True, drop=True)
        self.report_performance = report

        if printt:
            print('\n')
            print(self.report_width * '*', '\n*')
            print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
            print(self.report_width * '*', '')
            print(report)
            print('\n')

        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Classifier Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0, bottom=0.25)
            self.graphs_model.append(fig)
            plt.show()             
        return None