Python SMOTE示例，master.SMOTE Python示例

示例#1

0

显示文件

文件： bclassification.py 项目： gigpir/AI_analysis_on_students_DB

def LDA(x, y, onlycv=False, testacc=False, smote=False):
    if not smote:
        print('LDA classifier')
    else:
        print('LDA classifier with SMOTE')
    #testacc=True -> stampa la test (validaton) accuracy (se onlycv=False)
    #onlycv=True -> stampa solo la cross validation accuracy

    x_train, x_test, y_train, y_test = master.split(x, y)
    if smote:
        x_train, y_train = master.SMOTE(x_train, y_train)

    clf = LinearDiscriminantAnalysis(
    )  #ho provato ad usare lo shrinkage con un solver diverso -> forma di regolarizzazione
    clf.fit(
        x_train, y_train
    )  #ma l'accuracy è più o meno la stessa e così non dobbiamo spiegarla nel report :D
    y_pred_test = clf.predict(x_test)
    score_test = metrics.accuracy_score(y_pred_test, y_test)

    if testacc == True and onlycv == False:
        print('Test score accuracy is:', score_test)
        print()

    cv_accuracy = master.cross_val(clf, x, y)
    if smote:
        cv_accuracy = master.cv_SMOTE(clf, x, y)
    print("10-fold cross validation accuracy for k=5 is:", cv_accuracy)
    print()

    if onlycv == False:
        matrix = metrics.confusion_matrix(y_test,
                                          y_pred_test,
                                          normalize="true")
        print("Confusion matrix for k=5 normalized by true categories (rows):")
        print(matrix)
        print()
        print('Classification report:')
        print(sklearn.metrics.classification_report(y_pred_test, y_test))
        print()

示例#2

0

显示文件

文件： bclassification.py 项目： gigpir/AI_analysis_on_students_DB

def randomForest(x,
                 y,
                 feature_names,
                 search=False,
                 cv=True,
                 onlycv=False,
                 crit_cv='gini',
                 n_cv=100,
                 smote=False):
    # Random Forest Classifier
    if not smote:
        print('Random Forest Classifier')
    else:
        print('Random Forest Classifier with SMOTE')

    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)
    if smote:
        x_train, y_train = master.SMOTE(x_train, y_train)

    # Fitting the classifier into the Training set
    # Grid search for criterion and n_estimators
    print_e = ['Test score with entropy criterion']
    print_g = ['Test score with gini criterion']
    if search:
        for crit in ['entropy', 'gini']:
            for n in [10, 50, 100, 200, 500]:
                classifier = RandomForestClassifier(n_estimators=n,
                                                    criterion=crit,
                                                    random_state=0,
                                                    min_samples_leaf=40)
                classifier.fit(x_train, y_train)
                y_pred_train = classifier.predict(x_train)
                y_pred_test = classifier.predict(x_test)
                score_train = metrics.accuracy_score(y_pred_train, y_train)
                score_test = metrics.accuracy_score(y_pred_test, y_test)
                if crit == 'entropy':

                    print_e.append(score_test)
                else:

                    print_g.append(score_test)
                #print('Train Scorefor {} estimators and {} criterion:'.format(n,crit)+ str(score_train)) #per n>10 è sempre 1
                #print('Test Score for {} estimators and {} criterion:'.format(n,crit) + str(score_test))
        # Predicting the test set results
        header = []
        for i in [10, 50, 100, 200, 500]:
            header.append("n={}".format(i))
        print(tabulate([print_e, print_g], headers=header))
        print()

    if cv:
        classifier = RandomForestClassifier(n_estimators=n_cv,
                                            criterion=crit_cv,
                                            random_state=0)
        classifier.fit(x_train, y_train)
        y_pred_test = classifier.predict(x_test)
        cv_accuracy = master.cross_val(classifier, x, y)
        if smote:
            cv_accuracy = master.cv_SMOTE(classifier, x, y)
        print(
            "10-fold cross validation accuracy for {} estimators and {} criterion is:"
            .format(n_cv, crit_cv), cv_accuracy)
        print()

        if not onlycv:
            # Making the Confusion Matrix
            cm = confusion_matrix(y_test, y_pred_test, normalize='true')
            print("Confusion matrix for {} estimators and {} criterion is:".
                  format(n_cv, crit_cv))
            print(cm)
            print()
            print("Report del test set per {} estimators and {} criterion is:".
                  format(n_cv, crit_cv))
            print(sklearn.metrics.classification_report(y_pred_test, y_test))
            print()
            print('The features sorting by descending importance are:')
            importance = classifier.feature_importances_
            dictionary = dict(zip(feature_names, importance))
            dictionary = sorted(dictionary.items(),
                                key=lambda x: x[1],
                                reverse=True)
            for i in dictionary:
                if i[1] > 0:
                    print(i)
            # print(dictionary) #da associare alle colonne
            print()

示例#3

0

显示文件

文件： bclassification.py 项目： gigpir/AI_analysis_on_students_DB

def decisionTree(x, y, feature_names, onlycv=False, smote=False):
    #per fare un print leggibile devo passargli il nome degli attributi
    #DecisionTreeClassifier non accetta attributi categorici

    if not smote:
        print('Decision Tree Classifier')
    else:
        print('Decision Tree Classifier with SMOTE')

    x_train, x_test, y_train, y_test = master.split(x, y)

    if smote:
        x_train, y_train = master.SMOTE(x_train, y_train)

    clf = tree.DecisionTreeClassifier(criterion='entropy',
                                      random_state=0,
                                      min_samples_leaf=15)  #cv_acc= 0.91055
    #clf = tree.DecisionTreeClassifier(criterion='gini', random_state=0) #cv_acc=0.89514

    clf = clf.fit(x_train, y_train)

    y_pred_train = clf.predict(x_train)
    y_pred_test = clf.predict(x_test)

    # Making the Confusion Matrix

    cm = confusion_matrix(y_test, y_pred_test, normalize="true")

    score_train = metrics.accuracy_score(y_pred_train, y_train)
    score_test = metrics.accuracy_score(y_pred_test, y_test)

    if not onlycv:
        print('The features sorting by descending importance are:')
        importance = clf.feature_importances_
        dictionary = dict(zip(feature_names, importance))
        dictionary = sorted(dictionary.items(),
                            key=lambda x: x[1],
                            reverse=True)
        for i in dictionary:
            if i[1] > 0:
                print(i)
        #print(dictionary) #da associare alle colonne
        print()

        #r=export_text(clf, feature_names=feature_names)
        #print(r)

        fig = plt.figure(figsize=(25, 20))
        _ = tree.plot_tree(clf,
                           feature_names=feature_names,
                           class_names=['fail', 'pass'],
                           filled=True)
        fig.savefig("./decision_tree/decistion_tree.png")
        plt.close('all')

        print("Confusion matrix normalized by true categories (rows):")
        print(cm)
        print()
        print("Report del test set")
        print(sklearn.metrics.classification_report(y_pred_test, y_test))
        #print('Train Score: '+str(score_train))
        #print('Test Score: '+str(score_test))
        #print()

    cv_accuracy = master.cross_val(clf, x, y)
    if smote:
        cv_accuracy = master.cv_SMOTE(clf, x, y)
    print('Cross validation accuracy:', cv_accuracy)

示例#4

0

显示文件

文件： bclassification.py 项目： gigpir/AI_analysis_on_students_DB

def SVM(x,
        y,
        search=False,
        cv=True,
        C_cv=0.1,
        mode_cv='linear',
        onlycv=False,
        smote=False):
    if not smote:
        print('SVM classifier')
    else:
        print('SVM classifier with SMOTE')

    x_train, x_test, y_train, y_test = master.split(x, y)

    if smote:
        x_train, y_train = master.SMOTE(x_train, y_train)
    #normalizzazione?
    #non credo influisca, alla fine otterrei solo un iperpiano deformato

    if search:
        #RBF KERNEL
        score_train = []
        score_test = []
        print1 = ['Train score']
        print2 = ['Val score']

        for c in np.multiply([0.001, 0.01, 0.1, 1, 10, 100], 100):
            clf = sklearn.svm.SVC(C=c, kernel='rbf').fit(x_train, y_train)
            y_pred_train = clf.predict(x_train)
            y_pred_test = clf.predict(x_test)
            score_train.append(metrics.accuracy_score(y_pred_train, y_train))
            score_test.append(metrics.accuracy_score(y_pred_test, y_test))
            print1.append(metrics.accuracy_score(y_pred_train, y_train))
            print2.append(metrics.accuracy_score(y_pred_test, y_test))

        header = [' ']
        for i in np.multiply([0.001, 0.01, 0.1, 1, 10, 100], 100):
            header.append("C={}".format(i))
        print('RBF kernel')
        print(tabulate([print1, print2], headers=header))
        print()
        """
        print("Radial basis function kernel")
        print("score_train",score_train)
        print("f1_train",f1_train)
        print("score_test",score_test)
        print("f1_test", f1_test)
        """

        #LINEAR KERNEL
        score_train = []
        score_test = []
        print1 = ['Train score']
        print2 = ['Val score']
        for c in [0.001, 0.01, 0.1, 1, 10, 100]:
            clf = sklearn.svm.SVC(C=c, kernel='linear').fit(x_train, y_train)
            y_pred_train = clf.predict(x_train)
            y_pred_test = clf.predict(x_test)
            score_train.append(metrics.accuracy_score(y_pred_train, y_train))
            score_test.append(metrics.accuracy_score(y_pred_test, y_test))
            print1.append(metrics.accuracy_score(y_pred_train, y_train))
            print2.append(metrics.accuracy_score(y_pred_test, y_test))

        header = [' ']
        for i in [0.001, 0.01, 0.1, 1, 10, 100]:
            header.append("C={}".format(i))
        print('Linear kernel')
        print(tabulate([print1, print2], headers=header))
        print()
        """
        print("Linear kernel")
        print("score_train",score_train)
        print("f1_train",f1_train)
        print("score_test",score_test)
        print("f1_test",f1_test)
        """
        #il lineare probabilmente va meglio perchè abbiamo molti attributi 0-1 che sono linearmente separabili
        #il migliore con il kernel lineare sembra essere C=0.01

        #plot training and test
        xrange = [0.001, 0.01, 0.1, 1, 10, 100]
        error_train = np.ones(len(xrange)) - score_train
        error_test = np.ones(len(xrange)) - score_test
        plt.plot(xrange, error_train, label="train score error")
        plt.plot(xrange, error_test, label="val score error")
        plt.xscale('log')
        plt.xlabel('C')
        plt.ylabel('Score error')
        plt.title(
            'SVM with linear kernel train and val score accuracy for different C'
        )
        plt.legend()
        plt.show()

    #calcolo confusion matrix e cv accuracy
    if cv:
        clf = sklearn.svm.SVC(C=C_cv, kernel=mode_cv).fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        cv_accuracy = master.cross_val(clf, x, y)
        if smote:
            cv_accuracy = master.cv_SMOTE(clf, x, y)
        print(
            "10-fold cross validation accuracy for C={} and {} kernel is:".
            format(C_cv, mode_cv), cv_accuracy)
        print()

        if not onlycv:
            matrix = metrics.confusion_matrix(y_test, y_pred, normalize="true")
            print(
                "Confusion matrix for C={} and {} kernel normalized by true categories (rows):"
                .format(C_cv, mode_cv))
            print(matrix)
            print()
            print(
                "Report del test set per fattore di penalizzazione C={} and {} kernel"
                .format(C_cv, mode_cv))
            print(sklearn.metrics.classification_report(y_pred, y_test))

示例#5

0

显示文件

文件： bclassification.py 项目： gigpir/AI_analysis_on_students_DB

def kNN(x,
        y,
        onlynum=False,
        search=False,
        cv=True,
        k_cv=9,
        onlycv=False,
        smote=False,
        select='all',
        return_clf=False):
    if not smote:
        print('kNN classifier')
    else:
        print('kNN classifier with SMOTE')
    ####################
    # KNN
    ####################

    #onlynum=True -> seleziona solo gli attributi numerici, vedere master.onlynum(x)
    #search=True -> ricerca del parametro k ottimale e stampa grafico train e test error
    #cv=True -> stampa cross validation score, confusion matrix e classification report
    #k_cv -> se si vuole provare un parametro k diverso nella fase di cross validation
    #onlycv=True -> attenzione che cv deve essere True, stampa solo cv accuracy e non classreport e confmatr

    # KNN doesn't work well on datasets with many features and in particular with
    # sparse matrices (our case because we have a lot of categorical data)
    # https://medium.com/cracking-the-data-science-interview/k-nearest-neighbors-who-are-close-to-you-19df59b97e7d

    # divisione fra train e test set
    if onlynum:
        x = master.select_numerical(x, select=select)
    x_train, x_test, y_train, y_test = master.split(x, y, scaled=True)
    if smote:
        x_train, y_train = master.SMOTE(x_train, y_train)

    if search:
        score_train = []
        score_test = []
        print1 = ['Train score']
        print2 = ['Val score']

        # applico KNN con diversi parametri -> faccio da 1 a 19
        for i in range(1, 21, 2):
            neigh = KNeighborsClassifier(n_neighbors=i)
            neigh.fit(x_train, y_train)

            # calcolo accuratezza del train e del test set -> studio overfitting
            y_pred_train = neigh.predict(x_train)
            y_pred_test = neigh.predict(x_test)

            score_train.append(metrics.accuracy_score(y_pred_train, y_train))
            score_test.append(metrics.accuracy_score(y_pred_test, y_test))
            print1.append(metrics.accuracy_score(y_pred_train, y_train))
            print2.append(metrics.accuracy_score(y_pred_test, y_test))
        header = [' ']
        for i in range(1, 21, 2):
            header.append("k={}".format(i))
        print(tabulate([print1, print2], headers=header))
        print()

        # Plot of train and test error for different values of K
        xrange = range(1, 21, 2)
        error_train = np.ones(len(xrange)) - score_train
        error_test = np.ones(len(xrange)) - score_test

        plt.plot(xrange, error_train, label="train error")
        plt.plot(xrange, error_test, label="val error")

        plt.xlabel('K')
        plt.ylabel('Score error')
        plt.title('KNN train and val score error for different values of K')
        plt.legend()
        plt.show()

    # ATTENZIONE-> non dovrei scegliere il parametro migliore sulla base del test set, dovrei avere
    # train-val-test. In questo caso il nostro test set può essere considerato come validation e considerare
    # come stimatore dell'accuracy la cross validation

    # rialleniamo KNN per visualizzare la matrice di confusione
    if cv:
        neigh = KNeighborsClassifier(n_neighbors=k_cv)
        neigh.fit(x_train, y_train)
        y_pred = neigh.predict(x_test)
        cv_accuracy = master.cross_val(neigh, x, y)
        if smote:
            cv_accuracy = master.cv_SMOTE(neigh, x, y)
        matrix = metrics.confusion_matrix(y_test, y_pred, normalize="true")
        print("10-fold cross validation accuracy for k=9 is:", cv_accuracy)
        print()
        if not onlycv:
            print(
                "Confusion matrix for k=9 normalized by true categories (rows):"
            )
            print(matrix)
            print()
            print('Classification report:')
            print(sklearn.metrics.classification_report(y_pred, y_test))
            print()
    if return_clf:
        return neigh

示例#6

0

显示文件

文件： bclassification.py 项目： gigpir/AI_analysis_on_students_DB

def logistic_regression(x,
                        y,
                        C_cv=0.1,
                        search=False,
                        cv=True,
                        onlycv=False,
                        smote=False,
                        return_clf=False):
    if not smote:
        print('Logistic regression classifier')
    else:
        print('Logistic regression classifier with SMOTE')
    #nella logistic_regression non c'è bisogno di fare nessuna operazione di standardizzazione

    x_train, x_test, y_train, y_test = master.split(x, y)
    if smote:
        x_train, y_train = master.SMOTE(x_train, y_train)

    score_train = []
    score_test = []
    print1 = ['Train score']
    print2 = ['Val score']
    #search for better penalization parameter
    if search == True:
        for c in [0.001, 0.01, 0.1, 1, 10, 100]:
            clf = sklearn.linear_model.LogisticRegression(C=c,
                                                          max_iter=1000).fit(
                                                              x_train, y_train)
            y_pred_train = clf.predict(x_train)
            y_pred_test = clf.predict(x_test)
            score_train.append(metrics.accuracy_score(y_pred_train, y_train))
            score_test.append(metrics.accuracy_score(y_pred_test, y_test))
            print1.append(metrics.accuracy_score(y_pred_train, y_train))
            print2.append(metrics.accuracy_score(y_pred_test, y_test))

        header = [' ']
        for i in [0.001, 0.01, 0.1, 1, 10, 100]:
            header.append("C={}".format(i))
        print(tabulate([print1, print2], headers=header))
        print()

        #plot train and test score error
        xrange = [0.001, 0.01, 0.1, 1, 10, 100]
        error_train = np.ones(len(xrange)) - score_train
        error_test = np.ones(len(xrange)) - score_test
        plt.plot(xrange, error_train, label="train score error")
        plt.plot(xrange, error_test, label="val score error")
        plt.xscale('log')
        plt.xlabel('C')
        plt.ylabel('Score error')
        plt.title(
            'Logistic Regression train and val score error for different values of C'
        )
        plt.legend()
        plt.show()

    #Cross Validation
    if cv:
        clf = sklearn.linear_model.LogisticRegression(C=C_cv, max_iter=1000)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        cv_accuracy = master.cross_val(clf, x, y)
        if smote:
            cv_accuracy = master.cv_SMOTE(clf, x, y)
        print("10-fold cross validation accuracy for C={} is:".format(C_cv),
              cv_accuracy)
        print()

        if not onlycv:
            matrix = metrics.confusion_matrix(y_test, y_pred, normalize="true")
            print(
                "Confusion matrix for C={} normalized by true categories (rows):"
                .format(C_cv))
            print(matrix)
            print()
            print("Report del test set per fattore di penalizzazione C=", C_cv)
            print(sklearn.metrics.classification_report(y_pred, y_test))
            print()

    if return_clf:
        return clf