Пример #1
0
def dataset3_svc_sgd(dataset):

    x_original = dataset.loc[:, dataset.columns != 'labels']
    labels = dataset['labels']

    fselect = FeatureSelection(dataset, x_original, labels)

    model_lsvc = LinearSVC(C=0.1, penalty="l1", dual=False)

    #model= linearsvs
    X_fit_model, X_transf_model,column_selected,feature_importances,feature_importances_DF,dataset_features= \
        fselect.select_from_model_feature_elimination(model=model_lsvc)

    ml = MachineLearning(X_transf_model, labels, classes=['pos', 'neg'])

    filename = 'crmapp/ml_models/dataset3_all_'

    print('best model sgd')
    best_sgd_model = ml.train_best_model('sgd')

    print('score_test_set_sgd')
    print(ml.score_testset(best_sgd_model))

    print('roc curve')
    ml.plot_roc_curve(best_sgd_model)

    pickle.dump(best_sgd_model, open(filename + 'sgd_model_15092020.sav',
                                     'wb'))
Пример #2
0
def dataset3_univariate_svm(dataset):

    x_original = dataset.loc[:, dataset.columns != 'labels']
    labels = dataset['labels']

    fselect = FeatureSelection(dataset, x_original, labels)

    model_lsvc = LinearSVC(C=0.1, penalty="l1", dual=False)

    # #**Select KBest**
    # #KBest com *mutual info classif*
    X_fit_univariate, X_transf_univariate,column_selected,scores,dataset_features = \
        fselect.univariate(score_func=mutual_info_classif, mode='k_best', param=500)

    ml = MachineLearning(X_transf_univariate, labels, classes=['pos', 'neg'])

    filename = 'crmapp/ml_models/dataset3_all_'

    #tests models
    print('best model svm')
    best_svm_model = ml.train_best_model('svm')

    print('score_test_set_svm')
    print(ml.score_testset(best_svm_model))

    print('roc curve')
    ml.plot_roc_curve(best_svm_model)
Пример #3
0
def dataset1_tree_svm(dataset):

    x_original = dataset.loc[:, dataset.columns != 'labels']
    labels = dataset['labels']

    fselect = FeatureSelection(dataset, x_original, labels)

    model_tree = ExtraTreesClassifier(n_estimators=50)

    # Select from model
    #model= Tree classifier. 50 estiamtors
    X_fit_model, X_transf_model,column_selected,feature_importances,feature_importances_DF,dataset_features= \
        fselect.select_from_model_feature_elimination(model=model_tree)

    ml = MachineLearning(X_transf_model, labels, classes=['pos', 'neg'])

    filename = 'crmapp/ml_models/dataset1_all_'

    #tests models
    print('best model svm')
    best_svm_model = ml.train_best_model('svm')

    print('score_test_set_svm')
    print(ml.score_testset(best_svm_model))

    print('roc curve')
    ml.plot_roc_curve(best_svm_model)

    pickle.dump(best_svm_model, open(filename + 'svm_model_15092020.sav',
                                     'wb'))
Пример #4
0
def test_machine_learning(dataset):
    #split dataset
    # dataset = pd.read_csv(r'datasets/dataset1_test_clean_fselection.csv', delimiter=',')

    print(dataset.shape)

    x_original = dataset.loc[:, dataset.columns != 'labels']
    labels = dataset['labels']

    #create Machine learning object
    ml = MachineLearning(x_original, labels, classes=['pos', 'neg'])

    filename = 'crmapp/ml_models/dataset3_all_'

    #tests models
    print('best model svm')
    best_svm_model = ml.train_best_model('svm')

    pickle.dump(best_svm_model, open(filename + 'svm_model_24082020.sav',
                                     'wb'))

    print('best model rf')
    best_rf_model = ml.train_best_model('rf')

    pickle.dump(best_rf_model, open(filename + 'rf_model_24082020.sav', 'wb'))

    print('best model sgd')
    best_sgd_model = ml.train_best_model('sgd')

    pickle.dump(best_sgd_model, open(filename + 'sgd_model_24082020.sav',
                                     'wb'))

    print('best model gradient boosting')
    best_gboosting_model = ml.train_best_model('gboosting')

    pickle.dump(best_gboosting_model,
                open(filename + 'gboosting_model_24082020.sav', 'wb'))

    print('best model lr')
    best_lr_model = ml.train_best_model('lr')

    pickle.dump(best_lr_model, open(filename + 'lr_model_24082020.sav', 'wb'))

    # feature importance of models
    ml.features_importances(best_svm_model, 'svm')
    ml.features_importances(best_rf_model, 'rf')
    # ml.features_importances(best_sgd_model,'sgd')
    # ml.features_importances(best_gboosting_model,'gboosting')
    # ml.features_importances(best_lr_model,'lr')

    print('best model nn')
    best_nn_model = ml.train_best_model('nn')

    pickle.dump(best_nn_model, open(filename + 'nn_model_24082020.sav', 'wb'))

    print('best model gnb')
    best_gnb_model = ml.train_best_model('gnb')

    pickle.dump(best_gnb_model, open(filename + 'gnb_model_24082020.sav',
                                     'wb'))

    print('best model knn')
    best_knn_model = ml.train_best_model('knn')

    pickle.dump(best_knn_model, open(filename + 'knn_model_24082020.sav',
                                     'wb'))

    #plot validation curve
    print('plot validation_svm')
    ml.plot_validation_curve(
        best_svm_model,
        param_name='clf__C',
        param_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100])

    # print('plot validation_gboosting')
    # ml.plot_validation_curve(best_gboosting_model, param_name='clf__n_estimators',
    #                          param_range=[ 1, 10,100,500])

    print('score_test_set_svm')
    print(ml.score_testset(best_svm_model))

    print('score_test_set_rf')
    print(ml.score_testset(best_rf_model))

    print('score_test_set_rf')
    print(ml.score_testset(best_rf_model))

    print('score_test_set_gboosting')
    print(ml.score_testset(best_gboosting_model))

    print('score_test_set_lr')
    print(ml.score_testset(best_lr_model))