Пример #1
0
def dataset3_svc_sgd(dataset):

    x_original = dataset.loc[:, dataset.columns != 'labels']
    labels = dataset['labels']

    fselect = FeatureSelection(dataset, x_original, labels)

    model_lsvc = LinearSVC(C=0.1, penalty="l1", dual=False)

    #model= linearsvs
    X_fit_model, X_transf_model,column_selected,feature_importances,feature_importances_DF,dataset_features= \
        fselect.select_from_model_feature_elimination(model=model_lsvc)

    ml = MachineLearning(X_transf_model, labels, classes=['pos', 'neg'])

    filename = 'crmapp/ml_models/dataset3_all_'

    print('best model sgd')
    best_sgd_model = ml.train_best_model('sgd')

    print('score_test_set_sgd')
    print(ml.score_testset(best_sgd_model))

    print('roc curve')
    ml.plot_roc_curve(best_sgd_model)

    pickle.dump(best_sgd_model, open(filename + 'sgd_model_15092020.sav',
                                     'wb'))
Пример #2
0
def dataset3_univariate_svm(dataset):

    x_original = dataset.loc[:, dataset.columns != 'labels']
    labels = dataset['labels']

    fselect = FeatureSelection(dataset, x_original, labels)

    model_lsvc = LinearSVC(C=0.1, penalty="l1", dual=False)

    # #**Select KBest**
    # #KBest com *mutual info classif*
    X_fit_univariate, X_transf_univariate,column_selected,scores,dataset_features = \
        fselect.univariate(score_func=mutual_info_classif, mode='k_best', param=500)

    ml = MachineLearning(X_transf_univariate, labels, classes=['pos', 'neg'])

    filename = 'crmapp/ml_models/dataset3_all_'

    #tests models
    print('best model svm')
    best_svm_model = ml.train_best_model('svm')

    print('score_test_set_svm')
    print(ml.score_testset(best_svm_model))

    print('roc curve')
    ml.plot_roc_curve(best_svm_model)
Пример #3
0
def dataset1_tree_svm(dataset):

    x_original = dataset.loc[:, dataset.columns != 'labels']
    labels = dataset['labels']

    fselect = FeatureSelection(dataset, x_original, labels)

    model_tree = ExtraTreesClassifier(n_estimators=50)

    # Select from model
    #model= Tree classifier. 50 estiamtors
    X_fit_model, X_transf_model,column_selected,feature_importances,feature_importances_DF,dataset_features= \
        fselect.select_from_model_feature_elimination(model=model_tree)

    ml = MachineLearning(X_transf_model, labels, classes=['pos', 'neg'])

    filename = 'crmapp/ml_models/dataset1_all_'

    #tests models
    print('best model svm')
    best_svm_model = ml.train_best_model('svm')

    print('score_test_set_svm')
    print(ml.score_testset(best_svm_model))

    print('roc curve')
    ml.plot_roc_curve(best_svm_model)

    pickle.dump(best_svm_model, open(filename + 'svm_model_15092020.sav',
                                     'wb'))
Пример #4
0
def predictions(model,
                dataset_file='dataset2_usar.csv',
                dataset='descriptors.sav',
                gap=1):

    dataset_prev = read_dataset(dataset_file)

    dataset_prev_X = dataset_prev.loc[:, 'seq']

    dataset_prev_y = dataset_prev.loc[:, 'labels']

    model = pickle.load(open(model, 'rb'))

    # dataset = test_preprocess(get_dataset(dataset))

    dataset = get_dataset(dataset)

    x_original = dataset.loc[:, dataset.columns != 'labels']

    labels = dataset.loc[:, 'labels']

    ml = MachineLearning(x_original, labels, classes=['pos', 'neg'])
    """
    results = {'total':len(dataset_prev_X),
               'true_pos':0,
               'true_neg':0,
               'false_pos':0,
               'false_neg':0}
    """

    prevs = {}

    for i in range(len(dataset_prev_X)):

        seq = dataset_prev_X[i]

        print(i, seq)

        current_label = dataset_prev_y[i]

        result = ml.predict_window(model,
                                   seq=seq,
                                   x=None,
                                   window_size=len(seq),
                                   gap=gap,
                                   features=[],
                                   names=None,
                                   y=None,
                                   filename=None)

        #df = ml.predict(model, x=X_test, seqs=y_test)

        prevs[i] = (result, current_label)
        """
        if result['probability'][0] > 0.5 and current_label == 'pos':
            results['true_pos'] += 1
        elif result['probability'][0] <= 0.5 and current_label == 'neg':
            results['true_neg'] += 1
        elif result['probability'][0] <= 0.5 and current_label == 'pos':
            results['false_neg'] += 1
        elif result['probability'][0] > 0.5 and current_label == 'neg':
            results['false_pos'] += 1
        """

    return prevs
Пример #5
0
def test_machine_learning(dataset):
    #split dataset
    # dataset = pd.read_csv(r'datasets/dataset1_test_clean_fselection.csv', delimiter=',')

    print(dataset.shape)

    x_original = dataset.loc[:, dataset.columns != 'labels']
    labels = dataset['labels']

    #create Machine learning object
    ml = MachineLearning(x_original, labels, classes=['pos', 'neg'])

    filename = 'crmapp/ml_models/dataset3_all_'

    #tests models
    print('best model svm')
    best_svm_model = ml.train_best_model('svm')

    pickle.dump(best_svm_model, open(filename + 'svm_model_24082020.sav',
                                     'wb'))

    print('best model rf')
    best_rf_model = ml.train_best_model('rf')

    pickle.dump(best_rf_model, open(filename + 'rf_model_24082020.sav', 'wb'))

    print('best model sgd')
    best_sgd_model = ml.train_best_model('sgd')

    pickle.dump(best_sgd_model, open(filename + 'sgd_model_24082020.sav',
                                     'wb'))

    print('best model gradient boosting')
    best_gboosting_model = ml.train_best_model('gboosting')

    pickle.dump(best_gboosting_model,
                open(filename + 'gboosting_model_24082020.sav', 'wb'))

    print('best model lr')
    best_lr_model = ml.train_best_model('lr')

    pickle.dump(best_lr_model, open(filename + 'lr_model_24082020.sav', 'wb'))

    # feature importance of models
    ml.features_importances(best_svm_model, 'svm')
    ml.features_importances(best_rf_model, 'rf')
    # ml.features_importances(best_sgd_model,'sgd')
    # ml.features_importances(best_gboosting_model,'gboosting')
    # ml.features_importances(best_lr_model,'lr')

    print('best model nn')
    best_nn_model = ml.train_best_model('nn')

    pickle.dump(best_nn_model, open(filename + 'nn_model_24082020.sav', 'wb'))

    print('best model gnb')
    best_gnb_model = ml.train_best_model('gnb')

    pickle.dump(best_gnb_model, open(filename + 'gnb_model_24082020.sav',
                                     'wb'))

    print('best model knn')
    best_knn_model = ml.train_best_model('knn')

    pickle.dump(best_knn_model, open(filename + 'knn_model_24082020.sav',
                                     'wb'))

    #plot validation curve
    print('plot validation_svm')
    ml.plot_validation_curve(
        best_svm_model,
        param_name='clf__C',
        param_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100])

    # print('plot validation_gboosting')
    # ml.plot_validation_curve(best_gboosting_model, param_name='clf__n_estimators',
    #                          param_range=[ 1, 10,100,500])

    print('score_test_set_svm')
    print(ml.score_testset(best_svm_model))

    print('score_test_set_rf')
    print(ml.score_testset(best_rf_model))

    print('score_test_set_rf')
    print(ml.score_testset(best_rf_model))

    print('score_test_set_gboosting')
    print(ml.score_testset(best_gboosting_model))

    print('score_test_set_lr')
    print(ml.score_testset(best_lr_model))