def dataset3_svc_sgd(dataset): x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset['labels'] fselect = FeatureSelection(dataset, x_original, labels) model_lsvc = LinearSVC(C=0.1, penalty="l1", dual=False) #model= linearsvs X_fit_model, X_transf_model,column_selected,feature_importances,feature_importances_DF,dataset_features= \ fselect.select_from_model_feature_elimination(model=model_lsvc) ml = MachineLearning(X_transf_model, labels, classes=['pos', 'neg']) filename = 'crmapp/ml_models/dataset3_all_' print('best model sgd') best_sgd_model = ml.train_best_model('sgd') print('score_test_set_sgd') print(ml.score_testset(best_sgd_model)) print('roc curve') ml.plot_roc_curve(best_sgd_model) pickle.dump(best_sgd_model, open(filename + 'sgd_model_15092020.sav', 'wb'))
def dataset3_univariate_svm(dataset): x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset['labels'] fselect = FeatureSelection(dataset, x_original, labels) model_lsvc = LinearSVC(C=0.1, penalty="l1", dual=False) # #**Select KBest** # #KBest com *mutual info classif* X_fit_univariate, X_transf_univariate,column_selected,scores,dataset_features = \ fselect.univariate(score_func=mutual_info_classif, mode='k_best', param=500) ml = MachineLearning(X_transf_univariate, labels, classes=['pos', 'neg']) filename = 'crmapp/ml_models/dataset3_all_' #tests models print('best model svm') best_svm_model = ml.train_best_model('svm') print('score_test_set_svm') print(ml.score_testset(best_svm_model)) print('roc curve') ml.plot_roc_curve(best_svm_model)
def dataset1_tree_svm(dataset): x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset['labels'] fselect = FeatureSelection(dataset, x_original, labels) model_tree = ExtraTreesClassifier(n_estimators=50) # Select from model #model= Tree classifier. 50 estiamtors X_fit_model, X_transf_model,column_selected,feature_importances,feature_importances_DF,dataset_features= \ fselect.select_from_model_feature_elimination(model=model_tree) ml = MachineLearning(X_transf_model, labels, classes=['pos', 'neg']) filename = 'crmapp/ml_models/dataset1_all_' #tests models print('best model svm') best_svm_model = ml.train_best_model('svm') print('score_test_set_svm') print(ml.score_testset(best_svm_model)) print('roc curve') ml.plot_roc_curve(best_svm_model) pickle.dump(best_svm_model, open(filename + 'svm_model_15092020.sav', 'wb'))
def test_machine_learning(dataset): #split dataset # dataset = pd.read_csv(r'datasets/dataset1_test_clean_fselection.csv', delimiter=',') print(dataset.shape) x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset['labels'] #create Machine learning object ml = MachineLearning(x_original, labels, classes=['pos', 'neg']) filename = 'crmapp/ml_models/dataset3_all_' #tests models print('best model svm') best_svm_model = ml.train_best_model('svm') pickle.dump(best_svm_model, open(filename + 'svm_model_24082020.sav', 'wb')) print('best model rf') best_rf_model = ml.train_best_model('rf') pickle.dump(best_rf_model, open(filename + 'rf_model_24082020.sav', 'wb')) print('best model sgd') best_sgd_model = ml.train_best_model('sgd') pickle.dump(best_sgd_model, open(filename + 'sgd_model_24082020.sav', 'wb')) print('best model gradient boosting') best_gboosting_model = ml.train_best_model('gboosting') pickle.dump(best_gboosting_model, open(filename + 'gboosting_model_24082020.sav', 'wb')) print('best model lr') best_lr_model = ml.train_best_model('lr') pickle.dump(best_lr_model, open(filename + 'lr_model_24082020.sav', 'wb')) # feature importance of models ml.features_importances(best_svm_model, 'svm') ml.features_importances(best_rf_model, 'rf') # ml.features_importances(best_sgd_model,'sgd') # ml.features_importances(best_gboosting_model,'gboosting') # ml.features_importances(best_lr_model,'lr') print('best model nn') best_nn_model = ml.train_best_model('nn') pickle.dump(best_nn_model, open(filename + 'nn_model_24082020.sav', 'wb')) print('best model gnb') best_gnb_model = ml.train_best_model('gnb') pickle.dump(best_gnb_model, open(filename + 'gnb_model_24082020.sav', 'wb')) print('best model knn') best_knn_model = ml.train_best_model('knn') pickle.dump(best_knn_model, open(filename + 'knn_model_24082020.sav', 'wb')) #plot validation curve print('plot validation_svm') ml.plot_validation_curve( best_svm_model, param_name='clf__C', param_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]) # print('plot validation_gboosting') # ml.plot_validation_curve(best_gboosting_model, param_name='clf__n_estimators', # param_range=[ 1, 10,100,500]) print('score_test_set_svm') print(ml.score_testset(best_svm_model)) print('score_test_set_rf') print(ml.score_testset(best_rf_model)) print('score_test_set_rf') print(ml.score_testset(best_rf_model)) print('score_test_set_gboosting') print(ml.score_testset(best_gboosting_model)) print('score_test_set_lr') print(ml.score_testset(best_lr_model))