def dataset3_svc_sgd(dataset): x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset['labels'] fselect = FeatureSelection(dataset, x_original, labels) model_lsvc = LinearSVC(C=0.1, penalty="l1", dual=False) #model= linearsvs X_fit_model, X_transf_model,column_selected,feature_importances,feature_importances_DF,dataset_features= \ fselect.select_from_model_feature_elimination(model=model_lsvc) ml = MachineLearning(X_transf_model, labels, classes=['pos', 'neg']) filename = 'crmapp/ml_models/dataset3_all_' print('best model sgd') best_sgd_model = ml.train_best_model('sgd') print('score_test_set_sgd') print(ml.score_testset(best_sgd_model)) print('roc curve') ml.plot_roc_curve(best_sgd_model) pickle.dump(best_sgd_model, open(filename + 'sgd_model_15092020.sav', 'wb'))
def dataset3_univariate_svm(dataset): x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset['labels'] fselect = FeatureSelection(dataset, x_original, labels) model_lsvc = LinearSVC(C=0.1, penalty="l1", dual=False) # #**Select KBest** # #KBest com *mutual info classif* X_fit_univariate, X_transf_univariate,column_selected,scores,dataset_features = \ fselect.univariate(score_func=mutual_info_classif, mode='k_best', param=500) ml = MachineLearning(X_transf_univariate, labels, classes=['pos', 'neg']) filename = 'crmapp/ml_models/dataset3_all_' #tests models print('best model svm') best_svm_model = ml.train_best_model('svm') print('score_test_set_svm') print(ml.score_testset(best_svm_model)) print('roc curve') ml.plot_roc_curve(best_svm_model)
def dataset1_tree_svm(dataset): x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset['labels'] fselect = FeatureSelection(dataset, x_original, labels) model_tree = ExtraTreesClassifier(n_estimators=50) # Select from model #model= Tree classifier. 50 estiamtors X_fit_model, X_transf_model,column_selected,feature_importances,feature_importances_DF,dataset_features= \ fselect.select_from_model_feature_elimination(model=model_tree) ml = MachineLearning(X_transf_model, labels, classes=['pos', 'neg']) filename = 'crmapp/ml_models/dataset1_all_' #tests models print('best model svm') best_svm_model = ml.train_best_model('svm') print('score_test_set_svm') print(ml.score_testset(best_svm_model)) print('roc curve') ml.plot_roc_curve(best_svm_model) pickle.dump(best_svm_model, open(filename + 'svm_model_15092020.sav', 'wb'))
def predictions(model, dataset_file='dataset2_usar.csv', dataset='descriptors.sav', gap=1): dataset_prev = read_dataset(dataset_file) dataset_prev_X = dataset_prev.loc[:, 'seq'] dataset_prev_y = dataset_prev.loc[:, 'labels'] model = pickle.load(open(model, 'rb')) # dataset = test_preprocess(get_dataset(dataset)) dataset = get_dataset(dataset) x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset.loc[:, 'labels'] ml = MachineLearning(x_original, labels, classes=['pos', 'neg']) """ results = {'total':len(dataset_prev_X), 'true_pos':0, 'true_neg':0, 'false_pos':0, 'false_neg':0} """ prevs = {} for i in range(len(dataset_prev_X)): seq = dataset_prev_X[i] print(i, seq) current_label = dataset_prev_y[i] result = ml.predict_window(model, seq=seq, x=None, window_size=len(seq), gap=gap, features=[], names=None, y=None, filename=None) #df = ml.predict(model, x=X_test, seqs=y_test) prevs[i] = (result, current_label) """ if result['probability'][0] > 0.5 and current_label == 'pos': results['true_pos'] += 1 elif result['probability'][0] <= 0.5 and current_label == 'neg': results['true_neg'] += 1 elif result['probability'][0] <= 0.5 and current_label == 'pos': results['false_neg'] += 1 elif result['probability'][0] > 0.5 and current_label == 'neg': results['false_pos'] += 1 """ return prevs
def test_machine_learning(dataset): #split dataset # dataset = pd.read_csv(r'datasets/dataset1_test_clean_fselection.csv', delimiter=',') print(dataset.shape) x_original = dataset.loc[:, dataset.columns != 'labels'] labels = dataset['labels'] #create Machine learning object ml = MachineLearning(x_original, labels, classes=['pos', 'neg']) filename = 'crmapp/ml_models/dataset3_all_' #tests models print('best model svm') best_svm_model = ml.train_best_model('svm') pickle.dump(best_svm_model, open(filename + 'svm_model_24082020.sav', 'wb')) print('best model rf') best_rf_model = ml.train_best_model('rf') pickle.dump(best_rf_model, open(filename + 'rf_model_24082020.sav', 'wb')) print('best model sgd') best_sgd_model = ml.train_best_model('sgd') pickle.dump(best_sgd_model, open(filename + 'sgd_model_24082020.sav', 'wb')) print('best model gradient boosting') best_gboosting_model = ml.train_best_model('gboosting') pickle.dump(best_gboosting_model, open(filename + 'gboosting_model_24082020.sav', 'wb')) print('best model lr') best_lr_model = ml.train_best_model('lr') pickle.dump(best_lr_model, open(filename + 'lr_model_24082020.sav', 'wb')) # feature importance of models ml.features_importances(best_svm_model, 'svm') ml.features_importances(best_rf_model, 'rf') # ml.features_importances(best_sgd_model,'sgd') # ml.features_importances(best_gboosting_model,'gboosting') # ml.features_importances(best_lr_model,'lr') print('best model nn') best_nn_model = ml.train_best_model('nn') pickle.dump(best_nn_model, open(filename + 'nn_model_24082020.sav', 'wb')) print('best model gnb') best_gnb_model = ml.train_best_model('gnb') pickle.dump(best_gnb_model, open(filename + 'gnb_model_24082020.sav', 'wb')) print('best model knn') best_knn_model = ml.train_best_model('knn') pickle.dump(best_knn_model, open(filename + 'knn_model_24082020.sav', 'wb')) #plot validation curve print('plot validation_svm') ml.plot_validation_curve( best_svm_model, param_name='clf__C', param_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]) # print('plot validation_gboosting') # ml.plot_validation_curve(best_gboosting_model, param_name='clf__n_estimators', # param_range=[ 1, 10,100,500]) print('score_test_set_svm') print(ml.score_testset(best_svm_model)) print('score_test_set_rf') print(ml.score_testset(best_rf_model)) print('score_test_set_rf') print(ml.score_testset(best_rf_model)) print('score_test_set_gboosting') print(ml.score_testset(best_gboosting_model)) print('score_test_set_lr') print(ml.score_testset(best_lr_model))