def models_compare(x, y): X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30) results_svm = models.SVM(X_train, y_train, X_test) sns.regplot(results_svm, y_test, color='red', label='SVM') Evaluationmatrix(y_test, results_svm, "SVM") results_tree = models.TREE(X_train, y_train, X_test) sns.regplot(results_tree, y_test, color='green', label='TREE') Evaluationmatrix(y_test, results_tree, "TREE") results_ridge = models.RIDGE(X_train, y_train, X_test) sns.regplot(results_ridge, y_test, color='orange', label='RIDGE') Evaluationmatrix(y_test, results_ridge, "RIDGE") results_knn = models.KNN(X_train, y_train, X_test) sns.regplot(results_knn, y_test, color='yellow', label='KNN') Evaluationmatrix(y_test, results_knn, "KNN") results_lr = models.LR(X_train, y_train, X_test) sns.regplot(results_lr, y_test, color='blue', label='LR') Evaluationmatrix(y_test, results_lr, "LR") results_rfr = models.RFR(X_train, y_train, X_test) sns.regplot(results_rfr, y_test, color='black', label='RFR') Evaluationmatrix(y_test, results_rfr, "RFR") plt.title('Models Comparison') plt.xlabel('Predicted Ratings') plt.ylabel('Actual Ratings') plt.legend() plt.show()
def execute_with_algorithm(alg, X, y, fname, headers, out_dir, record_id, feature_selection, oversampling, survival, undersampling): '''execute learning task using the specified algorithm''' # feature selection # if survival == True and aggregation == True: # k=150 # if survival == True and aggregation == False: # k=220 # if survival == False and aggregation == True: # k=150 # if survival == False and aggregation == False: # k=220 k=220 # perform feature selection new_X, best_features, headers = fs.pearson_fs(X, y, headers, k, feature_selection, survival) # execute algorithm if alg == 'DT': results, model = ML.CART(new_X, y, best_features, out_dir+"{}.dot".format(fname), headers, oversampling, undersampling) #out_dir+"{}.dot".format(fname) elif alg == 'RF': results, features, model = ML.RF(new_X, y, best_features,oversampling, undersampling, n_estimators=200) elif alg == 'RFsmall': results, features, model = ML.RF(new_X, y, best_features, oversampling, undersampling, n_estimators=100) elif alg == 'SVM': results, model = ML.SVM(new_X, y, best_features, oversampling, undersampling) elif alg == 'LR': results, features, model = ML.LR(new_X, y, best_features,oversampling, undersampling) elif alg == 'XGBoost': results, features, model = ML.XGBoost(new_X, y, best_features,oversampling, undersampling) if alg == 'COX': results, features, model = ML.COX(new_X, y, best_features, oversampling, undersampling) if alg == 'survSVM': results, features, model = ML.survSVM(new_X, y, best_features, oversampling, undersampling) if alg == 'GBS': results, features, model = ML.GradientBoostingSurvival(new_X, y, best_features, oversampling, undersampling) if not results: return if survival == False: in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)]) # else: # in_out.save_results(out_dir+fname+'.csv', ["CI"], results, [sum(y),len(y)]) if 'features' in locals(): features = features.flatten() in_out.save_features(out_dir+"features_" + fname + '.csv', zip(headers[1:-1], features)) return model, best_features, [fname] + results[0:3]
def predict(): # prepare data_loader and vocab use_by_article = False if use_by_article: _, data_loader_test, vocab = prepare_byarticle_data() else: _, _, data_loader_test, vocab = prepare_data('./data_new/preprocessed_new_{}', constant.batch_size) if constant.use_bert: from pytorch_pretrained_bert import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_model = BertModel.from_pretrained('bert-base-uncased') state = torch.load("bert_model/pytorch_model.bin") bert_model.load_state_dict(state) article_model = bert_model title_model = bert_model # print("finish bert model loading") LR = models.Classifier(hidden_dim1=768, hidden_dim2=768) classifer_state = torch.load("bert_model/classifier.bin") LR.load_state_dict(classifer_state) # else: # for basic LSTM model article_model = models.LSTM(vocab=vocab, embedding_size=constant.emb_dim, hidden_size=constant.hidden_dim, num_layers=constant.n_layers, pretrain_emb=constant.pretrain_emb ) title_model = models.LSTM(vocab=vocab, embedding_size=constant.emb_dim, hidden_size=constant.hidden_dim_tit, num_layers=constant.n_layers, pretrain_emb=constant.pretrain_emb ) LR = models.LR(hidden_dim1=constant.hidden_dim, hidden_dim2=constant.hidden_dim_tit) # load parameters article_model = load_model(article_model, model_name="article_model") title_model = load_model(title_model, model_name="title_model") LR = load_model(LR, model_name="LR") if constant.USE_CUDA: article_model.cuda() title_model.cuda() LR.cuda() # predict and save result in result folder predict(article_model, title_model, LR, data_loader_test, name="bypublisher", print_pred=True)
def define_solver(version, input_nc, input_width, input_height, **kwargs): if version == 1: # Logistric regressor return models.LR(input_nc, input_width, input_height, **kwargs) elif version == 2: # MLP with 2 hidden layers: return models.MLP_LeNet(input_nc, input_width, input_height, **kwargs) elif version == 3: # MLP with a single hidden layer (MNIST LeNet) return models.MLP_LeNetMNIST(input_nc, input_width, input_height, **kwargs) elif version == 4: # GAP + 2 FC layers return models.Solver_GAP_TwoFClayers(input_nc, input_width, input_height, **kwargs) elif version == 5: # MLP with a single hidden layer in AlexNet return models.MLP_AlexNet(input_nc, input_width, input_height, **kwargs) elif version == 6: # GAP + 1 FC layer return models.Solver_GAP_OneFClayers(input_nc, input_width, input_height, **kwargs) else: raise NotImplementedError("Specified solver module not available!")
import config import models def huber_approx_obj(preds, dtrain): ''' xgboost optimizing function for mean absolute error ''' d = preds - dtrain #add .get_labels() for xgb.train() h = 1 #h is delta in the graphic scale = 1 + (d / h)**2 scale_sqrt = np.sqrt(scale) grad = d / scale_sqrt hess = 1 / scale / scale_sqrt return grad, hess models = { "dt": models.DecisionTree(), "rf": models.RandomForest(), "lr": models.LR(), "xgb": models.XGBoost(), "svm": models.SVM(), "lgb": models.LGB(), # "mlp": models.MLP(), "lstm": models.LSTM() } # to get the final accuracy, calculate the mean and the mean absolute error should be the percentage of the # performance since he wants to see performance
for i in range(0, len(X_res_test)): if (nn_res_pred[i] > 0.5): nn_res_pred[i] = 1 else: nn_res_pred[i] = 0 nn_res_score = modelling.evaluation(nn, X_res_test, y_res_test, nn_res_pred) nn_pred = nn.predict(X_test) for i in range(0, len(X_test)): if (nn_pred[i] > 0.5): nn_pred[i] = 1 else: nn_pred[i] = 0 nn_score = modelling.evaluation(nn, X_test, y_test, nn_pred) lr = models.LR() lr_opt, lr_opt_params = modelling.find_hyperparams(lr, parameters_lr, \ X_res_train, \ y_res_train, \ search_method="gridsearch", \ cv=10) from sklearn.linear_model import LogisticRegression lr_opt = LogisticRegression(C=30) lr_opt.fit(X_res_train, y_res_train) lr_res_score = modelling.evaluation(lr_opt, X_res_test, y_res_test, lr_opt.predict(X_res_test)) lr_score = modelling.evaluation(lr_opt, X_test, y_test, lr_opt.predict(X_test)) #svm = models.linSVM() #svm_opt, svm_opt_params = modelling.find_hyperparams(svm, parameters_lin_svm, \
def LR_model_training(train_ds, test_ds, textField, labelField, max_features, batch_size,no_epochs,saver_path, saver_name, results_file, earlystopping = early_stopping): # LR Model training AUC_scores = [] F1_scores = [] micro_F1_scores = [] macro_F1_scores = [] training_time = [] #y_train = train_ds[labelField] #y_test = test_ds[labelField] for i in range (5): print("iteration" + str(i)) start_time = time.time() # split the train dataset into train and validation X_train, X_valid, y_train, y_valid = train_test_split(train_ds[textField], train_ds[labelField], test_size=0.3, stratify=train_ds[labelField]) vectorizer = TfidfVectorizer(max_features=max_features, ngram_range = (1,3)) vectorizer = vectorizer.fit(train_ds[textField]) X_train = vectorizer.transform(X_train) X_valid = vectorizer.transform(X_valid) X_test = vectorizer.transform(test_ds[textField]) saver = ModelCheckpoint(saver_path + "/" + saver_name) # Logistic regression LR_model = models.LR(X_train.shape[1]) print("model created") LR_training_history = LR_model.fit( X_train, y_train, epochs = no_epochs, batch_size = batch_size, validation_data = [X_valid, y_valid], callbacks=[earlystopping,saver], verbose=0) predicted_labels = LR_model.predict(X_test) print("AUC score", roc_auc_score(test_ds[labelField],predicted_labels)) print("F1 score", f1_score(test_ds[labelField],np.rint(predicted_labels))) print("micro F1 score", f1_score(test_ds[labelField], np.rint(predicted_labels), average="micro")) print("macro F1 score", f1_score(test_ds[labelField], np.rint(predicted_labels), average="macro")) exc_time = time.time() - start_time AUC_scores.append(roc_auc_score(test_ds[labelField],predicted_labels)) F1_scores.append(f1_score(test_ds[labelField],np.rint(predicted_labels))) macro_F1_scores.append(f1_score(test_ds[labelField], np.rint(predicted_labels), average="macro")) micro_F1_scores.append(f1_score(test_ds[labelField], np.rint(predicted_labels), average="micro")) training_time.append(exc_time) keras.backend.clear_session() print("End iteration"+str(i)) test_ds["LR_prediction"] = predicted_labels print("AUC_avg", np.mean(AUC_scores)) print("f1_avg", np.mean(F1_scores)) print("macro_f1_avg", np.mean(macro_F1_scores)) print("micro_f1_avg", np.mean(micro_F1_scores)) f = open(results_file, "w") f.write(saver_name) f.write("\n") f.write("AUC_mean: " + str(np.mean(AUC_scores))) f.write("\n") f.write("F1_mean: " + str(np.mean(F1_scores))) f.write("\n") f.write("macro_F1_mean: " + str(np.mean(macro_F1_scores))) f.write("\n") f.write("micro_F1_mean: " + str(np.mean(micro_F1_scores))) f.write("\n") f.write("Excution Time: " + str(np.mean(training_time))) f.write("--------------------------------------------------------------------------------") f.write("\n") f.close() return test_ds print("Done!")
return avg_best, test_best data_loader_tr, data_loader_val, data_loader_test, vocab = prepare_data( '/home/nayeon/fakenews/data_new/preprocessed_new_{}_wtitle.pickle', constant.batch_size) if constant.use_bert: article_model = models.LSTM(vocab=vocab, embedding_size=constant.emb_dim, hidden_size=constant.hidden_dim, num_layers=constant.n_layers, pretrain_emb=constant.pretrain_emb) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') title_model = BertModel.from_pretrained('bert-base-uncased') LR = models.LR(hidden_dim1=constant.hidden_dim, hidden_dim2=768) elif constant.use_utransformer: article_model = models.UTransformer( vocab=vocab, embedding_size=constant.emb_dim, hidden_size=constant.hidden_dim, num_layers=constant.max_hops_article, num_heads=constant.num_heads, total_key_depth=constant.key_value_depth, total_value_depth=constant.key_value_depth, filter_size=constant.filter_size_article, input_dropout=constant.input_dropout, layer_dropout=constant.layer_dropout, attention_dropout=constant.attention_dropout, relu_dropout=constant.relu_dropout) title_model = models.UTransformer(