def models_compare(x, y):

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

    results_svm = models.SVM(X_train, y_train, X_test)
    sns.regplot(results_svm, y_test, color='red', label='SVM')
    Evaluationmatrix(y_test, results_svm, "SVM")

    results_tree = models.TREE(X_train, y_train, X_test)
    sns.regplot(results_tree, y_test, color='green', label='TREE')
    Evaluationmatrix(y_test, results_tree, "TREE")

    results_ridge = models.RIDGE(X_train, y_train, X_test)
    sns.regplot(results_ridge, y_test, color='orange', label='RIDGE')
    Evaluationmatrix(y_test, results_ridge, "RIDGE")

    results_knn = models.KNN(X_train, y_train, X_test)
    sns.regplot(results_knn, y_test, color='yellow', label='KNN')
    Evaluationmatrix(y_test, results_knn, "KNN")

    results_lr = models.LR(X_train, y_train, X_test)
    sns.regplot(results_lr, y_test, color='blue', label='LR')
    Evaluationmatrix(y_test, results_lr, "LR")

    results_rfr = models.RFR(X_train, y_train, X_test)
    sns.regplot(results_rfr, y_test, color='black', label='RFR')
    Evaluationmatrix(y_test, results_rfr, "RFR")

    plt.title('Models Comparison')
    plt.xlabel('Predicted Ratings')
    plt.ylabel('Actual Ratings')
    plt.legend()
    plt.show()
Пример #2
0
def execute_with_algorithm(alg, X, y, fname, headers, out_dir, record_id, feature_selection, oversampling, survival, undersampling):
	'''execute learning task using the specified algorithm'''

	# feature selection
	# if survival == True and aggregation == True:
	# 	k=150
	# if survival == True and aggregation == False:
	# 	k=220
	# if survival == False and aggregation == True:
	# 	k=150
	# if survival == False and aggregation == False:
	# 	k=220

	k=220

	# perform feature selection
	new_X, best_features, headers = fs.pearson_fs(X, y, headers, k, feature_selection, survival)

	# execute algorithm
	if alg == 'DT':
		results, model = ML.CART(new_X, y, best_features, out_dir+"{}.dot".format(fname), headers, oversampling, undersampling)  #out_dir+"{}.dot".format(fname)
	elif alg == 'RF':
		results, features, model = ML.RF(new_X, y, best_features,oversampling, undersampling, n_estimators=200)
	elif alg == 'RFsmall':
		results, features, model = ML.RF(new_X, y, best_features, oversampling, undersampling, n_estimators=100)
	elif alg == 'SVM':
		results, model = ML.SVM(new_X, y, best_features, oversampling, undersampling)
	elif alg == 'LR':
		results, features, model = ML.LR(new_X, y, best_features,oversampling, undersampling)
	elif alg == 'XGBoost':
		results, features, model = ML.XGBoost(new_X, y, best_features,oversampling, undersampling)
	if alg == 'COX':
		results, features, model = ML.COX(new_X, y, best_features, oversampling, undersampling)
	if alg == 'survSVM':
		results, features, model = ML.survSVM(new_X, y, best_features, oversampling, undersampling)
	if alg == 'GBS':
		results, features, model = ML.GradientBoostingSurvival(new_X, y, best_features, oversampling, undersampling)

	if not results:
		return


	if survival == False:
		in_out.save_results(out_dir+fname+'.csv', ["fpr", "tpr", "auc", "cm"], results, [sum(y),len(y)])
	# else:
		# in_out.save_results(out_dir+fname+'.csv', ["CI"], results, [sum(y),len(y)])

	if 'features' in locals():
		features = features.flatten()
		in_out.save_features(out_dir+"features_" + fname + '.csv', zip(headers[1:-1], features))
	
	return model, best_features, [fname] + results[0:3]
Пример #3
0
def predict():
    # prepare data_loader and vocab
    use_by_article = False
    if use_by_article:
        _, data_loader_test, vocab = prepare_byarticle_data()
    else:
        _, _, data_loader_test, vocab = prepare_data('./data_new/preprocessed_new_{}', constant.batch_size)
    
    if constant.use_bert:
        from pytorch_pretrained_bert import BertTokenizer, BertModel
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        bert_model = BertModel.from_pretrained('bert-base-uncased')
        state = torch.load("bert_model/pytorch_model.bin")
        bert_model.load_state_dict(state)
        article_model = bert_model
        title_model = bert_model
        # print("finish bert model loading")
        LR = models.Classifier(hidden_dim1=768, hidden_dim2=768)
        classifer_state = torch.load("bert_model/classifier.bin")
        LR.load_state_dict(classifer_state)
        # 
    else:
        # for basic LSTM model
        article_model = models.LSTM(vocab=vocab, 
                        embedding_size=constant.emb_dim, 
                        hidden_size=constant.hidden_dim, 
                        num_layers=constant.n_layers,
                        pretrain_emb=constant.pretrain_emb
                        )
        title_model = models.LSTM(vocab=vocab,
                        embedding_size=constant.emb_dim,
                        hidden_size=constant.hidden_dim_tit,
                        num_layers=constant.n_layers,
                        pretrain_emb=constant.pretrain_emb
                        )
        LR = models.LR(hidden_dim1=constant.hidden_dim, hidden_dim2=constant.hidden_dim_tit)

        # load parameters
        article_model = load_model(article_model, model_name="article_model")
        title_model = load_model(title_model, model_name="title_model")
        LR = load_model(LR, model_name="LR")

    if constant.USE_CUDA:
        article_model.cuda()
        title_model.cuda()
        LR.cuda()

    # predict and save result in result folder
    predict(article_model, title_model, LR, data_loader_test, name="bypublisher", print_pred=True)
Пример #4
0
def define_solver(version, input_nc, input_width, input_height, **kwargs):
    if version == 1:  # Logistric regressor
        return models.LR(input_nc, input_width, input_height, **kwargs)
    elif version == 2:  # MLP with 2 hidden layers:
        return models.MLP_LeNet(input_nc, input_width, input_height, **kwargs)
    elif version == 3:  # MLP with a single hidden layer (MNIST LeNet)
        return models.MLP_LeNetMNIST(input_nc, input_width, input_height, **kwargs)
    elif version == 4:  # GAP + 2 FC layers
        return models.Solver_GAP_TwoFClayers(input_nc, input_width, input_height, **kwargs)
    elif version == 5:  # MLP with a single hidden layer in AlexNet
        return models.MLP_AlexNet(input_nc, input_width, input_height, **kwargs)
    elif version == 6:  # GAP + 1 FC layer
        return models.Solver_GAP_OneFClayers(input_nc, input_width, input_height, **kwargs)
    else:
        raise NotImplementedError("Specified solver module not available!")
Пример #5
0
import config
import models


def huber_approx_obj(preds, dtrain):
    '''
    xgboost optimizing function for mean absolute error
    '''
    d = preds - dtrain  #add .get_labels() for xgb.train()
    h = 1  #h is delta in the graphic
    scale = 1 + (d / h)**2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt
    hess = 1 / scale / scale_sqrt
    return grad, hess


models = {
    "dt": models.DecisionTree(),
    "rf": models.RandomForest(),
    "lr": models.LR(),
    "xgb": models.XGBoost(),
    "svm": models.SVM(),
    "lgb": models.LGB(),
    # "mlp": models.MLP(),
    "lstm": models.LSTM()
}

# to get the final accuracy, calculate the mean and the mean absolute error should be the percentage of the
# performance since he wants to see performance
Пример #6
0
for i in range(0, len(X_res_test)):
    if (nn_res_pred[i] > 0.5):
        nn_res_pred[i] = 1
    else:
        nn_res_pred[i] = 0
nn_res_score = modelling.evaluation(nn, X_res_test, y_res_test, nn_res_pred)

nn_pred = nn.predict(X_test)
for i in range(0, len(X_test)):
    if (nn_pred[i] > 0.5):
        nn_pred[i] = 1
    else:
        nn_pred[i] = 0
nn_score = modelling.evaluation(nn, X_test, y_test, nn_pred)

lr = models.LR()
lr_opt, lr_opt_params = modelling.find_hyperparams(lr, parameters_lr, \
                                                   X_res_train, \
                                                   y_res_train, \
                                                   search_method="gridsearch", \
                                                   cv=10)

from sklearn.linear_model import LogisticRegression
lr_opt = LogisticRegression(C=30)
lr_opt.fit(X_res_train, y_res_train)
lr_res_score = modelling.evaluation(lr_opt, X_res_test, y_res_test,
                                    lr_opt.predict(X_res_test))
lr_score = modelling.evaluation(lr_opt, X_test, y_test, lr_opt.predict(X_test))

#svm = models.linSVM()
#svm_opt, svm_opt_params = modelling.find_hyperparams(svm, parameters_lin_svm, \
def LR_model_training(train_ds, test_ds, textField, labelField, max_features,
                      batch_size,no_epochs,saver_path,
                      saver_name, results_file, earlystopping = early_stopping):
    # LR Model training
    AUC_scores = []
    F1_scores = []
    micro_F1_scores = []
    macro_F1_scores = []
    training_time = []

    #y_train = train_ds[labelField]
    #y_test = test_ds[labelField]

    for i in range (5):
        print("iteration" + str(i))
        start_time = time.time()
        # split the train dataset into train and validation
        X_train, X_valid, y_train, y_valid = train_test_split(train_ds[textField], train_ds[labelField], test_size=0.3, stratify=train_ds[labelField])

        vectorizer = TfidfVectorizer(max_features=max_features, ngram_range = (1,3))
        vectorizer = vectorizer.fit(train_ds[textField])

        X_train = vectorizer.transform(X_train)
        X_valid = vectorizer.transform(X_valid)
        X_test = vectorizer.transform(test_ds[textField])

        saver = ModelCheckpoint(saver_path + "/" + saver_name)
        # Logistic regression
        LR_model = models.LR(X_train.shape[1])
        print("model created")
        LR_training_history = LR_model.fit(
            X_train,
            y_train,
            epochs = no_epochs,
            batch_size = batch_size,
            validation_data = [X_valid, y_valid],
            callbacks=[earlystopping,saver],
            verbose=0)

        predicted_labels = LR_model.predict(X_test)
        print("AUC score", roc_auc_score(test_ds[labelField],predicted_labels))
        print("F1 score", f1_score(test_ds[labelField],np.rint(predicted_labels)))
        print("micro F1 score", f1_score(test_ds[labelField], np.rint(predicted_labels), average="micro"))
        print("macro F1 score", f1_score(test_ds[labelField], np.rint(predicted_labels), average="macro"))
        exc_time = time.time() - start_time
        AUC_scores.append(roc_auc_score(test_ds[labelField],predicted_labels))
        F1_scores.append(f1_score(test_ds[labelField],np.rint(predicted_labels)))
        macro_F1_scores.append(f1_score(test_ds[labelField], np.rint(predicted_labels), average="macro"))
        micro_F1_scores.append(f1_score(test_ds[labelField], np.rint(predicted_labels), average="micro"))
        training_time.append(exc_time)
        keras.backend.clear_session()
        print("End iteration"+str(i))

    test_ds["LR_prediction"] = predicted_labels
    print("AUC_avg", np.mean(AUC_scores))
    print("f1_avg", np.mean(F1_scores))
    print("macro_f1_avg", np.mean(macro_F1_scores))
    print("micro_f1_avg", np.mean(micro_F1_scores))
    f = open(results_file, "w")
    f.write(saver_name)
    f.write("\n")
    f.write("AUC_mean: " + str(np.mean(AUC_scores)))
    f.write("\n")
    f.write("F1_mean: " + str(np.mean(F1_scores)))
    f.write("\n")
    f.write("macro_F1_mean: " + str(np.mean(macro_F1_scores)))
    f.write("\n")
    f.write("micro_F1_mean: " + str(np.mean(micro_F1_scores)))
    f.write("\n")
    f.write("Excution Time: " + str(np.mean(training_time)))
    f.write("--------------------------------------------------------------------------------")
    f.write("\n")
    f.close()
    return test_ds
    print("Done!")
Пример #8
0
    return avg_best, test_best

data_loader_tr, data_loader_val, data_loader_test, vocab = prepare_data(
    '/home/nayeon/fakenews/data_new/preprocessed_new_{}_wtitle.pickle',
    constant.batch_size)

if constant.use_bert:
    article_model = models.LSTM(vocab=vocab,
                                embedding_size=constant.emb_dim,
                                hidden_size=constant.hidden_dim,
                                num_layers=constant.n_layers,
                                pretrain_emb=constant.pretrain_emb)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    title_model = BertModel.from_pretrained('bert-base-uncased')
    LR = models.LR(hidden_dim1=constant.hidden_dim, hidden_dim2=768)
elif constant.use_utransformer:
    article_model = models.UTransformer(
        vocab=vocab,
        embedding_size=constant.emb_dim,
        hidden_size=constant.hidden_dim,
        num_layers=constant.max_hops_article,
        num_heads=constant.num_heads,
        total_key_depth=constant.key_value_depth,
        total_value_depth=constant.key_value_depth,
        filter_size=constant.filter_size_article,
        input_dropout=constant.input_dropout,
        layer_dropout=constant.layer_dropout,
        attention_dropout=constant.attention_dropout,
        relu_dropout=constant.relu_dropout)
    title_model = models.UTransformer(