def k_fold_plot(data, k, rate, lambs, iteration, category): costList = [] df = data[0] tdf = data[1] lens = len(df.columns) data_split = np.array_split(df, k) best_l = np.inf max_acc = -np.inf for l in range(len(lambs)): lam = lambs[l] # print("lam=%f" %lam) accuracy = 0 # print("rate=%f" %rate) for i in range(0, k, 1): dfk = pd.concat([df, data_split[i]]).drop_duplicates(keep=False) vdfk = data_split[i] X = dfk.iloc[:, 0:lens - 1] Y = dfk.iloc[:, lens - 1:lens] pX = vdfk.iloc[:, 0:lens - 1] pY = vdfk.iloc[:, lens - 1:lens] nX, nPx = data_normalized(np.array(X), np.array(pX)) nX = pd.DataFrame(nX).astype(float) nPx = pd.DataFrame(nPx).astype(float) model = LogisticRegression(np.zeros((1, len(nX.columns)), float)) costList = np.append( costList, model.fit(nX, np.array(Y), rate, lam, iteration)) prediction = model.predict(nPx, category) accuracy += model.evaluate_acc(pY, prediction) mean_acc = accuracy / k if mean_acc > max_acc: max_acc = mean_acc best_l = l X = df.iloc[:, 0:lens - 1] Y = df.iloc[:, lens - 1:lens] pX = tdf.iloc[:, 0:lens - 1] pY = tdf.iloc[:, lens - 1:lens] nX, nPx = data_normalized(np.array(X), np.array(pX)) nX = pd.DataFrame(nX).astype(float) nPx = pd.DataFrame(nPx).astype(float) # print(rates[best_r]) # print(lambs[best_l]) model = LogisticRegression(np.zeros((1, len(nX.columns)), float)) costList = np.append( costList, model.fit(nX, np.array(Y), rate, lambs[best_l], iteration)) prediction = model.predict(nPx, category) acc = model.evaluate_acc(pY, prediction) matrix = model.confusion_matrix(pY, prediction, category) print(matrix) return acc, costList
def lg_k_folds(X_train, y_train, lr, b, epochs, lamda, bias, k=5, verbose=False): results = { 'accuracy': [], 'recall': [], 'precision': [] } metric_means = {} accuracy = Accuracy() recall = Recall() precision = Precision() chunk_size = int(len(X_train) / k) logistic_regression = LogisticRegression(bias) for i in range(0, len(X_train), chunk_size): end = i + chunk_size if i + chunk_size <= len(X_train) else len(X_train) new_X_valid = X_train[i: end] new_y_valid = y_train[i: end] new_X_train = np.concatenate([X_train[: i], X_train[end:]]) new_y_train = np.concatenate([y_train[: i], y_train[end:]]) logistic_regression.fit(new_X_train, new_y_train, lr, b, epochs, lamda, verbose=verbose) predictions = logistic_regression.predict(new_X_valid) results['accuracy'].append(accuracy(new_y_valid, predictions)) results['recall'].append(recall(new_y_valid, predictions)) results['precision'].append(precision(new_y_valid, predictions)) metric_means['accuracy'] = np.mean(results['accuracy']) metric_means['recall'] = np.mean(results['recall']) metric_means['precision'] = np.mean(results['precision']) return metric_means
class LogisticRegressionExperiment(object): def __init__(self): self._data_set = get_pick_data("LogisticRegression") self._num_features = self._data_set.dynamic_features.shape[1] self._time_steps = 1 self._n_output = 1 self._model_format() self._check_path() def _model_format(self): learning_rate, max_loss, max_pace, ridge, batch_size, hidden_size, epoch, dropout = lr_setup.all self._model = LogisticRegression( num_features=self._num_features, time_steps=self._time_steps, n_output=self._n_output, batch_size=batch_size, epochs=epoch, output_n_epoch=ExperimentSetup.output_n_epochs, learning_rate=learning_rate, max_loss=max_loss, dropout=dropout, max_pace=max_pace, ridge=ridge) def _check_path(self): if not os.path.exists("result_9_16_0"): os.makedirs("result_9_16_0") self._filename = "result_9_16_0" + "/" + self._model.name + " " + \ time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) def do_experiments(self): n_output = 1 dynamic_features = self._data_set.dynamic_features labels = self._data_set.labels # tol_test_index = np.zeros(shape=0, dtype=np.int32) tol_pred = np.zeros(shape=(0, n_output)) tol_label = np.zeros(shape=(0, n_output), dtype=np.int32) train_dynamic_features, test_dynamic_features, train_labels, test_labels = \ split_logistic_data(dynamic_features,labels) for i in range(5): train_dynamic_res, train_labels_res = imbalance_preprocess( train_dynamic_features[i], train_labels[i], 'LogisticRegression') train_set = DataSet(train_dynamic_res, train_labels_res) test_set = DataSet(test_dynamic_features[i].reshape(-1, 92), test_labels[i].reshape(-1, 1)) self._model.fit(train_set, test_set) y_score = self._model.predict(test_set) tol_pred = np.vstack((tol_pred, y_score)) tol_label = np.vstack((tol_label, test_labels[i])) print("Cross validation: {} of {}".format(i, 5), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) tol_test_index = np.arange(labels.shape[0] * labels.shape[1]) evaluate(tol_test_index, tol_label, tol_pred, self._filename) self._model.close()
def logistic_test(): n_samples = 100 np.random.seed(0) X_train = np.random.normal(size=n_samples) y_train = (X_train > 0).astype(float) X_train[X_train > 0] *= 4 X_train += 0.3 * np.random.normal(size=n_samples) X_train = X_train[:, np.newaxis] X, y = make_classification( n_features=1, n_classes=2, n_redundant=0, n_informative=1, n_clusters_per_class=1, class_sep=0.75, shuffle=True, random_state=0, ) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) df_test = pd.DataFrame(data=[X_test.flatten(), y_test]).T df_test.columns = ["X", "y"] lr = LogisticRegression() lr.fit(X_train, y_train) y_pred = lr.predict(X_test) score = [1 if yi == yi_pred else 0 for yi, yi_pred in zip(y_test, y_pred)] print(np.sum(score) / len(score)) # and plot the result plt.figure(1, figsize=(4, 3)) plt.clf() plt.scatter(X_train.ravel(), y_train, color="black", zorder=20) df_test["loss"] = expit(X_test * lr.theta + lr.bias).ravel() df_test = df_test.sort_values("X") plt.plot(df_test["X"], df_test["loss"], color="red", linewidth=3) ols = LinearRegression() ols.fit(X_train, y_train) plt.plot(X_test, ols.theta * X_test + ols.bias, linewidth=1) plt.axhline(0.5, color=".5") plt.ylabel("y") plt.xlabel("X") plt.xticks(range(-5, 10)) plt.yticks([0, 0.5, 1]) plt.ylim(-0.25, 1.25) plt.xlim(-2, 2) plt.legend( ("Logistic Regression Model", "Linear Regression Model"), loc="lower right", fontsize="small", ) plt.tight_layout() plt.show()
autoencoder.eval() classifier.eval() predictions = list() labels = list() for idx, (data_batch, targets_batch, _) in enumerate(test_loader): if args.num_certify is not None and idx >= args.num_certify: break time_start = time.time() data_batch = data_batch.double() latent_data = autoencoder.encode(data_batch) y_pred = classifier.predict(latent_data).detach() predictions.append(y_pred.detach().cpu().unsqueeze(0)) labels.append(targets_batch.detach().cpu()) if y_pred == targets_batch[0]: corr += 1 x_batches, y_batches = list(), list() k = 1 for i in range(oracle.constraint.n_tvars): x_batches.append(data_batch[i:i + k]) y_batches.append(targets_batch[i:i + k]) if oracle.constraint.n_gvars > 0: domains = oracle.constraint.get_domains(x_batches, y_batches)
# # Plot this dataset # plt.plot(x_0[np.where(y == 0)], x_1[np.where(y == 0)], 'o', c="b") # plt.plot(x_0[np.where(y == 1)], x_1[np.where(y == 1)], 'o', c="r") # plt.xlabel("x_0") # plt.ylabel("x_1") # plt.show() x_0 = scaleFeature(x_0) x_1 = scaleFeature(x_1) X = np.transpose(np.vstack((x_0, x_1))) # Transform x into a matrix lrClassifier.train(X, y, 5, 500) # Train model predictions = lrClassifier.predict(X) # Train set predictions evaluateBinaryClassifier(predictions, y) # Evaluate train set predictions # Plot the decition boundary plt.plot(x_0[np.where(y == 0)], x_1[np.where(y == 0)], 'o', c="b") plt.plot(x_0[np.where(y == 1)], x_1[np.where(y == 1)], 'o', c="r") plt.xlabel("x_0") plt.ylabel("x_1") plt.title("Learned Decition Boundary for the Generated Dataset") # Generate the decition boundary boundary_x = np.linspace(-0.5,0.5,25) param = lrClassifier.parameters boundary_y = (-1 / param[2]) * (param[0] + boundary_x * param[1]) # Plot the decition boundary plt.plot(boundary_x, boundary_y, c="k")
class LogisticRegressionExperiment(object): def __init__(self, event_type): self._event_type = event_type self._data_set = read_data(event_type) self._num_features = self._data_set.dynamic_feature.shape[2] self._time_steps = self._data_set.dynamic_feature.shape[1] self._n_output = self._data_set.labels.shape[1] print(event_type) self._model_format() self._check_path() def _model_format(self): if self._event_type == "qx": learning_rate, max_loss, max_pace, lasso, ridge = lr_qx_setup.all elif self._event_type == "cx": learning_rate, max_loss, max_pace, lasso, ridge = lr_cx_setup.all else: learning_rate, max_loss, max_pace, lasso, ridge = lr_xycj_setup.all self._model = LogisticRegression( num_features=self._num_features, time_steps=self._time_steps, n_output=self._n_output, batch_size=ExperimentSetup.batch_size, epochs=ExperimentSetup.epochs, output_n_epoch=ExperimentSetup.output_n_epochs, learning_rate=learning_rate, max_loss=max_loss, max_pace=max_pace, lasso=lasso, ridge=ridge) def _check_path(self): if not os.path.exists("average_result_cx_TEST" + self._event_type): os.makedirs("average_result_cx_TEST" + self._event_type) self._filename = "average_result_cx_TEST" + self._event_type + "/" + self._model.name + " " + time.strftime( "%Y-%m-%d-%H-%M-%S", time.localtime()) def do_experiments(self): dynamic_feature = self._data_set.dynamic_feature labels = self._data_set.labels kf = sklearn.model_selection.StratifiedKFold( n_splits=ExperimentSetup.kfold, shuffle=False) n_output = labels.shape[1] # classes tol_test_index = np.zeros(shape=0, dtype=np.int32) tol_pred = np.zeros(shape=(0, n_output)) tol_label = np.zeros(shape=(0, n_output), dtype=np.int32) i = 1 for train_idx, test_idx in kf.split(X=dynamic_feature, y=labels.reshape(-1)): # 五折交叉 train_dynamic = dynamic_feature[train_idx] train_y = labels[train_idx] train_dynamic_res, train_y_res = imbalance_preprocess( train_dynamic, train_y) # SMOTE过采样方法处理不平衡数据集 test_dynamic = dynamic_feature[test_idx] test_y = labels[test_idx] train_set = DataSet(train_dynamic_res, train_y_res) test_set = DataSet(test_dynamic, test_y) self._model.fit(train_set, test_set, self._event_type) y_score = self._model.predict(test_set) tol_test_index = np.concatenate((tol_test_index, test_idx)) tol_pred = np.vstack((tol_pred, y_score)) tol_label = np.vstack((tol_label, test_y)) print( "Cross validation: {} of {}".format(i, ExperimentSetup.kfold), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) i += 1 evaluate(tol_test_index, tol_label, tol_pred, self._filename) self._model.close()
k_priv_train = gaussian_kernel(x_priv_train, x_priv_train) # loop over svm model parameter space mdl = LogisticRegression() params = mdl.hyper_parameters() for p in params: # train the model t_start = time.time() success = mdl.train(x_norm_train, y_train, gamma=p['gamma']) # did we succeed? if success: # test the model with linear features y_hat = mdl.predict(x_norm_valid) # get metrics recall, precision, f1 = metrics(y_valid, y_hat) save_result(t, 'lr', 'linear', recall, precision, f1, C=None, gamma=p['gamma']) # print result t_elapsed = time.time() - t_start print('Logistic Regression w/ gamma = {:.2e}'.format(p['gamma'],) + ' | Precision = {:.4f}, Recall = {:.4f}, F1 = {:.4f}, '.format(precision, recall, f1) + ' | Time = {:.2f} seconds'.format(t_elapsed)) # loop over svm model parameter space mdl = SVM() params = mdl.hyper_parameters() for p in params:
pre = metric['precision'] print( f'\n\nLearning rate {best_lr}: accuracy={acc}\trecall={rec}\tprecision={pre}' ) print('*************\n\n') # 4 - Regresión Logística con mini-batch y regularización ridge # 4.a - Fit del modelo obtenido print("MEJOR MODELO OBTENIDO (Least Square)") print(f'Hiperparametros: bias: {best_bias} \t Learning Rate {best_lr} \t') logistic_regression = LogisticRegression(best_bias) logistic_regression.fit(X_train, y_train.reshape(-1, 1), best_lr, b, epochs, None) predictions = logistic_regression.predict(X_test) metrics = [Accuracy(), Precision(), Recall()] results = {} for metric in metrics: name = metric.__class__.__name__ results[name] = metric(y_test, predictions[:, 0]) print('{metric}: {value}'.format(metric=name, value=results[name])) print('*************\n\n') """ Se una entrena un modelo de regresión logística con regularización Ridge como función de costo. Se agrega un segundo término a la función basada en least squares. Este término se conoce como shrinkage penalty y tiene como efecto que los coeficientes que minimizan la expresión se sean pequenos, tendiendo a cero a medida que el valor de lambda crece. Básicamente restringe al norma del vector de parámetros. La ventaja de usar este método se explica por el trade-off entre varianza y bias. Lambda hace más rígido el modelo a medida que crece, con el consecuente incremento de la varianza y reducción del bias. El resultado debería ser un mejor desempeño del modelo en el set de testeo porque el modelo gana capacidad de generalizar.