Exemplo n.º 1
0
 def __init__(self):
     bank_full = pd.read_csv('data/bank_full_w_dummy_vars.csv')
     X = bank_full.ix[:,(18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36)].values
     y = bank_full.ix[:,17].values
     LogReg = LogisticRegression()
     LogReg.fit(X, y)
     self.model = LogReg
Exemplo n.º 2
0
Arquivo: kfolds.py Projeto: mluzu/iia
def lg_k_folds(X_train, y_train, lr, b, epochs, lamda, bias, k=5, verbose=False):
    results = {
        'accuracy': [],
        'recall': [],
        'precision': []
    }
    metric_means = {}
    accuracy = Accuracy()
    recall = Recall()
    precision = Precision()
    chunk_size = int(len(X_train) / k)

    logistic_regression = LogisticRegression(bias)

    for i in range(0, len(X_train), chunk_size):
        end = i + chunk_size if i + chunk_size <= len(X_train) else len(X_train)
        new_X_valid = X_train[i: end]
        new_y_valid = y_train[i: end]
        new_X_train = np.concatenate([X_train[: i], X_train[end:]])
        new_y_train = np.concatenate([y_train[: i], y_train[end:]])
        logistic_regression.fit(new_X_train, new_y_train,  lr, b, epochs, lamda, verbose=verbose)
        predictions = logistic_regression.predict(new_X_valid)

        results['accuracy'].append(accuracy(new_y_valid, predictions))
        results['recall'].append(recall(new_y_valid, predictions))
        results['precision'].append(precision(new_y_valid, predictions))

    metric_means['accuracy'] = np.mean(results['accuracy'])
    metric_means['recall'] = np.mean(results['recall'])
    metric_means['precision'] = np.mean(results['precision'])

    return metric_means
def k_fold_plot(data, k, rate, lambs, iteration, category):
    costList = []
    df = data[0]
    tdf = data[1]
    lens = len(df.columns)
    data_split = np.array_split(df, k)
    best_l = np.inf
    max_acc = -np.inf

    for l in range(len(lambs)):
        lam = lambs[l]
        # print("lam=%f" %lam)
        accuracy = 0
        # print("rate=%f" %rate)
        for i in range(0, k, 1):
            dfk = pd.concat([df, data_split[i]]).drop_duplicates(keep=False)
            vdfk = data_split[i]

            X = dfk.iloc[:, 0:lens - 1]
            Y = dfk.iloc[:, lens - 1:lens]

            pX = vdfk.iloc[:, 0:lens - 1]
            pY = vdfk.iloc[:, lens - 1:lens]

            nX, nPx = data_normalized(np.array(X), np.array(pX))
            nX = pd.DataFrame(nX).astype(float)
            nPx = pd.DataFrame(nPx).astype(float)

            model = LogisticRegression(np.zeros((1, len(nX.columns)), float))
            costList = np.append(
                costList, model.fit(nX, np.array(Y), rate, lam, iteration))
            prediction = model.predict(nPx, category)
            accuracy += model.evaluate_acc(pY, prediction)

        mean_acc = accuracy / k
        if mean_acc > max_acc:
            max_acc = mean_acc
            best_l = l

    X = df.iloc[:, 0:lens - 1]
    Y = df.iloc[:, lens - 1:lens]

    pX = tdf.iloc[:, 0:lens - 1]
    pY = tdf.iloc[:, lens - 1:lens]

    nX, nPx = data_normalized(np.array(X), np.array(pX))
    nX = pd.DataFrame(nX).astype(float)
    nPx = pd.DataFrame(nPx).astype(float)
    # print(rates[best_r])
    # print(lambs[best_l])

    model = LogisticRegression(np.zeros((1, len(nX.columns)), float))
    costList = np.append(
        costList, model.fit(nX, np.array(Y), rate, lambs[best_l], iteration))
    prediction = model.predict(nPx, category)
    acc = model.evaluate_acc(pY, prediction)
    matrix = model.confusion_matrix(pY, prediction, category)
    print(matrix)

    return acc, costList
Exemplo n.º 4
0
class LogisticRegressionExperiment(object):
    def __init__(self):
        self._data_set = get_pick_data("LogisticRegression")
        self._num_features = self._data_set.dynamic_features.shape[1]
        self._time_steps = 1
        self._n_output = 1
        self._model_format()
        self._check_path()

    def _model_format(self):
        learning_rate, max_loss, max_pace, ridge, batch_size, hidden_size, epoch, dropout = lr_setup.all
        self._model = LogisticRegression(
            num_features=self._num_features,
            time_steps=self._time_steps,
            n_output=self._n_output,
            batch_size=batch_size,
            epochs=epoch,
            output_n_epoch=ExperimentSetup.output_n_epochs,
            learning_rate=learning_rate,
            max_loss=max_loss,
            dropout=dropout,
            max_pace=max_pace,
            ridge=ridge)

    def _check_path(self):
        if not os.path.exists("result_9_16_0"):
            os.makedirs("result_9_16_0")
        self._filename = "result_9_16_0" + "/" + self._model.name + " " + \
                         time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())

    def do_experiments(self):
        n_output = 1
        dynamic_features = self._data_set.dynamic_features
        labels = self._data_set.labels
        # tol_test_index = np.zeros(shape=0, dtype=np.int32)
        tol_pred = np.zeros(shape=(0, n_output))
        tol_label = np.zeros(shape=(0, n_output), dtype=np.int32)
        train_dynamic_features, test_dynamic_features, train_labels, test_labels = \
            split_logistic_data(dynamic_features,labels)
        for i in range(5):
            train_dynamic_res, train_labels_res = imbalance_preprocess(
                train_dynamic_features[i], train_labels[i],
                'LogisticRegression')
            train_set = DataSet(train_dynamic_res, train_labels_res)
            test_set = DataSet(test_dynamic_features[i].reshape(-1, 92),
                               test_labels[i].reshape(-1, 1))
            self._model.fit(train_set, test_set)
            y_score = self._model.predict(test_set)
            tol_pred = np.vstack((tol_pred, y_score))
            tol_label = np.vstack((tol_label, test_labels[i]))
            print("Cross validation: {} of {}".format(i, 5),
                  time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

        tol_test_index = np.arange(labels.shape[0] * labels.shape[1])
        evaluate(tol_test_index, tol_label, tol_pred, self._filename)
        self._model.close()
Exemplo n.º 5
0
    def train_lr(self, cid):
        params = {
            "offline_model_dir": PROJECT_ROOT+"/ltr/weights/lr",
        }
        params.update(self.params_common)

        X_train, X_valid = self.load_data_by_id("train", cid), self.load_data_by_id("vali", cid)

        model = LogisticRegression("ranking", params, self.logger)
        model.fit(X_train, validation_data=X_valid)
        model.save_session()
Exemplo n.º 6
0
def logistic_test():
    n_samples = 100
    np.random.seed(0)
    X_train = np.random.normal(size=n_samples)
    y_train = (X_train > 0).astype(float)
    X_train[X_train > 0] *= 4
    X_train += 0.3 * np.random.normal(size=n_samples)

    X_train = X_train[:, np.newaxis]

    X, y = make_classification(
        n_features=1,
        n_classes=2,
        n_redundant=0,
        n_informative=1,
        n_clusters_per_class=1,
        class_sep=0.75,
        shuffle=True,
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=0)

    df_test = pd.DataFrame(data=[X_test.flatten(), y_test]).T
    df_test.columns = ["X", "y"]

    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)

    score = [1 if yi == yi_pred else 0 for yi, yi_pred in zip(y_test, y_pred)]
    print(np.sum(score) / len(score))

    # and plot the result
    plt.figure(1, figsize=(4, 3))
    plt.clf()
    plt.scatter(X_train.ravel(), y_train, color="black", zorder=20)

    df_test["loss"] = expit(X_test * lr.theta + lr.bias).ravel()
    df_test = df_test.sort_values("X")
    plt.plot(df_test["X"], df_test["loss"], color="red", linewidth=3)

    ols = LinearRegression()
    ols.fit(X_train, y_train)
    plt.plot(X_test, ols.theta * X_test + ols.bias, linewidth=1)
    plt.axhline(0.5, color=".5")

    plt.ylabel("y")
    plt.xlabel("X")
    plt.xticks(range(-5, 10))
    plt.yticks([0, 0.5, 1])
    plt.ylim(-0.25, 1.25)
    plt.xlim(-2, 2)
    plt.legend(
        ("Logistic Regression Model", "Linear Regression Model"),
        loc="lower right",
        fontsize="small",
    )
    plt.tight_layout()
    plt.show()
Exemplo n.º 7
0
class LogisticRegressionExperiment(object):
    def __init__(self, event_type):
        self._event_type = event_type
        self._data_set = read_data(event_type)
        self._num_features = self._data_set.dynamic_feature.shape[2]
        self._time_steps = self._data_set.dynamic_feature.shape[1]
        self._n_output = self._data_set.labels.shape[1]
        print(event_type)
        self._model_format()
        self._check_path()

    def _model_format(self):
        if self._event_type == "qx":
            learning_rate, max_loss, max_pace, lasso, ridge = lr_qx_setup.all
        elif self._event_type == "cx":
            learning_rate, max_loss, max_pace, lasso, ridge = lr_cx_setup.all
        else:
            learning_rate, max_loss, max_pace, lasso, ridge = lr_xycj_setup.all
        self._model = LogisticRegression(
            num_features=self._num_features,
            time_steps=self._time_steps,
            n_output=self._n_output,
            batch_size=ExperimentSetup.batch_size,
            epochs=ExperimentSetup.epochs,
            output_n_epoch=ExperimentSetup.output_n_epochs,
            learning_rate=learning_rate,
            max_loss=max_loss,
            max_pace=max_pace,
            lasso=lasso,
            ridge=ridge)

    def _check_path(self):
        if not os.path.exists("average_result_cx_TEST" + self._event_type):
            os.makedirs("average_result_cx_TEST" + self._event_type)
        self._filename = "average_result_cx_TEST" + self._event_type + "/" + self._model.name + " " + time.strftime(
            "%Y-%m-%d-%H-%M-%S", time.localtime())

    def do_experiments(self):
        dynamic_feature = self._data_set.dynamic_feature
        labels = self._data_set.labels
        kf = sklearn.model_selection.StratifiedKFold(
            n_splits=ExperimentSetup.kfold, shuffle=False)

        n_output = labels.shape[1]  # classes

        tol_test_index = np.zeros(shape=0, dtype=np.int32)
        tol_pred = np.zeros(shape=(0, n_output))
        tol_label = np.zeros(shape=(0, n_output), dtype=np.int32)
        i = 1
        for train_idx, test_idx in kf.split(X=dynamic_feature,
                                            y=labels.reshape(-1)):  # 五折交叉
            train_dynamic = dynamic_feature[train_idx]
            train_y = labels[train_idx]
            train_dynamic_res, train_y_res = imbalance_preprocess(
                train_dynamic, train_y)  # SMOTE过采样方法处理不平衡数据集

            test_dynamic = dynamic_feature[test_idx]
            test_y = labels[test_idx]

            train_set = DataSet(train_dynamic_res, train_y_res)
            test_set = DataSet(test_dynamic, test_y)

            self._model.fit(train_set, test_set, self._event_type)

            y_score = self._model.predict(test_set)

            tol_test_index = np.concatenate((tol_test_index, test_idx))
            tol_pred = np.vstack((tol_pred, y_score))
            tol_label = np.vstack((tol_label, test_y))
            print(
                "Cross validation: {} of {}".format(i, ExperimentSetup.kfold),
                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            i += 1
        evaluate(tol_test_index, tol_label, tol_pred, self._filename)
        self._model.close()
Exemplo n.º 8
0
acc = metric['accuracy']
rec = metric['recall']
pre = metric['precision']
print(
    f'\n\nLearning rate {best_lr}: accuracy={acc}\trecall={rec}\tprecision={pre}'
)
print('*************\n\n')

# 4 - Regresión Logística con mini-batch y regularización ridge

# 4.a - Fit del modelo obtenido

print("MEJOR MODELO OBTENIDO (Least Square)")
print(f'Hiperparametros: bias: {best_bias} \t Learning Rate {best_lr} \t')
logistic_regression = LogisticRegression(best_bias)
logistic_regression.fit(X_train, y_train.reshape(-1, 1), best_lr, b, epochs,
                        None)
predictions = logistic_regression.predict(X_test)
metrics = [Accuracy(), Precision(), Recall()]
results = {}
for metric in metrics:
    name = metric.__class__.__name__
    results[name] = metric(y_test, predictions[:, 0])
    print('{metric}: {value}'.format(metric=name, value=results[name]))
print('*************\n\n')
"""
Se una entrena un modelo de regresión logística con regularización Ridge como función de costo.
Se agrega un segundo término a la función basada en least squares.  Este término  se conoce como shrinkage penalty y 
tiene como efecto que los coeficientes que minimizan la expresión se sean pequenos, tendiendo a cero a medida que el 
valor de lambda crece. Básicamente restringe al norma del vector de parámetros. 
La ventaja de usar este método se explica por el trade-off entre varianza y bias. Lambda hace más rígido 
el modelo a medida que crece, con el consecuente incremento de la varianza y reducción del bias. El resultado debería