示例#1
0
文件: main.py 项目: YanpeiTian/CS229
def main(train_path, valid_path, save_path):
    """Problem: Logistic regression with Newton's Method.

    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)
    x_valid, y_valid = util.load_dataset(valid_path, add_intercept=False)
    x_train = x_train[:, 1:]
    x_valid = x_valid[:, 1:]

    # normalize the data: (skip binary features)
    x_train[:, :-1] = (x_train[:, :-1] - np.mean(
        x_train[:, :-1], axis=0)) / np.std(x_train[:, :-1], axis=0)
    x_valid[:, :-1] = (x_valid[:, :-1] - np.mean(
        x_valid[:, :-1], axis=0)) / np.std(x_valid[:, :-1], axis=0)

    # add intercept for logistic regression:
    x_train = util.add_intercept(x_train)
    x_valid = util.add_intercept(x_valid)

    clf = logistic.LogisticRegression(step_size=1, max_iter=100000000)
    clf.fit(x_train, y_train)

    y_pred_prob = clf.predict(x_valid)
    y_pred = y_pred_prob.round()

    print(classification_report(y_valid, y_pred))
    print(confusion_matrix(y_valid, y_pred))
    print(np.sum(y_valid))

    np.savetxt(save_path, y_pred)
示例#2
0
def backward_selection(x_train, y_train, x_valid, y_valid):
    n = x_train.shape[0]  # number of examples
    d = x_train.shape[1]  # number of features
    # Wrapper feature selection: forward search
    remove_list = []
    F_list = np.arange(d).tolist()
    score_all = []
    index = np.arange(d).tolist()
    i = 0  # iteration times
    while len(F_list) > 0:
        i += 1
        remove_f = []
        score_f = []
        for k in range(d):
            if k in F_list:
                remove_f.append(k)
                f = F_list[:]
                f.remove(k)
                x_train_f = x_train[:, f]
                x_valid_f = x_valid[:, f]
                # add intercept for logistic regression:
                x_train_f = util.add_intercept(x_train_f)
                x_valid_f = util.add_intercept(x_valid_f)
                clf = logistic.LogisticRegression(step_size=1,
                                                  max_iter=100000000,
                                                  verbose=False)
                clf.fit(x_train_f, y_train)
                y_pred_f_prob = clf.predict(x_valid_f)
                y_pred_f = y_pred_f_prob.round()
                f_accuracy = np.mean(y_pred_f == y_valid)
                score_f.append(f_accuracy)
                print('Acc = %.6f' % (f_accuracy), f)
        best_score = np.amax(score_f)
        best_f_index = np.argwhere(score_f == best_score)
        best_f_index = best_f_index.flatten().tolist()

        remove_all = True
        if remove_all:
            for f_index in best_f_index:
                best_f = remove_f[f_index]
                remove_list.append(best_f)
                F_list.remove(best_f)
                score_all.append(best_score)
                index[len(remove_list) - 1] = i
                print('')
                print('Acc_best = %.6f' % (best_score), F_list)
                print('')
        else:
            if len(best_f_index) == 1:
                f_index = best_f_index[0]
            else:  # more than one best choice
                f_index = random.choice(best_f_index)
            best_f = remove_f[f_index]
            remove_list.append(best_f)
            F_list.remove(best_f)
            score_all.append(best_score)
            print('Acc_best = %.6f' % (best_score), F_list)

    return remove_list, score_all, index
示例#3
0
    def fit(self, x, y):
        """Run Newton's Method to minimize J(theta) for logistic regression.

        Args:
            x: Training example inputs. Shape (m, n).
            y: Training example labels. Shape (m,).
        """
        if self.intercept is True:
            x = util.add_intercept(x)

        g = lambda x: 1 / (1 + np.exp(-x))
        m, n = x.shape

        # initialize theta
        if self.theta is None:
            self.theta = np.zeros(n)

        # optimize theta
        while True:
            theta = self.theta
            # compute gradient
            G = -(1 / m) * (y - g(x.dot(theta))).dot(x)

            # compute H
            x_theta = x.dot(theta)
            H = (1 / m) * g(x_theta).dot(g(1 - x_theta)) * (x.T).dot(x)
            H_inv = np.linalg.inv(H)

            # update
            self.theta = theta - H_inv.dot(G)

            # if norm is small, terminate
            if np.linalg.norm(self.theta - theta, ord=1) < self.eps:
                break
示例#4
0
def forward_selection(x_train, y_train, x_valid, y_valid):
    n = x_train.shape[0]  # number of examples
    d = x_train.shape[1]  # number of features
    # Wrapper feature selection: forward search
    F_list = []
    score_all = []
    for i in range(d):
        add_f = []
        score_f = []
        for k in range(d):
            if k not in F_list:
                add_f.append(k)
                f = F_list + [k]
                x_train_f = x_train[:, f]
                x_valid_f = x_valid[:, f]
                # add intercept for logistic regression:
                x_train_f = util.add_intercept(x_train_f)
                x_valid_f = util.add_intercept(x_valid_f)
                clf = logistic.LogisticRegression(step_size=1,
                                                  max_iter=100000000,
                                                  verbose=False)
                clf.fit(x_train_f, y_train)
                y_pred_f_prob = clf.predict(x_valid_f)
                y_pred_f = y_pred_f_prob.round()
                f_accuracy = np.mean(y_pred_f == y_valid)
                score_f.append(f_accuracy)
                print(f, f_accuracy)
        print(score_f)
        best_score = np.max(score_f)
        best_f_index = np.argwhere(score_f == best_score)
        best_f_index = best_f_index.flatten().tolist()
        if len(best_f_index) == 1:
            best_f_index = best_f_index[0]
        else:  # more than one best choice
            best_f_index = random.choice(best_f_index)
        best_f = add_f[best_f_index]
        F_list.append(best_f)
        score_all.append(best_score)
        print('%.8f' % (best_score), F_list)

        remove_list.append(best_f)

    best_score_all = np.max(score_all)
    best_score_index = np.argmax(score_all)
    F_best = F_list[:int(best_score_index) + 1]

    return F_best, F_list, score_all
示例#5
0
    def predict(self, x):
        """Make a prediction given new inputs x.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,).
        """
        # *** START CODE HERE ***
        x = util.add_intercept(x)
        return x.dot(self.theta)>=0
示例#6
0
def main(tau, train_path, eval_path):
    """Problem 5(b): Locally weighted regression (LWR)

    Args:
        tau: Bandwidth parameter for LWR.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
    """
    ##
    ##
    # Load training set
    x_train_org, y_train,x_eval_org,y_eval, data_frame = util.load_dataset_new(train_path,eval_path)

    # Feature Scaling
    sc_X = StandardScaler()
    x_train= util.add_intercept(sc_X.fit_transform(x_train_org))
    x_eval= util.add_intercept(sc_X.fit_transform(x_eval_org))
    #all_zeros = np.where(~x_train.any(axis=0))[0]
    #print(all_zeros)


    print("Train shape:" + str(x_train.shape))
    print("Eval shape:" + str(x_eval.shape))
    # Fit a LWR model
    clf = LocallyWeightedLinearRegression(tau)
    clf.fit(x_train, y_train, 0.1)
    y_train_out_real = np.dot(x_train, clf.theta)

    #print(y_train_out)
    p_eval = clf.predict(x_eval)

    def give_error(y_out, y):
        cnt = 0
        for i in range(len(y_out)):
            if (y_out[i] == y[i]):
                cnt +=1
        return cnt/len(y_out)
    #print(give_error(p_eval,y_eval))
    print(p_eval, y_eval)
示例#7
0
def main_h(train_path, eval_path, fig_path):
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # it's okay without / (1. + median)
    median = np.median(x_train, axis=0)
    x_train = np.log((1. + x_train) / (1. + median))
    x_train = util.add_intercept(x_train)

    # Train a logistic regression classifier
    lr = p01b_logreg.LogisticRegression()
    lr.fit(x_train, y_train)

    # Train a GDA classifier
    gda = p01e_gda.GDA()
    gda.fit(x_train[:, 1:], y_train)

    # Plot decision boundary on top of validation set set
    x_val, y_val = util.load_dataset(eval_path, add_intercept=False)
    x_val = np.log((1. + x_val) / (1. + median))
    x_val = util.add_intercept(x_val)
    plot_all(x_val, y_val, lr.theta, gda.theta, fig_path)
示例#8
0
文件: gda.py 项目: g-ych/CS229
    def predict(self, x):
        """Make a prediction given new inputs x.

        Args:
            x: Inputs of shape (n_examples, dim).

        Returns:
            Outputs of shape (n_examples,).
        """
        # *** START CODE HERE ***
        x = util.add_intercept(x)
        probs = sigmoid(x.dot(self.theta))
        preds = (probs >= 0.5).astype(np.int)
        return preds
示例#9
0
    def predict(self, x):
        """Make a prediction given new inputs x.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,).
        """
        # *** START CODE HERE ***
        # we do not assume that intercept is added.
        def sigmoid(z): return 1 / (1 + np.exp(-z))
        x = util.add_intercept(x)
        preds = (sigmoid(x.dot(self.theta.T)) >= 0.5).astype('int')
        return preds
示例#10
0
    def predict(self, x):
        """Make a prediction given new inputs x.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,).
        """
        # *** START CODE HERE ***
        # we do not assume that intercept is added.
        sigmoid = lambda z: 1 / (1 + np.exp(-z))
        x = util.add_intercept(x)
        probs = sigmoid(x.dot(self.theta))
        preds = (probs >= 0.5).astype(np.int)
        return preds
示例#11
0
    def predict(self, x, p):
        """Make a prediction given new inputs x.

        Args:
            p: Cut-off probability
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,).
        """
        sigmoid = lambda z: 1 / (1 + np.exp(- z))
        if self.intercept is True:
            x = util.add_intercept(x)

        probs = sigmoid(np.dot(x, self.theta))
        preds = (probs >= p).astype(np.float64)
        return preds
示例#12
0
    def predict(self, x):
        """Make a prediction given new inputs x.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,).
        """

        # *** START CODE HERE ***
        def sigmoid(z):
            return 1 / (1 + np.exp(-z))

        x = util.add_intercept(x)
        probability = sigmoid(x.dot(self.theta))
        predictions = (probability >= 0.5).astype(np.int)
        return predictions
示例#13
0
    def predict(self, x, p=None):
        """Make a prediction given new inputs x.

        Args:
            p: Cut-off probability
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,).
        """
        if self.intercept is True:
            x = util.add_intercept(x)

        g = lambda x: 1 / (1 + np.exp(-x))
        preds = g(x.dot(self.theta))
        if p is not None:
            preds = (preds >= p).astype(np.float64)
        return preds
示例#14
0
def main(lr, train_path, eval_path, pred_path):
    """Problem 3(d): Poisson regression with gradient ascent.

    Args:
        lr: Learning rate for gradient ascent.
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # Load training set
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # *** START CODE HERE ***
    # Fit a Poisson Regression model
    x_train_ex = util.add_intercept(x_train)
    model = PoissonRegression(step_size=lr, max_iter=1000)
    model.fit(x_train_ex, y_train)

    # Run on the validation set, and use np.savetxt to save outputs to pred_path
    x_val, _ = util.load_dataset(eval_path, add_intercept=True)
    np.savetxt(pred_path, model.predict(x_val))
示例#15
0
    def predict(self, x):
        """Make a prediction given new inputs x.

        Args:
            x: Inputs of shape (m, n).

        Returns:
            Outputs of shape (m,)
        """
        # *** START CODE HERE ***
        x = util.add_intercept(x)
        m,n = x.shape
        g = lambda z: 1/(1 + np.exp(-z))
        test = g((x.dot(self.theta)))
        y_pred = test.copy()
        for i in range(m):
            if y_pred[i] >0.5:
                y_pred[i] = 1
            else:
                y_pred[i] = 0
        return y_pred
示例#16
0
def main(train_path, valid_path, save_path):
    """Problem: Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        valid_path: Path to CSV file containing dataset for validation.
        save_path: Path to save predicted probabilities using np.savetxt().
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # *** START CODE HERE ***
    # Train a GDA classifier
    classification = GDA()
    classification.fit(x_train, y_train)

    # Plot decision boundary on validation set
    x_eval, y_eval = util.load_dataset(valid_path, add_intercept=False)
    util.plot(x_eval, y_eval, classification.theta,
              save_path.replace('.txt', '.png'))
    x_eval = util.add_intercept(x_eval)
    np.savetxt(save_path, classification.predict(x_eval))
示例#17
0
def main(train_path, eval_path, pred_path):
    """Problem 1(e): Gaussian discriminant analysis (GDA)

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # Load dataset
    x_train, y_train = util.load_dataset(train_path, add_intercept=False)

    # *** START CODE HERE ***
    # Train a GDA classifier
    clf = GDA()
    clf.fit(x_train, y_train)
    # Plot decision boundary on validation set
    x_val, y_val = util.load_dataset(eval_path, add_intercept=False)
    y_pred = clf.predict(x_val)
    x_val = util.add_intercept(x_val)
    print(clf.theta_0)
    # Use np.savetxt to save outputs from validation set to pred_path
    # print(clf.theta_0)
    util.plot(x_val, y_val, clf.theta_0, '{}.png'.format(pred_path))
    np.savetxt(pred_path, y_pred)
示例#18
0
def main(file1):
    print("Running main")
    train_path = "output/flights_pass_1_na_0.csv"
    eval_path = "testinput/all_test_with_failures_clean.csv"
    #X, Y, X_test, Y_test, dataset = util.load_dataset_new(train_path, eval_path)
    x_train_org, y_train, x_valid_org, y_valid, dataset = util.load_dataset_new(
        train_path, eval_path)

    sc_X = StandardScaler()
    x_train = util.add_intercept(sc_X.fit_transform(x_train_org))
    x_valid = util.add_intercept(sc_X.fit_transform(x_valid_org))

    ###plot correlation matrix
    corr_after_dropping = dataset.corr()
    labels = corr_after_dropping.columns.values
    plt.matshow(corr_after_dropping)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(corr_after_dropping, vmin=-1, vmax=1)
    fig.colorbar(cax)
    ticks = np.arange(0, len(corr_after_dropping.columns), 1)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)
    #ax.set_xticklabels(labels, size=1)
    ax.set_yticklabels(labels, size=5)
    plot_path = 'output/correlation_plot'
    plt.savefig(plot_path)

    ##Scatter
    #headers = list(dataset.columns.values)
    ##scatter = pd.DataFrame(dataset, columns=headers)
    #scatter = pd.DataFrame(dataset)
    #my_scatter = scatter_matrix(scatter)
    #plt.savefig("output/flightscatter")

    def give_error(y_out, y):
        cnt = 0
        for i in range(len(y_out)):
            if (y_out[i] == y[i]):
                cnt += 1
        return cnt / len(y_out)

    ##Normal Eq
    tau = 0.1
    lwr = LinearReg_normal_eq_locally_weighted(tau)
    lwr.x_train = x_train
    lwr.y_train = y_train
    lwr.x_valid = x_valid
    theta_train = lwr.fit(x_train, y_train, 0.05)

    y_train_out = sigmoid(x_train, theta_train)
    y_valid_out_ne = sigmoid(x_valid, theta_train)

    y_train_out_1 = np.where(y_train_out > 0.65, 1, 0)
    y_valid_out_ne_1 = np.where(y_valid_out_ne > 0.65, 1, 0)

    print(give_error(y_valid_out_ne_1, y_valid))
    print(give_error(y_train_out_1, y_train))
    ##print(y_valid_out_ne_1)
    #print(y_valid_out_ne)
    ##print(y_valid)
    ##LWR
    ###tau_array = np.array([10])
    ###r2_valid_lwr = 0
    ###for i in range(0, len(tau_array)):
    ###    lwr.tau = tau_array[i]
    ###    y_valid_out_lwr = lwr.predict(x_valid)
    ###    y_valid_out_lwr_1 = np.where(y_valid_out_lwr > 0.65, 1, 0)
    ###    print(give_error(y_valid_out_lwr_1, y_valid))

    ##Gradient descent
    linear_reg = LinearRegression_gradient_descent()
    linear_reg.x_train = x_train
    linear_reg.y_train = y_train
    l1_l2_factor = np.array([1, 2])
    ##learning_rate = 4.85e-5
    lambda_array = np.array([10, 0.5])
    learning_rate = 1e-5
    cost_limit = 1e-12
    r2_train_gd = 0
    r2_valid_gd = 0
    for i in range(0, len(l1_l2_factor)):
        ##for i in range(0, 0):
        theta_train = linear_reg.fit(x_train, y_train, lambda_array[i],
                                     learning_rate, cost_limit,
                                     l1_l2_factor[i])

        y_train_out = linear_reg.predict(x_train)
        y_valid_out = linear_reg.predict(x_valid)

        y_train_out_1 = np.where(y_train_out > 0.6, 1, 0)
        y_valid_out_1 = np.where(y_valid_out > 0.6, 1, 0)

        print(give_error(y_valid_out_1, y_valid))
        print(give_error(y_train_out_1, y_train))
示例#19
0
import util
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.decomposition import PCA

iris = datasets.load_iris()

from sklearn.preprocessing import StandardScaler

train_path = "output/flights_pass_1_na_0.csv"
eval_path = "testinput/all_test_with_failures_clean.csv"
#X, Y, X_test, Y_test, dataset = util.load_dataset_new(train_path, eval_path)
x_train_org, y, x_valid_org, y_eval, dataset = util.load_dataset_new(train_path, eval_path)
sc_X = StandardScaler()
X_Train = util.add_intercept(sc_X.fit_transform(x_train_org))
X_Test = util.add_intercept(sc_X.fit_transform(x_valid_org))

##X = iris.data
##y = iris.target
X = X_Train
y = y

pca = PCA(n_components=2)
Xreduced = pca.fit_transform(X)
Xtestreduced = pca.transform(X_Test)

def give_error(y_out, y):
    cnt = 0
    cntfour = 0
    for i in range(len(y_out)):
示例#20
0
def main(profile, input_col, label_col, ransac, cross, profile_test):
    """
    :param profile: read profile_id, int type
    :param input_col: list of X name
    :param label_col: list of y name
    :param cross: whether it is a cross test between two profiles
    :param profile_test: the test profile if cross == True
    :return: n/a
    """
    path = 'profile_data/Profile_' + str(profile) + '.csv'
    if not cross:
        x, y = util.load_dataset(path, input_col, label_col)
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=5)
    else:
        x_train, y_train = util.load_dataset(path, input_col, label_col)

    n_label = len(label_col)  # number of kinds of labels
    n_input = len(input_col)  # number of kinds of outputs

    # initialization for theta
    theta = np.zeros((n_label, n_input + 1))

    if cross:
        # test path
        path = 'profile_data/Profile_' + str(profile_test) + '.csv'
        x_test, y_test = util.load_dataset(path, input_col, label_col)

    high_score = 0
    for i in range(n_label):
        print('For label', label_col[i], " : ")
        kNeighbor(x_train, y_train[:, i], x_test, y_test[:, i])
        if not ransac:
            reg = LinearRegression().fit(x_train, y_train[:, i])
            theta[i, 0] = reg.intercept_
            theta[i, 1:] = reg.coef_
        else:
            reg = RANSACRegressor().fit(x_train, y_train[:, i])
            theta[i, 0] = reg.estimator_.intercept_
            theta[i, 1:] = reg.estimator_.coef_

        # score = reg.score(x_train, y_train[:, i])
        scores = cross_val_score(reg,
                                 x_test,
                                 y_test[:, i],
                                 cv=5,
                                 scoring='neg_mean_squared_error')
        score = -scores.mean()
        if score > high_score:
            high_score = score
            label = label_col[i]
        x_new = util.add_intercept(x_test)
        pred = x_new.dot(theta[i, :])
        den = pd.DataFrame({
            'Actual': y_test[:, i],
            'Prediction': pred,
        })
        p = den.plot.kde()
        fig = p.get_figure()
        fig.savefig("density_plot/" + label_col[i] + '_density.png')
        print('Multiple Linear Regression MSE for', label_col[i], 'is', score,
              '+-',
              scores.std() * 2)
    print('In profile', profile, 'the highest score for', label, 'is',
          high_score)

    # if only one input, we can plot it
    if n_input == 1:
        for i in range(n_label):
            save_path = "plots/" + \
                        input_col[0] + '_vs_' + label_col[i] + '.png'
            util.plot(x_test, y_test[:, i], theta[i, :], input_col[0],
                      label_col[0], save_path)
    print("theta is: ")
    print(theta)