예제 #1
0
def generate_prediction(x_tr_0, y_tr_0, x_tr_1, y_tr_1, x_tr_2, y_tr_2, x_tr_3,
                        y_tr_3, x_te_0, x_te_1, x_te_2, x_te_3, jet_num_te):
    """Generate a prediction for a test dataset already split according to jet_num
    by calculating weights using a training dataset also already split."""
    #compute the weights using predetermined polynomial degrees
    w_0, _ = least_squares(y_tr_0, build_poly(x_tr_0, 9))
    w_1, _ = least_squares(y_tr_1, build_poly(x_tr_1, 15))
    w_2, _ = least_squares(y_tr_2, build_poly(x_tr_2, 13))
    w_3, _ = least_squares(y_tr_3, build_poly(x_tr_3, 12))

    #compute the prediction using the weights
    y_te_0 = predict_labels(w_0, build_poly(x_te_0, 9))
    y_te_1 = predict_labels(w_1, build_poly(x_te_1, 15))
    y_te_2 = predict_labels(w_2, build_poly(x_te_2, 13))
    y_te_3 = predict_labels(w_3, build_poly(x_te_3, 12))

    #join the four predictions into a single one matching the original indices
    predicted_y_te = []
    i_0, i_1, i_2, i_3 = 0, 0, 0, 0
    for jet_num in jet_num_te:
        if jet_num == 0:
            predicted_y_te.append(y_te_0[i_0])
            i_0 += 1
        elif jet_num == 1:
            predicted_y_te.append(y_te_1[i_1])
            i_1 += 1
        elif jet_num == 2:
            predicted_y_te.append(y_te_2[i_2])
            i_2 += 1
        else:
            predicted_y_te.append(y_te_3[i_3])
            i_3 += 1

    return predicted_y_te
예제 #2
0
    def __init__(self, model_name, w=None, learning_param=None, debug=True):

        # Set weights
        self.w = w

        # Set debug object
        if debug:
            self.dbg = debugger.Debugger(['loss', 'w'])
        else:
            self.dbg = None

        """Depending on the chosen model, we choose the approriate output, 
        loss prediction, and learning functions.
        """
        if model_name == 'logistic_regression':
            self.model_output = misc.lr_output
            self.compute_loss = cost.compute_loss_ce
            self.predict_output = misc.map_prediction

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            
            self.learn = lambda y, x, w, dbg: impl.logistic_regression(y, x, w, max_iters, gamma, dbg)

        if model_name == 'reg_logistic_regression':
            self.model_output = misc.lr_output
            self.compute_loss = cost.compute_loss_reg_ce
            self.predict_output = misc.map_prediction

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            lambda_ = learning_param['lambda_']
            self.learn = lambda y, x, w, dbg: impl.reg_logistic_regression(y, x, lambda_, w, max_iters, gamma, dbg)


        if model_name == 'least_squares_GD':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            
            self.learn = lambda y, x, w, dbg: impl.least_squares_GD(y, x, w, max_iters, gamma, dbg)

        if model_name == 'ridge_regression':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            lambda_ = learning_param['lambda_']
            
            self.learn = lambda y, x, w, dbg: impl.ridge_regression(y, x, lambda_)

        if model_name == 'least_squares':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            self.learn = lambda y, x, w, dbg: impl.least_squares(y, x)
예제 #3
0
def impute_lr(data):
    #find columns that have no -999
    clear_cols = [i for i in range(data.shape[1]) if -999 not in data[:,i]]
    #find rows that have no -999
    clear_rows = [i for i in range(data.shape[0]) if -999 not in data[i,:]]
    #pdb.set_trace()
    dirty_cols = [i for i in range(data.shape[1]) if i not in clear_cols]
    dirty_rows = [i for i in range(data.shape[0]) if i not in clear_rows]

    clear_samples = np.copy(data[clear_rows, :])
    #clear_samples, mean_x, std_x = hp.standardize(clear_samples)
    w_lr = list()
    mse= list()
    #pdb.set_trace()
    for feature in dirty_cols:
        wf = imp.least_squares(clear_samples[:, feature], clear_samples[:, clear_cols])
        w_lr.append(wf[0])
        #pdb.set_trace()
        #mse.append(compute_loss(clear_samples[:, feature], clear_samples[:, clear_cols] ,wf[0]))
        for sample in dirty_rows:
            if data[sample,feature] == -999:
                replacement = np.dot(data[sample, clear_cols].transpose(), wf[0])
                data[sample, feature] = replacement

    return data
예제 #4
0
def cross_validation(y, x, k_indices,k, degree,index_to_be_skewed):
    """return the loss of ridge regression."""

    x_train = x[np.array([p for i in range(k_indices.shape[0]) if i != k for p in k_indices[i]])]
    y_train= y[np.array([p for i in range(k_indices.shape[0]) if i != k for p in k_indices[i]])]
    
    x_test=x[k_indices[k]]
    y_test=y[k_indices[k]]
    
    min_tr=np.min(x_train,axis=0)
    max_tr=np.max(x_train,axis=0)
    
    #Transformations to train
    x_train=min_max_transform(x_train,min_tr,max_tr)
    x_train[:,index_to_be_skewed]= np.log(x_train[:,index_to_be_skewed]+1)
    x_train_poly,mean_train,std_train= expand_and_normalize_X(x_train,degree)
    
    #Transformations to test, using same min, max, mean and std as in the train partition

    x_test= min_max_transform(x_test,min_tr,max_tr)
    x_test[:,index_to_be_skewed]= x_test[:,index_to_be_skewed]
    x_test[:,index_to_be_skewed]= np.log(x_test[:,index_to_be_skewed]+1)
    x_test_poly=build_poly(x_test,degree)
    x_test_poly[:,1:]=(x_test_poly[:,1:]-mean_train)/std_train
    
    
    w,loss=m.least_squares(y_train, x_train_poly)

    loss_tr= -accuracy(y_train, predict_labels(w,x_train_poly))
    loss_te= -accuracy(y_test, predict_labels(w,x_test_poly))
    return loss_tr, loss_te,min_tr,max_tr
예제 #5
0
def cross_validation(y, x, degree, k, k_indices,method, error, feature_augmentation, hyperparams):
    """"""
    from helpers_data import feature_processing, feat_augmentation, standardize, build_poly
    from implementations import ridge_regression, least_squares, least_squares_GD, least_squares_SGD, logistic_regression, reg_logistic_regression
    
    
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    
    x_tr, y_tr, median = feature_processing (x_tr, y_tr, 'mean', replace_feature = True, suppr_outliers = hyperparams[-1], threshold = 3, ref_median=[])
    x_te, y_te, _= feature_processing (x_te, y_te, 'mean', replace_feature = True, suppr_outliers = False, threshold = 3, ref_median=median)
    
    
    tx_tr_aug = []
    tx_te_aug = []
    if feature_augmentation:
        tx_tr_aug, index = feat_augmentation(x_tr, 0.003)
        tx_te_aug, _ = feat_augmentation(x_te, 0.003, False, index)
    
    # form data with polynomial degree
    tx_tr = build_poly(x_tr, degree, feature_augmentation, tx_tr_aug)
    tx_te = build_poly(x_te, degree, feature_augmentation, tx_te_aug)
    tx_tr, mean, std = standardize(tx_tr)
    tx_te, _, _ = standardize(tx_te, mean, std)
    
    #print('Mean and std of each feature in train set: {} , {}'.format(tx_tr.mean(axis = 0),tx_tr.std(axis = 0)))
    #print('Mean and std of each feature in test set: {} , {}'.format(tx_te.mean(axis = 0),tx_te.std(axis = 0)))
    
    
    
    if method == 'rr': w,_ = ridge_regression(y_tr, tx_tr, hyperparams[0]) # ridge regression
    elif method == 'ls': w,_ = least_squares(y_tr, tx_tr) # least square
    elif method == 'lsGD': w,_ = least_squares_GD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # gradient descent
    elif method == 'lsSGD': w,_ = least_squares_SGD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2], hyperparams[3]) # stoch GD
    elif method == 'log': w,_ = logistic_regression(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # logistic reg
    elif method == 'rlog': w,_ =reg_logistic_regression(y_tr, tx_tr, hyperparams[3], np.zeros(tx_tr.shape[1]), hyperparams[1], hyperparams[2]) # regularised logistic reg
    else: raise NotImplementedError
   
    if method == 'log':
        loss_tr = cal_loglike(y_tr, tx_tr, w)
        loss_te = cal_loglike(y_te, tx_te, w)
    elif method == 'rlog':
        loss_tr = cal_loglike_r(y_tr, tx_tr, w, hyperparams[3])
        loss_te = cal_loglike_r(y_te, tx_te, w, hyperparams[3])
    else :
        # calculate the loss for train and test data
        loss_tr = compute_loss(y_tr, tx_tr, w, error)
        loss_te = compute_loss(y_te, tx_te, w, error)      
    
    y_pred = predict_labels(np.array(w).T, tx_te)
    acc = accuracy(y_te,y_pred)
    
    return loss_tr, loss_te, w, acc
def cross_validation(y,
                     tx,
                     mlfunction,
                     split_number=5,
                     lambda_=1e-6,
                     gamma=0.001):
    '''Performs a ml_function given as parameters using cross validation on the training set split_number folds (5 as default value) '''

    # define empty lists to store train/test losses and accuracy
    train_loss_ = []
    test_loss_ = []
    train_accuracy_ = []
    test_accuracy_ = []

    # get k_indices
    k_indices = build_k_indices(len(y), split_number)

    for ki in range(len(k_indices)):

        # set the k'th indices as test, and others as training set
        #train_idx = np.asarray([k_indices[i] for i in np.delete( np.arange(len(k_indices)), ki)]).flatten()
        test_idx = np.asarray(k_indices[ki])
        train_idx = np.delete(np.arange(len(y)), test_idx)

        train_tX = tx[train_idx]
        train_y = y[train_idx]

        test_tX = tx[test_idx]
        test_y = y[test_idx]

        if (mlfunction == 'ridge_regression'):
            w, loss = impl.ridge_regression(train_y, train_tX, lambda_)
        elif (mlfunction == 'least_squares'):
            w, loss = impl.least_squares(train_y, train_tX)
        elif (mlfunction == 'logistic_regression'):
            w, loss = impl.logistic_regression(train_y, train_tX)
        elif (mlfunction == 'reg_logistic_regression'):
            w, loss = impl.reg_logistic_regression(train_y, train_tX, lambda_)

        elif (mlfunction == 'least_squares_sgd'):
            w, loss = impl.least_squares_SGD(train_y, train_tX, gamma)
        elif (mlfunction == 'least_squares_gd'):
            w, loss = impl.least_squares_GD(train_y, train_tX, gamma)
        else:
            print('ERROR: ml_function not recognized')
            print(
                'least_squares, least_squares_gd, least_squares_sgd, logistic_regression, reg_logistic_regression'
            )
            return None

        # Calculate different losses and accuracy
        train_loss_.append(impl.compute_loss_mse(train_y, train_tX, w))
        test_loss_.append(impl.compute_loss_mse(test_y, test_tX, w))

        train_accuracy_ = impl.compute_accuracy(train_y, train_tX, w)
        test_accuracy_ = impl.compute_accuracy(test_y, test_tX, w)

    return np.mean(train_loss_), np.mean(test_loss_), np.mean(
        train_accuracy_), np.mean(test_accuracy_)
예제 #7
0
def get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size):
    """ Returns the learned weights 'w' (last weight vector) and
    the corresponding loss function by a given model.

    Parameters
    ----------
    model: string
        The model
    y: ndarray
        The labels
    tx: ndarray
        The feature matrix
    initial_w: ndarray
        The initial weights
    max_iters: integer
        The number of steps to run
    gamma: integer
        The step size
    lambda_: integer
        The regularization parameter
    batch_size: integer
        The batch size

    Returns
    -------
    tuple
        The learned weights
    """
    if model == "MSE_GD":
        w, _ = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        
    elif model == "MSE_SGD":
        w, _ = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        
    elif model == "MSE_OPT":
        w, _ = least_squares(y, tx)
        
    elif model == "MSE_OPT_REG":
        w, _ = ridge_regression(y, tx, lambda_)
        
    elif model == "LOG_GD":
        w, _ = logistic_regression(y, tx, initial_w, max_iters, gamma)
        
    elif model == "LOG_REG_GD":
        w, _ = reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma)

    elif model == "LOG_REG_L1":
        w, _ = reg_logistic_regression_L1(y, tx, lambda_, initial_w, max_iters, gamma)
    
    elif model == "MSE_GD_L1":
        w, _ = least_squares_GD_L1(y, tx, lambda_, initial_w, max_iters, gamma)
    
    else:
        raise UnknownModel
    
    return w
예제 #8
0
def cross_validation_ls(y, x, k_indices, k):
    """train and test least square model using cross validation"""
    x_test = x[k_indices[k]]
    x_train = np.delete(x, [k_indices[k]], axis=0)
    y_test = y[k_indices[k]]
    y_train = np.delete(y, [k_indices[k]], axis=0)

    opt_w, mse_tr = imp.least_squares(y_train,x_train)
    mse_te = imp.compute_mse(y_test, x_test, opt_w)
    return mse_te, opt_w
예제 #9
0
def test_least_squares(y_train, tx_train, y_test, tx_test):
    """
    Tests least_squares method on the splitted data set and 
    reports percentage of correct predictions. 
    Args:
        y_train: training labels after the splitting
        tx_train: training features after the splitting
        y_test: test labels after the splitting
        tx_test: test features after the splitting
    """
    print('\nTesting least_squares...')
    w, _ = least_squares(y_train, tx_train)
    report_prediction_accuracy(y_test, tx_test, w)
    print('... testing completed.')
def find_optimal_w(tX, y, implementation, log_initial_w, log_max_iters,
                   log_gamma, decreasing_gamma, log_regulator, ridge_lambda):
    """
    Find the optimal weights by training the data set
    
    Parameters 
    ----------
    
    tX: array
        The feature matrices
    y: array
        The output
    log_initial_w: array
        inital weights in order to perform GD or SGD
    log_max_iters: integer
        number of iterations to perform GD or SGD
    log_gamma: float
        gamma parameter to perform GD or SGD
    log_regulator: float
        lambda to perform logistic regression
    ridge_lambda: float
        lambda to perform ridge regression
      
    Return
    ------
    
    optimal_w = array
        Optimal weights.

    """
    optimal_w = None
    if implementation == 0:
        optimal_w, _ = impl.least_squares(y, tX)
    if implementation == 1:
        optimal_w, _ = impl.ridge_regression(y, tX, ridge_lambda)
    if implementation == 2:
        optimal_w, _ = impl.reg_logistic_regression(y, tX, log_regulator,
                                                    log_initial_w,
                                                    log_max_iters, log_gamma,
                                                    decreasing_gamma)
    return optimal_w
예제 #11
0
def fill_missing_values(X_, deg=1, tresh=1, lambda_=1e-7):
    # Create a dictionary to store the index of the feature with -999 value as key, and the corresponding indices as value
    X = X_.copy()
    unknown_dict = find_bad_features(X)

    # Get bad/good features indices
    bad_features = list(unknown_dict.keys())

    # select feature to fill depending on the treshold
    features_to_fill = [
        i for i in bad_features if ((len(unknown_dict[i]) / len(X)) < tresh)
    ]

    features_to_ignore = bad_features.copy()

    for i in features_to_fill:
        features_to_ignore.remove(i)

    clean_features = np.delete(np.arange(len(X.T)), bad_features)

    clean_X = X.T[clean_features]
    # Ignoring very bad features (>tresh)
    # fill missing values using least squares
    for i in features_to_fill:
        clean_idx = list(np.delete(np.arange(len(X)), unknown_dict[i]))
        tx = clean_X.T[clean_idx]
        ys = X.T[i][clean_idx]

        bad_idx_by_feature = unknown_dict[i]
        w, _ = impl.least_squares(ys, tx)
        y_bad = np.dot(clean_X.T[bad_idx_by_feature], w)

        # Predict missing values
        for idx in bad_idx_by_feature:
            X[idx][i] = y_bad[i]
    feat_to_conserve = np.delete(np.arange(len(X.T)), features_to_ignore)
    return X.T[feat_to_conserve].T
예제 #12
0
# Cross validation over lambda
avg_test_accuracy_RR = cross_validation_RR(X_train, y_train, k_fold=4, seed=1)

# Cross validation over gamma
avg_test_accuracy_LR = cross_validation_LR(X_train, y_train, k_fold=4, seed=1)

# Cross validation over both gamma and lambda
g, l, avg_test_accuracy_RLR = cross_validation_RLR(X_train, y_train, k_fold=4, seed=1)

#%% Testing functions
#np.random.seed(42)

gamma = 0.2
lambda_ = 4E-5

w, loss = least_squares(y = y_train, tx = X_train)
#
w, loss = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma)
#
w, loss = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_)
#
w, loss = logistic_regression(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*10, max_iters = 125000, gamma = gamma)
#
w, loss = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma)

plt.plot(w)

#%% Predictive step
y_test = X_test @ w

plt.hist(y_test, bins=200)
예제 #13
0
def cross_validation(x,
                     y,
                     k,
                     mode,
                     gamma=None,
                     lambda_=None,
                     max_iters=None,
                     initial_w=None):
    """
    INPUT:
    @x : input data, dimensions (NxD)
    @y : target labels, (Nx1) array
    @k : number of folds
    OUTPUT:
    """
    D = x.shape[1]
    #randomly permute data maybe?
    x_split = np.array_split(x, k, axis=0)
    y_split = np.array_split(y, k, axis=0)
    #initialize weights and metrics
    weights = list()
    acc = list()
    tpr = list()
    fpr = list()
    losses = list()

    #loop over folds
    for fold in range(k):
        #create model
        #train_ind = [i for i in range(k) if i!=fold]
        #val_ind = [i for i in range(k) if i==fold]
        #pdb.set_trace()
        x_train = [x_split[i] for i in range(k) if i != fold]
        y_train = [y_split[i] for i in range(k) if i != fold]
        x_train = np.concatenate(x_train, axis=0)
        y_train = np.concatenate(y_train, axis=0)
        x_val = x_split[fold]
        y_val = y_split[fold]
        #model = Proj1_Model(x_train, y_train, mode)
        #train model for fold
        #weights[k] = model.train()
        """here the choice of method"""
        if mode == 'linear_regression_eq':
            update, loss = imp.least_squares(y_train, x_train)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'ridge_regression_eq':
            update, loss = imp.ridge_regression(y_train, x_train, lambda_)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'linear_regression_GD':
            update, loss = imp.least_squares_GD(y_train, x_train, initial_w,
                                                max_iters, gamma)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'linear_regression_SGD':
            update, loss = imp.least_squares_SGD(y_train, x_train, initial_w,
                                                 max_iters, gamma)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'logistic_regression':
            update, loss = imp.logistic_regression(y_train, x_train, initial_w,
                                                   max_iters, gamma)
            predictions = np.dot(x_val, update)
            predicted_prob = H.sigmoid(predictions)
            #pdb.set_trace()
            pr_bool = predicted_prob > 0.5
        elif mode == 'reg_logistic_regression':
            update, loss = imp.reg_logistic_regression(y_train, x_train,
                                                       initial_w, max_iters,
                                                       gamma)
            predictions = np.dot(x_val, update)
            predicted_prob = H.sigmoid(predictions)
            #pdb.set_trace()
            pr_bool = predicted_prob > 0.5
        weights.append(update)
        losses.append(loss)
        pr_bool = predictions >= np.mean(predictions)
        y_bool = y_val == 1
        correct = pr_bool == y_bool
        tp = np.logical_and(correct, y_bool)
        fp = np.logical_and(np.logical_not(correct), pr_bool)
        #tp = [i for i in range(len(pr_bool)) if (pr_bool[i] == True and y_bool[i] == True)]
        #all_p = [i for i in range(len(pr_bool)) if y_bool == True]
        #fp = [i for i in range(len(pr_bool)) if (pr_bool == True and y_bool == False)]
        #all_n = [i for i in range(len(pr_bool)) if y_bool == False]
        #print('True signal samples:' + str(sum(y_val)) + ' - Predicted signal samples:' + str(sum(pr_bool)))
        acc.append(sum(correct) / float(len(y_val)))
        tpr.append(sum(tp) / float(sum(y_bool)))
        fpr.append(sum(fp) / float(sum(np.logical_not(y_bool))))
        #acc[k] = model.acc()
        #tpr[k] = model.tpr()
        #fpr[k] = model.fpr()
    return acc, tpr, fpr, losses
예제 #14
0
from sklearn.metrics import r2_score


# Linear regression

print("Linear Regession \n ---------------- \n")
X, y = datasets.load_boston(return_X_y = True)
X, _, _ = implementations.standardize_numpy(X)
tx = np.c_[np.ones(X.shape[0]), X]

initial_w = np.zeros(tx.shape[1])

max_iters = 1000
gamma = 0.01

w_lr, loss_lr = implementations.least_squares(y, tx)
y_pred_lr = tx @ w_lr
print(f"Linear regression eq: {r2_score(y_pred_lr, y)}")

w_lr_gd, loss_lr_gd = implementations.least_squares_GD(y, tx, initial_w,
                     max_iters, gamma, verbose=False)
y_pred_lr_gd = tx @ w_lr_gd
print(f"Linear regression gd: {r2_score(y_pred_lr_gd, y)}")

w_lr_sgd, loss_lr_sgd = implementations.least_squares_SGD(y, tx, initial_w,
                      max_iters, gamma, verbose=False)
y_pred_lr_sgd = tx @ w_lr_sgd
print(f"Linear regression sgd: {r2_score(y_pred_lr_sgd, y)}")

reg = LinearRegression().fit(X, y)
y_pred_sk = reg.predict(X)
예제 #15
0
"""
Load the datasets, train a model, and create a Kaggle submission for the first 
Machine Learning project

Authors: Kirill IVANOV, Matthias RAMIREZ, Nicolas TALABOT
"""

### Import modules and datasets
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission
from implementations import least_squares
from utilities import split_data, preprocess_data

y_train, x_train, ids_train = load_csv_data("train.csv")
y_test, x_test, ids_test = load_csv_data("test.csv")

# Parameters
seed = 3
degree = 11
ratio = 0.66

# Learn the model
tx, x_mean, x_std = preprocess_data(x_train, degree)
x_tr, y_tr, x_te, y_te = split_data(tx, y_train, ratio, seed)
w, loss_tr = least_squares(y_tr, x_tr)

# Create a Kaggle submission
x_kaggle,_,_ = preprocess_data(x_test, degree, compute_mean_std=False, \
                               x_mean=x_mean, x_std=x_std)
y_pred = predict_labels(w, x_kaggle)
create_csv_submission(ids_test, y_pred, "run_submission.csv")
예제 #16
0
def cross_validation(y, tX, gamma, method='logistic_regression'):
    """Cross validation for logistic regression
	@param gamma: learning rate
	@return : the average accuracy over the four fold validations
	"""
    N, D = tX.shape

    # Logistic regression parameters
    max_iters = 100
    batch_size = N / 100

    # Cross validation parameters
    seed = 1
    k_fold = 4
    k_indices = build_k_indices(y, k_fold, seed)

    N_fold = N * (k_fold - 1) / k_fold
    N_test = N / k_fold

    acc = []

    for k in range(k_fold):
        yTr = np.array([])
        xTr = np.zeros((0, D))
        for i in range(k_fold):
            if i == k:
                yTe = y[k_indices[i]]
                xTe = tX[k_indices[i]]
            else:
                yTr = np.append(yTr, y[k_indices[i]], axis=0)
                xTr = np.append(xTr, tX[k_indices[i]], axis=0)

        initial_w = np.zeros(tX.shape[1])
        if method == 'logistic_regression':
            initial_w = np.zeros((tX.shape[1], 1))
            w, loss = logistic_regression(yTr, xTr, initial_w, max_iters,
                                          gamma)
            y_est = sigmoid(np.dot(xTe, w))
            y_label = [0 if i < 0.5 else 1 for i in y_est]
        elif method == 'reg_logistic_regression':
            initial_w = np.zeros((tX.shape[1], 1))
            lambda_ = 0.1
            w, loss = reg_logistic_regression(yTr, xTr, lambda_, initial_w,
                                              max_iters, gamma)
            y_est = sigmoid(np.dot(xTe, w))
            y_label = [0 if i < 0.5 else 1 for i in y_est]
        elif method == 'least_squares_GD':
            w, loss = least_squares_GD(yTr, xTr, initial_w, max_iters, gamma)
            y_label = predict_labels(w, xTe)
        elif method == 'least_squares_SGD':
            w, loss = least_squares_SGD(yTr, xTr, initial_w, max_iters, gamma)
            y_label = predict_labels(w, xTe)
        elif method == 'least_squares':
            w, loss = least_squares(yTr, xTr)
            y_label = predict_labels(w, xTe)
        elif method == 'ridge_regression':
            w, loss = ridge_regression(yTr, xTr, 0.1)
            y_label = predict_labels(w, xTe)
        else:
            raise Exception('Invalid method')

        corr = [
            True if i == yTe[ind] else False for ind, i in enumerate(y_label)
        ]
        acc.append(sum(corr) / N_test)
        # print("Fold: {f}, Accuracy: {acc}, Loss:{loss}".format(f=k, acc=acc[k], loss=loss))
    return (sum(acc) / k_fold), acc
예제 #17
0

print("Starting cross validation for the tx0 dataset")
print("##################################")

min_degree0,min_loss0=cross_validation_demo(y[tX0_dropped_distribution[:,0].astype(int)], tX0_dropped_distribution[:,1:],1,16,index_to_be_skewed0)

# # Record the min, max, mean, std of the data set resulting from the best weight found so they can be re-applied to the testing set later
min0= np.min(tX0_dropped_distribution[:,1:],axis=0)
max0=np.max(tX0_dropped_distribution[:,1:],axis=0)
tx0=min_max_transform(tX0_dropped_distribution[:,1:],min0,max0)
tx0[:,index_to_be_skewed0]= np.log(tx0[:,index_to_be_skewed0]+1)
tx0_norm,mean0,std0=expand_and_normalize_X(tx0,min_degree0)


w0,loss0=m.least_squares(y[tX0_dropped_distribution[:,0].astype(int)],tx0_norm)
min_degree0,min_loss0,loss0





print("Accuracy of best w found for tx0",accuracy(y[tX0_dropped_distribution[:,0].astype(int)],predict_labels(w0,tx0_norm)))
print("##################################")



print("Starting cross validation for the tx1 dataset")
print("##################################")

min_degree1,min_loss1=cross_validation_demo(y[tX1_dropped_distribution[:,0].astype(int)], tX1_dropped_distribution[:,1:],1,16,index_to_be_skewed1)