def load_clean_csv(data_path,
                   sub_sample=False,
                   missing_val="ignore",
                   normalized=True):
    """Load clean csv, specify data_path, sub_sample(True/False), missing_val(ignore, avg, median), normalized(True/False)
    Return yb, input_data, and ids"""
    yb, input_data, ids = load_csv_data(data_path, sub_sample)
    missing_ind = get_missing_index(input_data)

    incomplete_features = np.unique(np.where(input_data == -999.0)[1])

    if (missing_val == "avg"):
        mean = np.mean(input_data[~missing_ind], 0)
        for i in incomplete_features:
            np.place(input_data[:, i], input_data[:, i] == -999, mean[i])
    elif (missing_val == "median"):
        median = np.median(input_data[~missing_ind], 0)
        for i in incomplete_features:
            np.place(input_data[:, i], input_data[:, i] == -999, median[i])
    else:
        yb = yb[~missing_ind]
        input_data = input_data[~missing_ind]
        ids = ids[~missing_ind]

    if normalized:
        input_m = np.mean(input_data, 0)
        input_std = np.std(input_data, 0)
        input_data = (input_data - input_m) / input_std

    return yb, input_data, ids
示例#2
0
def process_data(path, inv_log=False):
    """Process the data before using it doing some engineering featuring

        :param path: path of the dataset
        :param inv_log: apply log on the positive columns of the dataset
        :return: y, processed data, masks based on pri_jet_num, ids
    """
    y, X, ids = helpers.load_csv_data(path)

    dict_mask_jets_train = helpers_us.get_jet_masks(X)

    new_X = []

    for i in range(len(dict_mask_jets_train)):
        new_X.append(np.delete(X[dict_mask_jets_train[i]], [22, 29], axis=1))

    for i in range(len(dict_mask_jets_train)):
        undefined_columns = [j for j in range(len(new_X[i][0])) if (new_X[i][:, j] < -900).all()]
        new_X[i] = np.delete(new_X[i], undefined_columns, axis=1)

    for i in range(len(dict_mask_jets_train)):
        for j in range(len(new_X[i][0])):
            col = new_X[i][:, j]
            np.where(col < -900)
            m = np.mean(col[col >= -900])  # compute mean of the right columns
            col[np.where(col < -900)] = m
            new_X[i][:, j] = col

    if inv_log:
        new_X = helpers_us.log_f(new_X)

    for i in range(1, len(dict_mask_jets_train)):
        new_X[i], x_mean, x_std = helpers_us.standardize(new_X[i])

    return y, new_X, dict_mask_jets_train, ids
示例#3
0
def main():
    """
    Tests the six mandatory implementations on the raw data sets. Splits the
    original training set into a new training set and a test set with the ratio
    of the new training set to the old one being 0.8. Reports the percentage of 
    correct predictions for each method. As a side note, standardization of the 
    data helps algorithms that use gradient descent, hence standardized features
    are used in those iterative algorithms.
    """
    y, tx, _ = load_csv_data('train.csv')
    y_train, tx_train, y_test, tx_test = train_test_split(y, tx, 0.8)
    standardized_tx_train, mean_tx_train, std_tx_train = standardize(tx_train)
    standardized_tx_test, _, _ = standardize(tx_test, mean_tx_train,
                                             std_tx_train)
    test_least_squares_GD(y_train, standardized_tx_train, y_test,
                          standardized_tx_test)
    test_least_squares_SGD(y_train, standardized_tx_train, y_test,
                           standardized_tx_test)
    test_least_squares(y_train, tx_train, y_test, tx_test)
    test_ridge_regression(y_train, tx_train, y_test, tx_test)
    y_train = change_labels_logistic(y_train)
    y_test = change_labels_logistic(y_test)
    test_logistic_regression(y_train, standardized_tx_train, y_test,
                             standardized_tx_test)
    test_reg_logistic_regression(y_train, standardized_tx_train, y_test,
                                 standardized_tx_test)
示例#4
0
def load_data_sets(y_train_jets, tx_train_jets, ids_train_jets, y_test_jets,
                   tx_test_jets, ids_test_jets):
    print(
        '\nLoading the processed training and test set data for each jet number...'
    )
    for jet_num in range(4):
        y_train, tx_train, ids_train = load_csv_data(training_files[jet_num])
        y_train_jets.append(y_train)
        tx_train_jets.append(tx_train)
        ids_train_jets.append(ids_train)
        y_test, tx_test, ids_test = load_csv_data(test_files[jet_num])
        y_test_jets.append(y_test)
        tx_test_jets.append(tx_test)
        ids_test_jets.append(ids_test)
        print('\nTraining and test set data for jet ', str(jet_num),
              ' is loaded.')
    print('\n... done.')
示例#5
0
def load(trainFile, testFile):
    """
    Builds various numpy arrays from the given .csv format training 
    and test tests.
    Args:
        trainFile: file name/path for the input training set
        testFile: file name/path for the input test set
    Returns: 
        y_train: labels in the training set as a numpy array
        tx_train: features in the training set as a numpy array
        ids_train: ids of the training data points as a numpy array
        y_test: labels in the test set as a numpy array
        tx_test: features in the test set as a numpy array
        ids_test: ids of the test data points as a numpy array
    """
    print('\nLoading the raw training and test set data...')
    y_train, tx_train, ids_train = load_csv_data(trainFile)
    y_test, tx_test, ids_test = load_csv_data(testFile)
    print('\n... finished.')
    return y_train, tx_train, ids_train, y_test, tx_test, ids_test
示例#6
0
def load_data(change_labels=True):
    """
    Loads the training and testing data from disk.
    Args:
        change_labels: Convert the labels from -1/1 to 0/1 for logistic regression.
    """
    train_path = "../data/train.csv"
    test_path = "../data/test.csv"

    print('Reading from file {}'.format(train_path))
    y, tx, ids = load_csv_data(train_path, sub_sample=False)
    y = np.expand_dims(y, axis=1)

    if change_labels:
        y = np.where(y == -1, 0, y)

    print('Reading from file {}'.format(test_path))
    _, tx_submission, _ = load_csv_data(test_path, sub_sample=False)

    return tx, y, tx_submission
示例#7
0
def main(param):
    # load train set
    y, x, i = load_csv_data('data/train.csv', sub_sample=False)
    # load test set
    y_test, x_test, i_test = load_csv_data('data/test.csv', sub_sample=False)
    # Reshape y
    y = y.reshape(y.shape[0], 1)
    # Preprocess x (remove features with lot of -999)
    x = remove_columns(x)

    # Number of sub-set for crossvalidation
    N_TOTAL_FOLDS = 1
    accuracies = []
    x, y = shuffle_data(x, y)
    for k in range(0, x.shape[0], x.shape[0] // N_TOTAL_FOLDS):
        accuracy, y_predictions, w = crossvalidation(y, x, k, N_TOTAL_FOLDS,
                                                     param)
        accuracies.append(accuracy)
        print(accuracies)

    #plot_result(lambdas,accuracies)
    submission(x_test, w, i_test)
示例#8
0
    def eval_train(self):
        if self._tX is None:
            y, tX, _ = load_csv_data(self._DATA_TRAIN_PATH)
            self._y, self._tX = self.prepare_all_data(y, tX)
            self._tX_orig = self._tX.copy()
            self._y_orig = self._y.copy()
            self._y, self._tX = self._prepare_model_data(self._y, self._tX)
            self._orig_train = True

        y_pred = self._predict(self._tX)

        # performance of model 5 on train dataset
        acc = 1 - sum(abs(self._y_orig - y_pred) / 2) / self._y_orig.shape[0]
        print('Total accuracy: ' + str(acc))
        return acc
示例#9
0
    def predict_test(self, x=None, ids=None):
        if x is None or ids is None:
            if self._orig_test is False:
                _, _tX_test, self._ids_test = load_csv_data(
                    self._DATA_TEST_PATH)
                _, self._tX_test = self.prepare_all_data(None, _tX_test)
                self._tX_orig = self._tX_test.copy()
                _, self._tX_test = self._prepare_model_data(
                    None, self._tX_test)
                self._orig_test = True
        else:
            _, self._tX_test = self.prepare_all_data(None, x.copy())
            self._tX_orig = self._tX_test.copy()
            self._ids_test = ids.copy()
            _, self._tX_test = self._prepare_model_data(None, self._tX_test)
            self._orig_test = False

        y_test_pred = self._predict(self._tX_test)
        create_csv_submission(self._ids_test, y_test_pred, self._output_path)
示例#10
0
    def train(self, y=None, x=None):
        if y is None or x is None:
            if self._orig_train is False:
                self._x_mean, self._x_std = None, None
                y, tX, _ = load_csv_data(self._DATA_TRAIN_PATH)
                self._y, self._tX = self.prepare_all_data(y, tX)
                self._tX_orig = self._tX.copy()
                self._y_orig = self._y.copy()
                self._y, self._tX = self._prepare_model_data(self._y, self._tX)
                self._orig_train = True
        else:
            self._x_mean, self._x_std = None, None
            self._y, self._tX = self.prepare_all_data(y.copy(), x.copy())
            self._tX_orig = self._tX.copy()
            self._y_orig = self._y.copy()
            self._y, self._tX = self._prepare_model_data(self._y, self._tX)
            self._orig_train = False

        self._train_model()

        return self.eval_train()
示例#11
0
fold_count = 1
seed = 2

# optimization
gd_func = gradient_descent.logistic_L2_gradient_descent
max_iters = 7000
gamma = 0.08

# lambdas (to find with grid search)
lambdas = np.linspace(10, 13, num=6)
lambda_best = 0

################################################################################
#                                    read data                                 #
################################################################################
(y, X, id) = proj1_helpers.load_csv_data(DATA_TRAIN_PATH, sub_sample=False)

# y is categorical, so we want integers (-1, 1) instead of floats (-1.0, 1.0)
# Modified here instead of in load_csv_data, because we don't know if we have the
# right to change the provided functions.
y = y.astype(int)

# The formulas used for the cost and gradients of the logistic function expect
# categories that are 0/1 for some terms to disappear in the equations.
y[np.where(y == -1)] = 0

################################################################################
#                                   clean data                                 #
################################################################################
# one-hot coding for "PRI_jet_num" (column 22)
(id, y, X) = clean_data.one_hot_PRI_jet_num(id, y, X)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

from proj1_helpers import load_csv_data
import numpy as np

ITERATIONS = 50000
SAMPLE_SIZE = 250000  # less or eq than 250000. step=250k/SAMPLE_SIZE

y, X, _, _ = load_csv_data('all/train.csv', step=int(250000. / SAMPLE_SIZE))

# Cleans dataset by removing all features that admit undefined values.
undef_features = [i for i, feature in enumerate(X.T) if -999 in feature]
X = np.delete(X, undef_features, axis=1)

clf = LogisticRegression(solver='newton-cg', max_iter=ITERATIONS).fit(X, y)
print(cross_validate(clf, X, y, scoring=['accuracy', 'precision']))
示例#13
0
import numpy as np

from implementations import ridge_regression
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission
from data_processing import process_data, build_poly

print("Loading data\n")

# Loading data from csv files
y_tr, tx_tr, ids_tr = load_csv_data("data/train.csv")
y_te, tx_te, ids_te = load_csv_data("data/test.csv")

# Hyper-parameters definitions
degree = 7
lambda_ = 0.00025

# Preprocessing data: cleaning, standardazing and adding constant column
tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te)

# Feature augmentation through polynomials
tx_tr = build_poly(tx_tr, degree)
tx_te = build_poly(tx_te, degree)

# Training with ridge regression
print("Training the model\n")
weights, _ = ridge_regression(y_tr, tx_tr, lambda_)

# Computing prediction vector
y_pred = predict_labels(weights, tx_te)

# Creating file for submission
示例#14
0
from feature_selection import compute_log, compute_theta, compute_physics
from correction_rate import cross_validation, print_score
from evaluation import predict_regression_labels
from costs import sigmoid

# edit if train.csv and test.csv in not in ../data/
dat_dir = '../data/'

############################
## Training
############################
print('training started')

# load the training set
print('loading the training dataset...')
y_train_pre, tx_train, ids_train = load_csv_data(dat_dir + "train.csv",
                                                 sub_sample=False)
print('data loaded...')

y_train = y_train_pre.reshape(y_train_pre.shape[0], 1)

# construct the featues using log()
index_log = [
    0, 1, 2, 4, 5, 6, 7, 9, 10, 12, 16, 21, 23, 24, 25, 26, 27, 28, 29
]
tx_log, mean_log, std_log = compute_log(tx_train, index_log)

# construct the featues using cosine()
index_theta = [14, 15, 17, 18, 20]
tx_theta, mean_theta, std_theta = compute_theta(tx_train, index_theta)

# construct the featues with physics meanings:
示例#15
0

def predict(w, x_test, small=-1, big=1):
    '''Returns the prediction for x_test with w, for two values small and big. The prediction is done by choosing the nearest value'''
    y_pred = x_test @ w
    sep_val = (small + big) / 2
    y_pred[y_pred < sep_val] = small
    y_pred[y_pred >= sep_val] = big

    return y_pred


# ------------------------------- BEGINNING -------------------------------
print('Reading data')

yb_full, input_data_full, ids_full = load_csv_data('data/train.csv')
yb_test, input_data_test, ids_test = load_csv_data('data/test.csv')

# Shuffling a bit the data to get some subsample that is picked at random
np.random.seed(16)
per = np.random.permutation(250000)

# Picking sumsamples
yb, input_data, ids = yb_full[per][::10], input_data_full[per][::10], ids_full[
    per][::10]

print('Data read')
print('Treating data')

# Separating each np.array into 4 sub-arrays by category (number of jets aka column 22)
input_data_by_22, ids_by_22, yb_by_22 = separate_by_col22(input_data, ids, yb)
import numpy as np
import proj1_helpers as helper
import data_preprocessing as preprocess
import multi_models_splitter as multi
import implementations as imp
from cross_validation import k_fold_cross_validation
import os

# Load training data
y_train,tx_train,ids_train = helper.load_csv_data('../all/train.csv')

# Load test data
_,tx_test,ids = helper.load_csv_data('../all/test.csv')

# Seed the random number generator with a fixed value for consistent results
np.random.seed(20181028)

# Parameters
degrees = [3, 5, 6, 8, 10, 12]
lambdas = np.logspace(-9, 0, 10)
k_cross_val = [5]

# Best results
best_pred_score = 0.0
best_weights = 0
best_tx = 0

# Best parameters
best_degree = 0
best_lambda = 0.0
best_k = 0
示例#17
0
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission
import numpy as np
from datetime import datetime
from created_helpers import *

print("loading data")
y_train, x_train, ids_train = load_csv_data("train.csv")
y_test, x_test, ids_test = load_csv_data("test.csv")


# same ridge_regression as in implementations.py but just
# returning loss
def ridge_regression(y, tx, lambda_):
    """implement ridge regression."""
    aI = 2 * tx.shape[0] * lambda_ * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    return np.linalg.solve(a, b)


# cross_validation code taken from lab
def cross_validation(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
示例#18
0
文件: run.py 项目: Battleman/Space-ML
COMBINED_DEGREES = params['COMBINED_DEGREES']
SIMPLE_DEGREES = params['SIMPLE_DEGREES']
TAN_HYP_DEGREES = params['TAN_HYP_DEGREES']
INVERSE_LOG_DEGREES = params['INVERSE_LOG_DEGREES']
ROOT_DEGREES = params['ROOT_DEGREES']
NUM_SETS = params['NUM_SETS']
DATA_TRAIN_PATH = params['DATA_TRAIN_PATH']
DATA_TEST_PATH = params['DATA_TEST_PATH']
OUTPUT_PATH = params['OUTPUT_PATH']
CACHE = params['CACHE']
LAMBDAS = params['lambdas']
#########
# Load CSV
#########
print("Loading CSV")
y, tX_train, _ = load_csv_data(DATA_TRAIN_PATH)

_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

#########
# Preprocess
#########
print("Preprocessing")
(XS_TRAIN, MASKS_TRAIN) = preprocessing(tX_train)
(XS_TEST, MASKS_TEST) = preprocessing(tX_test)

# placeholder for submission
y_submission = np.zeros(tX_test.shape[0])

# compute for each subset of PRI_JET_NUM
for i in range(NUM_SETS):
示例#19
0
import numpy as np

from proj1_helpers import (create_csv_submission, predict_labels,
                           load_csv_data)

from implementations import (least_squares_GD, least_squares_SGD,
                             least_squares, ridge_regression,
                             logistic_regression, normalize_data,
                             delete_missing_values, replace_data,
                             reg_logistic_regression)

#load data from train set
y, tX, ids = load_csv_data("train.csv")

# change [-1, 1] labels to [0, 1]
y = y / 2 + 0.5

N, d = tX.shape
#initial weights randomly generated
w0 = 10 * np.random.rand(d + 1, 1)

# remplace -999 values with the mean of the other ones
tX = replace_data(tX)
# normalize data to std 1 and 0 mean
tX = normalize_data(tX)

w, L = reg_logistic_regression(y,
                               tX,
                               lambda_=0.001,
                               initial_w=w0,
                               max_iters=10,
示例#20
0
文件: code.py 项目: GTaf/ML_project_1
import proj1_helpers
import implementations
import numpy as np

print("Extracting dataset")
y_train, X_train, id1 = proj1_helpers.load_csv_data("train.csv", True)
y_test, X_test, id2 = proj1_helpers.load_csv_data("test.csv", True)
print(X_train.shape)

batch_size = 128

print("Splitting dataset into batch")
X_batch = np.array_split(X_train, int(X_train.shape[0] / batch_size))
y_batch = np.array_split(y_train, int(y_train.shape[0] / batch_size))

print(X_batch[0].shape)

w, _ = implementations.ridge_regression(y_batch[0], X_train[0], 1)

print(w)

y_pred = proj1_helpers.predict_labels(w, X_test)

s = 0
tot = 0
for i, y in enumerate(y_pred):
    if y == y_test[i]:
        s += 1
    tot += 1

print(s / tot)
示例#21
0
"""
run.py is used to launch the application of weights on a test dataset and serialize the results.
"""


def load_npy(*npy_paths):
    """
    Returns numpy arrays serialized at npy_paths.
    Args:
        npy_paths : a sequence of serialized np.arrays files paths.
    Returns:
        Deserialized numpy arrays
    """
    return (np.load(p) for p in npy_paths)


# Load the test dataset
_, test_data, test_ids, _ = load_csv_data('all/test.csv')

# Load the weights, feature masks and parameters (mean, std_dev)
weights, clean_features, parameters = load_npy('all/weights.npy',
                                               'all/clean_features.npy',
                                               'all/parameters.npy')

# Runs the weights against the test dataset
pri_jet_num_idx = 22
polynomial_degree = 3
predictions = model_predictions(test_data, weights, pri_jet_num_idx,
                                clean_features, parameters, polynomial_degree)

create_csv_submission(test_ids, predictions, 'all/predictions.csv')
示例#22
0
def get_data(use_preexisting=True,
             save_preprocessed=True,
             z_outlier=False,
             feature_expansion=False,
             correlation_analysis=False,
             class_equalizer=False,
             M=4,
             z_value=3.0):
    """
    Data supplying function.

    This function has the purpose of loading data and applying preprocessing.
    It includes many features such as downloading the data from the github
    repository, saving the data (for fast reuse), applying different
    preprocessing algorithms, etc...

    Args:
        use_preexisting (bool): if existent, enabling this parameters will allow
                                the function to use previously preprocessed and
                                saved data files
        save_preprocessed (bool): enabling this parameters will allow the
                                    function to save the preprocessed data
        z_outlier (Union[int, bool]): enabling this parameters will allow the function to
                            perform z outlier detection
        feature_expansion (bool): enabling this parameters will allow the
                                    function to perform exponential feature
                                    expansion
        correlation_analysis (Union[int, bool]): enabling this parameters will allow the
                                        function to perform correlation analysis
                                        and remove highly correlated features
        class_equalizer (Union[int, bool]): enabling this parameters will allow the function to
                            perform class balancing
        M (Union[int, list]): feature expansion parameter per group
        z_value (Union[float, list]): outlier detection threshold per group

    Returns:
        list: groups of training samples
        list: corresponding groups of training labels
        list: corresponding indexes of affiliated training ows
        list: groups of test samples
        list: corresponding groups of test labels
        list: corresponding indexes of affiliated test rows
        list: list of indexes of testing (for creating submissions)

    """

    if os.path.isdir(config.DATA_PATH) and os.path.isdir(
            config.PREPROCESSED_PATH) and use_preexisting:
        print("[*] Using previously preprocessed Data")
        groups_tr_X = np.load(config.PREPROCESSED_X_TR_GROUPS_NPY,
                              allow_pickle=True)
        groups_tr_Y = np.load(config.PREPROCESSED_Y_TR_GROUPS_NPY,
                              allow_pickle=True)
        indc_list_tr = np.load(config.PREPROCESSED_GROUP_INDEX_TR_NPY,
                               allow_pickle=True)
        groups_te_X = np.load(config.PREPROCESSED_X_TE_GROUPS_NPY,
                              allow_pickle=True)
        groups_te_Y = np.load(config.PREPROCESSED_Y_TE_GROUPS_NPY,
                              allow_pickle=True)
        indc_list_te = np.load(config.PREPROCESSED_GROUP_INDEX_TE_NPY,
                               allow_pickle=True)
        ids_te = np.load(config.PREPROCESSED_IDS_TE_GROUPS_NPY,
                         allow_pickle=True)

    else:
        if not (os.path.isdir(config.DATA_PATH)
                and os.path.isfile(config.TRAIN_DATA_CSV_PATH)
                and os.path.isfile(config.TEST_DATA_CSV_PATH)):
            Path(config.DATA_PATH).mkdir(exist_ok=True)
            download_url(config.TRAIN_URL, config.TRAIN_DATA_CSV_PATH)
            download_url(config.TEST_URL, config.TEST_DATA_CSV_PATH)

        print("[*] Creating preprocessed Data")

        # load data from csv filesconfig.Z_VALUE
        Y_tr, X_tr, ids_tr = load_csv_data(config.TRAIN_DATA_CSV_PATH)
        Y_te, X_te, ids_te = load_csv_data(config.TEST_DATA_CSV_PATH)

        groups_tr_Y, groups_tr_X, indc_list_tr = split_groups(Y_tr, X_tr)
        groups_te_Y, groups_te_X, indc_list_te = split_groups(Y_te, X_te)

        nr_groups_tr = len(indc_list_tr)

        # make to lists
        z_outlier = make_to_list(z_outlier)
        class_equalizer = make_to_list(class_equalizer)
        correlation_analysis = make_to_list(correlation_analysis)
        M = make_to_list(M)

        for indx in range(nr_groups_tr):
            # perform z outlier detection
            if z_outlier[indx]:
                groups_tr_X[indx] = z_score_outlier_detection(
                    groups_tr_X[indx], thresh=z_value)
                groups_te_X[indx] = z_score_outlier_detection(
                    groups_te_X[indx], thresh=z_value)

            # perform correlation analysis
            if correlation_analysis[indx]:
                groups_tr_X[indx], columns_to_keep = corr_filter(
                    groups_tr_X[indx], threshold=0.95)
                groups_te_X[indx] = groups_te_X[indx][:, columns_to_keep]

            # perform class equalization
            if class_equalizer[indx]:
                groups_tr_X[indx], groups_tr_Y[
                    indx] = class_imbalance_equalizer(groups_tr_X[indx],
                                                      groups_tr_Y[indx])

            # perform feature expansion
            if feature_expansion:
                groups_tr_X[indx] = augment_features_polynomial(
                    groups_tr_X[indx], M=M[indx])
                groups_te_X[indx] = augment_features_polynomial(
                    groups_te_X[indx], M=M[indx])

            # standardize features
            groups_tr_X[indx] = standardize(groups_tr_X[indx])
            groups_te_X[indx] = standardize(groups_te_X[indx])

            # add bias
            groups_tr_X[indx] = add_bias(groups_tr_X[indx])
            groups_te_X[indx] = add_bias(groups_te_X[indx])

            print(f"\t [+]Group {indx + 1} finished!")

        if save_preprocessed:
            Path(config.PREPROCESSED_PATH).mkdir(exist_ok=True)

            np.save(config.PREPROCESSED_X_TR_GROUPS_NPY,
                    groups_tr_X,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_Y_TR_GROUPS_NPY,
                    groups_tr_Y,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_X_TE_GROUPS_NPY,
                    groups_te_X,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_Y_TE_GROUPS_NPY,
                    groups_te_Y,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_GROUP_INDEX_TR_NPY,
                    indc_list_tr,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_GROUP_INDEX_TE_NPY,
                    indc_list_te,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_IDS_TE_GROUPS_NPY,
                    ids_te,
                    allow_pickle=True)
            print("[+] Saved Preprocessed Data")

    return groups_tr_X, groups_tr_Y, indc_list_tr, groups_te_X, groups_te_Y, indc_list_te, ids_te
示例#23
0
文件: run.py 项目: reslbesl/CS433_ML
from data_utils import feature_transform, standardise, standardise_to_fixed
from implementation_variants import logistic_regression_mean

cwd = path.dirname(__file__)

SEED = 42
DATA_PATH = '../data/'

# Training hyperparameters (obtained through procedure in Run.ipynb)
MAX_ITERS = 50000
GAMMA = 0.01
THRESHOLD = 1e-7

if __name__ == "__main__":
    # Load train data
    y_train, x_train, _ = load_csv_data(path.join(DATA_PATH, 'train.csv'))

    # Apply feature transform
    fx_train = feature_transform(x_train)

    # Standardise to mean and s.d.
    fx_train, mu_train, sigma_train = standardise(fx_train)

    # Add offset term
    tx_train = np.c_[np.ones(len(y_train)), fx_train]

    # Initialise training
    w_initial = np.ones(tx_train.shape[1])

    # Run gradient descent
    w, loss = logistic_regression_mean(y_train,
示例#24
0
import numpy as np

import preprocessing as prep
import feature_engineering as f_e
import local_prediction as pred
import proj1_helpers as helpers
import params

if __name__ == '__main__':
    # Training set preprocessing and feature engineering
    print('Train set:')
    y, tX, ids = helpers.load_csv_data(params.DATA_TRAIN_PATH)
    y_preprocessed, tX_preprocessed, ids_preprocessed, masks, counts = prep.preprocess(
        y, tX, ids)
    tX_improved = f_e.feature_engineer(tX_preprocessed)

    # In case we want to test our model locally by splitting our data
    if params.LOCAL_PREDICTION:
        pred.locally_predict(tX_improved, y_preprocessed, counts)
    else:
        print('Test set:')
        y_test, tX_test, ids_test = helpers.load_csv_data(
            params.DATA_TEST_PATH)
        y_test_preprocessed, tX_test_preprocessed, ids_test_preprocessed, masks_test, counts_test = prep.preprocess(
            y_test, tX_test, ids_test)
        tX_test_improved = f_e.feature_engineer(tX_test_preprocessed)
        log_initial_ws = []
        for i in range(len(tX_test_improved)):
            log_initial_ws.append(np.repeat(0, tX_test_improved[i].shape[1]))
        optimal_ws = pred.find_optimal_ws_grouped(
            tX_improved, y_preprocessed, params.IMPLEMENTATION, log_initial_ws,
                accuracies.append(
                    pred.locally_predict(tX,
                                         y,
                                         counts[group_number],
                                         implementation=2,
                                         group=False,
                                         max_iter=max_iter,
                                         gamma=gamma,
                                         log_lambda=log_lambda))
    argmax = np.flip(np.argsort(accuracies), axis=0)[0]
    print(max_iter_range[argmax], gamma_range[argmax],
          log_lambda_range[argmax], accuracies[argmax])


if __name__ == "__main__":
    y, tX, ids = helpers.load_csv_data(params.DATA_TRAIN_PATH)
    for replace_unwanted_value in true_false:
        for std in true_false:
            print('\t\treplace_unwanted_value = {}'.format(
                replace_unwanted_value))
            print('\t\tstd = {}'.format(std))
            y_grouped_preprocessed, tX_grouped_preprocessed, ids_grouped_preprocessed, masks, counts = \
                prep.preprocess(y, tX, ids, std=std, replace_unwanted_value=replace_unwanted_value)
            for ones_column in true_false:
                for feature_multiplication in true_false:
                    print('\t\tones_column = {}'.format(ones_column))
                    print('\t\tfeature_multiplication = {}'.format(
                        feature_multiplication))
                    tX_improved = f_e.feature_engineer(
                        tX_grouped_preprocessed[0],
                        group=False,
示例#26
0
import numpy as np
from proj1_helpers import load_csv_data
from implementations import least_squares_SGD

yb, input_data, ids = load_csv_data('all/train.csv', step=50)
losses, w = least_squares_SGD(yb,
                              input_data,
                              initial_w=np.zeros(30),
                              batch_size=1,
                              max_iters=30,
                              gamma=0.5)
with open('output.txt', 'w') as fp:
    print(yb, input_data, losses, w, file=fp)
示例#27
0
# Simply the copy of Test Set Prediction Notebook

# It takes nearly 20 seconds be patient

import numpy as np
from cross_validation import cross_validation
from polynomial import build_poly
from implementations import ridge_regression
from proj1_helpers import load_csv_data
from proj1_helpers import predict_labels
from proj1_helpers import create_csv_submission

USE_PRETRAINED_WEIGHTS = False

# Read the test set
test_set = load_csv_data('../data/test.csv')
y_test, X_test, ids, columns = test_set

# Selected columns
selected_features = np.array([1, 3, 9, 10, 11, 13, 21, 22, 23])
selected_features = np.sort(np.append(selected_features, [0, 4, 5, 6, 12]))

# Log transformed columns
log_transformed_columns = [3, 9, 10, 13, 21]

for i in log_transformed_columns:
    X_test[np.where(X_test[:, i] != -999),
           i] = np.log(X_test[np.where(X_test[:, i] != -999), i] + 1)
    X_test[np.where(X_test[:, i] == -999)] = -999

# Select the correct features
示例#28
0
[email protected]
"""
import os
import numpy as np

from proj1_helpers import load_csv_data
from cross_validation import optimize_model
#from tests.test_helpers import load_data
from helpers import _standardize

# Resets cpu core task affinity
os.system("taskset -p 0xff %d" % os.getpid())

# ## Load the training data into feature matrix, class labels, and event ids:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

# # ## Our code
# # split categorical variable (23)
# tX = np.vstack((tX.T, (tX[:, 22] == 0).astype(int))).T
# tX = np.vstack((tX.T, (tX[:, 22] == 1).astype(int))).T
# tX = np.vstack((tX.T, (tX[:, 22] == 2).astype(int))).T
# tX = np.vstack((tX.T, (tX[:, 22] == 3).astype(int))).T
# tX.shape
#
#
# model1_datapoints_no = tX.shape[0] - sum(tX[:, 4] == -999)
# tX1 = tX[tX[:, 4] != -999]
# tX1 = np.delete(tX1, 0, axis=1)
# y1 = y[tX[:, 4] != -999]
# (tX1.shape, y1.shape)
示例#29
0
                           load_csv_data, predict_01_labels)

from implementations import (
    replace_data, normalize_data, remove_outliers, oversample, one_hot_encode,
    polynomial_expansion, least_squares_GD, least_squares_SGD, least_squares,
    ridge_regression, logistic_regression, reg_logistic_regression,
    cross_validation_OLS, cross_validation_SGD, cross_validation_RR,
    cross_validation_LR, cross_validation_RLR_gamma,
    cross_validation_RLR_lambda)

##########################################################################
#### Loading data
##########################################################################

# load data from train set
_y_train, _tX_train, ids_train = load_csv_data("train.csv")

# change [-1, 1] labels to [0, 1]
y_train = _y_train / 2 + 0.5

##########################################################################
#### Data pre-processing
##########################################################################

# replace -999 values with the mean of the other ones
tX_train = replace_data(_tX_train)

# Get the one-hot-encoded columns for later
one_hot_columns = one_hot_encode(tX_train, 22)

# normalize data to std 1 and 0 mean
示例#30
0
import numpy as np
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission
from implementations import ridge_regression
from helpers import build_poly, build_k_indices, normalize

DATA_PATH = '../data/'
lambda_ = 1e-20
degree = 13
seed = 12
k_fold = 7

# We work with the training data in this notebook
y, x, ids = load_csv_data(DATA_PATH + 'train.csv')

x, col_mean, xmin, xmax = normalize(x)


def cross_validation(y, x, k_indices, k, lambda_, degree):

    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te, y_tr = y[te_indice], y[tr_indice]
    x_te, x_tr = x[te_indice], x[tr_indice]

    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)

    w, _ = ridge_regression(y_tr, tx_tr, lambda_)

    y_tr_pred = predict_labels(w, tx_tr)