best_pred_score = 0.0
best_weights = 0
best_tx = 0

# Best parameters
best_degree = 0
best_lambda = 0.0
best_k = 0

# Normalizing data
preprocess.normalize_features(tx_train, tx_test)

print("Starting computations\n")

for degree in degrees: # For each degree...
    processed_tx_train = preprocess.build_poly(tx_train, degree)
    for lambda_ in lambdas: # For each lambda...
        for k in k_cross_val: # For each k...

            print("Trying (degree, lambda, k) = (" + str(degree) + ", " + str(lambda_) + ", " + str(k) + ")")

            # Use ridge_regression to compute our model
            weights, pred_score = k_fold_cross_validation(y_train, processed_tx_train, k, imp.ridge_regression, [lambda_])

            print("Got predictions score = " + str(pred_score) + "\n")

            if pred_score > best_pred_score:
                # Update best results
                best_weights = np.copy(weights)
                best_pred_score = pred_score
예제 #2
0
파일: run.py 프로젝트: mvujas/mlproject1
def transformation_pipeline(x, col_to_index_mapping=col_to_index_mapping, transformation_memory=None):
    # Memory is required in order to apply same transformation on training and test data
    training = transformation_memory is None
    if training:
      transformation_memory = {}

    tx = np.copy(x) # Recommended to copy x so it doesn't change

    # Creating binary column indicating whether given column is missing for 
    #   each column that contains missing values
    if training:
      columns_with_missing_values = np.max((tx == -999), axis=0)
      transformation_memory['columns_with_missing_values'] = columns_with_missing_values
    missing_columns_binary = (tx[:, transformation_memory['columns_with_missing_values']] == -999)\
              .astype(int)
    
    # remove missing values with NANs
    tx[tx == -999.] = np.nan

    # Calculate mean and standard deviation in order to to later standardize data
    base_standardize_col_idx = [col_to_index_mapping[key] for key in col_to_index_mapping if 'PRI_jet_num' not in key]
    base_standardize_cols = tx[:, base_standardize_col_idx]
    if training:
      mean = np.nanmean(base_standardize_cols, axis=0)
      stddev = np.nanstd(base_standardize_cols, axis=0)
      transformation_memory['base_mean'] = mean
      transformation_memory['base_stddev'] = stddev

    # Standardize data
    tx[:, base_standardize_col_idx] = (base_standardize_cols - transformation_memory['base_mean']) \
          / transformation_memory['base_stddev']

    # standardize and normalize may change value of fields from default missing values, so it uses matrix calculated before 
    #   applying transformations (0 = mean after standardization)
    tx[np.isnan(tx)] = 0
     
    # onehot for categorical and drop one level
    tx, col_to_index_mapping_upd = data_preprocessing.one_hot_transformation(tx, 'PRI_jet_num', col_to_index_mapping)
    tx = tx[:, :-1]

    # Augment features using sin and cos
    sins = np.sin(tx)
    coses = np.cos(tx)
    tx = np.concatenate((tx, sins, coses), axis=1)
    
    # Select best features (determined using backwards attribute selection)
    first_selection_attr = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 19, 21, 24, 29, 30, 32, 33, 34, 35, 39, 40, 41, 43, 44, 46, 48, 49, 51, 56, 57, 58, 61, 62, 64, 65, 66, 67, 68, 71, 73, 74, 75, 78, 80, 81, 86, 87, 90, 93, 94, 95]
    tx = tx[:, first_selection_attr]
    
    d = tx.shape[1]
    
    # Add polynomial degrees 2 and 3 for the selected features
    poly = data_preprocessing.build_poly(tx, list(range(d)), [2, 3])

    if training:
      poly_mean = np.nanmean(poly, axis=0)
      poly_stddev = np.nanstd(poly, axis=0)
      transformation_memory['poly_mean'] = poly_mean
      transformation_memory['poly_stddev'] = poly_stddev

    # Standardize value of polynomial degrees
    poly = (poly - transformation_memory['poly_mean']) / transformation_memory['poly_stddev']

    # Add features multiplied with each other, stratified polynomial degrees 2 and 3 and
    #   binary columns for missing and clipped features
    tx = np.c_[
               data_preprocessing.build_pairwise_alt(tx, list(range(d))), 
               poly, missing_columns_binary]

    # Add bias
    tx = data_preprocessing.prepend_bias_column(tx)
    
    return tx, transformation_memory