best_pred_score = 0.0 best_weights = 0 best_tx = 0 # Best parameters best_degree = 0 best_lambda = 0.0 best_k = 0 # Normalizing data preprocess.normalize_features(tx_train, tx_test) print("Starting computations\n") for degree in degrees: # For each degree... processed_tx_train = preprocess.build_poly(tx_train, degree) for lambda_ in lambdas: # For each lambda... for k in k_cross_val: # For each k... print("Trying (degree, lambda, k) = (" + str(degree) + ", " + str(lambda_) + ", " + str(k) + ")") # Use ridge_regression to compute our model weights, pred_score = k_fold_cross_validation(y_train, processed_tx_train, k, imp.ridge_regression, [lambda_]) print("Got predictions score = " + str(pred_score) + "\n") if pred_score > best_pred_score: # Update best results best_weights = np.copy(weights) best_pred_score = pred_score
def transformation_pipeline(x, col_to_index_mapping=col_to_index_mapping, transformation_memory=None): # Memory is required in order to apply same transformation on training and test data training = transformation_memory is None if training: transformation_memory = {} tx = np.copy(x) # Recommended to copy x so it doesn't change # Creating binary column indicating whether given column is missing for # each column that contains missing values if training: columns_with_missing_values = np.max((tx == -999), axis=0) transformation_memory['columns_with_missing_values'] = columns_with_missing_values missing_columns_binary = (tx[:, transformation_memory['columns_with_missing_values']] == -999)\ .astype(int) # remove missing values with NANs tx[tx == -999.] = np.nan # Calculate mean and standard deviation in order to to later standardize data base_standardize_col_idx = [col_to_index_mapping[key] for key in col_to_index_mapping if 'PRI_jet_num' not in key] base_standardize_cols = tx[:, base_standardize_col_idx] if training: mean = np.nanmean(base_standardize_cols, axis=0) stddev = np.nanstd(base_standardize_cols, axis=0) transformation_memory['base_mean'] = mean transformation_memory['base_stddev'] = stddev # Standardize data tx[:, base_standardize_col_idx] = (base_standardize_cols - transformation_memory['base_mean']) \ / transformation_memory['base_stddev'] # standardize and normalize may change value of fields from default missing values, so it uses matrix calculated before # applying transformations (0 = mean after standardization) tx[np.isnan(tx)] = 0 # onehot for categorical and drop one level tx, col_to_index_mapping_upd = data_preprocessing.one_hot_transformation(tx, 'PRI_jet_num', col_to_index_mapping) tx = tx[:, :-1] # Augment features using sin and cos sins = np.sin(tx) coses = np.cos(tx) tx = np.concatenate((tx, sins, coses), axis=1) # Select best features (determined using backwards attribute selection) first_selection_attr = [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 19, 21, 24, 29, 30, 32, 33, 34, 35, 39, 40, 41, 43, 44, 46, 48, 49, 51, 56, 57, 58, 61, 62, 64, 65, 66, 67, 68, 71, 73, 74, 75, 78, 80, 81, 86, 87, 90, 93, 94, 95] tx = tx[:, first_selection_attr] d = tx.shape[1] # Add polynomial degrees 2 and 3 for the selected features poly = data_preprocessing.build_poly(tx, list(range(d)), [2, 3]) if training: poly_mean = np.nanmean(poly, axis=0) poly_stddev = np.nanstd(poly, axis=0) transformation_memory['poly_mean'] = poly_mean transformation_memory['poly_stddev'] = poly_stddev # Standardize value of polynomial degrees poly = (poly - transformation_memory['poly_mean']) / transformation_memory['poly_stddev'] # Add features multiplied with each other, stratified polynomial degrees 2 and 3 and # binary columns for missing and clipped features tx = np.c_[ data_preprocessing.build_pairwise_alt(tx, list(range(d))), poly, missing_columns_binary] # Add bias tx = data_preprocessing.prepend_bias_column(tx) return tx, transformation_memory