def run_lasso_on_input(df, target): X_part, y_part, _ = sample_data_frame_return_x_y_column_name(df, True, target, int(0.7*df.shape[0])) X_part, _ = scale_input_data(X_part) print "#######################################" print "Starting LARS CV" print "#######################################" lars_cv = LassoLarsCV(cv=10).fit(X_part, y_part) print "#######################################" print "Done with LARS CV" print "#######################################" #alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, df.shape[0]) X, _ = scale_input_data(X) print "#######################################" print "Starting main lasso" print "#######################################" clf = RandomizedLasso(alpha= lars_cv.alphas_, random_state=12, n_resampling= 400, normalize=True).fit(X, y) print "#######################################" print "Done with main lasso" print "#######################################" return clf, column_list_for_sampled
def perform_orthogonal_variable_selection(X, y, column_list_for_sampled, non_linear, feature_dictionary): X, _ = scale_input_data(X) #train base line model with the original model #define lasso model print "defining lasso model " clf_baseline = LassoCV(n_alphas=3, cv=3) print "fitting lasso model with cross validation library" base_line_lasso = cross_validation.cross_val_score(clf_baseline, X, y, cv=3, n_jobs=1) print "done fitting lasso library, now computing baseline performance" #baseline performance baseline_performance = np.mean(base_line_lasso) print baseline_performance print "baseline performance printed above" #get number of columns and rows for main data rows, columns = X.shape print "starting iterative orthogonal transformation for each feature" for i in range(columns): feature_name_current_i = column_list_for_sampled[i] print "Currently on ----->> " + feature_name_current_i current_column = X[:, i] transformed_X = np.zeros((rows, columns)) if non_linear > 0: #print "performing non-linear transformation" A = return_non_linear_transformation(current_column, poly=3, log=True, square_root=False, exponential=False, sin=False, cos=False) for j in range(columns): #print "in inner loop for " + feature_name_current_i + " ---->> " + column_list_for_sampled[j] if i == j: transformed_X[:, j] = np.zeros((rows)) else: transformed_X[:, j] = orthogonal_vector_to_subspace( A, X[:, j]) else: #print "performing linear transformation" for j in range(columns): #print "in inner loop for " + feature_name_current_i + " ---->> " + column_list_for_sampled[j] if i == j: #skip the current columns, because we are removing it from the #analysis. transformed_X[:, j] = np.zeros((rows)) else: transformed_X[:, j] = get_orthogonal_vector( current_column, X[:, j]) #print "done with computing transformed matrix" #print "original matrix size" #print X.shape #print "transformed matrix size" #print transformed_X.shape #now we have transformed_x the same size as original X with the column i having all zeros. # since we have our own new classifier then delete original columne transformed_X = np.delete(transformed_X, i, 1) #print "deleted original column ----->> " + feature_name_current_i #print "transformed matrix size after deleting original columns" #print transformed_X.shape #print "training and fitting lasso model on transformed data set" #now we have have a transformed vector with which we are ready to do another prediction. lasso_scores_i = cross_validation.cross_val_score(clf_baseline, transformed_X, y, cv=3, n_jobs=1) #print "computing scores for the model" current_feature_score = np.mean(lasso_scores_i) #print current_feature_score #print "score for current feature shown above" change_in_score = baseline_performance - current_feature_score feature_dictionary[feature_name_current_i].append(change_in_score) return feature_dictionary
def perform_orthogonal_variable_selection(X, y, column_list_for_sampled, non_linear, feature_dictionary): X, _ = scale_input_data(X) #train base line model with the original model #define lasso model print "defining lasso model " clf_baseline = LassoCV(n_alphas=3, cv=3) print "fitting lasso model with cross validation library" base_line_lasso = cross_validation.cross_val_score(clf_baseline, X, y, cv = 3, n_jobs=1) print "done fitting lasso library, now computing baseline performance" #baseline performance baseline_performance = np.mean(base_line_lasso) print baseline_performance print "baseline performance printed above" #get number of columns and rows for main data rows, columns = X.shape print "starting iterative orthogonal transformation for each feature" for i in range(columns): feature_name_current_i = column_list_for_sampled[i] print "Currently on ----->> " + feature_name_current_i current_column = X[:,i] transformed_X = np.zeros((rows, columns)) if non_linear > 0: #print "performing non-linear transformation" A = return_non_linear_transformation(current_column, poly=3, log=True,square_root=False, exponential=False, sin=False, cos=False) for j in range(columns): #print "in inner loop for " + feature_name_current_i + " ---->> " + column_list_for_sampled[j] if i==j: transformed_X[:, j] = np.zeros((rows)) else: transformed_X[:, j] = orthogonal_vector_to_subspace(A, X[:,j]) else: #print "performing linear transformation" for j in range(columns): #print "in inner loop for " + feature_name_current_i + " ---->> " + column_list_for_sampled[j] if i==j: #skip the current columns, because we are removing it from the #analysis. transformed_X[:, j] = np.zeros((rows)) else: transformed_X[:, j] = get_orthogonal_vector(current_column, X[:,j]) #print "done with computing transformed matrix" #print "original matrix size" #print X.shape #print "transformed matrix size" #print transformed_X.shape #now we have transformed_x the same size as original X with the column i having all zeros. # since we have our own new classifier then delete original columne transformed_X = np.delete(transformed_X, i, 1) #print "deleted original column ----->> " + feature_name_current_i #print "transformed matrix size after deleting original columns" #print transformed_X.shape #print "training and fitting lasso model on transformed data set" #now we have have a transformed vector with which we are ready to do another prediction. lasso_scores_i = cross_validation.cross_val_score(clf_baseline, transformed_X, y, cv = 3, n_jobs=1) #print "computing scores for the model" current_feature_score = np.mean(lasso_scores_i) #print current_feature_score #print "score for current feature shown above" change_in_score = baseline_performance - current_feature_score feature_dictionary[feature_name_current_i].append(change_in_score) return feature_dictionary