def build_and_eval(X,y, extra = None, scorer = 'r2',get_max = True, return_models = False, return_optimal = False, score_options = False, omni_seed = 8): ''' Taking a (normalized) X and its corresponding y, the function builds a multiple-regression model before attempting to regularize with ridge and lasso. The function returns a dictionary of the models, specified by regu- larizer (i.e. 'lasso', 'ridge', or 'normal'[no regularization performed]), with the option to return only the best-performing model of each regulari- zation type ''' if score_options: score_options() model_holder = {'Normal':[] ,'Ridge':[], 'Lasso':[]} baseline = cv(lm.LinearRegression(fit_intercept = True), X, y, cv = 20, scoring = scorer, return_estimator = True) model_holder['Normal'] = baseline['estimator'] if get_max: precurser = 'Largest ' + scorer + ': ' else: precurser = 'Smallest ' + scorer + ': ' if extra is None: print('Multiple Regression:') else: print('Multiple Regression ' + extra + ':') print(precurser + str(baseline['test_score'].max()) + '\n') # regularize reg_vals = {'penalty':list(range(1,21)), 'Ridge':list(), 'Lasso':list() } for penalty in reg_vals['penalty']: ridger = cv(lm.Ridge(alpha = penalty, random_state = omni_seed), X, y, scoring = scorer, cv = 10, return_estimator = True) lasso = cv(lm.Lasso(alpha = penalty, max_iter = 50000, random_state = omni_seed), X, y, scoring = scorer, cv = 10, return_estimator = True) #obtain the min/max score and the corresponding model s,c = get_score_and_model(ridger['test_score'],ridger['estimator'], get_max = get_max) reg_vals['Ridge'].append(round(s,3)) model_holder['Ridge'].append(c) s,c = get_score_and_model(lasso['test_score'], lasso['estimator'], get_max = get_max) reg_vals['Lasso'].append(round(s,3)) model_holder['Lasso'].append(c) best_alpha = {'Ridge':0, 'Lasso':0} # use to obtain the best models based on scoring for val in ['Ridge', 'Lasso']: v = min(reg_vals[val]) print(val + ' Regression:') best_alpha[val] = reg_vals['penalty'][reg_vals[val].index(v)] print(precurser + str(v) + ' for corresponding alpha = ' + str(best_alpha[val]) + '\n') if return_optimal: return_models = True for val in ['Ridge', 'Lasso']: model_holder[val] = [m for m in model_holder[val] if m.alpha == best_alpha[val]] if return_models: return model_holder
def show_stats(x, y, classifier, validation, name): print(name) stats = cv(classifier, x, y, cv=validation) score = (sum(stats['test_score']) / len(stats['test_score'])) print('Accuracy :', score) print('-----------------------------') return stats
def show_stats(lx, ly, classifier, validation, name): print(name) scores = [] for i in range(len(lx)): stats = cv(classifier, lx[i], ly[i], cv=validation) scores.append(sum(stats['test_score']) / len(stats['test_score'])) for score in scores: print('monk', i, ':', score) print('-----------------------------') return sum(scores) / len(scores)
def gridsearch_lasso_best(X, y): score_best = 0 param_best = {'alpha': 1} for alp in [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005]: lasso = Lasso(alpha=alp) split = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) score = cv(lasso, X, y, cv=split).mean() if score > score_best: score_best = score param_best = {'alpha': alp} print(param_best) return param_best
def evaluate(solution): results = cv(RFC(n_estimators=10), train_x[:, solution], train_Y, cv=3) return results["test_score"].mean()
names=header, engine='python') # Number of users in current set print('Number of unique users in current data-set', active_time_data.user_id.unique().shape[0]) print('Number of unique articles in current data-set', active_time_data.item_id.unique().shape[0]) # SVD allows us to look at our input matrix as a product of three smaller matrices; U, Z and V. # In short this will help us discover concepts from the original input matrix, # (subsets of users that like subsets of items) # Note that use of SVD is not strictly restricted to user-item matrices # https://www.youtube.com/watch?v=P5mlg91as1c algorithm = TruncatedSVD() # Finally we run our cross validation in n folds, where n is denoted by the cv parameter. # Verbose can be adjusted by an integer to determine level of verbosity. # We pass in our SVD algorithm as the estimator used to fit the data. # X is our data set that we want to fit. # Since our estimator (The SVD algorithm), We must either define our own estimator, or we can simply define how it # score the fitting. # Since we currently evaluate the enjoyment of our users per article highly binary, (Please see the rate_article fn in # the filter script), we can easily decide our precision and recall based on whether or not our prediction exactly # matches the binary rating field in the test set. # This, the F1 scoring metric seems an intuitive choice for measuring our success, as it provides a balanced score # based on the two. cv(estimator=algorithm, X=active_time_data, scoring='f1', cv=5, verbose=True)
from sklearn.metrics import accuracy_score # Evaluation Metric from sklearn.svm import SVC # import SVM Classifier svc_default = SVC() # Default Model svc_default.fit(X_train,y_train) pred1= svc_default.predict(X_test) print(accuracy_score(y_test,pred1)) svc_1 = SVC(C= 0.1,kernel = "linear") # Model with defined parameters svc_1.fit(X_train,y_train) pred2= svc_1.predict(X_test) print(accuracy_score(y_test,pred2)) # Cross Validation Score from sklearn.model_selection import cross_val_score as cv cv_score = cv(svc_default, X_train, y_train, cv=3)# CV score for default model cv2_score = cv(svc_1, X_train, y_train, cv=3)# CV score for model predefined parameters # Grid Searchin search of best optimal SVM parameters from sklearn.model_selection import GridSearchCV # Import grid search function # define grid values for each SVM Parameter params = [ {'C': [0.05,0.04, 0.07], 'kernel': ['linear']}, {'C': [10,3], 'gamma': [0.001,0.2], 'kernel': ['rbf']} ] # define grid search grid_svc = GridSearchCV(estimator=svc_default, param_grid=params,cv =3) grid_svc.fit(X_train , y_train ) # perform Grid search on data set
y, train_size=0.8, random_state=0) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(x_train, y_train) y_pred = rfc.predict(x_test) rfc.score(x_test, y_test) # Score is 0.99 from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) from sklearn.model_selection import GridSearchCV as cv para = [{ "n_estimators": [30, 40, 50], "criterion": ["gini", "entropy"] }, { "n_estimators": [60, 70, 80, 90], "criterion": ["gini", "entropy"] }] cv = cv(estimator=rfc, param_grid=para, cv=15) c = cv.fit(x_train, y_train) cv.best_params_ cv.best_score_ y_cv = cv.predit(x_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_cv)