示例#1
0
def build_and_eval(X,y, extra = None, scorer = 'r2',get_max = True,
                   return_models = False, return_optimal = False,
                   score_options = False, omni_seed = 8):
    '''
    Taking a (normalized) X and its corresponding y, the function builds a 
    multiple-regression model before attempting to regularize with ridge and
    lasso. The function returns a dictionary of the models, specified by regu-
    larizer (i.e. 'lasso', 'ridge', or 'normal'[no regularization performed]), 
    with the option to return only the best-performing model of each regulari-
    zation type
    '''
    if score_options: score_options()
    model_holder = {'Normal':[] ,'Ridge':[], 'Lasso':[]}
    baseline = cv(lm.LinearRegression(fit_intercept = True), X, y, cv = 20,
                      scoring = scorer, return_estimator = True)
    model_holder['Normal'] = baseline['estimator']
    if get_max:
        precurser = 'Largest ' + scorer + ': '
    else:
        precurser = 'Smallest ' + scorer + ': '
        
    if extra is None:
        print('Multiple Regression:')
    else:
        print('Multiple Regression ' + extra + ':')
    print(precurser +  str(baseline['test_score'].max()) + '\n')
    
    # regularize
    reg_vals = {'penalty':list(range(1,21)), 'Ridge':list(), 'Lasso':list() }
    
    for penalty in reg_vals['penalty']:
        ridger = cv(lm.Ridge(alpha = penalty, random_state = omni_seed), X, y, scoring = scorer,
                    cv = 10, return_estimator = True)
        lasso = cv(lm.Lasso(alpha = penalty, max_iter = 50000, random_state = omni_seed), X, y, scoring = scorer,
                   cv = 10, return_estimator = True)
        
        #obtain the min/max score and the corresponding model
        s,c = get_score_and_model(ridger['test_score'],ridger['estimator'], get_max = get_max)
        reg_vals['Ridge'].append(round(s,3))
        model_holder['Ridge'].append(c)
        
        s,c = get_score_and_model(lasso['test_score'], lasso['estimator'], get_max = get_max)
        reg_vals['Lasso'].append(round(s,3))
        model_holder['Lasso'].append(c)
        
    best_alpha = {'Ridge':0, 'Lasso':0} # use to obtain the best models based on scoring
    for val in ['Ridge', 'Lasso']:
        v = min(reg_vals[val])
        print(val + ' Regression:')
        best_alpha[val] = reg_vals['penalty'][reg_vals[val].index(v)] 
        print(precurser + str(v) + ' for corresponding alpha = ' +
              str(best_alpha[val]) + '\n')
    
    if return_optimal:
        return_models = True
        for val in ['Ridge', 'Lasso']:
            model_holder[val] = [m for m in model_holder[val] if m.alpha == best_alpha[val]]
    
    if return_models:
        return model_holder
示例#2
0
def show_stats(x, y, classifier, validation, name):
    print(name)
    stats = cv(classifier, x, y, cv=validation)
    score = (sum(stats['test_score']) / len(stats['test_score']))
    print('Accuracy :', score)
    print('-----------------------------')
    return stats
示例#3
0
文件: hw3.py 项目: michridan/cs437
def show_stats(lx, ly, classifier, validation, name):
    print(name)
    scores = []
    for i in range(len(lx)):
        stats = cv(classifier, lx[i], ly[i], cv=validation)
        scores.append(sum(stats['test_score']) / len(stats['test_score']))
    for score in scores:
        print('monk', i, ':', score)
    print('-----------------------------')
    return sum(scores) / len(scores)
示例#4
0
def gridsearch_lasso_best(X, y):
    score_best = 0
    param_best = {'alpha': 1}
    for alp in [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005]:
        lasso = Lasso(alpha=alp)
        split = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
        score = cv(lasso, X, y, cv=split).mean()
        if score > score_best:
            score_best = score
            param_best = {'alpha': alp}
        print(param_best)
    return param_best
示例#5
0
 def evaluate(solution):
     results = cv(RFC(n_estimators=10), train_x[:, solution], train_Y, cv=3)
     return results["test_score"].mean()
示例#6
0
                               names=header,
                               engine='python')

# Number of users in current set
print('Number of unique users in current data-set',
      active_time_data.user_id.unique().shape[0])
print('Number of unique articles in current data-set',
      active_time_data.item_id.unique().shape[0])

# SVD allows us to look at our input matrix as a product of three smaller matrices; U, Z and V.
# In short this will help us discover concepts from the original input matrix,
# (subsets of users that like subsets of items)
# Note that use of SVD is not strictly restricted to user-item matrices
# https://www.youtube.com/watch?v=P5mlg91as1c

algorithm = TruncatedSVD()

# Finally we run our cross validation in n folds, where n is denoted by the cv parameter.
# Verbose can be adjusted by an integer to determine level of verbosity.
# We pass in our SVD algorithm as the estimator used to fit the data.
# X is our data set that we want to fit.
# Since our estimator (The SVD algorithm), We must either define our own estimator, or we can simply define how it
# score the fitting.
# Since we currently evaluate the enjoyment of our users per article highly binary, (Please see the rate_article fn in
# the filter script), we can easily decide our precision and recall based on whether or not our prediction exactly
# matches the binary rating field in the test set.
# This, the F1 scoring metric seems an intuitive choice for measuring our success, as it provides a balanced score
# based on the two.

cv(estimator=algorithm, X=active_time_data, scoring='f1', cv=5, verbose=True)
示例#7
0
from sklearn.metrics import accuracy_score # Evaluation Metric 
from sklearn.svm import SVC # import SVM Classifier
svc_default = SVC() # Default Model
svc_default.fit(X_train,y_train)
pred1= svc_default.predict(X_test)
print(accuracy_score(y_test,pred1))

svc_1 = SVC(C= 0.1,kernel = "linear") # Model with defined parameters
svc_1.fit(X_train,y_train)

pred2= svc_1.predict(X_test)
print(accuracy_score(y_test,pred2))

# Cross Validation Score
from sklearn.model_selection import cross_val_score as cv
cv_score = cv(svc_default, X_train, y_train, cv=3)# CV score for default model

cv2_score = cv(svc_1, X_train, y_train, cv=3)# CV score for model predefined parameters

# Grid Searchin search of best optimal SVM parameters
from sklearn.model_selection import GridSearchCV # Import grid search function

# define grid values for each SVM Parameter
params = [
              {'C': [0.05,0.04, 0.07], 'kernel': ['linear']}, 
              {'C': [10,3], 'gamma': [0.001,0.2], 'kernel': ['rbf']}
         ]

# define  grid  search          
grid_svc = GridSearchCV(estimator=svc_default, param_grid=params,cv =3)
grid_svc.fit(X_train , y_train ) # perform Grid search on data set
                                                    y,
                                                    train_size=0.8,
                                                    random_state=0)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
rfc.score(x_test, y_test)

# Score is 0.99

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

from sklearn.model_selection import GridSearchCV as cv
para = [{
    "n_estimators": [30, 40, 50],
    "criterion": ["gini", "entropy"]
}, {
    "n_estimators": [60, 70, 80, 90],
    "criterion": ["gini", "entropy"]
}]
cv = cv(estimator=rfc, param_grid=para, cv=15)
c = cv.fit(x_train, y_train)
cv.best_params_
cv.best_score_
y_cv = cv.predit(x_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_cv)