Пример #1
0
def classic_rsearch(x,y):
    from scipy.stats import uniform as sp_rand
    from scipy.stats import randint as sp_randint
    lr1 = LR(warm_start = True, penalty = 'l1', verbose = 100, max_iter = 5000)
    lr2 = LR(warm_start = True, penalty = 'l2', verbose = 100, max_iter = 5000)
    svm = SVM(verbose = True, probability = False, max_iter= 5000)
    rf = RF(warm_start = True, verbose = 100)

    #random search params
    lr_params = {'C': sp_rand(1, 1e5)}
    rf_params = {'criterion': ['gini', 'entropy'], 'n_estimators': sp_randint(10, 200), 'max_features': ['auto', 'sqrt', 'log2', None]}
    mlp_params = {'hidden_layer_sizes':[(64, 64), (128, 128), (256, 256), (512, 512)], 'alpha': sp_rand(1e-6, 1e-2)}
    svm_params = {'kernel': ['rbf', 'poly'], 'C':sp_rand (1, 1e5), 'gamma': sp_rand(1e-5, 1)}

    results = {}; models = []
    lst = [lr1, lr2, svm, rf]
    names = ['LR','SVM','RF']
    params = [lr_params, lr_params, svm_params, rf_params]
    for idx in range(len(lst)):
        n_iter_search = 60
        start = time.time()
        rsearch = random_search(estimator = lst[idx], param_distributions = params[idx], n_iter=n_iter_search,
                                scoring='roc_auc', fit_params=None, n_jobs=1,
                                iid=True, refit=True, cv=5, verbose=0, random_state=8)
        rsearch.fit(x, y)
        models.append(rsearch)
        results[names[idx]] = rsearch.cv_results_
        print (names[idx]+" results complete.")
        print("RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time.time() - start), n_iter_search))
    return (data, models)
Пример #2
0
def pick_best_features(df):
    """
    Grid search to find best features. TODO refactor
    :param train: train data
    :param test: test data
    :return:
    """

    #X = sample_data_random(df, .25)
    X = df[0:int(df.shape[0] * .25)]
    overfit_models = dict()
    for out in outputs:
        print out
        pipe_clf = CustomPipeline.get_transforms()

        clf = SGDClassifier(loss='log')

        tuned_parameters = {'alpha': sp_rand()}
        score = 'log_loss'
        tran_x = pipe_clf.fit_transform(X)
        grid = RandomizedSearchCV(clf, tuned_parameters, cv=5, scoring=score)
        grid.fit(tran_x, X[out])
        print grid.best_estimator_
        overfit_models[out] = grid.best_estimator_
    return overfit_models
 def set_hyperparameters(self):
     self.p_grid = {
         'C': sp_rand(),
         'penalty': ['l1', 'l2'],
         'solver': ['liblinear', 'saga'],
         'max_iter': sp_randint(1, 3000)
     }
def svmClassifier(x, y, folds):
    n_iter_search = 50
    skf = StratifiedKFold(folds)

    clf = SVC(kernel='linear', random_state=7)
    pipe = make_pipeline(preprocessing.StandardScaler(), clf)

    # Hyper parameters
    C = sp_rand()

    param_dist = dict(svc__C=C)

    random_search = RandomizedSearchCV(pipe,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=skf,
                                       scoring='roc_auc')
    random_search.fit(x, y)
    tuned_params = random_search.best_params_
    print tuned_params

    # Tuned Hyper parameters
    C = tuned_params['svc__C']

    # print('\nSVM(Linear) Best Tuned Model')
    tuned_clf = SVC(kernel='linear', C=C, random_state=7)
    modelEvaluator(tuned_clf, x, y, folds)
Пример #5
0
    def Predict(self, userId):
        param_grid = {'alpha': sp_rand()}
        rsearch = RandomizedSearchCV(estimator=AnalysisappConfig.model,
                                     param_distributions=param_grid,
                                     n_iter=200,
                                     cv=20,
                                     random_state=42)  # model: Lasso

        YOU = my_ratings[my_ratings['userId'] ==
                         userId]  # 유저 설정, userID 값으로 YOU 설정

        rsearch.fit(YOU[genre_cols], YOU['rating'])  # 장르 칼럼

        #rsearch.best_estimator_.alpha

        intercept = rsearch.best_estimator_.intercept_
        coef = rsearch.best_estimator_.coef_
        """
        you_profile = pd.DataFrame([intercept, *coef],  # 유저 profile 생성. 장르별 계수
                                   index=['intercept', *genre_cols], columns=['score'])
        """
        predictions = rsearch.best_estimator_.predict(genres)
        genres['YOU'] = predictions

        rating_predictions = genres[~genres.index.isin(YOU['movieId']
                                                       )].sort_values(
                                                           'YOU',
                                                           ascending=False)
        rating_predictions = rating_predictions.merge(
            movies[['movieId', 'title']], left_index=True, right_on='movieId')

        return rating_predictions  # 예상 별점! it can show the best, worst or whatever something
def tune_logistic_regression(X_train, y_train):
    param_grid = {
        'estimator__penalty': ['l1', 'l2'],
        'estimator__class_weight': ['balanced', None],
        'estimator__C': sp_rand(),
    }
    tune_hyper_parameters(LogisticRegression(solver='liblinear'), param_grid, X_train, y_train)
Пример #7
0
def RandTest():
    # prepare a uniform distribution to sample for the alpha parameter
    param_grid = {'alpha': sp_rand()}
    # create and fit a ridge regression model, testing random alpha values
    model = Ridge()
    rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
    rsearch.fit(X, y)
    print(rsearch)
    # summarize the results of the random parameter search
    print(rsearch.best_score_)
    print(rsearch.best_estimator_.alpha)
Пример #8
0
def train_classiifer(X_train, y_train, to_tune, classifier):
    # Initialize Classifier.
    clf = BayesianRidge()
    clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
    #clf = RandomForestRegressor()
    if classifier:
        clf = classifier
        to_tune = False
    if to_tune:
        # Grid search: find optimal classifier parameters.
        param_grid = {'alpha_1': sp_rand(), 'alpha_2': sp_rand()}
        param_grid = {'C': sp_rand(), 'gamma': sp_rand()}
        rsearch = RandomizedSearchCV(estimator=clf, 
                                     param_distributions=param_grid, n_iter=5000)
        rsearch.fit(X_train, y_train)
        # Use tuned classifier.
        clf = rsearch.best_estimator_
          
    # Trains Classifier   
    clf.fit(X_train, y_train)
    return clf
Пример #9
0
def r_search(x, y):
    #random search params
    lr_params = {'penalty': ['l1', 'l2'], 'C': sp_rand(1e-5, .1)}
    svm_params = {'kernel': ['rbf', 'linear'], 'C': sp_rand(10, 1e5)}
    rf_params = {
        'criterion': ['gini', 'entropy'],
        'n_estimators': sp_randint(50, 200),
        'bootstrap': [True, False]
    }
    gbc_params = {
        'learning_rate': sp_rand(1e-6, 1e-1),
        'n_estimators': sp_randint(50, 200),
        'loss': ['deviance', 'exponential']
    }

    data = {}
    xs, ys = balanced_subsample(x, y)
    lst = [LR(verbose=1), RF(verbose=1), SVM(verbose=True), GBC(verbose=1)]
    names = ['LR', 'RF', 'SVM', 'GB']
    params = [lr_params, rf_params, svm_params, gbc_params]
    for idx in range(len(lst)):
        n_iter_search = 60
        start = time.time()
        rsearch = random_search(estimator=lst[idx],
                                param_distributions=params[idx],
                                n_iter=n_iter_search,
                                scoring='roc_auc',
                                fit_params=None,
                                n_jobs=1,
                                iid=True,
                                refit=True,
                                cv=5,
                                verbose=0,
                                random_state=8)
        rsearch.fit(xs, ys)
        data[names[idx]] = rsearch.cv_results_
        print(names[idx] + " results complete.")
        print("RandomizedSearchCV took %.2f seconds for %d candidates"
              " parameter settings." % ((time.time() - start), n_iter_search))
    return (data)
Пример #10
0
def r_search(x, y, input_shape):
    #random search params
    mlp_params = {'units': [64, 128, 256, 512], 'rate': sp_rand(.2, .9)}
    lstm_params = {'units': [64, 128, 256, 512], 'rate': sp_rand(.2, .9)}
    cnn_params = {
        'filters': [32, 64, 128, 256, 512],
        'filter_length': [2, 3, 4, 5, 6],
        'pool_size': [2, 3]
    }

    data = {}
    xs, ys = balanced_subsample(x, y)
    lst = [
        mlp_train(input_shape),
        lstm_train(input_shape),
        cnn_train(input_shape)
    ]
    names = ['MLP', 'LSTM', 'CNN']
    params = [mlp_params, lstm_params, cnn_params]
    for idx in range(len(lst)):
        n_iter_search = 60
        start = time.time()
        rsearch = random_search(estimator=lst[idx],
                                param_distributions=params[idx],
                                n_iter=n_iter_search,
                                scoring='roc_auc',
                                fit_params=None,
                                n_jobs=1,
                                iid=True,
                                refit=True,
                                cv=3,
                                verbose=10,
                                random_state=8)
        rsearch.fit(xs, ys)
        data[names[idx]] = rsearch.cv_results_
        print(names[idx] + " results complete.")
        print("RandomizedSearchCV took %.2f seconds for %d candidates"
              " parameter settings." % ((time.time() - start), n_iter_search))
    return (data)
def cross_validate(train_features, targets_train, iters):
	"""
	Runs randomized cross validation using adjustable MultinomialNB params.

	Returns:
		The model that is the most accurate
	"""
	print('starting cross validation')
	param_grid = {'alpha': sp_rand()}
	model = MultinomialNB()
	rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=iters)
	rsearch.fit(train_features, targets_train)
	print('finished cross validation')
	print('best model has a score of {} using alpha={}'.format(rsearch.best_score_, rsearch.best_estimator_.alpha))
	return rsearch.best_estimator_.alpha
Пример #12
0
def test_random_parameters(X, y):
    """
    Sometimes it is more efficient to randomly select a parameter from the given range,
    estimate the algorithm quality for this parameter and choose the best one.
    """
    # prepare a uniform distribution to sample for the alpha parameter
    param_grid = {'alpha': sp_rand()}
    # create and fit a ridge regression model, testing random alpha values
    model = Ridge()
    rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
    rsearch.fit(X, y)
    print(rsearch)
    # summarize the results of the random parameter search
    print(rsearch.best_score_)
    print(rsearch.best_estimator_.alpha)
Пример #13
0
def optimize_params():
    """
    一项更加困难的任务是构建一个有效的方法用于选择正确的参数,我们需要用搜索的方法来确定参数
    """
    import numpy as np

    from sklearn import datasets
    from sklearn.linear_model import Ridge
    from sklearn.grid_search import GridSearchCV

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    # prepare a range of alpha values to test
    alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])
    # create and fit a ridge regression model, testing each alpha
    model = Ridge()
    grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
    # print(dict(alpha=alphas))
    grid.fit(X, y)
    print(grid)

    # summarize the results of the grid search
    print(grid.best_score_)
    print(grid.best_estimator_.alpha)

    # 有时随机从给定区间中选择参数是很有效的方法,然后根据这些参数来评估算法的效果进而选择最佳的那个
    from scipy.stats import uniform as sp_rand
    from sklearn.grid_search import RandomizedSearchCV

    # prepare a uniform distribution to sample for the alpha parameter
    param_grid = {"alpha": sp_rand()}

    # create and fit a ridge regression model, testing random alpha values
    model = Ridge()
    rsearch = RandomizedSearchCV(estimator=model,
                                 param_distributions=param_grid,
                                 n_iter=100)
    rsearch.fit(X, y)
    print(rsearch)

    # summarize the results of the random parameter search
    print(rsearch.best_score_)
    print(rsearch.best_estimator_.alpha)
def contentsbased1(user_id, movie_id, genres_p):
    print('======== 전체영화 예상평점 - 장르 ===========')
    print('START TIME : ', str(datetime.now())[10:19])
    start = time.time()

    conn = pymysql.connect(host=config('HOST'),
                           port=3306,
                           user=config('USER'),
                           password=config('PASSWORD'),
                           db=config('DB'))
    sql = 'SELECT * FROM wouldyouci.accounts_rating where user_id=' + str(
        user_id)
    ratings = pd.read_sql_query(sql, conn)
    genres = genres_p

    conn.close()

    user_profile = ratings.merge(genres, left_on='movie_id', right_index=True)

    model = Lasso()
    param_grid = {'alpha': sp_rand()}

    research = RandomizedSearchCV(estimator=model,
                                  param_distributions=param_grid,
                                  n_iter=20,
                                  cv=5,
                                  random_state=406)

    research.fit(user_profile[genres.columns], user_profile['score'])
    predictions = research.best_estimator_.predict(genres)
    genres.reset_index()

    genres['predict'] = predictions

    predicted_score = genres.at[movie_id, 'predict']
    print('END TIME : ', str(datetime.now())[10:19])

    end = time.time()
    print('TOTAL TIME : ', end - start)
    print('PREDICTED SCORE : ', predicted_score)
    print()
    return pd.DataFrame.to_json(genres['predict'])
    def get_params(classifier_name):
        params_dt = {
            'max_features': sp_rand(0.1, 0.9),
            'min_samples_leaf': sp_randint(1, 21),
            'min_samples_split': sp_randint(2, 21),
            'criterion': ['entropy', 'gini']
        }

        params_random_forest = {
            'n_estimators': sp_randint(1, 100),
            'bootstrap': [True, False],
            'max_features': sp_rand(0.1, 0.9),
            'min_samples_leaf': sp_randint(1, 21),
            'min_samples_split': sp_randint(2, 21),
            'criterion': ['entropy', 'gini']
        }

        params_adaboost = {
            'base_estimator__max_depth': sp_randint(1, 11),
            'algorithm': ['SAMME', 'SAMME.R'],
            'n_estimators': sp_randint(50, 501),
            'learning_rate': sp_rand(0.01, 2)
        }

        params_gboosting = {
            'learning_rate': sp_rand(0.01, 1),
            'criterion': ['friedman_mse', 'mse'],
            'n_estimators': sp_randint(50, 501),
            'max_depth': sp_randint(1, 11),
            'min_samples_split': sp_randint(2, 21),
            'min_samples_leaf': sp_randint(1, 21),
            'max_features': sp_rand(0.1, 0.9)
        }

        params_log_regression = {
            'penalty': ['l2', 'none'],
            'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
            'C': sp_rand(2**(-5), 2**15)
        }

        parameter_space = {
            'DT': params_dt,
            'RF': params_random_forest,
            'AB': params_adaboost,
            'GB': params_gboosting,
            'LR': params_log_regression
        }
        return parameter_space[classifier_name]
Пример #16
0
    def guessYouLikeIt(self, userId):
        model = Lasso()  # 모델 설정
        param_grid = {'alpha': sp_rand()}
        rsearch = RandomizedSearchCV(estimator=model,
                                     param_distributions=param_grid,
                                     n_iter=200,
                                     cv=20,
                                     random_state=42)

        YOU = my_ratings[my_ratings['userId'] ==
                         userId]  # 유저 설정, userID 값으로 YOU 설정

        rsearch.fit(YOU[genre_cols], YOU['rating'])  # 장르 칼럼

        rsearch.best_estimator_.alpha

        intercept = rsearch.best_estimator_.intercept_
        coef = rsearch.best_estimator_.coef_

        you_profile = pd.DataFrame(
            [intercept, *coef],  # 유저 profile 생성. 장르별 계수
            index=['intercept', *genre_cols],
            columns=['score'])

        predictions = rsearch.best_estimator_.predict(genres)
        genres['YOU'] = predictions

        rating_predictions = genres[~genres.index.isin(YOU['movieId']
                                                       )].sort_values(
                                                           'YOU',
                                                           ascending=False)
        rating_predictions = rating_predictions.merge(
            movies[['movieId', 'title']], left_index=True, right_on='movieId')

        Top5 = rating_predictions.sort_values(by='YOU',
                                              ascending=False)[:5]  # 추천 TOP 5
        Worst5 = rating_predictions.sort_values(
            by='YOU', ascending=True)[:5]  # 비추천 WORST 5
        print(
            Top5)  # 예상 별점! 이를 토대로 can show best , worst or whatever something
def logisicRegression(x, y, folds):
    n_iter_search = 50
    skf = StratifiedKFold(folds)

    clf = LogisticRegression(random_state=7)
    pipe = make_pipeline(preprocessing.StandardScaler(), clf)

    # Hyper parameters
    C_range = sp_rand()
    penalty_options = ['l1', 'l2']
    solver = ['liblinear', 'saga']
    max_iter = sp_randint(1, 3000)

    param_dist = dict(logisticregression__C=C_range,
                      logisticregression__penalty=penalty_options,
                      logisticregression__solver=solver,
                      logisticregression__max_iter=max_iter)

    random_search = RandomizedSearchCV(pipe,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=skf,
                                       scoring='roc_auc')
    random_search.fit(x, y)
    tuned_params = random_search.best_params_
    print tuned_params

    # Tuned Hyper parameters
    penalty_options = tuned_params['logisticregression__penalty']
    C_range = tuned_params['logisticregression__C']
    solver = tuned_params['logisticregression__solver']
    max_iter = tuned_params['logisticregression__max_iter']

    print('\nLR Best Tuned Model')
    tuned_clf = LogisticRegression(penalty=penalty_options,
                                   C=C_range,
                                   random_state=7,
                                   solver=solver,
                                   max_iter=max_iter)
    modelEvaluator(tuned_clf, x, y, folds)
Пример #18
0
def contentsbased_onscreen(user_id):
    genres = MoviesConfig.genre_pickle

    ratings = pd.DataFrame(
        list(Rating.objects.filter(user=user_id).values('score', 'movie_id')))

    user_profile = ratings.merge(genres, left_on='movie_id', right_index=True)

    model = Lasso()
    param_grid = {'alpha': sp_rand()}

    research = RandomizedSearchCV(estimator=model,
                                  param_distributions=param_grid,
                                  n_iter=30,
                                  cv=5,
                                  random_state=406)

    research.fit(user_profile[genres.columns], user_profile['score'])

    predictions = research.best_estimator_.predict(genres)
    genres.reset_index()

    genres['predict'] = predictions

    onscreen_id_set = Movie.objects.exclude(onscreens=None).exclude(
        genres=None)
    onscreen_id_set = onscreen_id_set.values_list('id', flat=True)

    score_info = []

    for _id in onscreen_id_set:
        try:
            score_info.append((_id, genres.at[_id, 'predict']))
        except KeyError:
            print(f'{_id}가 피클에 없어요. genre 피클을 업데이트 해야합니다.')

    score_info = sorted(score_info, key=lambda x: -x[1])[:10]
    onscreen_id_set = [x for x, y in score_info]

    return onscreen_id_set
Пример #19
0
def main():
    # print(sys.argv[0])
    # print(sys.argv[1])
    # print("cani cani? hi cani?")
    perfumes = pd.read_csv('perfumes-test-re.csv')
    types_dummies = perfumes['type'].str.get_dummies(sep="|")
    types_dummies.to_pickle('types.p')
    my_ratings = pd.read_csv('added-rating.csv')
    types = pd.read_pickle('types.p')
    my_ratings_came = my_ratings.merge(perfumes, on='perfumeId').merge(
        types, left_on='perfumeId', right_index=True)
    my_ratings = my_ratings_came
    user3 = my_ratings_came[my_ratings_came['userId'] == 3]
    type_cols = types.columns
    model = Lasso()
    param_grid = {'alpha': sp_rand()}
    rsearch = RandomizedSearchCV(estimator=model,
                                 param_distributions=param_grid,
                                 n_iter=100,
                                 cv=13,
                                 random_state=42)
    rsearch.fit(user3[type_cols], user3['rating'])
    intercept = rsearch.best_estimator_.intercept_
    coef = rsearch.best_estimator_.coef_
    user3_profile = pd.DataFrame([intercept, *coef],
                                 index=['intercept', *type_cols],
                                 columns=['score'])
    predictions = rsearch.best_estimator_.predict(types)
    types['user3'] = predictions  #type csv에 column 'user3'추가
    rating_predictions = types[~types.index.
                               isin(user3['perfumeId'])].sort_values(
                                   'user3',
                                   ascending=False)  #user 3이 평가하지 않은 향수
    ratings_predictions = rating_predictions.merge(
        perfumes[['perfumeId', 'name']], left_index=True, right_on='perfumeId')
    print(ratings_predictions.head())
    #test = pd.read_csv('test12.csv')
    #print(test)
    # print("data_comp")
    sys.stdout.flush()
def adaBoostClassifier(x, y, folds):
    print("\nAda Boost Classifier -Best Tuned Model")

    clf = AdaBoostClassifier(random_state=7)
    pipe = make_pipeline(preprocessing.StandardScaler(), clf)

    skf = StratifiedKFold(folds)
    n_iter_search = 50
    # Hyper parameters
    n_estimators = sp_randint(1, 300)
    learning_rate = sp_rand()
    algorithm = ['SAMME', 'SAMME.R']

    param_dist = dict(adaboostclassifier__n_estimators=n_estimators,
                      adaboostclassifier__learning_rate=learning_rate,
                      adaboostclassifier__algorithm=algorithm)

    random_search = RandomizedSearchCV(pipe,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=skf,
                                       scoring='roc_auc',
                                       verbose=1)
    random_search.fit(x, y)
    tuned_params = random_search.best_params_
    print tuned_params

    # Tuned Hyper parameters
    n_estimators = tuned_params['adaboostclassifier__n_estimators']
    learning_rate = tuned_params['adaboostclassifier__learning_rate']
    algorithm = tuned_params['adaboostclassifier__algorithm']

    print('\nAB Best Tuned Model')
    tuned_clf = AdaBoostClassifier(n_estimators=n_estimators,
                                   learning_rate=learning_rate,
                                   algorithm=algorithm,
                                   random_state=7)
    modelEvaluator(tuned_clf, x, y, folds)
Пример #21
0
def search(examples, fd_train, eg_train, iterations):
    """
    beginnnings of hyperparameter search for svm
    """
    param_grid = {'C': sp_rand()}
    lessons_train = list()
    outcomes_train = list()
    for _ in tnrange(examples):
        cameras_train = eg_train.generate()
        match_id = get_match_id(cameras_train)
        goods, bads = make_good_bad(cameras_train, match_id)
        make_work(fd_train, lessons_train, outcomes_train, goods, 1)
        make_work(fd_train, lessons_train, outcomes_train, bads, 0)
    clf = svm.SVC()
    print('searching')
    start = time.time()
    rsearch = RandomizedSearchCV(
        estimator=clf, param_distributions=param_grid, n_iter=iterations)
    rsearch.fit(lessons_train, outcomes_train)
    end = time.time()
    print('searching took {} seconds'.format(end - start))
    print(rsearch.best_score_)
    print(rsearch.best_estimator_.C)
Пример #22
0
    def tune_parameters(self, texts, classes):
        # Help:
        # http://scikit-learn.org/stable/modules/grid_search.html#tuning-the-hyper-parameters-of-an-estimator
        # http://machinelearningmastery.com/how-to-tune-algorithm-parameters-with-scikit-learn/
        self.logger.debug("Start parameter tuning...")
        model = Pipeline([
            ('vect', self.vectorizer),
            ('clf',
             SGDClassifier(loss='hinge',
                           penalty='l2',
                           alpha=1e-3,
                           n_iter=5,
                           random_state=42,
                           n_jobs=-1)),
        ])

        # Implement more!

        #parameters = {'clf__alpha': (1e-1, 1e-4) }
        #gs_clf = GridSearchCV(model, parameters, n_jobs=-1)
        #gs_clf = gs_clf.fit(texts, classes)

        #param_grid = {'alpha': sp_rand()}
        parameters = {'clf__alpha': sp_rand()}
        # create and fit a ridge regression model, testing random alpha values

        # use all Cores!
        gs_clf = RandomizedSearchCV(model, parameters, 1000, n_jobs=-1)
        gs_clf = gs_clf.fit(texts, classes)

        self.logger.info("Best Score: " + str(gs_clf.best_score_))
        self.logger.info("Best fitting Parameters: ")
        for param_name in sorted(parameters.keys()):
            self.logger.info("%s: %r" %
                             (param_name, gs_clf.best_params_[param_name]))
        self.logger.debug("Finished parameter tuning...")
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# In[ ]:

logistic = linear_model.LogisticRegression(penalty='l2')

#the hyper-parameters include the number of features,  type of regularization
#and the alpha parameter of the regularizer
param_grid = {'penalty': ['l1', 'l2'], 'C': sp_rand()}

num_iter = 2000
rand_search_cv = RandomizedSearchCV(logistic,
                                    param_distributions=param_grid,
                                    n_iter=num_iter)

start = time()
rand_search_cv.fit(x_df, y_df)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), num_iter))
report(rand_search_cv.cv_results_)

logistic.set_params(**rand_search_cv.best_params_)
#note the ** means to unpack all the iterations in the named argument
#(either in dictionary form or in named pair eg "c" = 0.1231)
Пример #24
0
            "regression__n_estimators": sp_randint(100, 200),
            "regression__max_features": ["auto", "sqrt", "log2"],
            "regression__loss": ["ls", "lad", "huber", "quantile"],
            "regression__warm_start": [False, True],
        },  # GradientBoostingRegressor
        {
            "regression__n_estimators": sp_randint(50, 150),
            "regression__base_estimator": [DecisionTreeRegressor(), GradientBoostingRegressor(), SVR()],
            "regression__loss": ["linear", "square", "exponential"],
        },  # AdaBoostRegressor
        {"regression__kernel": ["linear", "poly", "rbf", "sigmoid"], "regression__C": sp_randint(1, 100)},  # SVR
    ]
    feature_sel_parameters = [
        dict(
            feat_sel__alpha=["aic", "bic"],
            feat_sel__selection_threshold=sp_rand(0.15, 0.20),
            feat_sel__scaling=sp_rand(0.35, 0.4),
            feat_sel__sample_fraction=sp_rand(0.6, 0.3),
        ),
        dict(feat_sel__k=sp_randint(10, 90)),
        dict(feat_sel__n_estimators=sp_randint(10, 90)),
        dict(feat_sel__n_features_to_select=sp_randint(10, 90), feat_sel__step=sp_randint(1, 10)),
    ]
else:
    raise Exception("seach_method must be grid or randomized!")
models = [
    RandomForestRegressor(random_state=rand_state),
    ExtraTreesRegressor(random_state=rand_state),
    GradientBoostingRegressor(random_state=rand_state),
    AdaBoostRegressor(random_state=rand_state),
    SVR(),
Пример #25
0
 def optimize_params(self):
     params_grid = {"update_learning_rate": sp_rand(0.001,0.01), "momentum":sp_rand(0.9,2.0), "epochs":randint(50,300)}
     rsearch = RandomizedSearchCV(estimator=self.net, param_distributions=params_grid, n_iter=15, n_jobs=1)
     X, _, y, _ = self.roof_loader.load(max_roofs=100, test_percent=0, non_roofs=self.non_roofs)
     rsearch.fit(X,y)
     Experiment.report_grid(rsearch.grid_scores_)
Пример #26
0
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV

# prepare a range of alpha values to test
alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.grid_search import RandomizedSearchCV

# prepare a uniform distribution to sample for the alpha parameter
param_grid = {"alpha": sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)
Пример #27
0
def main():

    np.random.seed(8)
    options = ['d_lstm', 'd_cnn']
    #'/home/andy/Desktop/MIMIC/temp/pretrain/...'
    try:
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/x_train.pkl',
                  'rb') as f:
            X_train = pickle.load(f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/x_test.pkl',
                  'rb') as f:
            X_test = pickle.load(f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/y_train.pkl',
                  'rb') as f:
            Y_train = pickle.load(f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/y_test.pkl',
                  'rb') as f:
            Y_test = pickle.load(f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/v_train.pkl',
                  'rb') as f:
            V_train = pickle.load(f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/v_test.pkl',
                  'rb') as f:
            V_test = pickle.load(f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/t_train.pkl',
                  'rb') as f:
            t_train = pickle.load(f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/t_test.pkl',
                  'rb') as f:
            t_test = pickle.load(f)
        SG = gensim.models.Word2Vec.load(
            '/home/andy/Desktop/MIMIC/temp/pretrain/SG')
        print("Training sets loaded.")
    except:
        with open('/home/andy/Desktop/MIMIC/temp/admits.pkl', 'rb') as f:
            admits = pickle.load(f)

        with open('/home/andy/Desktop/MIMIC/temp/d.pkl', 'rb') as f:
            d = pickle.load(f)

        with open('/home/andy/Desktop/MIMIC/temp/lib.pkl', 'rb') as f:
            lib = pickle.load(f)

        with open('/home/andy/Desktop/MIMIC/temp/sentences.pkl', 'rb') as f:
            sentences = pickle.load(f)
        print("Splitting dataset...")
        X_train, X_test, V_train, V_test, t_train, t_test, Y_train, Y_test = get_split(
            admits=admits, sentences=sentences, lib=lib, dz=d)

        with open('/home/andy/Desktop/MIMIC/temp/pretrain/x_train.pkl',
                  'wb') as f:
            pickle.dump(X_train, f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/x_test.pkl',
                  'wb') as f:
            pickle.dump(X_test, f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/v_train.pkl',
                  'wb') as f:
            pickle.dump(V_train, f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/v_test.pkl',
                  'wb') as f:
            pickle.dump(V_test, f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/y_train.pkl',
                  'wb') as f:
            pickle.dump(Y_train, f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/y_test.pkl',
                  'wb') as f:
            pickle.dump(Y_test, f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/t_train.pkl',
                  'wb') as f:
            pickle.dump(t_train, f)
        with open('/home/andy/Desktop/MIMIC/temp/pretrain/t_test.pkl',
                  'wb') as f:
            pickle.dump(t_test, f)

        print("Making Dictionary...")
        #V_test = [np.ndarray.tolist(i) for i in V_test]
        #exons = [i[2] for i in sentences if i[2] not in V_test]
        del sentences

        V_train = [np.ndarray.tolist(i) for i in V_train]
        #Do NOT forget the previous step; it is very important to convert sentence to regular python list... otherwise it'll take forever.
        #SG = gensim.models.Word2Vec(sentences = exons, sg = 1, size = 300, window = 10, min_count = int(len(exons)*.001), hs = 1, negative = 0)
        SG = gensim.models.Word2Vec(sentences=V_train,
                                    sg=1,
                                    size=300,
                                    window=10,
                                    hs=1,
                                    negative=0)

        print("...saving dictionary...")
        SG.save("/home/andy/Desktop/MIMIC/temp/pretrain/SG")

    #fixed embedding layers
    weights = SG.wv.syn0
    vocab = dict([(k, v.index) for k, v in SG.wv.vocab.items()])
    w2i, i2w = vocab_index(vocab)
    w_train = [
        list(map(lambda i: w2i[i] if i in w2i.keys() else 0, vv))
        for vv in V_train
    ]

    random = True

    if random == False:
        Data = []
        preset = {'optimizer': 'Adam', 'learn_rate': .005}
        optimizer = ['SGD', 'RMSprop', 'Adam']
        learn_rate = [.0001, .0005, .001, .005, .01]
        #momentum = np.arange(.5, .9, .1).tolist()
        #neurons = [100]
        dropout_W = [0.001, .01, .1, .2, .4]
        dropout_U = [0.001, .01, .1, .2, .4]
        #W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        #U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        init_mode = [
            'uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal',
            'glorot_uniform', 'he_normal', 'he_uniform'
        ]

        for o in options:
            t1 = TIME.time()
            param_grid = dict(learn_rate=learn_rate)
            data = grid_search(x=X_train,
                               y=Y_train,
                               v=V_train,
                               t=t_train,
                               SG=SG,
                               option=o,
                               nb_epoch=20,
                               cv=3,
                               n_jobs=1,
                               param_grid=param_grid,
                               preset=preset)
            with open(
                    "/home/tangfeng/MIMIC/results/randgrid_" + str(o) + ".pkl",
                    'wb') as f:
                pickle.dump(data, f)
            print("Pickle successful!")
            t2 = TIME.time()
            print("Training completed in " + str((t2 - t1) / 3600) + " hours")

            Data += data

    # Data = pd.DataFrame([pd.Series(dd) for dd in Data])
        with open("/home/tangfeng/MIMIC/results/gridsearch.pkl", 'wb') as f:
            pickle.dump(Data, f)
        print("Done.")

    else:
        Data = []

        optimizer = ['SGD', 'RMSprop', 'Adam']
        learn_rate = sp_rand(.0001, .01)
        momentum = sp_rand(.5, .9)
        dropout_W = sp_rand(0, .5)
        dropout_U = sp_rand(0, .5)
        #W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        #U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        init_mode = [
            'uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal',
            'glorot_uniform', 'he_normal', 'he_uniform'
        ]

        for o in options:
            preset = {}
            if o == 'lr':
                param_grid = {
                    'C': sp_rand(.0001, 1000),
                    'penalty': ('l1', 'l2')
                }
            elif o == 'svm':
                param_grid = {'C': sp_rand(.0001, 1000)}
            elif o == 'rf':
                param_grid = {
                    'criterion': ['gini', 'entropy'],
                    'n_estimators': sp_randint(10, 50),
                    'bootstrap': [True, False]
                }
            else:
                param_grid = dict(optimizer=optimizer,
                                  learn_rate=learn_rate,
                                  momentum=momentum,
                                  dropout_W=dropout_W,
                                  dropout_U=dropout_U,
                                  init_mode=init_mode)

            if o == 'd_lstm' or o == 'd_cnn':
                trainable = False
            else:
                trainable = True

            t1 = TIME.time()
            data = random_search(x=X_train,
                                 y=Y_train,
                                 v=w_train,
                                 t=t_train,
                                 weights=weights,
                                 option=o,
                                 nb_epoch=16,
                                 cv=3,
                                 n_jobs=1,
                                 param_grid=param_grid,
                                 preset=preset,
                                 n_iter=40,
                                 trainable=trainable)
            t2 = TIME.time()
            with open(
                    "/home/andy/Desktop/MIMIC/results/random_" + str(o) +
                    ".pkl", 'wb') as f:
                pickle.dump(data, f)
            print("Pickle successful!")
            print("Training completed in " + str((t2 - t1) / 3600) + " hours")

            Data += data
        with open("/home/andy/Desktop/MIMIC/results/randomsearch.pkl",
                  'wb') as f:
            pickle.dump(Data, f)
        print("Done.")
        input_shape=(None, num_features),
        hidden_num_units=200,  # number of units in hidden layer #!200-600
        output_nonlinearity=lasagne.nonlinearities.softmax,  # output layer
        output_num_units=num_classes, # 10 target values
        dropout_p=0.2,
        #!dropout 0.2-0.7

        # optimization method:
        update=nesterov_momentum,
        update_learning_rate=0.01,#!0.001-0.01
        update_momentum=0.9,#!0.6-0.9

        regression=False,  # flag to indicate we're dealing with regression problem
        max_epochs=500,  # we want to train this many epochs
        verbose=1,


    )

    random_search = RandomizedSearchCV(net1, {'hidden_num_units': sp_randint(200, 600),
                                              "dropout_p": sp_rand(0.2,0.7),
                                              "update_learning_rate": sp_rand(0.001, 0.01),
                                              "update_momentum": sp_rand(0.6, 0.9),
                                               })
    random_search.fit(X, y)
    print random_search.grid_scores_

    preds = random_search.predict_proba(X_test)[:, 1]
    submission = pd.DataFrame(preds, index=ids, columns=['target'])
    submission.to_csv('Keras_BTB.csv')
 def set_hyperparameters(self):
     self.p_grid = {'C': sp_rand()}
def Random_classifier(a, b):
    N_range_KNN = sp_randint(1, 31)
    C_LOGSVM = sp_rand()
    n_estimators_RF = sp_randint(10, 10000)
    hidden_layer_sizes_MLP = sp_randint(100, 1000)
    gamma_range_Random_SVM = sp_rand()
    alpha_NB = sp_rand()

    params_Random_KNN = dict(n_neighbors=N_range_KNN)
    params_Random_tree = dict(criterion=criterion_list_TREE,
                              splitter=splitter_TREE,
                              min_samples_split=min_samples_split_TRF,
                              max_features=max_features_TRF)
    params_Random_SGD = dict(loss=loss_list_SGD, penalty=penalty_list_SGD)
    params_Random_LOG = dict(penalty=penalty_LOG,
                             C=C_LOGSVM,
                             solver=solver_LOG,
                             multi_class=multi_class_LOG)
    params_Random_RF = dict(n_estimators=n_estimators_RF,
                            criterion=criterion_RF,
                            min_samples_split=min_samples_split_TRF,
                            max_features=max_features_TRF)
    params_Random_MLP = dict(hidden_layer_sizes=hidden_layer_sizes_MLP,
                             activation=activation_MLP,
                             solver=solver_MLP,
                             learning_rate=learning_rate_MLP)
    params_Random_SVM = dict(gamma=gamma_range_Random_SVM,
                             C=C_LOGSVM,
                             kernel=kernel_list_SVM)
    params_Random_NB = dict(alpha=alpha_NB)
    params_Random_GB = dict(loss=loss_GB,
                            criterion=criterion_GB,
                            min_samples_split=min_samples_split_TRF)
    params_Random_ADA = dict(algorithm=algorithm_ADA)
    params_Random_XG = dict(C=C_XG)
    params_Random_BAG = dict(max_samples=max_samples_BAG,
                             max_features=max_features_TRF)
    params_Random_LDA = dict(solver=solver_LDA)
    params_Random_ET = dict(criterion=criterion_list_TREE,
                            max_features=max_features_TRF,
                            min_samples_split=min_samples_split_TRF)

    Random_KNN = RandomizedSearchCV(KNeighborsClassifier(), params_Random_KNN)
    Random_tree = RandomizedSearchCV(DecisionTreeClassifier(),
                                     params_Random_tree)
    Random_SGD = RandomizedSearchCV(SGDClassifier(), params_Random_SGD)
    Random_LOG = RandomizedSearchCV(LogisticRegression(), params_Random_LOG)
    Random_RF = RandomizedSearchCV(RandomForestClassifier(), params_Random_RF)
    Random_MLP = RandomizedSearchCV(MLPClassifier(), params_Random_MLP)
    Random_SVM = RandomizedSearchCV(SVC(), params_Random_SVM)
    Random_NB = RandomizedSearchCV(BernoulliNB(), params_Random_NB)
    Random_GB = GridSearchCV(GradientBoostingClassifier(), params_Random_GB)
    Random_ADA = GridSearchCV(AdaBoostClassifier(), params_Random_ADA)
    Random_XG = GridSearchCV(XGBClassifier(), params_Random_XG)
    Random_BAG = GridSearchCV(BaggingClassifier(), params_Random_BAG)
    Random_LDA = GridSearchCV(LinearDiscriminantAnalysis(), params_Random_LDA)
    Random_ET = GridSearchCV(ExtraTreesClassifier(), params_Random_ET)

    Randoms = [
        Random_KNN, Random_tree, Random_SGD, Random_LOG, Random_RF, Random_MLP,
        Random_SVM, Random_NB, Random_GB, Random_ADA, Random_XG, Random_BAG,
        Random_LDA, Random_ET
    ]

    list_Random = []
    for ran in Randoms:
        try:
            dict_Random = {}
            ran.fit(a, b)
            dict_Random['Best_Estimator'] = ran.best_estimator_
            dict_Random['Accuracy'] = ran.best_score_
            list_Random.append(dict_Random)
        except:
            pass

    Best_Classifier_Random = max(list_Random, key=lambda x: x['Accuracy'])
    return Best_Classifier_Random
Пример #31
0
    'max_depth': 6,
    'gamma': 2.213,
    'learning_rate': 0.273,
    'max_delta_step': 1.444,
    'subsample': 0.847
}
num_round = 75
plst = param.items()
# specify validations set to watch performance
watchlist = [(dcv, 'eval'), (dtrain, 'train')]
bst_search = xgb.XGBClassifier()
clf = RandomizedSearchCV(bst_search, {
    'max_depth':
    sp_randint(1, 13),
    'learning_rate':
    sp_rand(0, 1),
    'gamma':
    sp_rand(0, 3),
    'subsample':
    sp_rand(0, 1),
    'max_delta_step':
    sp_rand(0, 3),
    'n_estimators': [
        5,
        10,
        15,
        20,
        25,
        30,
        35,
        40,
Пример #32
0
def random_search(x,
                  y,
                  v,
                  t,
                  weights,
                  top_words=9444,
                  max_review_length=1000,
                  embedding_length=300,
                  batch_size=128,
                  nb_epoch=16,
                  cv=3,
                  n_jobs=1,
                  option='d_cnn',
                  param_grid={},
                  preset={},
                  n_iter=40,
                  trainable=False):
    x = sequence.pad_sequences(x, maxlen=max_review_length)
    v = sequence.pad_sequences(v, maxlen=max_review_length)
    data = []
    x = np.array(x)  #convert to numpy form before splitting
    v = np.array(v)
    y = np.array(y)
    t = np.array(t)

    for sess in range(n_iter):
        print("Session: {0}".format(sess))
        for key, value in param_grid.items():
            try:
                preset.update({key: random.choice(value)})
            except:
                preset.update({key: value.rvs(1)[0]})

        decay = sp_rand(0, 0.00001).rvs(1)[0]
        decay_factors = [[
            math.exp(-1 * decay * elapse.total_seconds() / 3600)
            for elapse in tt
        ] for tt in t]
        decay_factors = sequence.pad_sequences(decay_factors,
                                               maxlen=max_review_length)
        shape = decay_factors.shape
        decay_factors = decay_factors.reshape(shape[0], shape[1], 1)

        if option == 'lstm' or option == 'd_lstm':
            if trainable == True:
                preset.update({
                    'top_words': top_words,
                    'max_length': max_review_length,
                    'embedding_length': embedding_length
                })
            else:
                preset.update(
                    {
                        'top_words': weights.shape[0],
                        'max_length': max_review_length,
                        'embedding_length': weights.shape[1]
                    },
                    trainable=False,
                    weights=weights)
            model = lstm_train(**preset)
            classic = False
        elif option == 'cnn' or option == 'd_cnn':
            if trainable == True:
                preset.update(
                    {
                        'top_words': top_words,
                        'max_length': max_review_length,
                        'embedding_length': embedding_length
                    },
                    trainable=True)
            else:
                preset.update(
                    {
                        'top_words': weights.shape[0],
                        'max_length': max_review_length,
                        'embedding_length': weights.shape[1]
                    },
                    trainable=False,
                    weights=weights)
            model = cnn_train(**preset)
            classic = False
        elif option == 'lr':
            preset.update({'verbose': 1})
            model = LogisticRegression(**preset)
            classic = True
        elif option == 'svm':
            preset.update({'verbose': True})
            model = SVC(**preset)
            classic = True
        elif option == 'rf':
            preset.update({'verbose': 1})
            model = RandomForestClassifier(**preset)
            classic = True

        print(preset)

        skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=8)
        cvscore = []
        for train, test in skf.split(x, y):
            x_train, x_test = x[train], x[test]
            y_train, y_test = y[train], y[test]
            t_train, t_test = t[train], t[test]
            v_train, v_test = v[train], v[test]

            if classic == True:
                model.fit(decay_norm(x=v_train, t_stamps=t_train, decay=decay),
                          y_train)
                score = model.score(
                    decay_norm(x=v_test, t_stamps=t_test, decay=decay), y_test)
                print("%s: %.2f%%" % ("accuracy", score * 100))
                cvscore.append(score * 100)
            elif trainable == True:
                model.fit(x_train,
                          y_train,
                          batch_size=batch_size,
                          nb_epoch=nb_epoch,
                          verbose=1)
                score = model.evaluate(x_test,
                                       y_test,
                                       batch_size=batch_size,
                                       verbose=1)
                print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100))
                cvscore.append(score[1] * 100)
            else:
                model.fit(x=[v_train, decay_factors],
                          y=y_train,
                          batch_size=batch_size,
                          nb_epoch=nb_epoch,
                          verbose=1)
                score = model.evaluate(x_test,
                                       y_test,
                                       batch_size=batch_size,
                                       verbose=1)
                print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100))
                cvscore.append(score[1] * 100)

        temp = {'model': option, 'decay': decay}
        temp.update(preset)
        temp.update({'mean_score': np.mean(cvscore), 'std': np.std(cvscore)})
        data.append(temp)
    return (data)
        input_shape=(None, num_features),
        hidden_num_units=200,  # number of units in hidden layer #!200-600
        output_nonlinearity=lasagne.nonlinearities.softmax,  # output layer
        output_num_units=num_classes,  # 10 target values
        dropout_p=0.2,
        #!dropout 0.2-0.7

        # optimization method:
        update=nesterov_momentum,
        update_learning_rate=0.01,  #!0.001-0.01
        update_momentum=0.9,  #!0.6-0.9
        regression=
        False,  # flag to indicate we're dealing with regression problem
        max_epochs=500,  # we want to train this many epochs
        verbose=1,
    )

    random_search = RandomizedSearchCV(
        net1, {
            'hidden_num_units': sp_randint(200, 600),
            "dropout_p": sp_rand(0.2, 0.7),
            "update_learning_rate": sp_rand(0.001, 0.01),
            "update_momentum": sp_rand(0.6, 0.9),
        })
    random_search.fit(X, y)
    print random_search.grid_scores_

    preds = random_search.predict_proba(X_test)[:, 1]
    submission = pd.DataFrame(preds, index=ids, columns=['target'])
    submission.to_csv('Keras_BTB.csv')
 def set_hyperparameters(self):
     self.p_grid = {
         'n_estimators': sp_randint(1, 300),
         'learning_rate': sp_rand(),
         'algorithm': ['SAMME', 'SAMME.R']
     }
 def optimize_params(self):
     params_grid = {"update_learning_rate": sp_rand(0.001,0.01), "momentum":sp_rand(0.9,2.0), "epochs":randint(50,300)}
     rsearch = RandomizedSearchCV(estimator=self.net, param_distributions=params_grid, n_iter=15, n_jobs=1)
     X, _, y, _ = self.roof_loader.load(max_roofs=100, test_percent=0, non_roofs=self.non_roofs)
     rsearch.fit(X,y)
     Experiment.report_grid(rsearch.grid_scores_)
Пример #36
0
test_data = df_test_df.values
test_ids = df_test['bidder_id'].map(lambda x: inv_bid_id_dict[x]).values

dtrain = xgb.DMatrix( train_data, label=outcome_train)
dcv = xgb.DMatrix( cv_data, label=outcome_cv)
dtest = xgb.DMatrix( test_data, )

param = {'objective':'binary:logistic','eval_metric':'auc','max_depth':6,
         'gamma': 2.213, 'learning_rate':0.273,'max_delta_step': 1.444,
         'subsample': 0.847}
num_round = 75
plst = param.items()
# specify validations set to watch performance
watchlist  = [(dcv,'eval'), (dtrain,'train')]
bst_search = xgb.XGBClassifier()
clf = RandomizedSearchCV(bst_search, {'max_depth': sp_randint(1,13), 'learning_rate':sp_rand(0,1),
                                'gamma':sp_rand(0,3), 'subsample':sp_rand(0,1),'max_delta_step':sp_rand(0,3),
                               'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 50, 75, 100, 125, 150,200,]},
                   verbose=1, n_jobs=2, cv = 4, scoring='roc_auc', n_iter = 1000)
clf.fit(train_data, outcome_train)
print('best clf score',clf.best_score_)
print('best params:', clf.best_params_)
bst = xgb.train(plst, dtrain, num_round, watchlist)
# this is prediction
preds = bst.predict(dcv)
pred_test = bst.predict(dtest)
labels = dcv.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

print('{0:<25} {1:>5}'.format('Feature','Importance'))
print("--------------------------------------")
Пример #37
0
print(metrics.confusion_matrix(expected, predicted))



print '=== Оптимизация параметров алгоритма ==='
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)




# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

Пример #38
0
            'regression__n_estimators':
            sp_randint(50, 150),
            'regression__base_estimator':
            [DecisionTreeRegressor(),
             GradientBoostingRegressor(),
             SVR()],
            'regression__loss': ['linear', 'square', 'exponential']
        },  # AdaBoostRegressor
        {
            'regression__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'regression__C': sp_randint(1, 100)
        }  # SVR
    ]
    feature_sel_parameters = [
        dict(feat_sel__alpha=['aic', 'bic'],
             feat_sel__selection_threshold=sp_rand(.15, .20),
             feat_sel__scaling=sp_rand(.35, .4),
             feat_sel__sample_fraction=sp_rand(.6, .3)),
        dict(feat_sel__k=sp_randint(10, 90)),
        dict(feat_sel__n_estimators=sp_randint(10, 90)),
        dict(feat_sel__n_features_to_select=sp_randint(10, 90),
             feat_sel__step=sp_randint(1, 10))
    ]
else:
    raise Exception('seach_method must be grid or randomized!')
models = [
    RandomForestRegressor(random_state=rand_state),
    ExtraTreesRegressor(random_state=rand_state),
    GradientBoostingRegressor(random_state=rand_state),
    AdaBoostRegressor(random_state=rand_state),
    SVR()