示例#1
0
def test_main():
    from hyperopt import fmin, tpe, rand, anneal, mix, partial
    from hyperopt.mongoexp import MongoTrials
    trials = MongoTrials('mongo://131.220.7.92/test/jobs', exp_key='test_nnet')

    # note that n_startup_jobs is related to gamma, the fraction of the "good"
    # jobs.  If gamma=.25, the default, then after the startup phase of 20 jobs,
    # 5 are used to build the model.
    from cuvnet.model_selection.test_nnet_objective  import objective
    best = fmin(fn=objective, space=test_build_space(),
                trials=trials,
                algo=partial(mix.suggest,
                        p_suggest=[(.0, rand.suggest),
                                   (1., anneal.suggest),
                                   (0., partial(tpe.suggest,
                                                prior_weight=1.0,  # default is 1.0
                                                n_startup_jobs=20))]),  # default is 20
                max_evals=200)
    print "best: ", best, min(TEST_LOSS)
    import matplotlib.pyplot as plt
    fig, (ax0, ax1, ax2, ax3) = plt.subplots(4, 1)
    c = np.arange(len(TEST_LOSS))
    ax0.scatter(TEST_N_FLT, TEST_LOSS, c=c)
    ax0.set_title("hidden layer size")
    ax1.scatter(np.log(TEST_LR), TEST_LOSS, c=c)
    ax1.set_title("learnrate")
    im = ax2.scatter([{"tanh":0, "rectified_linear":1}[n] for n in TEST_NONLIN], TEST_LOSS, c=c)
    ax2.set_title("tanh/relu")
    im = ax3.scatter(TEST_DROPOUT, TEST_LOSS, c=c)
    ax3.set_title("dropout")
    fig.colorbar(im)
    plt.show()
示例#2
0
def slm_visitor_lfw_partial(
    max_n_per_class,
    maybe_test_view2=False, # -- this still takes too much memory
    assume_promising=False,
    foobar_trace_target=None,
    ):
    # -- this curries and re-decorates hpconvnet.lfw.slm_visitor_lfw
    #    so that we can pass it to fmin()
    if max_n_per_class is not None:
        max_n_per_class = int(max_n_per_class)
    return hyperopt.partial(
        hpconvnet.lfw.slm_visitor_lfw,
        max_n_per_class=max_n_per_class,
        maybe_test_view2=maybe_test_view2,
        assume_promising=assume_promising,
        foobar_trace_target=foobar_trace_target,
        )
示例#3
0
def small_random_run():
    # -- This is a smoke test to make sure that a lot of code paths actually
    # run. Some of the jobs will fail, some should succeed, the data will be
    # loaded and some SVMs will be fit etc. Classifier performance is expected
    # to be poor (70% error?), because we're using just 10% of the data and
    # only trying a few random architectures.
    #
    # Expected running time on CPU: ~10 mins

    search_space = hpconvnet.cifar10.build_search_space(
        max_n_features=4500,  # -- smaller than normal
        bagging_fraction=0.5,  # -- normal
        n_unsup=2000,  # -- smaller than normal
        abort_on_rows_larger_than=50 * 1000,  # -- smaller
    )
    trials = Trials()
    hyperopt.fmin(
        fn=hyperopt.partial(hpconvnet.cifar10.uslm_eval, data_fraction=0.1),  # -- smaller than normal
        space=search_space,
        algo=hyperopt.rand.suggest,
        max_evals=10,
        trials=trials,
    )
示例#4
0
    def ks_best(self,space):
        space_4model = self.f_spacehp(space)
        test4best = fmin(self.ks,space_4model,algo=partial(tpe.suggest,n_startup_jobs=1),max_evals=100,trials=self.trials)
        best_params = self.f_bestparam(test4best,space)

        best_model = self.model(**best_params,random_state=self.random_state)
        best_model.fit(self.data_sets['train_x'], self.data_sets['train_y'])

        try:
            train_ks = ks_results(self.data_sets['train_x'], self.data_sets['train_y'], best_model)
        except:
            train_ks = None
        try:
            test_ks = ks_results(self.data_sets['test_x'], self.data_sets['test_y'], best_model)
        except:
            test_ks = None
        try:
            future_ks = ks_results(self.data_sets['data_future'], self.data_sets['target_future'], best_model)
        except:
            future_ks = None
        try:
            train_auc = auc_results(self.data_sets['train_x'], self.data_sets['train_y'], best_model)
        except:
            train_auc = None
        try:
            test_auc = auc_results(self.data_sets['test_x'], self.data_sets['test_y'], best_model)
        except:
            test_auc = None
        try:
            future_auc = auc_results(self.data_sets['data_future'], self.data_sets['target_future'], best_model)
        except:
            future_auc = None

        best_results = {'train_auc': train_auc,'test_auc': test_auc, 'future_auc': future_auc,'train_ks': train_ks, 'test_ks': test_ks, 'future_ks': future_ks}


        return best_params, best_model, best_results
示例#5
0
def tune_hyper_parameter(stock_name, df, config):
    # Get last 365 day for tuning
    df = df[-365:]
    bayer_max_evals = config.get('bayer_max_evals', 1000)
    param_grid = config.get('param_grid')

    # Hyperparameter grid
    bayes_trials = Trials()

    # Create the algorithm
    bayes_algo = tpe.suggest

    fmin_objective = partial(objective, df=df)

    bayes_best = fmin(fn=fmin_objective,
                      space=param_grid,
                      algo=bayes_algo,
                      trials=bayes_trials,
                      max_evals=bayer_max_evals)

    param_file_path = os.path.join(config['model_dir'],
                                   '%s_param.txt' % (stock_name))
    with open(param_file_path, 'w') as outfile:
        json.dump(bayes_best, outfile)
示例#6
0
def train_model(total_data,bus_data,user_data):
    # 提取label并删除其他多余信息
    print_to_log('==========数据预处理==============')
    total_data['label'] = total_data['product_amount'].apply(lambda x:1 if x>=0 else 0)
    total_data.drop(['customer_id', 'login_name', 'product_amount', 'salary_score'
                     , 'followStart', 'followEnd', 'content', 'post'], axis=1, inplace=True)

    total_data_po = total_data.loc[total_data['label']==1]
    total_data_ne = total_data.loc[total_data['label']==0]
    #控制负样本与正样本的比例为10:1
    samples = total_data_po.shape[0]*5
    total_data_ne_sample = total_data_ne.sample(samples)
    # 重新生成训练数据
    total_data_sample = pd.concat((total_data_ne_sample,total_data_po),axis=0)
    total_data_sample = total_data_sample.sample(frac=1)  # 打乱顺序
    # 离散型变量,如果某一取值的数量超过了95%,则舍弃该变量
    remove = set()
    miss_value = []
    print_to_log('处理类别型变量')
    for k in category_columns:
        if k in total_data_sample.keys():
            counts = total_data_sample[k].value_counts()
            total_num = sum(counts)
            for v in counts:
                if v/total_num>=0.95:
                    print_to_log(k)
                    print_to_log(counts)
                    remove.add(k)
        else:
            miss_value.append(k)
    # 连续型变量,如果其下四分位数=中位数=上四分位数,则舍弃该变量
    print_to_log('处理连续型变量')
    for k in continues_columns:
        if k in total_data_sample.keys():
            total_data_sample[k] = total_data_sample[k].astype('float')
            button = total_data_sample[k].min()
            top = total_data_sample[k].max()
            if button == top:
                remove.add(k)
                print_to_log(k, 0)
            else:
                var = total_data_sample[k].apply(lambda x: (x - button) / (top - button)).var()
                if var < 0.003:
                    remove.add(k)
                    print_to_log(k,var)
        else:
            miss_value.append(k)
    # 去掉缺失率大于80%的变量
    print_to_log('处理缺失率较大的变量')
    for k in total_data_sample.keys():
        miss = total_data[k].isna().sum()/total_data.shape[0]
        if miss>0.8:
            remove.add(k)
            print_to_log(k,':',miss)
    # 去掉商机地区信息
    remove.add('placeCode')
    remove.add('businessOperate')
    remove.add('businessStage')
    remove.add('businessStatus')
    # 保存剔除变量

    user_remove = []
    bus_remove = []
    for k in remove:
        if k in user_data.keys():
            user_remove.append(k)
        else:
            bus_remove.append(k)
    total_data_sample.drop(list(remove), axis=1, inplace=True)
    user_data.drop(user_remove, axis=1, inplace=True)
    bus_data.drop(bus_remove, axis=1, inplace=True)
    # 添加冷启动数据
    def cold_start_decode(k):
        if k in continues_columns:
            return 0
        elif k in category_columns:
            return 'None'
        elif k == 'customer_id':
            return -1
        else:
            return 'None'
    cold_start = {}
    for k in user_data.keys():
        cold_start[k] = cold_start_decode(k)
    user_data.append(cold_start,ignore_index=True)
    if len(miss_value)>0:
        print_to_log('数据字段缺失,请检查数据库',miss_value)

    #连续变量用0填充,类别型变量全部用‘None’填充
    def fillna(df):
        for k in df.keys():
            if k in continues_columns:
                df[k] = df[k].fillna(0)
            else:
                df[k] = df[k].fillna('None')

    fillna(total_data_sample)
    fillna(user_data)
    fillna(bus_data)
    continues_dict = {}
    category_dict = {}
    for k in bus_data.keys():
        if k in continues_columns:
            continues_dict[k] = (bus_data[k].max(), bus_data[k].min())
        elif k in category_columns:
            bus_data[k] = bus_data[k].apply(lambda x: str(x).split('.')[0])
            category_dict[k] = list(bus_data[k].unique())
            if 'None' not in category_dict[k]:
                category_dict[k].append('None')
            bus_data[k] = bus_data[k].apply(lambda x: category_dict[k].index(x))

    for k in user_data.keys():
        if k in continues_columns:
            continues_dict[k] = (user_data[k].max(), user_data[k].min())
        elif k in category_columns:
            user_data[k] = user_data[k].apply(lambda x: str(x).split('.')[0])
            category_dict[k] = list(user_data[k].unique())
            if 'None' not in category_dict[k]:
                category_dict[k].append('None')
            user_data[k] = user_data[k].apply(lambda x: category_dict[k].index(x))

    # 对连续变量进行区间缩放并存放中间值,对类别型变量进行独热编码并存放中间值
    for k in total_data_sample.keys():
        # if k in continues_dict.keys():
            # total_data_sample[k] = total_data_sample[k].apply(lambda x:(x-continues_dict[k][0])/(continues_dict[k][1])-continues_dict[k][0])
        if k in category_dict.keys():
            # 直接编码

            total_data_sample[k] = total_data_sample[k].apply(lambda x:category_dict[k].index(str(x).split('.')[0]))

            # for v in category_dict[k]:
            #     total_data_sample[k+'_'+str(v)] = total_data_sample[k].apply(lambda x:1 if x ==v else 0)
            # total_data_sample.drop(k,axis=1,inplace=True)

    # 建模并进行训练
    label = total_data_sample['label']
    total_data_sample.drop('label',axis = 1,inplace=True)
    lenth = total_data_sample.shape[0]

    train_data = total_data_sample[:int(0.8*lenth)]
    train_label = label[:int(0.8*lenth)]
    test_data = total_data_sample[int(0.8*lenth):]
    test_label = label[int(0.8*lenth):]
    print_to_log('==========模型训练==============')
    print(train_data.info())
    print(train_data.head())

    def objective(space):
        model = xgboost.XGBClassifier(
            max_depth=int(space['max_depth']),
            n_estimators=int(space['n_estimators']),
            subsample=space['subsample'],
            colsample_bytree=space['colsample_bytree'],
            learning_rate=space['learning_rate'],
            reg_alpha=space['reg_alpha'],
            nthread=4
        )
        model.fit(train_data, train_label)
        score = metrics.f1_score(test_label, model.predict(test_data))
        print_to_log('score: {}'.format(score))
        return {'loss': 1 - score, 'status': STATUS_OK}

    space = {
        'max_depth': hp.quniform('max_depth', 2, 20, 1),
        'n_estimators': hp.quniform('n_estimators', 100, 500, 1),
        'subsample': hp.uniform('subsample', 0.8, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
        'reg_alpha': hp.uniform('reg_alpha', 0.1, 1),
    }

    algo = partial(tpe.suggest, n_startup_jobs=4)
    trials = Trials()
    best = fmin(fn=objective,
                space=space,
                algo=algo,
                max_evals=20,
                trials=trials)
    print_to_log(best)
    model = xgboost.XGBClassifier(
        max_depth = int(best['max_depth']),
        n_estimators = int(best['n_estimators']),
        subsample = best['subsample'],
        colsample_bytree = best['colsample_bytree'],
        learning_rate = best['learning_rate'],
        reg_alpha = best['reg_alpha'],
        nthread = 4)
    model.fit(train_data,train_label)
    pred = model.predict(test_data)
    print_to_log('==========训练完成==============')
    print_to_log('模型得分: ')
    print_to_log('recall:',metrics.recall_score(test_label,pred))
    print_to_log('precision:',metrics.precision_score(test_label,pred))
    print_to_log('f1_score:',metrics.f1_score(test_label,pred))
    print_to_log('auc_score:',metrics.roc_auc_score(test_label,pred))
    print_to_log(len(total_data_sample.keys()))

    try:
        with open(model_path + 'remove.pk', 'wb') as f:
            pickle.dump(remove, f)
        with open(model_path + 'continues_dict.pk', 'wb') as f:
            pickle.dump(continues_dict, f)
        with open(model_path + 'category_dict.pk', 'wb') as f:
            pickle.dump(category_dict, f)
        joblib.dump(model, model_path+'model.m')
        user_data.to_csv(data_path + 'employee_feature.csv', encoding='utf-8', index=0, sep=',')
        bus_data.to_csv(data_path + 'business_feature.csv', encoding='utf-8', index=0, sep=',')
        print_to_log('所有数据保存完毕')
    except Exception as e:
        print_to_log(e, level=4)
        print_to_log('保存文件失败,检查文件路径')
示例#7
0
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

from params_select import *
from objective import *
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, partial, rand, space_eval
from loss import weighted_categorical_crossentropy3

if __name__ == '__main__':
    params = {'ma': 5, 'std_window': 20, 'vol_window': 15}
    construct_feature_func = partial(construct_features1,
                                     params=params,
                                     test=False)

    data_set, reverse_func = get_data(
        file_name="E:\market_data/cs_market.csv",
        stks=zz500[:50],
        construct_feature_func=construct_feature_func,
        split_dates=["2016-01-01", "2017-01-01"])
    performance_func = performance_factory(reverse_func,
                                           performance_types=[
                                               'Y0', 'Y', 'returns',
                                               'cum_returns', 'annual_return',
                                               'sharpe_ratio'
                                           ])

    function = "test_weight"
    identity = str(uuid.uuid1())
    namespace = function + '_' + identity
 def best_model(self):
     algo = partial(tpe.suggest, n_startup_jobs=1)
     best = fmin(self.GBM, space=self.paras.hyper_opt, algo=algo, max_evals=20)
     print("best", best)
     return best
# shape 1 corresponds to (1, 256), 2 to (2, 128), and 4 to (4, 64)
parameter_space = {
    'shape': hp.choice('shape', [1, 2, 4]),
    'units': hp.choice('units', [16, 32, 64, 128, 256, 512]),
    'layers': hp.choice('layers', [1, 2, 3]),
    'dense': hp.choice('dense', [32, 64, 128, 256, 512])
}

trials = Trials()

# number of models that will be built and evaluated using the provided choices
max_evals = 225

algo = partial(
    tpe.suggest,
    n_EI_candidates=1000,
    gamma=0.2,
    n_startup_jobs=int(0.1 * max_evals),
)

fmin(train_network,
     trials=trials,
     space=parameter_space,
     algo=algo,
     max_evals=max_evals,
     show_progressbar=False)
df.to_csv('parameters.csv')
best = get_best()
print('\n-------------------------------------\n')

print(
    'Hyper-parameter space exploration ended. \nRetraining the best again on the full dataset.'
示例#10
0
    import sys

    if len(sys.argv) > 1:
        model_type = sys.argv[1]
        max_evals = int(sys.argv[2])

    else:
        model_type = 'lgb'
        max_evals = 2

    logger.debug(
        f'Try to search paras base on model:{model_type}, max_evals:{max_evals}'
    )

    from functools import partial
    optimize_fun_ex = partial(optimize_fun, model_type=model_type)

    trials = Trials()
    space = get_search_space(model_type)
    best = fmin(optimize_fun_ex,
                space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)

    #logger.debug(f"Best: {best}")

    att_message = [
        trials.trial_attachments(trial)['message'] for trial in trials.trials
    ]
    for score, para, misc in zip(
示例#11
0
        _counter += 1
        trial_id = _counter

        trainer = AsymmetricSelfPlay(model_builder, model_params, env_params,
                                     eval_env_params, args.train_episodes,
                                     args.eval_episodes, args.num_evals,
                                     switch_freq, args.path + f'/{trial_id}',
                                     args.seed, args.processes)
        trainer.run()

        best_win_rate = -max(max(player_wr) for player_wr in trainer.win_rates)

        return best_win_rate

    algo = partial(tpe.suggest,
                   n_startup_jobs=max(0,
                                      args.num_warmup_trials - trials_so_far))

    best_param = fmin(wrapper,
                      hyperparameter_space,
                      algo=algo,
                      max_evals=args.num_trials,
                      trials=trials,
                      rstate=random_state)

    save_run(trials, random_state, args.path)

    loss = [x['result']['loss'] for x in trials.trials]

    print("")
    print("##### Results")
示例#12
0
    def suggest(self):
        """Return the next parameter suggestion

        >>> import json
        >>> json_str='{"seed":0,"lib":"hyperopt","algo":"tpe","scope":{"x":["uniform",-10,10],"y":["uniform",-10,10]},'
        >>> json_str+='"max_evals":1,'
        >>> json_str+='"results":{"losses":[3.4620,3.192,28.963,19.64,20.458],'
        >>> json_str+='"statuses":["ok","ok","ok","ok","ok"],'
        >>> json_str+='"vals":{"y":[-0.16774,0.3122,-2.416,0.27455,-3.2827],'
        >>> json_str+='"x":[1.857,1.760,4.785,-4.498,2.837]}}}'
        >>> exec=ExecutorFactory.get_executor(json_str)
        >>> reval=json.loads(exec.suggest())

        >>> reval["alog"] == 'tpe'
        True
        >>> reval["scope"]["x"][0] == 4.30378732744839
        True
        >>> reval["scope"]["y"][0] == 0.9762700785464951
        True
        """
        _logger = getLogger(__name__)

        id_qnt = int(self.json_loaded[COMMON_MAXEVALS])
        additional_args = []
        executed_alog = self.json_loaded[COMMON_ALGO]
        if executed_alog == self.HYP_ALGO_TPE:
            algo = tpe.suggest
            id_qnt = 1
            additional_args.append(tpe._default_prior_weight)
            additional_args.append(5)  # n_startup_jobs
        elif executed_alog == self.HYP_ALGO_ANNEAL:
            algo = anneal.suggest
        elif executed_alog == self.HYP_ALGO_RAND:
            algo = rand.suggest
        elif executed_alog == self.HYP_ALGO_MIX:
            algo = partial(mix.suggest,
                           p_suggest=[
                               (.1, rand.suggest),
                               (.2, anneal.suggest),
                               (.7, tpe.suggest),
                           ])
        else:
            _logger.warning('unknown algo define. use tpe')
            algo = tpe.suggest

        new_ids = self.trials.new_trial_ids(id_qnt)
        args = [new_ids, self.domain, self.trials, self.rand_seed
                ] + additional_args
        rval_docs = algo(*args)

        statuses = []
        vals = {}
        for i in range(len(new_ids)):
            statuses.append(
                rval_docs[i][self.HYP_OUT_RESULT][self.HYP_OUT_STATUS])
            vals = self.merge_dict_valuelist(
                vals, rval_docs[i][self.HYP_OUT_MISC][self.HYP_OUT_VALS])

        results = dict(algo=executed_alog, statuses=statuses, vals=vals)

        return results
示例#13
0
文件: train.py 项目: jmhIcoding/fifty
    def train_model(self):
        if self.data_dir:
            load_dataset(self, self.data_dir)
        elif self.scale_down:

            make_new_dataset(self)
        elif self.scale_up:
            raise SystemExit(
                'Please refer documentation. Requires you to prepare the dataset on your own and then use -d option.'
            )
        else:
            load_dataset(self)
        # updating global variables. train_network only takes one and only one argument.
        global percent, block_size, scenario, gpu, output, verbose, new_model, no_of_classes
        percent = self.percent
        block_size = self.block_size
        scenario = self.scenario
        gpu = self.gpus
        output = self.output
        new_model = self.new_model
        if self.scale_down:
            no_of_classes = len(list(open(self.scale_down, 'r')))
        if self.v:
            verbose = 0
        elif self.vv:
            verbose = 1
        elif self.vvv:
            verbose = 2
        parameter_space = {
            'layers': hp.choice('layers', [1, 2, 3]),
            'embed_size': hp.choice('embed_size', [16, 32, 48, 64]),
            'filter': hp.choice('filter', [16, 32, 64, 128]),
            'kernel': hp.choice('kernel', [3, 11, 19, 27, 35]),
            'pool': hp.choice('pool', [2, 4, 6, 8]),
            'dense': hp.choice('dense', [16, 32, 64, 128, 256])
        }

        trials = Trials()

        if self.algo.lower() == 'tpe':
            algo = partial(
                tpe.suggest,
                n_EI_candidates=1000,
                gamma=0.2,
                n_startup_jobs=int(0.1 * self.max_evals),
            )

        elif self.algo.lower() == 'rand':
            algo = rand.suggest
        else:
            print(
                'Warning! The requested hyper-parameter algorithm is not supported. Using TPE.'
            )
            algo = partial(
                tpe.suggest,
                n_EI_candidates=1000,
                gamma=0.2,
                n_startup_jobs=int(0.1 * self.max_evals),
            )

        fmin(train_network,
             trials=trials,
             space=parameter_space,
             algo=algo,
             max_evals=self.max_evals,
             show_progressbar=False)
        df.to_csv(os.path.join(self.output, 'parameters.csv'))
        best = get_best()
        print('\n-------------------------------------\n')
        print(
            'Hyper-parameter space exploration ended. \nRetraining the best again on the full dataset.'
        )
        percent = 1
        train_network(best)
        print('The best model has been retrained and saved as {}.'.format(
            self.new_model))
示例#14
0
# best : {'gamma': 0.4, 'learning_rate': 0.05740649534056902, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 166, 'subsample': 0.6}
# best param after transform :
# {'gamma': 0.04000000000000001, 'learning_rate': 0.05114812990681138, 'max_depth': 10, 'min_child_weight': 7, 'n_estimators': 316, 'subsample': 0.56}
# rmse of the best xgboost: 6136.126337046346
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
from hyperopt import fmin, tpe, hp, partial
import numpy as np
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, zero_one_loss
from sklearn.metrics import log_loss
import xgboost as xgb
import pandas as pd
from xgboost import plot_importance
from matplotlib import pyplot as plt
totalcount = 50
Devide = "edge"
#----------------------------------------------01----------------------------------------------------------
attribute = pd.read_csv(str(totalcount) + '-mix-two-data-1.csv')
label = pd.read_csv(str(totalcount) + '-mix-two-label.csv')
#print(waferA.info())
#print(waferL.info())
#mapping_type={'Center':0,'Donut':1,'Edge-Loc':2,'Edge-Ring':3,'Loc':4,'Random':5,'Scratch':6,'Near-full':7,'none':8}
label = label['totalLatency']
attribute = attribute.loc[:, ~attribute.columns.str.contains('^Unnamed')]
X = attribute
y = label
# ############################################################
示例#15
0
def Bayesian_optimize_poly(df_minmax, force):
    '''
    贝叶斯优化具体教程见链接:https://github.com/FontTian/hyperopt-doc-zh/wiki/FMin
    :param df_minmax: 归一化后的DF数据
    :param force: 径向力还是纵向力
    :return: 返回多项式回归超参数筛选后的最优超参数
    '''
    space = {"param_degree": hp.randint("param_degree", 5, 15)}  #贝叶斯优化的搜索域
    ref_list = [
        pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[0].name,
        pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[1].name,
        pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[2].name,
        pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[3].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[0].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[1].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[2].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[3].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[4].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[5].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[6].name
    ]

    def Polynomia_func(argsDict):
        model = Pipeline([
            ('poly', PolynomialFeatures(degree=argsDict["param_degree"])),
            ('linear', LinearRegression())
        ])
        mse_list = []
        for i in range(len(ref_list)):
            if i < 4:
                df1 = df_minmax[df_minmax['x/d'] != ref_list[i]]
                df2 = df_minmax[df_minmax['x/d'] == ref_list[i]]
                X_data = np.array(df1[['x/d', 'y/d']])
                y_data = np.array(df1[[force]]).flatten()
                X_test = np.array(df2[['x/d', 'y/d']])
                y_test = np.array(df2[[force]]).flatten()
                X_data = X_data.astype(np.float32)
                y_data = y_data.astype(np.float32)
                X_test = X_test.astype(np.float32)
                y_test = y_test.astype(np.float32)
                model.fit(X_data, y_data)
                y_test_predict = model.predict(X_test)
                mse = mean_squared_error(y_test, y_test_predict)
                mse_list.append(mse)
            if i >= 4:
                df1 = df_minmax[df_minmax['y/d'] != ref_list[i]]
                df2 = df_minmax[df_minmax['y/d'] == ref_list[i]]
                X_data = np.array(df1[['x/d', 'y/d']])
                y_data = np.array(df1[[force]]).flatten()
                X_test = np.array(df2[['x/d', 'y/d']])
                y_test = np.array(df2[[force]]).flatten()
                X_data = X_data.astype(np.float32)
                y_data = y_data.astype(np.float32)
                X_test = X_test.astype(np.float32)
                y_test = y_test.astype(np.float32)
                model.fit(X_data, y_data)
                y_test_predict = model.predict(X_test)
                mse = mean_squared_error(y_test, y_test_predict)
                mse_list.append(mse)
        #     print(np.mean(mse_list),mse_list)
        return np.mean(mse_list)

    trials = Trials()
    algo = partial(tpe.suggest, n_startup_jobs=1)
    best = fmin(Polynomia_func, space, algo=algo, max_evals=100, trials=trials)
    return best
示例#16
0
def Bayesian_optimize_nn(df_minmax, force):
    '''
    :param df_minmax: 归一化后的DF数据
    :param force: 径向力还是纵向力
    :return: 返回神经网络模型超参数筛选后的最优超参数
    '''
    space = {
        'units1':
        hp.choice('units1', [16, 64, 128, 320, 512]),
        'units2':
        hp.choice('units2', [16, 64, 128, 320, 512]),
        'units3':
        hp.choice('units3', [16, 64, 128, 320, 512]),
        'lr':
        hp.choice('lr', [0.01, 0.001, 0.0001]),
        'activation':
        hp.choice('activation', ['relu', 'sigmoid', 'tanh', 'linear']),
        'loss':
        hp.choice('loss',
                  [losses.logcosh, losses.mse, losses.mae, losses.mape])
    }
    ref_list = [
        pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[0].name,
        pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[1].name,
        pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[2].name,
        pd.DataFrame(df_minmax['x/d'].value_counts()).iloc[3].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[0].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[1].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[2].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[3].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[4].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[5].name,
        pd.DataFrame(df_minmax['y/d'].value_counts()).iloc[6].name
    ]

    def experiment(params):
        main_input = Input(shape=(2, ), name='main_input')
        x = Dense(params['units1'],
                  activation=params['activation'])(main_input)
        x = Dense(params['units2'], activation=params['activation'])(x)
        x = Dense(params['units3'], activation=params['activation'])(x)
        output = Dense(1, activation="linear", name="out")(x)
        final_model = Model(inputs=[main_input], outputs=[output])
        opt = Adam(lr=params['lr'])
        final_model.compile(optimizer=opt, loss=params['loss'])

        mse_list = []
        for i in range(len(ref_list)):
            if i < 4:
                df1 = df_minmax[df_minmax['x/d'] != ref_list[i]]
                df2 = df_minmax[df_minmax['x/d'] == ref_list[i]]
                X_data = np.array(df1[['x/d', 'y/d']])
                y_data = np.array(df1[[force]]).flatten()
                X_test = np.array(df2[['x/d', 'y/d']])
                y_test = np.array(df2[[force]]).flatten()
                X_data = X_data.astype(np.float32)
                y_data = y_data.astype(np.float32)
                X_test = X_test.astype(np.float32)
                y_test = y_test.astype(np.float32)
                history = final_model.fit(X_data,
                                          y_data,
                                          epochs=30,
                                          batch_size=256,
                                          verbose=0,
                                          validation_data=(X_test, y_test),
                                          shuffle=True)
                y_test_predict = final_model.predict(X_test)
                mse = mean_squared_error(y_test, y_test_predict)
                mse_list.append(mse)
            if i >= 4:
                df1 = df_minmax[df_minmax['y/d'] != ref_list[i]]
                df2 = df_minmax[df_minmax['y/d'] == ref_list[i]]
                X_data = np.array(df1[['x/d', 'y/d']])
                y_data = np.array(df1[[force]]).flatten()
                X_test = np.array(df2[['x/d', 'y/d']])
                y_test = np.array(df2[[force]]).flatten()
                X_data = X_data.astype(np.float32)
                y_data = y_data.astype(np.float32)
                X_test = X_test.astype(np.float32)
                y_test = y_test.astype(np.float32)
                history = final_model.fit(X_data,
                                          y_data,
                                          epochs=30,
                                          batch_size=256,
                                          verbose=0,
                                          validation_data=(X_test, y_test),
                                          shuffle=True)
                y_test_predict = final_model.predict(X_test)
                mse = mean_squared_error(y_test, y_test_predict)
                mse_list.append(mse)

        mse = np.mean(mse_list)
        print('mse', mse)
        return mse

    algo = partial(tpe.suggest, n_startup_jobs=1)
    best = fmin(experiment, space, algo=algo, max_evals=200)
    return best
示例#17
0
文件: LightGBM.py 项目: Asdil/xgblgb
def pipeline(path):
    max_evals = 30
    _, name, _, _ = tool.splitPath(path)
    logger.info(f'开始训练位点: {name}')
    print(f'开始训练位点: {name}')

    data = np.load(path)

    try:
        X, Y = data[:, :-1], data[:, -1]
    except:
        logger.info(f'位点: {name} 文件读取错误')
        print(f'位点: {name} 文件读取错误')
        return 0

    if len(np.unique(Y)) == 1:
        logger.info(f'位点: {name} 只有一种类标签')
        print(f'位点: {name} 只有一种类标签')
        return 0

    tmp = Y.tolist()
    tmp = dict(Counter(tmp))
    if tmp[0] > tmp[1]:
        ma, mi = tmp[0], tmp[1]
    else:
        ma, mi = tmp[1], tmp[0]
    if mi / ma < 0.01:
        logger.info(f'位点: {name} 为低频位点')
        print(f'位点: {name} 为低频位点')
        return 0

    space = {
        "num_leaves":
        hp.randint("num_leaves", 5),  # [0, upper)
        "max_depth":
        hp.choice("max_depth", [-1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]),
        "learning_rate":
        hp.uniform("learning_rate", 0.001, 2),  # 0.001-2均匀分布
        "n_estimators":
        hp.randint("n_estimators", 5),  # [0,1000)
        "min_child_weight":
        hp.uniform("min_child_weight", 0.001, 0.01),  # 0.001-2均匀分布
        "min_child_samples":
        hp.randint("min_child_samples", 10),  # [0,1000)
        "subsample":
        hp.randint("subsample", 4),
        "colsample_bytree":
        hp.choice("colsample_bytree", [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]),
        "reg_alpha":
        hp.choice("reg_alpha", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1]),
        "reg_lambda":
        hp.choice("reg_lambda", [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]),
        "path":
        hp.choice('path', [path])
    }

    star = time.time()
    algo = partial(tpe.suggest, n_startup_jobs=1)  # 优化算法种类
    best = fmin(LGB, space, algo=algo,
                max_evals=max_evals)  # max_evals表示想要训练的最大模型数量,越大越容易找到最优解

    best = RECOVERLGB(best)
    TRAINLGB(X, Y, best, name, save_path + name + '.lgb', logger)
    end = time.time()
    times = end - star
    logger.info(f'位点: {name} 用时为: {times}')
示例#18
0
    'activation': hp.choice('activation',
                            ['identity', 'logistic', 'tanh', 'relu']),
    'solver': hp.choice('solver', ['lbfgs', 'sgd', 'adam']),
    'batch_size': hp.uniform('batch_size', 1, 50),
    'early_stopping': hp.choice('early_stopping', [True, False]),
}
space_SVM = {
    'C':
    hp.uniform('C', 0.1, 50),
    'kernel':
    hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']),
    'degree':
    hp.uniform('degree', 1, 10),
    'coef0':
    hp.uniform('coef0', 0, 10),
}
space_LR = {
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'C': hp.uniform('C', 0.1, 20),
    'intercept_scaling': hp.randint('intercept_scaling', 100),
    'solver': hp.choice('solver', ['liblinear', 'saga']),
    'warm_start': hp.choice('warm_start', [True, False]),
}
algo = partial(tpe.suggest)
trials = Trials()
best = fmin(percept, space, algo=algo, max_evals=200, trials=trials)
# print(best)
print(space_eval(space, best))
print(percept(space_eval(space, best)))
print("test")
示例#19
0
 def ks_fmin(self,space4model):
     test_best=fmin(self.ks,space4model,algo=partial(tpe.suggest,n_startup_jobs=1),max_evals=100,trials=self.trials)
     return test_best
示例#20
0
def Hyperopt_get_best_parameters(Metrics='roc_auc', evals_num=30):
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, partial
    penalty_list = ['l1', 'l2']
    parameter_space = {
        'C': hp.uniform('C', 0, 1),
        'penalty': hp.choice('penalty', penalty_list),
    }

    def hyperopt_train_test(params):
        clf = LogisticRegression(**params, random_state=123)
        auc = cross_val_score(clf, X_train, y_train, cv=5,
                              scoring=Metrics).mean()  # replace 2
        return auc

    count = 0

    def function(params):
        auc = hyperopt_train_test(params)
        global count
        count = count + 1
        print({'loss': auc, 'status': STATUS_OK, 'count': count})
        return -auc

    count = 0

    def fuction_model(params):
        #    print(params)
        folds = KFold(n_splits=5, shuffle=True, random_state=546789)
        train_preds = np.zeros(X_train.shape[0])
        train_class = np.zeros(X_train.shape[0])
        feats = [
            f for f in X_train.columns if f not in ['Survived', 'PassengerId']
        ]  # 注意用户编号也要去掉
        for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train)):
            trn_x, trn_y = X_train[feats].iloc[trn_idx], y_train.iloc[trn_idx]
            val_x, val_y = X_train[feats].iloc[val_idx], y_train.iloc[val_idx]
            clf = LogisticRegression(**params, random_state=123)
            clf.fit(trn_x, trn_y)
            train_preds[val_idx] = clf.predict_proba(val_x)[:, 1]
            train_class[val_idx] = clf.predict(val_x)

            del clf, trn_x, trn_y, val_x, val_y
            gc.collect()
        global count
        count = count + 1
        if Metrics == 'roc_auc':
            score = roc_auc_score(y_train, train_preds)
        elif Metrics == 'accuracy':
            score = accuracy_score(y_train, train_class)
        elif Metrics == 'f1':
            score = f1_score(y_train, train_class)
        print("第%s次,%s score为:%f" % (str(count), Metrics, score))
        return -score

    algo = partial(tpe.suggest, n_startup_jobs=20)
    trials = Trials()
    #max_evals  -- 寻找最优参数的迭代的次数
    best = fmin(fuction_model,
                parameter_space,
                algo=algo,
                max_evals=evals_num,
                trials=trials)

    #best["parameter"]返回的是数组下标,因此需要把它还原回来
    best["penalty"] = penalty_list[best['penalty']]
    print('best:\n', best)

    clf = LogisticRegression(**best, random_state=123)
    phsorce = cross_val_score(
        clf, X_train, y_train, cv=5,
        scoring=Metrics).mean()  # replace 4 roc_auc f1 accuracy
    print('贝叶斯优化参数得分:', phsorce)

    clf = LogisticRegression(random_state=123)
    nosorce = cross_val_score(clf, X_train, y_train, cv=5,
                              scoring=Metrics).mean()  # replace 5
    print('自己调参数得分:', nosorce)

    return best
    return score(pred, valid_y)


def score(pred, y):
    '''
    给最后测试结果打分,根据不同的标准,这里需要每次都改
    '''

    metric = rmse(y, pred)
    print(metric)
    return metric


if __name__ == '__main__':
    train_x, valid_x, train_y, valid_y = get_train_dataset()

    param_space_reg_skl_lasso = {
        'alpha': hp.loguniform("alpha", numpy.log(0.00001), numpy.log(0.1)),
        'random_state': skl_random_seed,
        "max_evals": lasso_max_evals,
    }

    best = fmin(objective,
                param_space_reg_skl_lasso,
                algo=partial(tpe.suggest, n_startup_jobs=1),
                max_evals=100,
                trials=Trials())
    print(best)
    print(objective(best))
示例#22
0
def lgbTraining(x_train, y_train, p):
    train_x, valid_x, train_y, valid_y = train_test_split(x_train.values,
                                                          y_train.values,
                                                          test_size=0.3,
                                                          random_state=42)
    train = lgb.Dataset(train_x, train_y)
    valid = lgb.Dataset(valid_x, valid_y, reference=train)

    # 自定义hyperopt的参数空间
    space = {
        "max_depth": hp.randint("max_depth", 15),
        "num_trees": hp.randint("num_trees", 20),
        'learning_rate': hp.randint('learning_rate', 20),
        "num_leaves": hp.randint("num_leaves", 10),
        "lambda_l1": hp.randint("lambda_l1", 6)
    }

    def argsDict_tranform(argsDict, isPrint=False):
        argsDict["max_depth"] = argsDict["max_depth"] + 10
        argsDict["num_trees"] = argsDict["num_trees"] * 5 + 100
        argsDict["learning_rate"] = argsDict["learning_rate"] * 0.01 + 0.01
        argsDict["num_leaves"] = argsDict["num_leaves"] * 3 + 10
        argsDict["lambda_l1"] = argsDict["lambda_l1"] * 0.1
        if isPrint:
            print(argsDict)
        else:
            pass
        return argsDict

    def lightgbm_factory(argsDict):
        argsDict = argsDict_tranform(argsDict)
        params = {
            'nthread': -1,  # 进程数
            'max_depth': argsDict['max_depth'],  # 最大深度
            'num_trees': argsDict['num_trees'],  # 树的数量
            'learning_rate': argsDict['learning_rate'],  # 学习率
            'num_leaves': argsDict['num_leaves'],  # 终点节点最小样本占比的和
            'lambda_l1': argsDict["lambda_l1"],  # L1 正则化
            'lambda_l2': 0,  # L2 正则化
            'objective': 'regression',
            'bagging_seed': 100  # 随机种子,light中默认为100
        }
        params['metric'] = ['mae']
        model_lgb = lgb.train(params,
                              train,
                              num_boost_round=20000,
                              valid_sets=[valid],
                              early_stopping_rounds=100)
        return get_transformer_score(model_lgb)

    # 获取实际功率大于0.03*p的部分
    valid_y_new = valid_y[valid_y > 0.03 * p]
    valid_y_new_index = np.argwhere(valid_y > 0.03 * p)

    def get_transformer_score(transformer):
        model = transformer
        prediction = model.predict(valid_x, num_iteration=model.best_iteration)
        prediction_new = prediction[valid_y_new_index]
        return mean_absolute_error(valid_y_new, prediction_new)

    # 开始使用hyperopt进行自动调参
    algo = partial(tpe.suggest, n_startup_jobs=1)
    best = fmin(lightgbm_factory,
                space,
                algo=algo,
                max_evals=100,
                pass_expr_memo_ctrl=None)
    MAE = lightgbm_factory(best) / p

    return MAE, best