Пример #1
0
def exploratory_experiment(df, target, target_type='R'):
    '''
    Func:机器学习探索性实验\n
    In:dataframe各指标数据\n
    target --> 因变量\n
    target_type --> R连续变量;C分类变量
    '''
    if target_type == 'R' or target_type == 'r':
        from pycaret.regression import compare_models, setup
    elif target_type == 'C' or target_type == 'c':
        from pycaret.classification import compare_models, setup
    setup(df, target)
    compare_models()
Пример #2
0
def run_pycaret(name, df_train, df_test, acc_func, target):
    pycaret_acc_func_str = 'Accuracy'
    for pycaret_metrics in [
            'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa', 'MCC',
            'MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE'
    ]:
        if pycaret_metrics.lower() in str(acc_func).lower():
            pycaret_acc_func_str = pycaret_metrics

    import traceback
    task_type = 'classification'
    if pycaret_acc_func_str in ['MAE', 'MSE', 'RMSE', 'R2', 'RMSLE', 'MAPE']:
        task_type = 'regression'
        from pycaret.regression import setup, compare_models, predict_model, blend_models, stack_models, automl, create_model
    else:
        from pycaret.classification import setup, compare_models, predict_model, blend_models, stack_models, automl, create_model

    setup_return = setup(data=df_train, target=target)

    top_models = compare_models(n_select=3,
                                verbose=False,
                                sort=pycaret_acc_func_str,
                                turbo=True,
                                blacklist=['catboost', 'xgboost'])

    # Ensemble the top models and optimize the resulting model
    blender = blend_models(estimator_list=top_models, verbose=False)
    stacker = stack_models(estimator_list=top_models,
                           meta_model=top_models[0],
                           verbose=False)
    best_model = automl(optimize=pycaret_acc_func_str)

    df_test_dropped = df_test.drop(columns=[target])

    predictions = predict_model(best_model, data=df_test_dropped)

    try:
        accuracy = acc_func(list(predictions['Label']), list(df_test[target]))
    except Exception as e:
        traceback.print_exc()
        print(f'Exception computing accuracy (1): {e}')
        if task_type == 'classification':
            accuracy = acc_func([str(x) for x in list(predictions['Label'])],
                                [str(x) for x in list(df_test[target])])
        elif task_type == 'regression':
            accuracy = acc_func([float(x) for x in list(predictions['Label'])],
                                [float(x) for x in list(df_test[target])])

    return accuracy
Пример #3
0
    def exec(self):

        log.info('[START] {}'.format("exec"))

        # h2o.init()

        try:

            if (platform.system() == 'Windows'):

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': '2019-01-01',
                    'endDate': '2021-12-31',
                    'isOverWrite': True
                    # , 'isOverWrite': False
                }

                globalVar['inpPath'] = 'E:/DATA'
                globalVar['outPath'] = 'E:/DATA'

            else:

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': globalVar['srtDate'],
                    'endDate': globalVar['endDate']
                    # , 'isOverWrite': True
                    ,
                    'isOverWrite': False
                }

            isDlModelInit = False

            inpPosFile = '{}/{}'.format(globalVar['cfgPath'],
                                        'stnInfo/GA_STN_INFO.xlsx')
            posData = pd.read_excel(inpPosFile, engine='openpyxl')
            posDataL1 = posData[['id', 'lat', 'lon']]

            modelDirKeyList = ['AI_2Y']
            # modelDirKeyList = ['AI_1Y6M']
            # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M']

            for k, modelDirKey in enumerate(modelDirKeyList):
                log.info("[CHECK] modelDirKey : {}".format(modelDirKey))

                for i, posInfo in posDataL1.iterrows():

                    posId = int(posInfo['id'])
                    posLat = posInfo['lat']
                    posLon = posInfo['lon']

                    if (not re.search('17', str(posId))): continue
                    # if (re.search('17|50|51|58|60|67|72|81|85|87', str(posId))): continue

                    log.info('[CHECK] posId : {}'.format(posId))

                    # break
                    inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format(
                        globalVar['outPath'], 'FOR', serviceName, posId,
                        'final', 'proc', 'for')
                    fileList = sorted(glob.glob(inpFile))

                    # 파일 없을 경우 예외 처리
                    if fileList is None or len(fileList) < 1:
                        log.error('[ERROR] inpFile : {} / {}'.format(
                            inpFile, '입력 자료를 확인해주세요.'))
                        continue

                    fileInfo = fileList[0]
                    inpData = pd.read_excel(fileInfo, engine='openpyxl')

                    # inpData['CA_TOT'].where(inpData['CA_TOT'] < 0, np.nan)
                    inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan
                    inpData['WS'][inpData['WS'] < 0] = np.nan
                    inpData['WD'][inpData['WD'] < 0] = np.nan
                    inpData['SWR'][inpData['SWR'] < 0] = np.nan
                    inpData['pv'][inpData['pv'] < 0] = np.nan

                    inpDataL1 = inpData.dropna().reset_index(drop=True)
                    inpDataL1 = inpDataL1.sort_values(by=['dtDateKst'], axis=0)

                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0]
                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-11-30', format='%Y-%m-%d')].index.to_numpy()
                    idxInfo = inpDataL1.loc[
                        inpDataL1['dtDateKst'] >= pd.to_datetime(
                            '2021-06-01', format='%Y-%m-%d')].index.to_numpy()
                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2022-01-01', format='%Y-%m-%d')].index.to_numpy()

                    if (len(idxInfo) < 1): continue
                    idx = idxInfo[0]

                    # 7일, 15일, 1달, 3달, 6달, 2년
                    if (modelDirKey == 'AI_2Y'):
                        # 전체 데이터
                        # trainData = inpDataL1

                        # 2021년 기준으로 데이터 분할
                        trainData, testData = inpDataL1[0:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    elif (modelDirKey == 'AI_7D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=7)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_15D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=15)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_1M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=1)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_3M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=3)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_6M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=6)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    log.info('[CHECK] len(trainData) : {}'.format(
                        len(trainData)))
                    log.info('[CHECK] len(testData) : {}'.format(
                        len(testData)))
                    log.info('[CHECK] trainData : {} - {}'.format(
                        trainData['dtDateKst'].min(),
                        trainData['dtDateKst'].max()))
                    # log.info('[CHECK] testData : {} - {}'.format(trainData['testData'].min(), trainData['testData'].max()))

                    # trainData['year'] = trainData['dtDateKst'].dt.strftime('%Y').astype('int64')
                    # trainData['month'] = trainData['dtDateKst'].dt.strftime('%m').astype('int64')
                    # trainData['day'] = trainData['dtDateKst'].dt.strftime('%d').astype('int64')
                    # trainData['hour'] = trainData['dtDateKst'].dt.strftime('%H').astype('int64')

                    trainDataL1 = trainData[[
                        'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR',
                        'pv', 'sza', 'aza', 'et'
                    ]]
                    # trainDataL1 = trainData[['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']]
                    # trainDataL1.describe()

                    # trainDataL1 = trainDataL1.loc[(trainDataL1['CA_TOT'] == 0)]

                    # CA_TOT = 0 (전운량)

                    # plt.scatter(trainData['dtDateKst'], trainData['CA_TOT'])
                    # plt.scatter(trainData['dtDateKst'], trainData['SWR'])
                    # plt.scatter(trainData['pv'], trainData['SWR'])
                    # plt.scatter(trainDataL1['CA_TOT'], trainDataL1['SWR'])
                    # plt.show()

                    # trainDataL1 = trainData[['dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv']]
                    #     # )[['dtDate', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CLD', 'CF', 'SWR', 'pv']]
                    #     # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CF', 'CLD', 'SWR', 'pv']].dropna()
                    #     # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'CF', 'CLD', 'SWR', 'pv']]
                    #

                    # import pandas as pd
                    # from autofeat import AutoFeatClassifier
                    # from sklearn.model_selection import train_test_split
                    # from sklearn.datasets import load_breast_cancer
                    # from sklearn.linear_model import LogisticRegression
                    # from sklearn.metrics import accuracy_score, confusion_matrix
                    # #
                    # # load_breast_cancer = load_breast_cancer(as_frame=True)
                    # # X = load_breast_cancer.data
                    # # y = load_breast_cancer.target
                    # # trainData, testData
                    # model = AutoFeatClassifier(verbose=1)
                    # X_train_feature_creation = model.fit_transform(trainData, testData)
                    #
                    # import pandas as pd  # 기본라이브러리
                    # # from prophet import Prophet  # Prophet
                    # from neuralprophet import NeuralProphet  # NeuralProphet
                    # from sklearn.metrics import mean_absolute_error  # 평가 지표 MAE
                    # from statistics import mean  # 평균값 계산
                    # import matplotlib.pyplot as plt  # 그래프묘사
                    #
                    # df1_nprophet_model = NeuralProphet(seasonality_mode='multiplicative')
                    # df1_nprophet_model_result = df1_nprophet_model.fit(trainData, freq="H")
                    # trainData['ds'] = trainData['dtDateKst']

                    # **********************************************************************************************************
                    # TEST
                    # ***************

                    # trainData

                    plt.scatter(trainData['dtDateKst'], trainData['pv'])
                    plt.show()

                    from pmdarima import auto_arima
                    import statsmodels.tsa.api as tsa
                    import statsmodels.api as sm
                    # arima_model = auto_arima(y_to_train, seasonal=True, m=7)

                    sxmodel = auto_arima(trainData[['pv']],
                                         exogenous=trainData[['SWR']],
                                         start_p=1,
                                         start_q=1,
                                         test='adf',
                                         max_p=3,
                                         max_q=3,
                                         m=12,
                                         start_P=0,
                                         seasonal=True,
                                         d=None,
                                         D=1,
                                         trace=True,
                                         error_action='ignore',
                                         suppress_warnings=True,
                                         stepwise=True)
                    # Fit model
                    # arima_exog_model = auto_arima(y=trainData['pv'], exogenous=trainData['SWR'], seasonal=True, m=7)
                    # Forecast
                    # y_arima_exog_forecast = arima_exog_model.predict(n_periods=365, exogenous=exog_to_test)

                    trainData.index = trainData['dtDateKst']

                    # import pmdarima as pm
                    # y = pm.datasets.load_wineind()
                    #
                    # from pmdarima.model_selection import train_test_split
                    # import numpy as np
                    # train, test = train_test_split(y, train_size=150)
                    #
                    #
                    # auto_arima = auto_arima(
                    #     y=trainData['pv'].values,
                    #     # X=trainData['dtDatekst', 'law']],
                    #     X=trainData['dtDateKst'].values,
                    #     #                   stepwise=False,
                    #     seasonal=True,
                    #     max_order=5,
                    #     m=12,
                    #     approximation=False,
                    #     information_criterion='aic')
                    #
                    # from darts.models import (
                    #     NaiveSeasonal,
                    #     NaiveDrift,
                    #     Prophet,
                    #     ExponentialSmoothing,
                    #     ARIMA,
                    #     AutoARIMA,
                    #     StandardRegressionModel,
                    #     Theta,
                    #     FFT
                    # )
                    #
                    # from darts import TimeSeries
                    # series = TimeSeries.from_dataframe(trainData, time_col='dtDateKst', value_cols='pv', fill_missing_dates=True, freq='H')
                    #
                    #
                    # # for model in (
                    # #         NaiveSeasonal,
                    # #         NaiveDrift,
                    # #         Prophet,
                    # #         ExponentialSmoothing,
                    # #         ARIMA,
                    # #         AutoARIMA,
                    # #         # StandardRegressionModel, -> 初期化時にtrain_n_points が必要
                    # #         Theta,
                    # #         FFT
                    # #     )
                    # m = model()
                    # m.fit(trainData)
                    # pred = m.predict(len(val))
                    # `

                    # **********************************************************************************************************
                    # 머신러닝
                    # **********************************************************************************************************
                    # 시게열
                    # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63
                    #
                    # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4")
                    # # trainDataL4.to_csv(saveCsvFile, index=False)
                    # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile))
                    #
                    # trainDataL4 = pd.read_csv(saveCsvFile)
                    # trainDataL4.describe()

                    saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'pycaret', 'for', '*')
                    saveMlModelList = sorted(glob.glob(saveMlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    # if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1):
                    if (len(saveMlModelList) < 1):
                        pyModel = setup(data=trainDataL1,
                                        session_id=123,
                                        silent=True,
                                        target='pv')

                        # 각 모형에 따른 자동 머신러닝
                        modelList = compare_models(sort='RMSE', n_select=3)

                        # 앙상블 모형
                        blendModel = blend_models(estimator_list=modelList,
                                                  fold=10)

                        # 앙상블 튜닝
                        tuneModel = tune_model(blendModel,
                                               fold=10,
                                               choose_better=True)

                        # 학습 모델
                        fnlModel = finalize_model(tuneModel)

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'pycaret', 'for',
                            datetime.now().strftime("%Y%m%d"))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)
                        save_model(fnlModel, saveModel)

                    # **********************************************************************************************************
                    # 딥러닝
                    # **********************************************************************************************************
                    saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'h2o', 'for', '*')
                    saveDlModelList = sorted(glob.glob(saveDlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1):

                        if (isDlModelInit == False):
                            h2o.init()
                            isDlModelInit = True

                        # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123)
                        # 2022-03-29
                        # dnModel = H2OAutoML(max_models=2, max_runtime_secs=20000, balance_classes=True, seed=123)
                        dnModel = H2OAutoML(max_models=20,
                                            max_runtime_secs=99999,
                                            balance_classes=True,
                                            seed=123)

                        # java.lang.OutOfMemoryError: Java heap space
                        # dnModel = H2OAutoML(max_models=None, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=40, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=30, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=40, max_runtime_secs=20000, balance_classes=True, seed=123)
                        dnModel.train(x=[
                            'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS',
                            'SWR', 'sza', 'aza', 'et'
                        ],
                                      y='pv',
                                      training_frame=h2o.H2OFrame(trainDataL1))
                        # dnModel.train(x=['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1))
                        # dnModel.train(x=['hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1))

                        # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))])
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2))

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'h2o', 'for',
                            datetime.now().strftime("%Y%m%d"))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)

                        # h2o.save_model(model=dnModel.get_best_model(), path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True)
                        dnModel.get_best_model().save_mojo(
                            path=os.path.dirname(saveModel),
                            filename=os.path.basename(saveModel),
                            force=True)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
    def exec(self):

        log.info('[START] {}'.format("exec"))

        # import pandas as pd
        # import numpy as np

        h2o.init()

        try:
            if (platform.system() == 'Windows'):

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': '2021-10-01',
                    'endDate': '2021-11-01',
                    'isOverWrite': True
                    # , 'isOverWrite': False
                }

                globalVar['inpPath'] = 'E:/DATA'
                globalVar['outPath'] = 'E:/DATA'

            else:

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': globalVar['srtDate'],
                    'endDate': globalVar['endDate']
                    # , 'isOverWrite': True
                    ,
                    'isOverWrite': False
                }

            inpPosFile = '{}/{}'.format(globalVar['cfgPath'],
                                        'stnInfo/GA_STN_INFO.xlsx')
            posData = pd.read_excel(inpPosFile, engine='openpyxl')
            posDataL1 = posData[['id', 'lat', 'lon']]

            modelDirKeyList = ['AI_2Y']
            # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M']
            for k, modelDirKey in enumerate(modelDirKeyList):
                log.info("[CHECK] modelDirKey : {}".format(modelDirKey))

                for i, posInfo in posDataL1.iterrows():
                    posId = int(posInfo['id'])
                    posLat = posInfo['lat']
                    posLon = posInfo['lon']

                    log.info(
                        "[CHECK] posId (posLon, posLat) : {} ({}. {})".format(
                            posId, posLon, posLat))

                    # break
                    inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format(
                        globalVar['outPath'], 'ACT', serviceName, posId,
                        'final', 'proc', 'act')
                    fileList = sorted(glob.glob(inpFile))

                    # 파일 없을 경우 예외 처리
                    if fileList is None or len(fileList) < 1:
                        log.error('[ERROR] inpFile : {} / {}'.format(
                            inpFile, '입력 자료를 확인해주세요.'))
                        continue

                    fileInfo = fileList[0]
                    inpData = pd.read_excel(fileInfo, engine='openpyxl')

                    inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan
                    inpData['WS'][inpData['WS'] < 0] = np.nan
                    inpData['WD'][inpData['WD'] < 0] = np.nan
                    inpData['SWR'][inpData['SWR'] < 0] = np.nan
                    inpData['pv'][inpData['pv'] < 0] = np.nan

                    inpDataL1 = inpData.dropna().reset_index(drop=True)

                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0]
                    idxInfo = inpDataL1.loc[
                        inpDataL1['dtDateKst'] >= pd.to_datetime(
                            '2021-10-30', format='%Y-%m-%d')].index.to_numpy()
                    if (len(idxInfo) < 1): continue
                    idx = idxInfo[0]

                    # 7일, 15일, 1달, 3달, 6달, 2년
                    if (modelDirKey == 'AI_2Y'):
                        # 2021년 기준으로 데이터 분할
                        # trainData, testData = inpDataL1[0:idx], inpDataL1[idx:len(inpDataL1)]

                        # 전체 데이터
                        # trainData = inpDataL1

                        # 2021년 기준으로 데이터 분할
                        trainData, testData = inpDataL1[0:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    elif (modelDirKey == 'AI_7D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=7)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_15D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=15)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_1M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=1)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_3M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=3)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_6M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=6)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    # 2021년 기준으로 변경
                    trainDataL1 = trainData[[
                        'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR',
                        'pv', 'sza', 'aza', 'et'
                    ]]

                    # trainDataL1.describe()

                    # **********************************************************************************************************
                    # 머신러닝
                    # **********************************************************************************************************
                    # 시게열
                    # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63
                    #
                    # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4")
                    # # trainDataL4.to_csv(saveCsvFile, index=False)
                    # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile))
                    #
                    # trainDataL4 = pd.read_csv(saveCsvFile)
                    # trainDataL4.describe()

                    saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'pycaret', 'act', '*')
                    saveMlModelList = sorted(glob.glob(saveMlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1):
                        pyModel = setup(data=trainDataL1,
                                        session_id=123,
                                        silent=True,
                                        target='pv')

                        # 각 모형에 따른 자동 머신러닝
                        modelList = compare_models(sort='RMSE', n_select=10)

                        # 앙상블 모형
                        blendModel = blend_models(estimator_list=modelList,
                                                  fold=10)

                        # 앙상블 튜닝
                        tuneModel = tune_model(blendModel,
                                               fold=2,
                                               choose_better=True)
                        log.info("[CHECK] tuneModel : {}".format(tuneModel))

                        # 학습 모델
                        fnlModel = finalize_model(tuneModel)
                        log.info("[CHECK] fnlModel : {}".format(fnlModel))

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'pycaret', 'act',
                            datetime.now().strftime("%Y%m%d"))
                        log.info("[CHECK] saveModel : {}".format(saveModel))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)
                        save_model(fnlModel, saveModel)

                    # **********************************************************************************************************
                    # 딥러닝
                    # **********************************************************************************************************
                    saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'h2o', 'act', '*')
                    saveDlModelList = sorted(glob.glob(saveDlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1):
                        # 개수 제한
                        # 10초 제한
                        # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123)
                        dnModel = H2OAutoML(max_models=40,
                                            max_runtime_secs=20000,
                                            balance_classes=True,
                                            seed=123)

                        # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))])

                        dnModel.train(x=[
                            'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS',
                            'SWR', 'sza', 'aza', 'et'
                        ],
                                      y='pv',
                                      training_frame=h2o.H2OFrame(trainDataL1))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2))

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'h2o', 'act',
                            datetime.now().strftime("%Y%m%d"))
                        log.info("[CHECK] saveModel : {}".format(saveModel))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)
                        h2o.save_model(model=dnModel.get_best_model(),
                                       path=os.path.dirname(saveModel),
                                       filename=os.path.basename(saveModel),
                                       force=True)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e

        finally:
            log.info('[END] {}'.format("exec"))
    def exec(self):

        log.info('[START] {}'.format("exec"))

        # h2o.init()
        import pandas as pd

        try:

            if (platform.system() == 'Windows'):

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': '2019-01-01',
                    'endDate': '2021-12-31',
                    'isOverWrite': True
                    # , 'isOverWrite': False
                }

                globalVar['inpPath'] = 'E:/DATA'
                globalVar['outPath'] = 'E:/DATA'

            else:

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': globalVar['srtDate'],
                    'endDate': globalVar['endDate']
                    # , 'isOverWrite': True
                    ,
                    'isOverWrite': False
                }

            isDlModelInit = False

            inpPosFile = '{}/{}'.format(globalVar['cfgPath'],
                                        'stnInfo/GA_STN_INFO.xlsx')
            posData = pd.read_excel(inpPosFile, engine='openpyxl')
            posDataL1 = posData[['id', 'lat', 'lon']]

            modelDirKeyList = ['AI_2Y']
            # modelDirKeyList = ['AI_1Y6M']
            # modelDirKeyList = ['AI_2Y', 'AI_7D', 'AI_15D', 'AI_1M', 'AI_3M', 'AI_6M']

            for k, modelDirKey in enumerate(modelDirKeyList):
                log.info("[CHECK] modelDirKey : {}".format(modelDirKey))

                for i, posInfo in posDataL1.iterrows():

                    posId = int(posInfo['id'])
                    posLat = posInfo['lat']
                    posLon = posInfo['lon']

                    # if (not re.search('51', str(posId))): continue
                    # if (not re.search('17', str(posId))): continue
                    # if (re.search('17|50|51|58|60|67|72|81|85|87', str(posId))): continue

                    log.info('[CHECK] posId : {}'.format(posId))

                    # break
                    inpFile = '{}/{}/{}-SRV{:05d}-{}-{}-{}.xlsx'.format(
                        globalVar['outPath'], 'FOR', serviceName, posId,
                        'final', 'proc', 'for')
                    fileList = sorted(glob.glob(inpFile))

                    # 파일 없을 경우 예외 처리
                    if fileList is None or len(fileList) < 1:
                        log.error('[ERROR] inpFile : {} / {}'.format(
                            inpFile, '입력 자료를 확인해주세요.'))
                        continue

                    fileInfo = fileList[0]
                    inpData = pd.read_excel(fileInfo, engine='openpyxl')

                    inpData['CA_TOT'][inpData['CA_TOT'] < 0] = np.nan
                    inpData['WS'][inpData['WS'] < 0] = np.nan
                    inpData['WD'][inpData['WD'] < 0] = np.nan
                    inpData['SWR'][inpData['SWR'] < 0] = np.nan
                    inpData['pv'][inpData['pv'] < 0] = np.nan

                    inpDataL1 = inpData.dropna().reset_index(drop=True)
                    inpDataL1 = inpDataL1.sort_values(by=['dtDateKst'], axis=0)

                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-01-01', format='%Y-%m-%d')].index.to_numpy()[0]
                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2021-11-30', format='%Y-%m-%d')].index.to_numpy()
                    idxInfo = inpDataL1.loc[
                        inpDataL1['dtDateKst'] >= pd.to_datetime(
                            '2021-06-01', format='%Y-%m-%d')].index.to_numpy()
                    # idxInfo = inpDataL1.loc[inpDataL1['dtDateKst'] >= pd.to_datetime('2022-01-01', format='%Y-%m-%d')].index.to_numpy()

                    if (len(idxInfo) < 1): continue
                    idx = idxInfo[0]

                    # 7일, 15일, 1달, 3달, 6달, 2년
                    if (modelDirKey == 'AI_2Y'):
                        # 전체 데이터
                        # trainData = inpDataL1

                        # 2021년 기준으로 데이터 분할
                        trainData, testData = inpDataL1[0:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    elif (modelDirKey == 'AI_7D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=7)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_15D'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            timedelta(days=15)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_1M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=1)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_3M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=3)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]
                    elif (modelDirKey == 'AI_6M'):
                        srtIdx = inpDataL1.loc[
                            inpDataL1['dtDateKst'] >=
                            pd.to_datetime('2021-01-01', format='%Y-%m-%d') -
                            relativedelta(months=6)].index.to_numpy()[0]
                        trainData, testData = inpDataL1[srtIdx:idx], inpDataL1[
                            idx:len(inpDataL1)]

                    log.info('[CHECK] len(trainData) : {}'.format(
                        len(trainData)))
                    log.info('[CHECK] len(testData) : {}'.format(
                        len(testData)))
                    log.info('[CHECK] trainData : {} - {}'.format(
                        trainData['dtDateKst'].min(),
                        trainData['dtDateKst'].max()))
                    # log.info('[CHECK] testData : {} - {}'.format(trainData['testData'].min(), trainData['testData'].max()))

                    # trainData['year'] = trainData['dtDateKst'].dt.strftime('%Y').astype('int64')
                    # trainData['month'] = trainData['dtDateKst'].dt.strftime('%m').astype('int64')
                    # trainData['day'] = trainData['dtDateKst'].dt.strftime('%d').astype('int64')

                    # trainDataL1 = trainData[['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']]
                    # trainDataL1 = trainData[['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv', 'sza', 'aza', 'et']]
                    # trainDataL1.describe()

                    # trainDataL1 = trainDataL1.loc[(trainDataL1['CA_TOT'] == 0)]

                    # CA_TOT = 0 (전운량)
                    # trainData.info()

                    # trainData['dtDateKst'] = pd.to_datetime(trainData['dtDateKst'])
                    # plt.scatter(trainData['dtDateKst'][0], trainData['CA_TOT'][0])
                    # plt.scatter(trainData['dtDate'], trainData['CA_TOT'])
                    # plt.scatter(trainData['dtDateKst'], trainData['SWR'])
                    # plt.scatter(trainData['pv'], trainData['SWR'])
                    # plt.scatter(trainDataL1['CA_TOT'], trainDataL1['SWR'])

                    # plt.scatter(trainData['dtDateKst'], trainData['SWR'])

                    log.info('[CHECK] min-max : {} - {}'.format(
                        int(trainData['pv'].min()),
                        int(trainData['pv'].max())))

                    mainTitle = '[{:05d}] {}'.format(
                        posId, '기상 예보 정보 (수치모델)를 활용한 입력데이터 (발전량) 시계열')
                    saveImg = '{}/{}/{}/{}.png'.format(globalVar['figPath'],
                                                       serviceName,
                                                       modelDirKey, mainTitle)
                    os.makedirs(os.path.dirname(saveImg), exist_ok=True)
                    plt.scatter(trainData['dtDateKst'], trainData['pv'])
                    plt.title('{:05d}'.format(posId))
                    plt.savefig(saveImg, dpi=600, bbox_inches='tight')
                    # plt.scatter(trainData['dtDateKst'], trainData['SWR'])
                    # plt.scatter(trainData['dtDateKst'], trainData['sza'])
                    # plt.scatter(trainData['dtDateKst'], trainData['aza'])
                    plt.show()
                    plt.close()

                    continue

                    # trainData.plot()
                    # plt.show()
                    # plt.close()
                    # trainDataL1 = trainData[['dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'pv']]
                    #     # )[['dtDate', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CLD', 'CF', 'SWR', 'pv']]
                    #     # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'DSR', 'CF', 'CLD', 'SWR', 'pv']].dropna()
                    #     # )[['dtDate', 'dtDateKst', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'CF', 'CLD', 'SWR', 'pv']]

                    # import pandas as pd
                    # from autofeat import AutoFeatClassifier
                    # from sklearn.model_selection import train_test_split
                    # from sklearn.datasets import load_breast_cancer
                    # from sklearn.linear_model import LogisticRegression
                    # from sklearn.metrics import accuracy_score, confusion_matrix
                    # #
                    # # load_breast_cancer = load_breast_cancer(as_frame=True)
                    # # X = load_breast_cancer.data
                    # # y = load_breast_cancer.target
                    # # trainData, testData
                    # model = AutoFeatClassifier(verbose=1)
                    # X_train_feature_creation = model.fit_transform(trainData, testData)
                    #
                    # import pandas as pd  # 기본라이브러리
                    # # from prophet import Prophet  # Prophet
                    # from neuralprophet import NeuralProphet  # NeuralProphet
                    # from sklearn.metrics import mean_absolute_error  # 평가 지표 MAE
                    # from statistics import mean  # 평균값 계산
                    # import matplotlib.pyplot as plt  # 그래프묘사
                    #
                    # df1_nprophet_model = NeuralProphet(seasonality_mode='multiplicative')
                    # df1_nprophet_model_result = df1_nprophet_model.fit(trainData, freq="H")
                    # trainData['ds'] = trainData['dtDateKst']
                    #
                    # import pandas as pd
                    # from pycaret.datasets import get_data
                    # data = get_data('pycaret_downloads')
                    # data['Date'] = pd.to_datetime(data['Date'])
                    # data = data.groupby('Date').sum()
                    # data = data.asfreq('D')
                    # data.head()
                    #
                    # # plot the data
                    # data.plot()
                    # plt.show()
                    #
                    # trainData.drop_duplicates(subset=['dtDateKst'], inplace=True)
                    # trainDataL2 = trainData[['pv']]
                    # trainDataL2.index = trainData['dtDateKst']

                    # import pycaret.classification
                    # from pycaret.time_series import *
                    # from pycaret.internal.pycaret_experiment import TimeSeriesExperiment
                    # pyModel = setup(trainDataL2, fh=7, fold=3, session_id=123)
                    # pyModel = setup(trainData, target = 'Price', fh=7, fold=3, session_id=123)

                    # 각 모형에 따른 자동 머신러닝
                    # modelList = compare_models(sort='RMSE', n_select=3)
                    # modelList = compare_models(sort='RMSE')

                    # tuneModel = stack_models(modelList)

                    # 앙상블 모형
                    # blendModel = blend_models(estimator_list=modelList, fold=5)

                    # 앙상블 튜닝
                    # tuneModel = tune_model(modelList, fold=5, choose_better=True)

                    # # 학습 모델
                    # fnlModel = finalize_model(tuneModel)
                    #
                    # predict_model(fnlModel, fh=90)
                    #
                    # plot_model(fnlModel, plot='forecast', data_kwargs = { 'fh' : 30 })
                    # # plot_model(modelList[0], plot='forecast', data_kwargs = { 'fh' : 30 })
                    # # plot_model(modelList[0], plot='forecast', data_kwargs = { 'fh' : 30 })
                    # plot_model(fnlModel, plot='insample')

                    # **********************************************************************************************************
                    # 머신러닝
                    # **********************************************************************************************************
                    # 시게열
                    # https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63
                    #
                    # saveCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'], serviceName, "trainDataL4")
                    # # trainDataL4.to_csv(saveCsvFile, index=False)
                    # log.info('[CHECK] saveCsvFile : {}'.format(saveCsvFile))
                    #
                    # trainDataL4 = pd.read_csv(saveCsvFile)
                    # trainDataL4.describe()

                    saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'pycaret', 'for', '*')
                    saveMlModelList = sorted(glob.glob(saveMlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    # if (sysOpt['isOverWrite']) or (len(saveMlModelList) < 1):
                    if (len(saveMlModelList) < 1):
                        pyModel = setup(data=trainDataL1,
                                        session_id=123,
                                        silent=True,
                                        target='pv')

                        # 각 모형에 따른 자동 머신러닝
                        modelList = compare_models(sort='RMSE', n_select=3)

                        # 앙상블 모형
                        blendModel = blend_models(estimator_list=modelList,
                                                  fold=10)

                        # 앙상블 튜닝
                        tuneModel = tune_model(blendModel,
                                               fold=10,
                                               choose_better=True)

                        # 학습 모델
                        fnlModel = finalize_model(tuneModel)

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'pycaret', 'for',
                            datetime.now().strftime("%Y%m%d"))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)
                        save_model(fnlModel, saveModel)

                    # **********************************************************************************************************
                    # 딥러닝
                    # **********************************************************************************************************
                    saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                        globalVar['outPath'], modelDirKey, serviceName, posId,
                        'final', 'h2o', 'for', '*')
                    saveDlModelList = sorted(glob.glob(saveDlModel),
                                             reverse=True)

                    # 학습 모델이 없을 경우
                    if (sysOpt['isOverWrite']) or (len(saveDlModelList) < 1):

                        if (isDlModelInit == False):
                            h2o.init()
                            isDlModelInit = True

                        # dnModel = H2OAutoML(max_models=20, max_runtime_secs=10000, balance_classes=True, seed=123)
                        # 2022-03-29
                        # dnModel = H2OAutoML(max_models=2, max_runtime_secs=20000, balance_classes=True, seed=123)
                        dnModel = H2OAutoML(max_models=20,
                                            max_runtime_secs=99999,
                                            balance_classes=True,
                                            seed=123)

                        # java.lang.OutOfMemoryError: Java heap space
                        # dnModel = H2OAutoML(max_models=None, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=40, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=30, max_runtime_secs=99999, balance_classes=True, seed=123)
                        # dnModel = H2OAutoML(max_models=40, max_runtime_secs=20000, balance_classes=True, seed=123)
                        dnModel.train(x=[
                            'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS',
                            'SWR', 'sza', 'aza', 'et'
                        ],
                                      y='pv',
                                      training_frame=h2o.H2OFrame(trainDataL1))
                        # dnModel.train(x=['year', 'month', 'day', 'hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1))
                        # dnModel.train(x=['hour', 'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL1))

                        # trainSet, validSet = np.split(trainDataL1, [int(0.70 * len(trainDataL1))])
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSetL1), validation_frame=h2o.H2OFrame(validSetL1))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2), validation_frame=h2o.H2OFrame(testData))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainSet), validation_frame=h2o.H2OFrame(validSet))
                        # aml.train(x=['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'], y='pv', training_frame=h2o.H2OFrame(trainDataL2))

                        # 학습 모델 저장
                        saveModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['outPath'], modelDirKey, serviceName,
                            posId, 'final', 'h2o', 'for',
                            datetime.now().strftime("%Y%m%d"))
                        os.makedirs(os.path.dirname(saveModel), exist_ok=True)

                        # h2o.save_model(model=dnModel.get_best_model(), path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True)
                        dnModel.get_best_model().save_mojo(
                            path=os.path.dirname(saveModel),
                            filename=os.path.basename(saveModel),
                            force=True)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
Пример #6
0
                         'Default Task'].tolist()[0]

    # Describe data
    st.write(
        f'This dataset has {df.shape[0]} samples and {df.shape[1]} features. Target variable is {df_target}.'
    )
    st.dataframe(df.head())

    if df_task in ['NLP / Regression', 'Regression']:

        # Setup PyCaret
        with st.spinner('PyCaret setup is running...'):
            pycset = regression.setup(data=df, target=df_target)

        # Compare models
        st.dataframe(regression.compare_models())

        # End
        st.success('End of execution!')

    if df_task in ['Classification (Binary)', 'Classification (Multiclass)']:

        # Setup PyCaret
        with st.spinner('PyCaret setup is running...'):
            pycset = classification.setup(data=df, target=df_target)

        # Compare models
        st.dataframe(classification.compare_models())

        # End
        st.success('End of execution!')
    def exec(self):

        log.info('[START] {}'.format("exec"))

        try:

            import pandas as pd

            globalVar['inpPath'] = 'E:/DATA/OUTPUT'
            globalVar['outPath'] = 'E:/DATA/OUTPUT'

            inpCsvFile = '{}/{}_{}.csv'.format(globalVar['outPath'],
                                               serviceName, 'TrainData')
            # CSV 파일 저장
            # umDataL10.to_csv(saveCsvFile, index=False)

            data = pd.read_csv(inpCsvFile)

            # test = trainDataL7.dropna().reset_index(drop=True)

            data = data.drop(['ML', 'DL'], axis=1)
            data['dtDateKst'] = pd.to_datetime(data['dtDateKst'])

            #
            # testL1 = test[['CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'PM10', 'SWR', 'sza', 'aza', 'et', 'pv']]
            dataL1 = data[[
                'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza',
                'aza', 'et', 'pv'
            ]]

            pyModel = setup(data=dataL1, target='pv', session_id=123)

            try:

                # 각 모형에 따른 자동 머신러닝
                modelList = compare_models(sort='RMSE', n_select=3)

                # 앙상블 모형
                blendModel = blend_models(estimator_list=modelList, fold=2)

                # 앙상블 튜닝
                tuneModel = tune_model(blendModel, fold=2, choose_better=True)

                # 학습 모델
                fnlModel = finalize_model(tuneModel)

            except Exception as e:
                log.error("Exception : {}".format(e))

            # evaluate_model(tuneModel)

            # pred_holdout = predict_model(fnlModel)

            # print(fnlModel)

            # 회귀 시각화
            # plot_model(fnlModel, plot='error')
            # plt.show()

            mlModel = fnlModel

            # predData = predict_model(fnlModel, data=dataL1)

            # 24.4427

            # check_metric(dataL1['pv'], dataL1['Label'], metric='RMSE')

            # h2o
            h2o.init()
            aml = H2OAutoML(max_models=20,
                            max_runtime_secs=10000,
                            balance_classes=True,
                            seed=1)
            aml.train(x=[
                'CA_TOT', 'HM', 'PA', 'TA', 'TD', 'WD', 'WS', 'SWR', 'sza',
                'aza', 'et'
            ],
                      y='pv',
                      training_frame=h2o.H2OFrame(dataL1),
                      validation_frame=h2o.H2OFrame(dataL1))

            dlModel = aml.get_best_model()

            dataL2 = data
            dataL3 = predict_model(mlModel,
                                   data=dataL2).rename({'Label': 'ML'},
                                                       axis='columns')
            dataL3['DL'] = dlModel.predict(
                h2o.H2OFrame(dataL2)).as_data_frame()

            anaTimeList = dataL3['anaTime'].unique()

            for j, anaTimeInfo in enumerate(anaTimeList):

                dataL4 = dataL3.loc[dataL3['anaTime'] == anaTimeInfo].dropna(
                ).reset_index(drop=True)

                mainTitle = '[{}] {}'.format(
                    anaTimeInfo, '기상 예보 정보 (수치모델)를 활용한 48시간 예측 시계열')
                saveImg = '{}/{}/{}.png'.format(globalVar['figPath'],
                                                serviceName, mainTitle)
                makeUserTimeSeriesPlot(pd.to_datetime(dataL4['dtDateKst']),
                                       dataL4['ML'], dataL4['DL'],
                                       dataL4['pv'], '예측 (머신러닝)', '예측 (딥러닝)',
                                       '실측 (발전량)', '시간 (시)', '발전량', mainTitle,
                                       saveImg, True)

            mainTitle = '[{}-{}] {}'.format(
                min(anaTimeList), max(anaTimeList),
                '기상 예보 정보 (수치모델)를 활용한 머신러닝 (48시간 예측) 산점도')
            saveImg = '{}/{}/{}.png'.format(globalVar['figPath'], serviceName,
                                            mainTitle)
            makeUserScatterPlot(dataL3['ML'], dataL3['pv'], '머신러닝', '실측',
                                mainTitle, saveImg, 0, 1000, 20, 60, True)

            mainTitle = '[{}-{}] {}'.format(
                min(anaTimeList), max(anaTimeList),
                '기상 예보 정보 (수치모델)를 활용한 딥러닝 (48시간 예측) 산점도')
            saveImg = '{}/{}/{}.png'.format(globalVar['figPath'], serviceName,
                                            mainTitle)
            makeUserScatterPlot(dataL3['DL'], dataL3['pv'], '딥러닝', '실측',
                                mainTitle, saveImg, 0, 1000, 20, 60, True)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
Пример #8
0
def regression_model(*, y_col, training_set, normalize, test_size, folds,
                     metric, model_name, testing_set, imbalanced, seed,
                     include_models, normalize_method):
    """
    Build a regression model for prediction.

    Parameters
    ----------
    y_col : str
        the name of the target column.
    training_set : pd.DataFrame
        DataFrame containing the training data.
    normalize : bool
        if True the dataset will be normalized before training.
    test_size : float
        Between [0.0-1.0]. The size of the split for test within the training set.
    folds : int
        number of folds for cross validation.
    metric : str
        the metric used for evaluating the best model.
    model_name : str
        the name to save the model.
    testing_set : pd.DataFrame
        the external dataset for evaluating the best model.
    imbalanced
    seed : int
        random number to initilize the process.
    include_models : List
        a list of models to be included in the process.
    normalize_method : str
        The method used for normalizing the data.

    Returns
    -------
    Final regression model

    """
    if not metric:
        metric = 'RMSE'
    setup = pyreg.setup(target=y_col,
                        data=training_set,
                        normalize=normalize,
                        normalize_method=normalize_method,
                        train_size=1 - test_size,
                        fold=folds,
                        silent=True,
                        session_id=seed)
    best_model = pyreg.compare_models(sort=metric, include=include_models)
    pyreg.pull().to_csv(model_name + '_compare_models.tsv',
                        sep='\t',
                        index=False)
    reg_model = pyreg.create_model(best_model)
    reg_tuned_model = pyreg.tune_model(reg_model, optimize=metric)
    pyreg.pull().to_csv(model_name + '_tuned_model.tsv', sep='\t', index=False)
    final_model = pyreg.finalize_model(reg_tuned_model)
    pyreg.plot_model(final_model, save=True)
    pyreg.plot_model(final_model, plot='feature', save=True)
    pyreg.plot_model(final_model, plot='error', save=True)
    pyreg.save_model(final_model, model_name)
    if len(testing_set.index) != 0:
        unseen_predictions = test_regressor(
            model_path=model_name + '.pkl',
            x_set=testing_set.drop(columns=[y_col]),
            y_col=testing_set[y_col],
            output=model_name)
        unseen_predictions.to_csv(model_name + '_external_testing_results.tsv',
                                  sep='\t',
                                  index=True)
    return final_model
        target=y_col,
        numeric_features=x_cols,
        verbose=False,
        remove_multicollinearity=False,
        # multicollinearity_threshold = 0.6,
        ignore_low_variance=False,
        silent=True,
        n_jobs=2)

    # ---- 模型选择 ---------------------------------------------------------------------------------

    best_model = compare_models(
        include=[
            'rf', 'lightgbm', 'lasso', 'ridge', 'xgboost', 'en', 'knn', 'mlp',
            'lr', 'dt'
        ],
        sort='R2',
        verbose=True,
        fold=3,
        round=5,
    )

    # ---- 模型调参 ---------------------------------------------------------------------------------

    # 初始化模型, 固定参数.
    params = {'max_features': 'auto'}
    rgsr = create_model('rf', verbose=False, **params)

    # 模型调参.
    params4tuning = {
        "n_estimators": np.arange(30, 250, 30),
        "min_samples_leaf": [10, 15, 20, 30, 40, 50],