示例#1
0
def make_forecast(df, len_forecast: int):
    """
    Function for making time series forecasting with AutoTS library

    :param df: dataframe to process
    :param len_forecast: forecast length

    :return predicted_values: forecast
    :return model_name: name of the model (always 'AutoTS')
    """

    model = AutoTS(forecast_length=len_forecast,
                   frequency='infer',
                   prediction_interval=0.9,
                   ensemble='all',
                   model_list="superfast",
                   max_generations=15,
                   num_validations=2,
                   validation_method="backwards")

    model = model.fit(df, date_col='datetime', value_col='value')

    prediction = model.predict()
    # point forecasts dataframe
    forecasts_df = prediction.forecast

    predicted_values = np.array(forecasts_df['value'])
    model_name = 'AutoTs'
    return predicted_values, model_name
示例#2
0
    'runtime_weighting': 0,
    'spl_weighting': 1,
    'contour_weighting': 0,
}

model = AutoTS(
    forecast_length=forecast_length,
    frequency='infer',
    prediction_interval=0.9,
    ensemble="simple,horizontal-max",
    constraint=None,
    max_generations=generations,
    num_validations=2,
    validation_method='backwards',
    model_list=model_list,
    transformer_list=transformer_list,
    transformer_max_depth=transformer_max_depth,
    initial_template='General+Random',
    metric_weighting=metric_weighting,
    models_to_validate=0.35,
    max_per_model_class=None,
    model_interrupt=True,
    n_jobs=n_jobs,
    drop_most_recent=1,
    subset=None,
    verbose=verbose,
)

future_regressor_train, future_regressor_forecast = fake_regressor(
    df,
    dimensions=1,
    forecast_length=forecast_length,
示例#3
0
}

model = AutoTS(
    forecast_length=forecast_length,
    frequency=frequency,
    prediction_interval=prediction_interval,
    ensemble=ensemble,
    constraint=None,
    max_generations=generations,
    num_validations=num_validations,
    validation_method=validation_method,
    model_list=model_list,
    transformer_list=transformer_list,
    transformer_max_depth=transformer_max_depth,
    initial_template="Random",
    metric_weighting=metric_weighting,
    models_to_validate=0.35,
    max_per_model_class=None,
    model_interrupt="end_generation",
    n_jobs=n_jobs,
    drop_most_recent=drop_most_recent,
    introduce_na=True,
    preclean=preclean,
    # prefill_na=0,
    # subset=5,
    verbose=verbose,
    models_mode=models_mode,
    random_seed=random_seed,
)

示例#4
0
    def exec(self):

        try:
            log.info('[START] {}'.format('exec'))

            # breakpoint()

            # ********************************************
            # 옵션 설정
            # ********************************************
            sysOpt = {
                #  딥러닝
                'dlModel': {
                    # 초기화
                    'isInit': False

                    # 모델 업데이트 여부
                    # , 'isOverWrite': True
                    ,
                    'isOverWrite': False
                }

                #  머신러닝
                ,
                'mlModel': {
                    # 모델 업데이트 여부
                    # 'isOverWrite': True
                    'isOverWrite': False
                }

                #  시계열
                ,
                'tsModel': {
                    # 미래 예측 연도
                    'forYear': 2
                    # 아파트 설정
                    ,
                    'aptList': []  # 전체 아파트 검색
                    # , 'aptList': ['미아동부센트레빌']
                    # , 'aptList': ['미아동부센트레빌', '송천센트레빌', '에스케이북한산시티']
                }
            }

            # *****************************************************
            # 인허가 데이터
            # *****************************************************
            lcnsInpFile = '{}/{}/{}'.format(globalVar['inpPath'], serviceName,
                                            '서울특별시 강북구 인허가.csv')
            lcnsFileList = glob.glob(lcnsInpFile)
            if lcnsFileList is None or len(lcnsFileList) < 1:
                log.error('[ERROR] inpFile : {} / {}'.format(
                    lcnsFileList, '입력 자료를 확인해주세요.'))
                raise Exception('[ERROR] inpFile : {} / {}'.format(
                    lcnsFileList, '입력 자료를 확인해주세요.'))

            lcnsData = pd.read_csv(lcnsFileList[0])
            lcnsData.drop(['Unnamed: 0'], axis=1, inplace=True)
            lcnsDataL1 = lcnsData.groupby(
                ['주소'], as_index=False)['archGbCdNm'].count()

            # *****************************************************
            # 전월세 데이터
            # *****************************************************
            prvsMntsrInpFile = '{}/{}/{}'.format(
                globalVar['inpPath'], serviceName,
                '서울특별시 강북구 아파트 전월세가_인허가_20111101_20201101.csv')

            prvsMntsrFileList = glob.glob(prvsMntsrInpFile)
            if prvsMntsrFileList is None or len(prvsMntsrFileList) < 1:
                log.error('[ERROR] inpFile : {} / {}'.format(
                    prvsMntsrFileList, '입력 자료를 확인해주세요.'))
                raise Exception('[ERROR] inpFile : {} / {}'.format(
                    prvsMntsrFileList, '입력 자료를 확인해주세요.'))

            prvsMntsrFileInfo = prvsMntsrFileList[0]
            prvsMntsrData = pd.read_csv(prvsMntsrFileInfo)
            prvsMntsrData.drop(['Unnamed: 0.1'], axis=1, inplace=True)

            prvsMntsrData['name'] = prvsMntsrData['단지명'] + '(' + prvsMntsrData[
                '도로명'] + ')'
            prvsMntsrDataL2 = prvsMntsrData.loc[
                (prvsMntsrData['전월세구분'] == '전세')
                & (prvsMntsrData['층'] != 1)].reset_index(drop=True)

            # prvsMntsrDataL2['계약년도'] = prvsMntsrDataL2['계약년월'].astype(str).str.slice(0, 4)
            prvsMntsrDataL2['date'] = pd.to_datetime(prvsMntsrDataL2['계약년월'],
                                                     format='%Y%m')
            prvsMntsrDataL2['year'] = prvsMntsrDataL2['date'].dt.strftime(
                "%Y").astype('int')
            prvsMntsrDataL2['month'] = prvsMntsrDataL2['date'].dt.strftime(
                "%m").astype('int')
            prvsMntsrDataL2['보증금(만원)'] = prvsMntsrDataL2['보증금(만원)'].astype(
                str).str.replace(',', '').astype('float')

            prvsMntsrDataL3 = pd.merge(
                left=prvsMntsrDataL2
                # , right=lcnsData[['archGbCdNm', '주소', 'lat', 'lon']]
                ,
                right=lcnsDataL1[['archGbCdNm', '주소']],
                left_on=['인허가addr'],
                right_on=['주소'],
                how='left').rename(columns={'archGbCdNm': 'inhuga'})

            prvsMntsrdataL2 = prvsMntsrDataL3.rename(
                columns={
                    '전용면적(㎡)': 'capacity',
                    '건축년도': 'conYear',
                    '보증금(만원)': 'realBjprice',
                    'archGbCdNm': 'inhuga',
                    '거래금액(만원)': 'realPrice'
                }).sort_values(by=['name', 'capacity', 'year']).reset_index(
                    drop=True)

            # *****************************************************
            # 실거래가 데이터
            # *****************************************************
            realPriceInpFile = '{}/{}/{}'.format(
                globalVar['inpPath'], serviceName,
                '서울특별시 강북구 아파트 실거래가_인허가_20111101_20201101.csv')

            realPriceFileList = glob.glob(realPriceInpFile)
            if realPriceFileList is None or len(realPriceFileList) < 1:
                log.error('[ERROR] inpFile : {} / {}'.format(
                    realPriceFileList, '입력 자료를 확인해주세요.'))
                raise Exception('[ERROR] inpFile : {} / {}'.format(
                    realPriceFileList, '입력 자료를 확인해주세요.'))

            realPriceFileInfo = realPriceFileList[0]
            realPriceData = pd.read_csv(realPriceFileInfo)
            realPriceData.drop(['Unnamed: 0.1'], axis=1, inplace=True)

            realPriceData['name'] = realPriceData['단지명'] + '(' + realPriceData[
                '도로명'] + ')'
            # realPriceDataL1 = realPriceData[['name','전용면적(㎡)','거래금액(만원)','층','건축년도','lat','lon','계약년월','인허가addr']]
            realPriceDataL2 = realPriceData.loc[(realPriceData['층'] !=
                                                 1)].reset_index(drop=True)

            realPriceDataL2['date'] = pd.to_datetime(realPriceDataL2['계약년월'],
                                                     format='%Y%m')
            realPriceDataL2['year'] = realPriceDataL2['date'].dt.strftime(
                "%Y").astype('int')
            realPriceDataL2['month'] = realPriceDataL2['date'].dt.strftime(
                "%m").astype('int')
            realPriceDataL2['거래금액(만원)'] = realPriceDataL2['거래금액(만원)'].astype(
                str).str.replace(',', '').astype('float')

            realPriceDataL3 = pd.merge(
                left=realPriceDataL2,
                right=lcnsDataL1[['archGbCdNm', '주소']],
                left_on=['인허가addr'],
                right_on=['주소'],
                how='left').rename(columns={'archGbCdNm': 'inhuga'})

            realPricedataL2 = realPriceDataL3.rename(
                columns={
                    '전용면적(㎡)': 'capacity',
                    '건축년도': 'conYear',
                    '보증금(만원)': 'realBjprice',
                    'archGbCdNm': 'inhuga',
                    '거래금액(만원)': 'realPrice'
                }).sort_values(by=['name', 'capacity', 'year']).reset_index(
                    drop=True)

            # *****************************************************
            # 데이터 통합
            # *****************************************************
            prvsMntsrDataL5 = prvsMntsrdataL2.groupby(
                [
                    'name', 'conYear', 'capacity', 'lat', 'lon', 'year',
                    'inhuga'
                ],
                as_index=False)['realBjprice'].mean()
            realPriceDataL5 = realPricedataL2.groupby(
                [
                    'name', 'conYear', 'capacity', 'lat', 'lon', 'year',
                    'inhuga'
                ],
                as_index=False)['realPrice'].mean()

            data = pd.merge(left=prvsMntsrDataL5,
                            right=realPriceDataL5,
                            left_on=[
                                'name', 'conYear', 'capacity', 'lat', 'lon',
                                'year', 'inhuga'
                            ],
                            right_on=[
                                'name', 'conYear', 'capacity', 'lat', 'lon',
                                'year', 'inhuga'
                            ],
                            how='outer')

            # **********************************************************************************************************
            # 딥러닝 매매가
            # **********************************************************************************************************
            inpData = realPricedataL2
            # inpData ㅊ= realPriceDataL5

            # xCol = ['times', 'capacity', 'construction_year', 'lat', 'lon', 'realBjprice', 'inhuga']
            # xCol = ['times', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'conYear', 'capacity', 'lat', 'lon']
            # xCol = ['name', 'year', 'conYear', 'capacity', 'lat', 'lon']
            # xCol = ['year', 'capacity', 'lat', 'lon', 'inhuga']
            xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'month', 'capacity', 'lat', 'lon', 'inhuga']
            yCol = 'realPrice'
            modelKey = 'realPrice'

            # 딥러닝 매매가 불러오기
            result = makeDlModel(sysOpt['dlModel'], xCol, yCol, inpData,
                                 modelKey)
            log.info('[CHECK] result : {}'.format(result))

            # 딥러닝 매매가 예측
            realPriceDlModel = result['dlModel']
            data['realPriceDL'] = realPriceDlModel.predict(
                h2o.H2OFrame(data)).as_data_frame()

            mainTitle = '강북구 아파트 매매가 예측 결과 (딥러닝)'
            saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
                                            mainTitle)
            makeUserScatterPlot(data['realPriceDL'], data['realPrice'], '예측',
                                '실측', mainTitle, saveImg, 0, 140000, 2000,
                                10000, True)

            # **********************************************************************************************************
            # 딥러닝 전세가
            # **********************************************************************************************************
            inpData = prvsMntsrdataL2

            # inpData.info()

            # xCol = ['times', 'capacity', 'construction_year', 'lat', 'lon', 'realPrice', 'inhuga']
            # xCol = ['times', 'capacity', 'lat', 'lon', 'inhuga']
            xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga']
            yCol = 'realBjprice'
            modelKey = 'realBjPrice'

            # 딥러닝 전세가 불러오기
            result = makeDlModel(sysOpt['dlModel'], xCol, yCol, inpData,
                                 modelKey)
            log.info('[CHECK] result : {}'.format(result))

            # 딥러닝 전세가 예측
            realBjPriceDlModel = result['dlModel']
            data['realBjPriceDL'] = realBjPriceDlModel.predict(
                h2o.H2OFrame(data)).as_data_frame()

            mainTitle = '강북구 아파트 전세가 예측 결과 (딥러닝)'
            saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
                                            mainTitle)
            makeUserScatterPlot(data['realBjPriceDL'], data['realBjprice'],
                                '예측', '실측', mainTitle, saveImg, 0, 140000,
                                2000, 10000, True)

            # **********************************************************************************************************
            # 머신러닝 매매가
            # **********************************************************************************************************
            inpData = realPricedataL2

            # xCol = ['times', 'capacity', 'construction_year', 'lat', 'lon', 'realBjprice', 'inhuga']
            # xCol = ['times', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'capacity', 'lat', 'lon', 'inhuga']
            xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'month', 'capacity', 'lat', 'lon', 'inhuga']
            yCol = 'realPrice'
            modelKey = 'realPrice'

            # 머신러닝 매매가 불러오기
            result = makeMlModel(sysOpt['mlModel'], xCol, yCol, inpData,
                                 modelKey)
            log.info('[CHECK] result : {}'.format(result))

            # 머신러닝 매매가 예측
            realPriceMlModel = result['mlModel']
            data['realPriceML'] = predict_model(realPriceMlModel,
                                                data=data)['Label']

            mainTitle = '강북구 아파트 매매가 예측 결과 (머신러닝)'
            saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
                                            mainTitle)
            makeUserScatterPlot(data['realPriceML'], data['realPrice'], '예측',
                                '실측', mainTitle, saveImg, 0, 140000, 2000,
                                10000, True)

            # **********************************************************************************************************
            # 머신러닝 전세가
            # **********************************************************************************************************
            inpData = prvsMntsrdataL2
            # xCol = ['times', 'capacity', 'construction_year', 'lat', 'lon', 'realPrice', 'inhuga']
            # xCol = ['times', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga']
            # xCol = ['year', 'month', 'capacity', 'lat', 'lon', 'inhuga']
            xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga']
            yCol = 'realBjprice'
            modelKey = 'realBjPrice'

            # 머신러닝 전세가 불러오기
            result = makeMlModel(sysOpt['mlModel'], xCol, yCol, inpData,
                                 modelKey)
            log.info('[CHECK] result : {}'.format(result))

            # 머신러닝 전세가 예측
            realBjPriceMlModel = result['mlModel']
            data['realBjPriceML'] = predict_model(realBjPriceMlModel,
                                                  data=data)['Label']

            mainTitle = '강북구 아파트 전세가 예측 결과 (머신러닝)'
            saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
                                            mainTitle)
            makeUserScatterPlot(data['realBjPriceML'], data['realBjprice'],
                                '예측', '실측', mainTitle, saveImg, 0, 140000,
                                2000, 10000, True)

            # **********************************************************************************************************
            # 시계열 갭투자
            # **********************************************************************************************************
            nameList = data['name'].unique()
            searchAptList = sysOpt['tsModel']['aptList']

            fnlData = pd.DataFrame()
            for i, nameInfo in enumerate(nameList):

                # 아파트 검색 모듈
                # sysOpt['tsModel']['aptList'] = [] : 전체 아파트에 대한 수익률 예측
                # sysOpt['tsModel']['aptList'] = ['미아동부센트레빌'] : 미아동부센트레빌 아파트에 대한 수익률 예측
                isSearch = True if (len(searchAptList) < 1) else False
                for ii, aptInfo in enumerate(searchAptList):
                    if (aptInfo in nameInfo):
                        isSearch = True
                        break

                if (isSearch == False): continue

                log.info('[CHECK] isSearch : {} / nameInfo : {}'.format(
                    isSearch, nameInfo))

                selData = data.loc[(data['name'] == nameInfo)].reset_index(
                    drop=True)

                if (len(selData) < 1): continue

                capList = selData['capacity'].unique()
                # capInfo = capList[0]
                for j, capInfo in enumerate(capList):
                    selDataL1 = selData.loc[(
                        selData['capacity'] == capInfo)].reset_index(drop=True)

                    if (len(selDataL1) < 1): continue
                    selInfoFirst = selDataL1.loc[0]

                    log.info('[CHECK] nameInfo : {} / capInfo : {} / cnt : {}'.
                             format(nameInfo, capInfo, len(selDataL1)))

                    srtDate = pd.to_datetime(selDataL1['year'].min(),
                                             format='%Y')
                    endDate = pd.to_datetime(selDataL1['year'].max(),
                                             format='%Y')
                    dtDateList = pd.date_range(start=srtDate,
                                               end=endDate,
                                               freq=pd.DateOffset(years=1))

                    dataL2 = pd.DataFrame()
                    # dtDateInfo = dtDateList[0]
                    for k, dtDateInfo in enumerate(dtDateList):
                        iYear = int(dtDateInfo.strftime('%Y'))

                        selInfo = selDataL1.loc[
                            (selDataL1['year'] == iYear)
                            # & (selDataL1['month'] == iMonth)
                        ].reset_index(drop=True)

                        dictInfo = {
                            'name': [nameInfo],
                            'capacity': [capInfo],
                            'date': [dtDateInfo],
                            'year': [iYear],
                            'lat': [selInfoFirst['lat']],
                            'lon': [selInfoFirst['lon']],
                            'inhuga': [selInfoFirst['inhuga']],
                            'conYear': [selInfoFirst['conYear']]
                        }

                        dictDtl = {
                            'realPrice': [
                                np.nan if
                                (len(selInfo) < 1) else selInfo['realPrice'][0]
                            ],
                            'realBjprice': [
                                np.nan if (len(selInfo) < 1) else
                                selInfo['realBjprice'][0]
                            ],
                            'realPriceDL': [
                                realPriceDlModel.predict(
                                    h2o.H2OFrame(
                                        pd.DataFrame.from_dict(dictInfo))
                                ).as_data_frame()['predict'][0] if
                                (len(selInfo) < 1) else
                                selInfo['realPriceDL'][0]
                            ],
                            'realBjPriceDL': [
                                realBjPriceDlModel.predict(
                                    h2o.H2OFrame(
                                        pd.DataFrame.from_dict(dictInfo))
                                ).as_data_frame()['predict'][0] if
                                (len(selInfo) < 1) else
                                selInfo['realBjPriceDL'][0]
                            ],
                            'realPriceML': [
                                predict_model(realPriceMlModel,
                                              data=pd.DataFrame.from_dict(
                                                  dictInfo))['Label'][0] if
                                (len(selInfo) < 1) else
                                selInfo['realPriceML'][0]
                            ],
                            'realBjPriceML': [
                                predict_model(realBjPriceMlModel,
                                              data=pd.DataFrame.from_dict(
                                                  dictInfo))['Label'][0] if
                                (len(selInfo) < 1) else
                                selInfo['realBjPriceML'][0]
                            ]
                        }

                        dict = {**dictInfo, **dictDtl}

                        # dataL1 = pd.concat([dataL1, pd.DataFrame.from_dict(dictInfo)], ignore_index=True)
                        dataL2 = pd.concat(
                            [dataL2, pd.DataFrame.from_dict(dict)],
                            ignore_index=True)

                    if (len(dataL2.dropna()) < 1): continue

                    dataL2['gapReal'] = dataL2['realPrice'] - dataL2[
                        'realBjprice']
                    dataL2['gapML'] = dataL2['realPriceML'] - dataL2[
                        'realBjPriceML']
                    dataL2['gapDL'] = dataL2['realPriceDL'] - dataL2[
                        'realBjPriceDL']

                    # 아파트 전세가 시계열
                    mainTitle = '[{}, {}] 아파트 전세가 시계열'.format(
                        nameInfo, capInfo)
                    saveImg = '{}/{}_{}.png'.format(globalVar['figPath'],
                                                    serviceName, mainTitle)
                    makeUserTimeSeriesPlot(
                        dataL2['date'], dataL2['realBjPriceML'],
                        dataL2['realBjPriceDL'], dataL2['realBjprice'],
                        '예측 (머신러닝)', '예측 (딥러닝)', '실측', '날짜 [연도]', '전세가 [만원]',
                        mainTitle, saveImg, False)

                    # 아파트 매매가 시계열
                    mainTitle = '[{}, {}] 아파트 매매가 시계열'.format(
                        nameInfo, capInfo)
                    saveImg = '{}/{}_{}.png'.format(globalVar['figPath'],
                                                    serviceName, mainTitle)
                    makeUserTimeSeriesPlot(
                        dataL2['date'], dataL2['realPriceML'],
                        dataL2['realPriceDL'], dataL2['realPrice'],
                        '예측 (머신러닝)', '예측 (딥러닝)', '실측', '날짜 [연도]', '매매가 [만원]',
                        mainTitle, saveImg, False)

                    # 미래 2년 시계열 예측
                    try:
                        tsModel = AutoTS(
                            forecast_length=sysOpt['tsModel']['forYear'],
                            frequency='infer',
                            ensemble='all',
                            model_list='superfast',
                            transformer_list='superfast')
                        tsDlModel = tsModel.fit(dataL2,
                                                date_col='date',
                                                value_col='gapDL',
                                                id_col=None)
                        tsDlFor = tsDlModel.predict().forecast
                        tsDlFor['date'] = tsDlFor.index
                        # tsDlFor.reset_index(drop=True, inplace=True)

                        tsMlModel = tsModel.fit(dataL2,
                                                date_col='date',
                                                value_col='gapML',
                                                id_col=None)
                        tsMlFor = tsMlModel.predict().forecast
                        tsMlFor['date'] = tsMlFor.index
                        # tsMlFor.reset_index(drop=True, inplace=True)

                        tsForData = tsDlFor.merge(tsMlFor,
                                                  left_on=['date'],
                                                  right_on=['date'],
                                                  how='inner')
                        tsForData['name'] = nameInfo
                        tsForData['capacity'] = capInfo
                        tsForData['year'] = iYear
                        tsForData['lat'] = selInfoFirst['lat']
                        tsForData['lon'] = selInfoFirst['lon']
                        tsForData['inhuga'] = selInfoFirst['inhuga']
                        tsForData['conYear'] = selInfoFirst['conYear']
                    except Exception as e:
                        log.error('Exception : {}'.format(e))

                    dataL3 = dataL2.merge(tsForData,
                                          left_on=[
                                              'name', 'capacity', 'year',
                                              'lat', 'lon', 'inhuga',
                                              'conYear', 'date', 'gapDL',
                                              'gapML'
                                          ],
                                          right_on=[
                                              'name', 'capacity', 'year',
                                              'lat', 'lon', 'inhuga',
                                              'conYear', 'date', 'gapDL',
                                              'gapML'
                                          ],
                                          how='outer')

                    # 아파트 갭투자 시계열
                    mainTitle = '[{}, {}] 아파트 갭투자 시계열'.format(
                        nameInfo, capInfo)
                    saveImg = '{}/{}_{}.png'.format(globalVar['figPath'],
                                                    serviceName, mainTitle)
                    makeUserTimeSeriesPlot(dataL3['date'], dataL3['gapML'],
                                           dataL3['gapDL'], dataL3['gapReal'],
                                           '예측 (머신러닝)', '예측 (딥러닝)', '실측',
                                           '날짜 [연도]', '갭 투자 [만원]', mainTitle,
                                           saveImg, False)

                    # +++++++++++++++++++++++++++++++++++++++++++++++++++++
                    # 수익률 테이블
                    # +++++++++++++++++++++++++++++++++++++++++++++++++++++
                    resData = dataL3[['gapReal', 'gapML', 'gapDL']]

                    resDiffData = resData.diff(periods=1).rename(columns={
                        'gapReal':
                        'gapDiffReal',
                        'gapML':
                        'gapDiffML',
                        'gapDL':
                        'gapDiffDL'
                    },
                                                                 inplace=False)

                    resPctData = resData.pct_change(periods=1).rename(
                        columns={
                            'gapReal': 'gapPctReal',
                            'gapML': 'gapPctML',
                            'gapDL': 'gapPctDL'
                        },
                        inplace=False)

                    resDataL2 = pd.concat(
                        [dataL3, resDiffData, resPctData * 100], axis=1)
                    resDataL3 = resDataL2.sort_values(
                        by=['date'], ascending=False).rename(
                            columns={
                                'name': '아파트(도로명)',
                                'capacity': '면적',
                                'construction_year': '건축연도',
                                'year': '연도',
                                'date': '날짜',
                                'lat': '위도',
                                'lon': '경도',
                                'inhuga': '인허가',
                                'conYear': '건축년도',
                                'realPrice': '매매가',
                                'realBjprice': '전세가',
                                'realPriceDL': '예측 딥러닝 매매가',
                                'realBjpriceDL': '예측 딥러닝 전세가',
                                'realPriceML': '예측 머신러닝 매매가',
                                'realBjpriceML': '예측 머신러닝 전세가',
                                'gapReal': '실측 갭투자',
                                'gapML': '예측 머신러닝 갭투자',
                                'gapDL': '예측 딥러닝 갭투자',
                                'gapDiffReal': '실측 수익금',
                                'gapDiffDL': '예측 딥러닝 수익금',
                                'gapDiffML': '예측 머신러닝 수익금',
                                'gapPctReal': '실측 수익률',
                                'gapPctDL': '예측 딥러닝 수익률',
                                'gapPctML': '예측 머신러닝 수익률'
                            })

                    fnlData = pd.concat([fnlData, resDataL3],
                                        ignore_index=True)

            saveFile = '{}/{}_{}_{}.xlsx'.format(
                globalVar['outPath'], serviceName, '수익률 테이블',
                datetime.now().strftime('%Y%m%d'))
            os.makedirs(os.path.dirname(saveFile), exist_ok=True)
            fnlData.to_excel(saveFile, index=False)
            log.info('[CHECK] saveFile : {}'.format(saveFile))

        except Exception as e:
            log.error('Exception : {}'.format(e))
            raise e
        finally:
            log.info('[END] {}'.format('exec'))
示例#5
0
文件: test.py 项目: nsankar/AutoTS
    'runtime_weighting': 0,
    'spl_weighting': 1,
    'contour_weighting': 0,
}


model = AutoTS(
    forecast_length=forecast_length,
    frequency='infer',
    prediction_interval=0.9,
    ensemble=None,
    constraint=2,
    max_generations=generations,
    num_validations=2,
    validation_method='backwards',
    model_list=model_list,
    initial_template='General+Random',
    metric_weighting=metric_weighting,
    models_to_validate=0.1,
    max_per_model_class=None,
    model_interrupt=True,
    n_jobs=n_jobs,
    drop_most_recent=0,
    verbose=1,
)


future_regressor_train, future_regressor_forecast = fake_regressor(
    df_long,
    dimensions=1,
    forecast_length=forecast_length,
示例#6
0
    def test_all_default_models(self):
        forecast_length = 8
        long = False
        df = load_daily(long=long).drop(columns=['US.Total.Covid.Tests'],
                                        errors='ignore')
        # to make it faster
        df = df[df.columns[0:2]]
        n_jobs = 'auto'
        verbose = -1
        validation_method = "backwards"
        generations = 1
        num_validations = 1
        models_to_validate = 0.10  # must be a decimal percent for this test

        model_list = "default"

        transformer_list = "fast"  # ["SinTrend", "MinMaxScaler"]
        transformer_max_depth = 1

        model = AutoTS(
            forecast_length=forecast_length,
            frequency='infer',
            prediction_interval=0.9,
            ensemble=["horizontal-max"],
            constraint=None,
            max_generations=generations,
            num_validations=num_validations,
            validation_method=validation_method,
            model_list=model_list,
            transformer_list=transformer_list,
            transformer_max_depth=transformer_max_depth,
            initial_template='Random',
            models_to_validate=models_to_validate,
            max_per_model_class=None,
            n_jobs=n_jobs,
            model_interrupt=True,
            drop_most_recent=1,
            verbose=verbose,
        )
        model = model.fit(
            df,
            date_col='datetime' if long else None,
            value_col='value' if long else None,
            id_col='series_id' if long else None,
        )
        prediction = model.predict(verbose=0)
        forecasts_df = prediction.forecast
        initial_results = model.results()
        validation_results = model.results("validation")

        # validated_count = (validation_results['Runs'] == (num_validations + 1)).sum()
        validated_count = (validation_results['Runs'] > 1).sum()

        # so these account for DROP MOST RECENT = 1
        expected_idx = pd.date_range(start=df.index[-2],
                                     periods=forecast_length + 1,
                                     freq='D')[1:]
        expected_val1 = pd.date_range(end=df.index[-(forecast_length + 2)],
                                      periods=forecast_length,
                                      freq='D')

        template_dict = json.loads(model.best_model['ModelParameters'].iloc[0])
        best_model_result = validation_results[validation_results['ID'] ==
                                               model.best_model['ID'].iloc[0]]

        check_fails = initial_results.groupby("Model")["mae"].count() > 0

        # check that all models had at least 1 success
        self.assertEqual(set(initial_results['Model'].unique().tolist()) -
                         {'Ensemble'},
                         set(default_model_list),
                         msg="Not all models used in initial template.")
        self.assertTrue(
            check_fails.all(),
            msg=
            f"These models failed: {check_fails[~check_fails].index.tolist()}. It is more likely a package install problem than a code problem"
        )
        # check general model setup
        self.assertGreaterEqual(validated_count, model.models_to_validate)
        self.assertGreater(model.models_to_validate,
                           (initial_results['ValidationRound'] == 0).sum() *
                           models_to_validate - 2)
        self.assertFalse(model.best_model.empty)
        # check the generated forecasts look right
        self.assertEqual(forecasts_df.shape[0], forecast_length)
        self.assertEqual(forecasts_df.shape[1], df.shape[1])
        self.assertFalse(forecasts_df.isna().any().any())
        self.assertEqual(forecast_length, len(forecasts_df.index))
        self.assertTrue(
            (expected_idx == pd.DatetimeIndex(forecasts_df.index)).all())
        # these next two could potentiall fail if any inputs have a strong trend
        self.assertTrue((forecasts_df.mean() <= (df.max()) + df.std()).all())
        self.assertTrue((forecasts_df.mean() >= (df.min()) - df.std()).all())
        # check all the checks work
        self.assertEqual(model.ensemble_check, 1)
        self.assertFalse(model.weighted)
        self.assertFalse(model.subset_flag)
        # assess 'backwards' validation
        val_1 = model.validation_test_indexes[1]
        self.assertEqual(len(model.validation_test_indexes),
                         num_validations + 1)
        self.assertTrue(
            val_1.intersection(model.validation_train_indexes[1]).empty)
        self.assertEqual(model.validation_train_indexes[1].shape[0],
                         df.shape[0] -
                         (forecast_length * 2 + 1))  # +1 via drop most recent
        self.assertTrue((val_1 == expected_val1).all())
        # assess Horizontal Ensembling
        self.assertTrue('horizontal' in template_dict['model_name'].lower())
        self.assertEqual(len(template_dict['series'].keys()), df.shape[1])
        self.assertEqual(len(set(template_dict['series'].values())),
                         template_dict['model_count'])
        self.assertEqual(len(template_dict['models'].keys()),
                         template_dict['model_count'])
        # test that actually the best model (or nearly) was chosen
        self.assertGreater(validation_results['Score'].quantile(0.05),
                           best_model_result['Score'].iloc[0])
        # test metrics
        self.assertTrue(initial_results['Score'].min() > 0)
        self.assertTrue(initial_results['mae'].min() >= 0)
        self.assertTrue(initial_results['smape'].min() >= 0)
        self.assertTrue(initial_results['rmse'].min() >= 0)
        self.assertTrue(initial_results['contour'].min() >= 0)
        self.assertTrue(initial_results['containment'].min() >= 0)
        self.assertTrue(initial_results['TotalRuntimeSeconds'].min() >= 0)
        self.assertTrue(initial_results['spl'].min() >= 0)
        self.assertTrue(initial_results['contour'].min() <= 1)
        self.assertTrue(initial_results['containment'].min() <= 1)
示例#7
0
    def test_autots(self):
        forecast_length = 8
        long = False
        df = load_daily(long=long).drop(columns=['US.Total.Covid.Tests'],
                                        errors='ignore')
        n_jobs = 'auto'
        verbose = 0
        validation_method = "backwards"
        generations = 1
        num_validations = 2
        models_to_validate = 0.35  # must be a decimal percent for this test

        model_list = [
            'ZeroesNaive',
            'LastValueNaive',
            'AverageValueNaive',
            'SeasonalNaive',
        ]

        transformer_list = "fast"  # ["SinTrend", "MinMaxScaler"]
        transformer_max_depth = 3

        metric_weighting = {
            'smape_weighting': 3,
            'mae_weighting': 1,
            'rmse_weighting': 1,
            'containment_weighting': 0,
            'runtime_weighting': 0,
            'spl_weighting': 1,
            'contour_weighting': 1,
        }

        model = AutoTS(
            forecast_length=forecast_length,
            frequency='infer',
            prediction_interval=0.9,
            ensemble=["horizontal-max", "horizontal-min"],
            constraint=None,
            max_generations=generations,
            num_validations=num_validations,
            validation_method=validation_method,
            model_list=model_list,
            transformer_list=transformer_list,
            transformer_max_depth=transformer_max_depth,
            initial_template='General+Random',
            metric_weighting=metric_weighting,
            models_to_validate=models_to_validate,
            max_per_model_class=None,
            model_interrupt=False,
            no_negatives=True,
            subset=100,
            n_jobs=n_jobs,
            drop_most_recent=1,
            verbose=verbose,
        )
        future_regressor_train2d, future_regressor_forecast2d = fake_regressor(
            df,
            dimensions=4,
            forecast_length=forecast_length,
            date_col='datetime' if long else None,
            value_col='value' if long else None,
            id_col='series_id' if long else None,
            drop_most_recent=model.drop_most_recent,
            aggfunc=model.aggfunc,
            verbose=model.verbose,
        )
        model = model.fit(
            df,
            future_regressor=future_regressor_train2d,
            date_col='datetime' if long else None,
            value_col='value' if long else None,
            id_col='series_id' if long else None,
        )
        prediction = model.predict(
            future_regressor=future_regressor_forecast2d, verbose=0)
        forecasts_df = prediction.forecast
        initial_results = model.results()
        validation_results = model.results("validation")
        back_forecast = model.back_forecast(n_splits=2, verbose=0).forecast
        # validated_count = (validation_results['Runs'] == (num_validations + 1)).sum()

        # so these account for DROP MOST RECENT = 1
        expected_idx = pd.date_range(start=df.index[-2],
                                     periods=forecast_length + 1,
                                     freq='D')[1:]
        expected_val1 = pd.date_range(end=df.index[-(forecast_length + 2)],
                                      periods=forecast_length,
                                      freq='D')
        expected_val2 = pd.date_range(end=df.index[-(forecast_length * 2 + 2)],
                                      periods=forecast_length,
                                      freq='D')

        template_dict = json.loads(model.best_model['ModelParameters'].iloc[0])
        best_model_result = validation_results[validation_results['ID'] ==
                                               model.best_model['ID'].iloc[0]]

        # check there were few failed models in this simple setup (fancier models are expected to fail sometimes!)
        self.assertGreater(
            initial_results['Exceptions'].isnull().mean(), 0.95,
            "Too many 'superfast' models failed. This can occur by random chance, try running again."
        )
        # check general model setup
        # self.assertEqual(validated_count, model.models_to_validate)
        self.assertGreater(model.models_to_validate,
                           (initial_results['ValidationRound'] == 0).sum() *
                           models_to_validate - 2)
        self.assertEqual(
            set(initial_results['Model'].unique().tolist()) - {'Ensemble'},
            set(model.model_list))
        self.assertFalse(model.best_model.empty)
        # check the generated forecasts look right
        self.assertEqual(forecasts_df.shape[0], forecast_length)
        self.assertEqual(forecasts_df.shape[1], df.shape[1])
        self.assertFalse(forecasts_df.isna().any().any())
        self.assertTrue((forecasts_df >= 0).all().all())
        self.assertEqual(forecast_length, len(forecasts_df.index))
        self.assertTrue(
            (expected_idx == pd.DatetimeIndex(forecasts_df.index)).all())
        # these next two could potentiall fail if any inputs have a strong trend
        self.assertTrue((forecasts_df.mean() <= (df.max()) + df.std()).all())
        self.assertTrue((forecasts_df.mean() >= (df.min()) - df.std()).all())
        # check all the checks work
        self.assertEqual(model.ensemble_check, 1)
        self.assertFalse(model.weighted)
        self.assertFalse(model.used_regressor_check)
        self.assertFalse(model.subset_flag)
        # assess 'backwards' validation
        self.assertEqual(len(model.validation_test_indexes),
                         num_validations + 1)
        self.assertTrue(model.validation_test_indexes[1].intersection(
            model.validation_train_indexes[1]).empty)
        self.assertTrue(model.validation_test_indexes[2].intersection(
            model.validation_train_indexes[2]).empty)
        self.assertEqual(model.validation_train_indexes[1].shape[0],
                         df.shape[0] -
                         (forecast_length * 2 + 1))  # +1 via drop most recent
        self.assertTrue(
            (model.validation_test_indexes[1] == expected_val1).all())
        self.assertTrue(
            (model.validation_test_indexes[2] == expected_val2).all())
        # assess Horizontal Ensembling
        self.assertTrue('horizontal' in template_dict['model_name'].lower())
        self.assertEqual(len(template_dict['series'].keys()), df.shape[1])
        self.assertEqual(len(set(template_dict['series'].values())),
                         template_dict['model_count'])
        self.assertEqual(len(template_dict['models'].keys()),
                         template_dict['model_count'])
        # test that actually the best model (or nearly) was chosen
        self.assertGreater(validation_results['Score'].quantile(0.05),
                           best_model_result['Score'].iloc[0])
        # test back_forecast
        self.assertTrue(
            (back_forecast.index == model.df_wide_numeric.index).all(),
            msg="Back forecasting failed to have equivalent index to train.")
示例#8
0
# model_list = ['AverageValueNaive', 'LastValueNaive', 'ZeroesNaive']
# model_list = ['WindowRegression', 'SeasonalNaive']

metric_weighting = {'smape_weighting': 2, 'mae_weighting': 1,
                    'rmse_weighting': 2, 'containment_weighting': 0,
                    'runtime_weighting': 0, 'spl_weighting': 1,
                    'contour_weighting': 0
                    }


model = AutoTS(forecast_length=forecast_length, frequency='infer',
               prediction_interval=0.9,
               ensemble='simple,distance,probabilistic-max,horizontal-max',
               constraint=2,
               max_generations=1, num_validations=2,
               validation_method='backwards',
               model_list=model_list, initial_template='General+Random',
               metric_weighting=metric_weighting, models_to_validate=0.1,
               max_per_model_class=None,
               model_interrupt=True,
               drop_most_recent=0, verbose=1)


future_regressor_train, future_regressor_forecast = fake_regressor(
    df_long, dimensions=1, forecast_length=forecast_length,
    date_col='datetime', value_col='value', id_col='series_id')
future_regressor_train2d, future_regressor_forecast2d = fake_regressor(
    df_long, dimensions=4, forecast_length=forecast_length,
    date_col='datetime', value_col='value', id_col='series_id')

# model = model.import_results('test.pickle')
示例#9
0
}

model = AutoTS(
    forecast_length=forecast_length,
    frequency=frequency,
    prediction_interval=prediction_interval,
    ensemble=ensemble,
    model_list=model_list,
    transformer_list=transformer_list,
    transformer_max_depth=transformer_max_depth,
    max_generations=gens,
    metric_weighting=metric_weighting,
    initial_template='random',
    aggfunc="sum",
    models_to_validate=models_to_validate,
    model_interrupt=True,
    num_validations=num_validations,
    validation_method=validation_method,
    constraint=None,
    drop_most_recent=
    drop_most_recent,  # if newest data is incomplete, also remember to increase forecast_length
    preclean=preclean,
    models_mode=models_mode,
    # no_negatives=True,
    # subset=100,
    # prefill_na=0,
    # remove_leading_zeroes=True,
    n_jobs=n_jobs,
    verbose=1,
)

if not initial_training: