def make_forecast(df, len_forecast: int): """ Function for making time series forecasting with AutoTS library :param df: dataframe to process :param len_forecast: forecast length :return predicted_values: forecast :return model_name: name of the model (always 'AutoTS') """ model = AutoTS(forecast_length=len_forecast, frequency='infer', prediction_interval=0.9, ensemble='all', model_list="superfast", max_generations=15, num_validations=2, validation_method="backwards") model = model.fit(df, date_col='datetime', value_col='value') prediction = model.predict() # point forecasts dataframe forecasts_df = prediction.forecast predicted_values = np.array(forecasts_df['value']) model_name = 'AutoTs' return predicted_values, model_name
'runtime_weighting': 0, 'spl_weighting': 1, 'contour_weighting': 0, } model = AutoTS( forecast_length=forecast_length, frequency='infer', prediction_interval=0.9, ensemble="simple,horizontal-max", constraint=None, max_generations=generations, num_validations=2, validation_method='backwards', model_list=model_list, transformer_list=transformer_list, transformer_max_depth=transformer_max_depth, initial_template='General+Random', metric_weighting=metric_weighting, models_to_validate=0.35, max_per_model_class=None, model_interrupt=True, n_jobs=n_jobs, drop_most_recent=1, subset=None, verbose=verbose, ) future_regressor_train, future_regressor_forecast = fake_regressor( df, dimensions=1, forecast_length=forecast_length,
} model = AutoTS( forecast_length=forecast_length, frequency=frequency, prediction_interval=prediction_interval, ensemble=ensemble, constraint=None, max_generations=generations, num_validations=num_validations, validation_method=validation_method, model_list=model_list, transformer_list=transformer_list, transformer_max_depth=transformer_max_depth, initial_template="Random", metric_weighting=metric_weighting, models_to_validate=0.35, max_per_model_class=None, model_interrupt="end_generation", n_jobs=n_jobs, drop_most_recent=drop_most_recent, introduce_na=True, preclean=preclean, # prefill_na=0, # subset=5, verbose=verbose, models_mode=models_mode, random_seed=random_seed, )
def exec(self): try: log.info('[START] {}'.format('exec')) # breakpoint() # ******************************************** # 옵션 설정 # ******************************************** sysOpt = { # 딥러닝 'dlModel': { # 초기화 'isInit': False # 모델 업데이트 여부 # , 'isOverWrite': True , 'isOverWrite': False } # 머신러닝 , 'mlModel': { # 모델 업데이트 여부 # 'isOverWrite': True 'isOverWrite': False } # 시계열 , 'tsModel': { # 미래 예측 연도 'forYear': 2 # 아파트 설정 , 'aptList': [] # 전체 아파트 검색 # , 'aptList': ['미아동부센트레빌'] # , 'aptList': ['미아동부센트레빌', '송천센트레빌', '에스케이북한산시티'] } } # ***************************************************** # 인허가 데이터 # ***************************************************** lcnsInpFile = '{}/{}/{}'.format(globalVar['inpPath'], serviceName, '서울특별시 강북구 인허가.csv') lcnsFileList = glob.glob(lcnsInpFile) if lcnsFileList is None or len(lcnsFileList) < 1: log.error('[ERROR] inpFile : {} / {}'.format( lcnsFileList, '입력 자료를 확인해주세요.')) raise Exception('[ERROR] inpFile : {} / {}'.format( lcnsFileList, '입력 자료를 확인해주세요.')) lcnsData = pd.read_csv(lcnsFileList[0]) lcnsData.drop(['Unnamed: 0'], axis=1, inplace=True) lcnsDataL1 = lcnsData.groupby( ['주소'], as_index=False)['archGbCdNm'].count() # ***************************************************** # 전월세 데이터 # ***************************************************** prvsMntsrInpFile = '{}/{}/{}'.format( globalVar['inpPath'], serviceName, '서울특별시 강북구 아파트 전월세가_인허가_20111101_20201101.csv') prvsMntsrFileList = glob.glob(prvsMntsrInpFile) if prvsMntsrFileList is None or len(prvsMntsrFileList) < 1: log.error('[ERROR] inpFile : {} / {}'.format( prvsMntsrFileList, '입력 자료를 확인해주세요.')) raise Exception('[ERROR] inpFile : {} / {}'.format( prvsMntsrFileList, '입력 자료를 확인해주세요.')) prvsMntsrFileInfo = prvsMntsrFileList[0] prvsMntsrData = pd.read_csv(prvsMntsrFileInfo) prvsMntsrData.drop(['Unnamed: 0.1'], axis=1, inplace=True) prvsMntsrData['name'] = prvsMntsrData['단지명'] + '(' + prvsMntsrData[ '도로명'] + ')' prvsMntsrDataL2 = prvsMntsrData.loc[ (prvsMntsrData['전월세구분'] == '전세') & (prvsMntsrData['층'] != 1)].reset_index(drop=True) # prvsMntsrDataL2['계약년도'] = prvsMntsrDataL2['계약년월'].astype(str).str.slice(0, 4) prvsMntsrDataL2['date'] = pd.to_datetime(prvsMntsrDataL2['계약년월'], format='%Y%m') prvsMntsrDataL2['year'] = prvsMntsrDataL2['date'].dt.strftime( "%Y").astype('int') prvsMntsrDataL2['month'] = prvsMntsrDataL2['date'].dt.strftime( "%m").astype('int') prvsMntsrDataL2['보증금(만원)'] = prvsMntsrDataL2['보증금(만원)'].astype( str).str.replace(',', '').astype('float') prvsMntsrDataL3 = pd.merge( left=prvsMntsrDataL2 # , right=lcnsData[['archGbCdNm', '주소', 'lat', 'lon']] , right=lcnsDataL1[['archGbCdNm', '주소']], left_on=['인허가addr'], right_on=['주소'], how='left').rename(columns={'archGbCdNm': 'inhuga'}) prvsMntsrdataL2 = prvsMntsrDataL3.rename( columns={ '전용면적(㎡)': 'capacity', '건축년도': 'conYear', '보증금(만원)': 'realBjprice', 'archGbCdNm': 'inhuga', '거래금액(만원)': 'realPrice' }).sort_values(by=['name', 'capacity', 'year']).reset_index( drop=True) # ***************************************************** # 실거래가 데이터 # ***************************************************** realPriceInpFile = '{}/{}/{}'.format( globalVar['inpPath'], serviceName, '서울특별시 강북구 아파트 실거래가_인허가_20111101_20201101.csv') realPriceFileList = glob.glob(realPriceInpFile) if realPriceFileList is None or len(realPriceFileList) < 1: log.error('[ERROR] inpFile : {} / {}'.format( realPriceFileList, '입력 자료를 확인해주세요.')) raise Exception('[ERROR] inpFile : {} / {}'.format( realPriceFileList, '입력 자료를 확인해주세요.')) realPriceFileInfo = realPriceFileList[0] realPriceData = pd.read_csv(realPriceFileInfo) realPriceData.drop(['Unnamed: 0.1'], axis=1, inplace=True) realPriceData['name'] = realPriceData['단지명'] + '(' + realPriceData[ '도로명'] + ')' # realPriceDataL1 = realPriceData[['name','전용면적(㎡)','거래금액(만원)','층','건축년도','lat','lon','계약년월','인허가addr']] realPriceDataL2 = realPriceData.loc[(realPriceData['층'] != 1)].reset_index(drop=True) realPriceDataL2['date'] = pd.to_datetime(realPriceDataL2['계약년월'], format='%Y%m') realPriceDataL2['year'] = realPriceDataL2['date'].dt.strftime( "%Y").astype('int') realPriceDataL2['month'] = realPriceDataL2['date'].dt.strftime( "%m").astype('int') realPriceDataL2['거래금액(만원)'] = realPriceDataL2['거래금액(만원)'].astype( str).str.replace(',', '').astype('float') realPriceDataL3 = pd.merge( left=realPriceDataL2, right=lcnsDataL1[['archGbCdNm', '주소']], left_on=['인허가addr'], right_on=['주소'], how='left').rename(columns={'archGbCdNm': 'inhuga'}) realPricedataL2 = realPriceDataL3.rename( columns={ '전용면적(㎡)': 'capacity', '건축년도': 'conYear', '보증금(만원)': 'realBjprice', 'archGbCdNm': 'inhuga', '거래금액(만원)': 'realPrice' }).sort_values(by=['name', 'capacity', 'year']).reset_index( drop=True) # ***************************************************** # 데이터 통합 # ***************************************************** prvsMntsrDataL5 = prvsMntsrdataL2.groupby( [ 'name', 'conYear', 'capacity', 'lat', 'lon', 'year', 'inhuga' ], as_index=False)['realBjprice'].mean() realPriceDataL5 = realPricedataL2.groupby( [ 'name', 'conYear', 'capacity', 'lat', 'lon', 'year', 'inhuga' ], as_index=False)['realPrice'].mean() data = pd.merge(left=prvsMntsrDataL5, right=realPriceDataL5, left_on=[ 'name', 'conYear', 'capacity', 'lat', 'lon', 'year', 'inhuga' ], right_on=[ 'name', 'conYear', 'capacity', 'lat', 'lon', 'year', 'inhuga' ], how='outer') # ********************************************************************************************************** # 딥러닝 매매가 # ********************************************************************************************************** inpData = realPricedataL2 # inpData ㅊ= realPriceDataL5 # xCol = ['times', 'capacity', 'construction_year', 'lat', 'lon', 'realBjprice', 'inhuga'] # xCol = ['times', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'conYear', 'capacity', 'lat', 'lon'] # xCol = ['name', 'year', 'conYear', 'capacity', 'lat', 'lon'] # xCol = ['year', 'capacity', 'lat', 'lon', 'inhuga'] xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'month', 'capacity', 'lat', 'lon', 'inhuga'] yCol = 'realPrice' modelKey = 'realPrice' # 딥러닝 매매가 불러오기 result = makeDlModel(sysOpt['dlModel'], xCol, yCol, inpData, modelKey) log.info('[CHECK] result : {}'.format(result)) # 딥러닝 매매가 예측 realPriceDlModel = result['dlModel'] data['realPriceDL'] = realPriceDlModel.predict( h2o.H2OFrame(data)).as_data_frame() mainTitle = '강북구 아파트 매매가 예측 결과 (딥러닝)' saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserScatterPlot(data['realPriceDL'], data['realPrice'], '예측', '실측', mainTitle, saveImg, 0, 140000, 2000, 10000, True) # ********************************************************************************************************** # 딥러닝 전세가 # ********************************************************************************************************** inpData = prvsMntsrdataL2 # inpData.info() # xCol = ['times', 'capacity', 'construction_year', 'lat', 'lon', 'realPrice', 'inhuga'] # xCol = ['times', 'capacity', 'lat', 'lon', 'inhuga'] xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga'] yCol = 'realBjprice' modelKey = 'realBjPrice' # 딥러닝 전세가 불러오기 result = makeDlModel(sysOpt['dlModel'], xCol, yCol, inpData, modelKey) log.info('[CHECK] result : {}'.format(result)) # 딥러닝 전세가 예측 realBjPriceDlModel = result['dlModel'] data['realBjPriceDL'] = realBjPriceDlModel.predict( h2o.H2OFrame(data)).as_data_frame() mainTitle = '강북구 아파트 전세가 예측 결과 (딥러닝)' saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserScatterPlot(data['realBjPriceDL'], data['realBjprice'], '예측', '실측', mainTitle, saveImg, 0, 140000, 2000, 10000, True) # ********************************************************************************************************** # 머신러닝 매매가 # ********************************************************************************************************** inpData = realPricedataL2 # xCol = ['times', 'capacity', 'construction_year', 'lat', 'lon', 'realBjprice', 'inhuga'] # xCol = ['times', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'capacity', 'lat', 'lon', 'inhuga'] xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'month', 'capacity', 'lat', 'lon', 'inhuga'] yCol = 'realPrice' modelKey = 'realPrice' # 머신러닝 매매가 불러오기 result = makeMlModel(sysOpt['mlModel'], xCol, yCol, inpData, modelKey) log.info('[CHECK] result : {}'.format(result)) # 머신러닝 매매가 예측 realPriceMlModel = result['mlModel'] data['realPriceML'] = predict_model(realPriceMlModel, data=data)['Label'] mainTitle = '강북구 아파트 매매가 예측 결과 (머신러닝)' saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserScatterPlot(data['realPriceML'], data['realPrice'], '예측', '실측', mainTitle, saveImg, 0, 140000, 2000, 10000, True) # ********************************************************************************************************** # 머신러닝 전세가 # ********************************************************************************************************** inpData = prvsMntsrdataL2 # xCol = ['times', 'capacity', 'construction_year', 'lat', 'lon', 'realPrice', 'inhuga'] # xCol = ['times', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga'] # xCol = ['year', 'month', 'capacity', 'lat', 'lon', 'inhuga'] xCol = ['year', 'conYear', 'capacity', 'lat', 'lon', 'inhuga'] yCol = 'realBjprice' modelKey = 'realBjPrice' # 머신러닝 전세가 불러오기 result = makeMlModel(sysOpt['mlModel'], xCol, yCol, inpData, modelKey) log.info('[CHECK] result : {}'.format(result)) # 머신러닝 전세가 예측 realBjPriceMlModel = result['mlModel'] data['realBjPriceML'] = predict_model(realBjPriceMlModel, data=data)['Label'] mainTitle = '강북구 아파트 전세가 예측 결과 (머신러닝)' saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserScatterPlot(data['realBjPriceML'], data['realBjprice'], '예측', '실측', mainTitle, saveImg, 0, 140000, 2000, 10000, True) # ********************************************************************************************************** # 시계열 갭투자 # ********************************************************************************************************** nameList = data['name'].unique() searchAptList = sysOpt['tsModel']['aptList'] fnlData = pd.DataFrame() for i, nameInfo in enumerate(nameList): # 아파트 검색 모듈 # sysOpt['tsModel']['aptList'] = [] : 전체 아파트에 대한 수익률 예측 # sysOpt['tsModel']['aptList'] = ['미아동부센트레빌'] : 미아동부센트레빌 아파트에 대한 수익률 예측 isSearch = True if (len(searchAptList) < 1) else False for ii, aptInfo in enumerate(searchAptList): if (aptInfo in nameInfo): isSearch = True break if (isSearch == False): continue log.info('[CHECK] isSearch : {} / nameInfo : {}'.format( isSearch, nameInfo)) selData = data.loc[(data['name'] == nameInfo)].reset_index( drop=True) if (len(selData) < 1): continue capList = selData['capacity'].unique() # capInfo = capList[0] for j, capInfo in enumerate(capList): selDataL1 = selData.loc[( selData['capacity'] == capInfo)].reset_index(drop=True) if (len(selDataL1) < 1): continue selInfoFirst = selDataL1.loc[0] log.info('[CHECK] nameInfo : {} / capInfo : {} / cnt : {}'. format(nameInfo, capInfo, len(selDataL1))) srtDate = pd.to_datetime(selDataL1['year'].min(), format='%Y') endDate = pd.to_datetime(selDataL1['year'].max(), format='%Y') dtDateList = pd.date_range(start=srtDate, end=endDate, freq=pd.DateOffset(years=1)) dataL2 = pd.DataFrame() # dtDateInfo = dtDateList[0] for k, dtDateInfo in enumerate(dtDateList): iYear = int(dtDateInfo.strftime('%Y')) selInfo = selDataL1.loc[ (selDataL1['year'] == iYear) # & (selDataL1['month'] == iMonth) ].reset_index(drop=True) dictInfo = { 'name': [nameInfo], 'capacity': [capInfo], 'date': [dtDateInfo], 'year': [iYear], 'lat': [selInfoFirst['lat']], 'lon': [selInfoFirst['lon']], 'inhuga': [selInfoFirst['inhuga']], 'conYear': [selInfoFirst['conYear']] } dictDtl = { 'realPrice': [ np.nan if (len(selInfo) < 1) else selInfo['realPrice'][0] ], 'realBjprice': [ np.nan if (len(selInfo) < 1) else selInfo['realBjprice'][0] ], 'realPriceDL': [ realPriceDlModel.predict( h2o.H2OFrame( pd.DataFrame.from_dict(dictInfo)) ).as_data_frame()['predict'][0] if (len(selInfo) < 1) else selInfo['realPriceDL'][0] ], 'realBjPriceDL': [ realBjPriceDlModel.predict( h2o.H2OFrame( pd.DataFrame.from_dict(dictInfo)) ).as_data_frame()['predict'][0] if (len(selInfo) < 1) else selInfo['realBjPriceDL'][0] ], 'realPriceML': [ predict_model(realPriceMlModel, data=pd.DataFrame.from_dict( dictInfo))['Label'][0] if (len(selInfo) < 1) else selInfo['realPriceML'][0] ], 'realBjPriceML': [ predict_model(realBjPriceMlModel, data=pd.DataFrame.from_dict( dictInfo))['Label'][0] if (len(selInfo) < 1) else selInfo['realBjPriceML'][0] ] } dict = {**dictInfo, **dictDtl} # dataL1 = pd.concat([dataL1, pd.DataFrame.from_dict(dictInfo)], ignore_index=True) dataL2 = pd.concat( [dataL2, pd.DataFrame.from_dict(dict)], ignore_index=True) if (len(dataL2.dropna()) < 1): continue dataL2['gapReal'] = dataL2['realPrice'] - dataL2[ 'realBjprice'] dataL2['gapML'] = dataL2['realPriceML'] - dataL2[ 'realBjPriceML'] dataL2['gapDL'] = dataL2['realPriceDL'] - dataL2[ 'realBjPriceDL'] # 아파트 전세가 시계열 mainTitle = '[{}, {}] 아파트 전세가 시계열'.format( nameInfo, capInfo) saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserTimeSeriesPlot( dataL2['date'], dataL2['realBjPriceML'], dataL2['realBjPriceDL'], dataL2['realBjprice'], '예측 (머신러닝)', '예측 (딥러닝)', '실측', '날짜 [연도]', '전세가 [만원]', mainTitle, saveImg, False) # 아파트 매매가 시계열 mainTitle = '[{}, {}] 아파트 매매가 시계열'.format( nameInfo, capInfo) saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserTimeSeriesPlot( dataL2['date'], dataL2['realPriceML'], dataL2['realPriceDL'], dataL2['realPrice'], '예측 (머신러닝)', '예측 (딥러닝)', '실측', '날짜 [연도]', '매매가 [만원]', mainTitle, saveImg, False) # 미래 2년 시계열 예측 try: tsModel = AutoTS( forecast_length=sysOpt['tsModel']['forYear'], frequency='infer', ensemble='all', model_list='superfast', transformer_list='superfast') tsDlModel = tsModel.fit(dataL2, date_col='date', value_col='gapDL', id_col=None) tsDlFor = tsDlModel.predict().forecast tsDlFor['date'] = tsDlFor.index # tsDlFor.reset_index(drop=True, inplace=True) tsMlModel = tsModel.fit(dataL2, date_col='date', value_col='gapML', id_col=None) tsMlFor = tsMlModel.predict().forecast tsMlFor['date'] = tsMlFor.index # tsMlFor.reset_index(drop=True, inplace=True) tsForData = tsDlFor.merge(tsMlFor, left_on=['date'], right_on=['date'], how='inner') tsForData['name'] = nameInfo tsForData['capacity'] = capInfo tsForData['year'] = iYear tsForData['lat'] = selInfoFirst['lat'] tsForData['lon'] = selInfoFirst['lon'] tsForData['inhuga'] = selInfoFirst['inhuga'] tsForData['conYear'] = selInfoFirst['conYear'] except Exception as e: log.error('Exception : {}'.format(e)) dataL3 = dataL2.merge(tsForData, left_on=[ 'name', 'capacity', 'year', 'lat', 'lon', 'inhuga', 'conYear', 'date', 'gapDL', 'gapML' ], right_on=[ 'name', 'capacity', 'year', 'lat', 'lon', 'inhuga', 'conYear', 'date', 'gapDL', 'gapML' ], how='outer') # 아파트 갭투자 시계열 mainTitle = '[{}, {}] 아파트 갭투자 시계열'.format( nameInfo, capInfo) saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, mainTitle) makeUserTimeSeriesPlot(dataL3['date'], dataL3['gapML'], dataL3['gapDL'], dataL3['gapReal'], '예측 (머신러닝)', '예측 (딥러닝)', '실측', '날짜 [연도]', '갭 투자 [만원]', mainTitle, saveImg, False) # +++++++++++++++++++++++++++++++++++++++++++++++++++++ # 수익률 테이블 # +++++++++++++++++++++++++++++++++++++++++++++++++++++ resData = dataL3[['gapReal', 'gapML', 'gapDL']] resDiffData = resData.diff(periods=1).rename(columns={ 'gapReal': 'gapDiffReal', 'gapML': 'gapDiffML', 'gapDL': 'gapDiffDL' }, inplace=False) resPctData = resData.pct_change(periods=1).rename( columns={ 'gapReal': 'gapPctReal', 'gapML': 'gapPctML', 'gapDL': 'gapPctDL' }, inplace=False) resDataL2 = pd.concat( [dataL3, resDiffData, resPctData * 100], axis=1) resDataL3 = resDataL2.sort_values( by=['date'], ascending=False).rename( columns={ 'name': '아파트(도로명)', 'capacity': '면적', 'construction_year': '건축연도', 'year': '연도', 'date': '날짜', 'lat': '위도', 'lon': '경도', 'inhuga': '인허가', 'conYear': '건축년도', 'realPrice': '매매가', 'realBjprice': '전세가', 'realPriceDL': '예측 딥러닝 매매가', 'realBjpriceDL': '예측 딥러닝 전세가', 'realPriceML': '예측 머신러닝 매매가', 'realBjpriceML': '예측 머신러닝 전세가', 'gapReal': '실측 갭투자', 'gapML': '예측 머신러닝 갭투자', 'gapDL': '예측 딥러닝 갭투자', 'gapDiffReal': '실측 수익금', 'gapDiffDL': '예측 딥러닝 수익금', 'gapDiffML': '예측 머신러닝 수익금', 'gapPctReal': '실측 수익률', 'gapPctDL': '예측 딥러닝 수익률', 'gapPctML': '예측 머신러닝 수익률' }) fnlData = pd.concat([fnlData, resDataL3], ignore_index=True) saveFile = '{}/{}_{}_{}.xlsx'.format( globalVar['outPath'], serviceName, '수익률 테이블', datetime.now().strftime('%Y%m%d')) os.makedirs(os.path.dirname(saveFile), exist_ok=True) fnlData.to_excel(saveFile, index=False) log.info('[CHECK] saveFile : {}'.format(saveFile)) except Exception as e: log.error('Exception : {}'.format(e)) raise e finally: log.info('[END] {}'.format('exec'))
'runtime_weighting': 0, 'spl_weighting': 1, 'contour_weighting': 0, } model = AutoTS( forecast_length=forecast_length, frequency='infer', prediction_interval=0.9, ensemble=None, constraint=2, max_generations=generations, num_validations=2, validation_method='backwards', model_list=model_list, initial_template='General+Random', metric_weighting=metric_weighting, models_to_validate=0.1, max_per_model_class=None, model_interrupt=True, n_jobs=n_jobs, drop_most_recent=0, verbose=1, ) future_regressor_train, future_regressor_forecast = fake_regressor( df_long, dimensions=1, forecast_length=forecast_length,
def test_all_default_models(self): forecast_length = 8 long = False df = load_daily(long=long).drop(columns=['US.Total.Covid.Tests'], errors='ignore') # to make it faster df = df[df.columns[0:2]] n_jobs = 'auto' verbose = -1 validation_method = "backwards" generations = 1 num_validations = 1 models_to_validate = 0.10 # must be a decimal percent for this test model_list = "default" transformer_list = "fast" # ["SinTrend", "MinMaxScaler"] transformer_max_depth = 1 model = AutoTS( forecast_length=forecast_length, frequency='infer', prediction_interval=0.9, ensemble=["horizontal-max"], constraint=None, max_generations=generations, num_validations=num_validations, validation_method=validation_method, model_list=model_list, transformer_list=transformer_list, transformer_max_depth=transformer_max_depth, initial_template='Random', models_to_validate=models_to_validate, max_per_model_class=None, n_jobs=n_jobs, model_interrupt=True, drop_most_recent=1, verbose=verbose, ) model = model.fit( df, date_col='datetime' if long else None, value_col='value' if long else None, id_col='series_id' if long else None, ) prediction = model.predict(verbose=0) forecasts_df = prediction.forecast initial_results = model.results() validation_results = model.results("validation") # validated_count = (validation_results['Runs'] == (num_validations + 1)).sum() validated_count = (validation_results['Runs'] > 1).sum() # so these account for DROP MOST RECENT = 1 expected_idx = pd.date_range(start=df.index[-2], periods=forecast_length + 1, freq='D')[1:] expected_val1 = pd.date_range(end=df.index[-(forecast_length + 2)], periods=forecast_length, freq='D') template_dict = json.loads(model.best_model['ModelParameters'].iloc[0]) best_model_result = validation_results[validation_results['ID'] == model.best_model['ID'].iloc[0]] check_fails = initial_results.groupby("Model")["mae"].count() > 0 # check that all models had at least 1 success self.assertEqual(set(initial_results['Model'].unique().tolist()) - {'Ensemble'}, set(default_model_list), msg="Not all models used in initial template.") self.assertTrue( check_fails.all(), msg= f"These models failed: {check_fails[~check_fails].index.tolist()}. It is more likely a package install problem than a code problem" ) # check general model setup self.assertGreaterEqual(validated_count, model.models_to_validate) self.assertGreater(model.models_to_validate, (initial_results['ValidationRound'] == 0).sum() * models_to_validate - 2) self.assertFalse(model.best_model.empty) # check the generated forecasts look right self.assertEqual(forecasts_df.shape[0], forecast_length) self.assertEqual(forecasts_df.shape[1], df.shape[1]) self.assertFalse(forecasts_df.isna().any().any()) self.assertEqual(forecast_length, len(forecasts_df.index)) self.assertTrue( (expected_idx == pd.DatetimeIndex(forecasts_df.index)).all()) # these next two could potentiall fail if any inputs have a strong trend self.assertTrue((forecasts_df.mean() <= (df.max()) + df.std()).all()) self.assertTrue((forecasts_df.mean() >= (df.min()) - df.std()).all()) # check all the checks work self.assertEqual(model.ensemble_check, 1) self.assertFalse(model.weighted) self.assertFalse(model.subset_flag) # assess 'backwards' validation val_1 = model.validation_test_indexes[1] self.assertEqual(len(model.validation_test_indexes), num_validations + 1) self.assertTrue( val_1.intersection(model.validation_train_indexes[1]).empty) self.assertEqual(model.validation_train_indexes[1].shape[0], df.shape[0] - (forecast_length * 2 + 1)) # +1 via drop most recent self.assertTrue((val_1 == expected_val1).all()) # assess Horizontal Ensembling self.assertTrue('horizontal' in template_dict['model_name'].lower()) self.assertEqual(len(template_dict['series'].keys()), df.shape[1]) self.assertEqual(len(set(template_dict['series'].values())), template_dict['model_count']) self.assertEqual(len(template_dict['models'].keys()), template_dict['model_count']) # test that actually the best model (or nearly) was chosen self.assertGreater(validation_results['Score'].quantile(0.05), best_model_result['Score'].iloc[0]) # test metrics self.assertTrue(initial_results['Score'].min() > 0) self.assertTrue(initial_results['mae'].min() >= 0) self.assertTrue(initial_results['smape'].min() >= 0) self.assertTrue(initial_results['rmse'].min() >= 0) self.assertTrue(initial_results['contour'].min() >= 0) self.assertTrue(initial_results['containment'].min() >= 0) self.assertTrue(initial_results['TotalRuntimeSeconds'].min() >= 0) self.assertTrue(initial_results['spl'].min() >= 0) self.assertTrue(initial_results['contour'].min() <= 1) self.assertTrue(initial_results['containment'].min() <= 1)
def test_autots(self): forecast_length = 8 long = False df = load_daily(long=long).drop(columns=['US.Total.Covid.Tests'], errors='ignore') n_jobs = 'auto' verbose = 0 validation_method = "backwards" generations = 1 num_validations = 2 models_to_validate = 0.35 # must be a decimal percent for this test model_list = [ 'ZeroesNaive', 'LastValueNaive', 'AverageValueNaive', 'SeasonalNaive', ] transformer_list = "fast" # ["SinTrend", "MinMaxScaler"] transformer_max_depth = 3 metric_weighting = { 'smape_weighting': 3, 'mae_weighting': 1, 'rmse_weighting': 1, 'containment_weighting': 0, 'runtime_weighting': 0, 'spl_weighting': 1, 'contour_weighting': 1, } model = AutoTS( forecast_length=forecast_length, frequency='infer', prediction_interval=0.9, ensemble=["horizontal-max", "horizontal-min"], constraint=None, max_generations=generations, num_validations=num_validations, validation_method=validation_method, model_list=model_list, transformer_list=transformer_list, transformer_max_depth=transformer_max_depth, initial_template='General+Random', metric_weighting=metric_weighting, models_to_validate=models_to_validate, max_per_model_class=None, model_interrupt=False, no_negatives=True, subset=100, n_jobs=n_jobs, drop_most_recent=1, verbose=verbose, ) future_regressor_train2d, future_regressor_forecast2d = fake_regressor( df, dimensions=4, forecast_length=forecast_length, date_col='datetime' if long else None, value_col='value' if long else None, id_col='series_id' if long else None, drop_most_recent=model.drop_most_recent, aggfunc=model.aggfunc, verbose=model.verbose, ) model = model.fit( df, future_regressor=future_regressor_train2d, date_col='datetime' if long else None, value_col='value' if long else None, id_col='series_id' if long else None, ) prediction = model.predict( future_regressor=future_regressor_forecast2d, verbose=0) forecasts_df = prediction.forecast initial_results = model.results() validation_results = model.results("validation") back_forecast = model.back_forecast(n_splits=2, verbose=0).forecast # validated_count = (validation_results['Runs'] == (num_validations + 1)).sum() # so these account for DROP MOST RECENT = 1 expected_idx = pd.date_range(start=df.index[-2], periods=forecast_length + 1, freq='D')[1:] expected_val1 = pd.date_range(end=df.index[-(forecast_length + 2)], periods=forecast_length, freq='D') expected_val2 = pd.date_range(end=df.index[-(forecast_length * 2 + 2)], periods=forecast_length, freq='D') template_dict = json.loads(model.best_model['ModelParameters'].iloc[0]) best_model_result = validation_results[validation_results['ID'] == model.best_model['ID'].iloc[0]] # check there were few failed models in this simple setup (fancier models are expected to fail sometimes!) self.assertGreater( initial_results['Exceptions'].isnull().mean(), 0.95, "Too many 'superfast' models failed. This can occur by random chance, try running again." ) # check general model setup # self.assertEqual(validated_count, model.models_to_validate) self.assertGreater(model.models_to_validate, (initial_results['ValidationRound'] == 0).sum() * models_to_validate - 2) self.assertEqual( set(initial_results['Model'].unique().tolist()) - {'Ensemble'}, set(model.model_list)) self.assertFalse(model.best_model.empty) # check the generated forecasts look right self.assertEqual(forecasts_df.shape[0], forecast_length) self.assertEqual(forecasts_df.shape[1], df.shape[1]) self.assertFalse(forecasts_df.isna().any().any()) self.assertTrue((forecasts_df >= 0).all().all()) self.assertEqual(forecast_length, len(forecasts_df.index)) self.assertTrue( (expected_idx == pd.DatetimeIndex(forecasts_df.index)).all()) # these next two could potentiall fail if any inputs have a strong trend self.assertTrue((forecasts_df.mean() <= (df.max()) + df.std()).all()) self.assertTrue((forecasts_df.mean() >= (df.min()) - df.std()).all()) # check all the checks work self.assertEqual(model.ensemble_check, 1) self.assertFalse(model.weighted) self.assertFalse(model.used_regressor_check) self.assertFalse(model.subset_flag) # assess 'backwards' validation self.assertEqual(len(model.validation_test_indexes), num_validations + 1) self.assertTrue(model.validation_test_indexes[1].intersection( model.validation_train_indexes[1]).empty) self.assertTrue(model.validation_test_indexes[2].intersection( model.validation_train_indexes[2]).empty) self.assertEqual(model.validation_train_indexes[1].shape[0], df.shape[0] - (forecast_length * 2 + 1)) # +1 via drop most recent self.assertTrue( (model.validation_test_indexes[1] == expected_val1).all()) self.assertTrue( (model.validation_test_indexes[2] == expected_val2).all()) # assess Horizontal Ensembling self.assertTrue('horizontal' in template_dict['model_name'].lower()) self.assertEqual(len(template_dict['series'].keys()), df.shape[1]) self.assertEqual(len(set(template_dict['series'].values())), template_dict['model_count']) self.assertEqual(len(template_dict['models'].keys()), template_dict['model_count']) # test that actually the best model (or nearly) was chosen self.assertGreater(validation_results['Score'].quantile(0.05), best_model_result['Score'].iloc[0]) # test back_forecast self.assertTrue( (back_forecast.index == model.df_wide_numeric.index).all(), msg="Back forecasting failed to have equivalent index to train.")
# model_list = ['AverageValueNaive', 'LastValueNaive', 'ZeroesNaive'] # model_list = ['WindowRegression', 'SeasonalNaive'] metric_weighting = {'smape_weighting': 2, 'mae_weighting': 1, 'rmse_weighting': 2, 'containment_weighting': 0, 'runtime_weighting': 0, 'spl_weighting': 1, 'contour_weighting': 0 } model = AutoTS(forecast_length=forecast_length, frequency='infer', prediction_interval=0.9, ensemble='simple,distance,probabilistic-max,horizontal-max', constraint=2, max_generations=1, num_validations=2, validation_method='backwards', model_list=model_list, initial_template='General+Random', metric_weighting=metric_weighting, models_to_validate=0.1, max_per_model_class=None, model_interrupt=True, drop_most_recent=0, verbose=1) future_regressor_train, future_regressor_forecast = fake_regressor( df_long, dimensions=1, forecast_length=forecast_length, date_col='datetime', value_col='value', id_col='series_id') future_regressor_train2d, future_regressor_forecast2d = fake_regressor( df_long, dimensions=4, forecast_length=forecast_length, date_col='datetime', value_col='value', id_col='series_id') # model = model.import_results('test.pickle')
} model = AutoTS( forecast_length=forecast_length, frequency=frequency, prediction_interval=prediction_interval, ensemble=ensemble, model_list=model_list, transformer_list=transformer_list, transformer_max_depth=transformer_max_depth, max_generations=gens, metric_weighting=metric_weighting, initial_template='random', aggfunc="sum", models_to_validate=models_to_validate, model_interrupt=True, num_validations=num_validations, validation_method=validation_method, constraint=None, drop_most_recent= drop_most_recent, # if newest data is incomplete, also remember to increase forecast_length preclean=preclean, models_mode=models_mode, # no_negatives=True, # subset=100, # prefill_na=0, # remove_leading_zeroes=True, n_jobs=n_jobs, verbose=1, ) if not initial_training: