def msm_airport_xy(icao, metar_dir, msm_dir, save_dir): import re import pandas as pd import skynet.nwp2d as npd import skynet.datasets as skyds # metar読み込み with open('%s/head.txt' % metar_dir, 'r') as f: header = f.read() header = header.split(sep=',') data15 = pd.read_csv('%s/2015/%s.txt' % (metar_dir, icao), sep=',') data16 = pd.read_csv('%s/2016/%s.txt' % (metar_dir, icao), sep=',') data17 = pd.read_csv('%s/2017/%s.txt' % (metar_dir, icao), sep=',', names=header) metar_data = pd.concat([data15, data16, data17]) metar_data = npd.NWPFrame(metar_data) metar_data.strtime_to_datetime('date', '%Y%m%d%H%M%S', inplace=True) metar_data.datetime_to_strtime('date', '%Y-%m-%d %H:%M', inplace=True) metar_data.drop_duplicates('date', inplace=True) metar_data.index = metar_data['date'].values metar_keys = ['date', 'visibility', 'str_cloud'] metar_data = metar_data[metar_keys] metar_data['visibility_rank'] = skyds.to_visrank(metar_data['visibility']) # MSM読み込み msm_data = pd.read_csv('%s/%s.csv' % (msm_dir, icao)) msm_data.rename(columns={'Unnamed: 0': 'date'}, inplace=True) msm_data.index = msm_data['date'].values msm_data.sort_index(inplace=True) fets = skyds.get_init_features() target = skyds.get_init_target() X = npd.NWPFrame(pd.concat([msm_data[fets], metar_data[target]], axis=1)) X.dropna(inplace=True) X.strtime_to_datetime('date', '%Y-%m-%d %H:%M', inplace=True) X.datetime_to_strtime('date', '%Y%m%d%H%M', inplace=True) X = X[fets + target] date = [d for d in X.index if not re.match('2017', d)] train = npd.NWPFrame(X.loc[date]) train['date'] = train.index df_date = train.split_strcol( 'date', ['year', 'month', 'day', 'hour', 'min'], pattern=r'[-\s:]')[['year', 'month', 'day', 'hour', 'min']] train = pd.concat([df_date, train], axis=1) train.drop('date', axis=1, inplace=True) train.to_csv('%s/%s.csv' % (save_dir, icao), index=False)
def eval_one_forecast(metar: pd.DataFrame, pred: pd.DataFrame, save_dir): icao = metar['ICAO'][0] metar.index = metar['date'] metar.sort_index(inplace=True) metar.drop_duplicates('date', inplace=True) pred = npd.NWPFrame(pred) pred_date_cols = ['HEAD:YEAR', 'MON', 'DAY', 'HOUR'] for key in pred_date_cols: if key == 'HEAD:YEAR': pred[key] = pred[key].astype(str).str.pad(4, fillchar='0') else: pred[key] = pred[key].astype(str).str.pad(2, fillchar='0') pred.merge_strcol(pred_date_cols, 'date', inplace=True) pred.strtime_to_datetime('date', '%Y%m%d%H', inplace=True) pred.datetime_to_strtime('date', '%Y-%m-%d %H:%M', inplace=True) pred.index = pred['date'] pred.sort_index(inplace=True) pred.drop_duplicates('date', inplace=True) vis = pd.concat([metar, pred], axis=1) vis = vis[['visibility', 'SKYNET-VIS']] vis.dropna(inplace=True) os.makedirs('%s/time_series' % save_dir, exist_ok=True) vis.to_html('%s/time_series/%s.html' % (save_dir, icao)) vis_level = skyds.get_init_vis_level() steps = list(vis_level.values()) cfm = conf_mat(vis['visibility'], vis['SKYNET-VIS'], steps) os.makedirs('%s/confusion_matrix' % save_dir, exist_ok=True) cfm.to_html('%s/confusion_matrix/%s.html' % (save_dir, icao))
def test1(): import pickle import pandas as pd import skynet.nwp2d as npd import skynet.datasets as skyds # -- テストデータの準備 test = pickle.load( open( '/Users/makino/PycharmProjects/SkyCC/data/skynet/test_%s.pkl' % icao, 'rb')) test['date'] = test['date'].astype(int).astype(str) test = npd.NWPFrame(test) test.strtime_to_datetime('date', '%Y%m%d%H%M', inplace=True) test.datetime_to_strtime('date', '%Y-%m-%d %H:%M', inplace=True) df_date = test.split_strcol('date', ['year', 'month', 'day', 'hour', 'min'], r'[-\s:]')[['month', 'day', 'hour', 'min']].astype(int) test = pd.concat([df_date, test], axis=1) keys = skyds.get_init_features() + skyds.get_init_target() test = test[keys] X_test = test.iloc[:, :-1] y_test = test.iloc[:, -1] X_test = X_test[(X_test['month'] == start_month) | (X_test['month'] == end_month)] y_test = y_test.loc[X_test.index] ss = StandardScaler() X_test = ss.fit_transform(X_test) y_test = y_test.values
def confidence_factor(x, n_class): import numpy as np import skynet.nwp2d as npd mv = np.zeros((len(x), n_class)) idx = np.arange(len(x)) for i in range(x.shape[1]): mv[idx, x[:, i].astype(int)] += 1 confac = npd.NWPFrame(mv) return confac
def set_visibility_area_forecast(icao): import pickle import skynet.nwp2d as npd from skynet import DATA_DIR af = pickle.load(open(DATA_DIR + "/skynet/metar.before.msm/test_%s.pkl" % icao, "rb")) af = af[["date", "VIS"]].rename(columns={"VIS": "visibility"}) af["visibility_rank"] = convert_visibility_rank(af["visibility"].values) af = npd.NWPFrame(af) af.index = af.strtime_to_datetime('date', fmt='%Y%m%d%H%M') return af
def set_visibility_metar(icao): import skynet.nwp2d as npd from skynet import DATA_DIR metar = pd.read_csv(DATA_DIR + "/metar/airport_vis/metar_%s.csv" % icao) metar = npd.NWPFrame(metar) metar['visibility_rank'] = convert_visibility_rank( metar['visibility'].values) date = metar.strtime_to_datetime('date', fmt='%Y%m%d%H%M') metar.index = date return metar
def set_visibility_human_edit(icao): import skynet.nwp2d as npd from skynet import DATA_DIR he = pd.read_csv( "%s/after/%s.csv" % (DATA_DIR, icao), names=["a", "b", "date", "c", "VIS_after", "d", "e", "f", "g", "h", "i"] ) he = npd.NWPFrame(he[["date", "VIS_after"]]) he.drop_duplicates("date", keep="first", inplace=True) vr = convert_visibility_rank(he["VIS_after"].values) he["visibility_rank"] = vr he.index = he.strtime_to_datetime('date', fmt='%Y%m%d%H%M') he.columns = ["date", "visibility", "visibility_rank"] return he
def Vis_Pred(model, contxt, lclid, test_dir, input_dir, fit_dir, pred_dir, errfile): import os import sys import copy import csv import pickle import pandas as pd import skynet.nwp2d as npd import skynet.datasets as skyds from sklearn.preprocessing import StandardScaler from pathlib import Path myname = sys.argv[0] print(model) csv_test = '%s/%s-%s.csv' % (test_dir, contxt, lclid) csv_input = '%s/%s-%s.vis.csv' % (input_dir, contxt, lclid) fitfile = '%s/%s-%s.vis.pkl' % (fit_dir, contxt, lclid) predfile = '%s/%s-%s.vis.csv' % (pred_dir, contxt, lclid) conffile = '%s/confidence_factor/%s-%s.vis.csv' % (pred_dir, contxt, lclid) if not os.path.exists(csv_test): print("{:s}: [Error] {:s} is not found !".format(myname, csv_test)) if not os.path.exists(errfile): Path(errfile).touch() return X = pd.read_csv(csv_test) X = npd.NWPFrame(X) # --- Reading Fitting File & Input File (If Not Existing -> -9999.) if not os.path.exists(fitfile) or not os.path.exists(csv_input): print("{:s}: [Checked] {:s} or {:s} is not found !".format( myname, fitfile, csv_input)) PRED = [] for k in range(len(X)): pred = [-9999.] PRED = PRED + pred # - Output(all -9999.) outdata = X[['HEAD:YEAR', 'MON', 'DAY', 'HOUR']] outdata['SKYNET-VIS'] = PRED outdata.to_csv( predfile, columns=['HEAD:YEAR', 'MON', 'DAY', 'HOUR', 'ARC-GUSTS'], index=False, header=True) # - Output(num of train -> 0) f = open(predfile, 'a') csv.writer(f, lineterminator='\n').writerow(['FOOT:TRAIN_NUM', 0]) f.close() return df_date = X[['HEAD:YEAR', 'MON', 'DAY', 'HOUR']] date_keys = ['HEAD:YEAR', 'MON', 'DAY', 'HOUR', 'MIN'] X['MIN'] = [0] * len(X) for key in date_keys: if not key == 'HEAD:YEAR': X[key] = ['%02d' % int(d) for d in X[key]] X.merge_strcol(date_keys, 'date', inplace=True) X.drop(date_keys, axis=1, inplace=True) # print(X) wni_code = skyds.get_init_features('wni') X = X[wni_code] long_code = skyds.get_init_features('long') X.columns = long_code vt = len(X) pool = skyds.read_csv(csv_input)[long_code] sppool = skyds.convert.split_time_series(pool, date=pool["date"].values, level="month", period=2, index_date=True) month_key_info = get_month_key(X['date'][0], period=2) X = pd.concat([X, sppool[month_key_info[1]]]) ss = StandardScaler() X = npd.NWPFrame(ss.fit_transform(X), columns=X.keys()) X = X.iloc[:vt] clfs = pickle.load(open(fitfile, 'rb'))[month_key_info[1]] p, c = predict(X, clfs, W[lclid][month_key_info[0]], smooth=False, confidence=True) vis_pred = adapt_visibility(p) vis = npd.NWPFrame(copy.deepcopy(df_date)) vis['SKYNET-VIS'] = vis_pred # vis.rename(columns={'HEAD:YEAR': 'YEAR'}, inplace=True) c = pd.concat([copy.deepcopy(df_date), c], axis=1) # c.rename(columns={'HEAD:YEAR': 'YEAR'}, inplace=True) print(os.path.dirname(predfile)) vis.to_csv(predfile, index=False) c.to_csv(conffile, index=False)
def edit_rate_06_23(): before_dir = '%s/before' % DATA_DIR after_dir = '%s/after' % DATA_DIR save_dir = '%s/evaluate/edit_rate' % DATA_DIR os.makedirs(save_dir, exist_ok=True) before_airports = os.listdir(before_dir) before_airports = { icao[:4] for icao in before_airports if re.match(r'^[A-Z]', icao) } after_airports = os.listdir(after_dir) after_airports = { icao[:4] for icao in after_airports if re.match(r'^[A-Z]', icao) } airports_list = list(before_airports & after_airports) airports_list.sort() # airports_series = pd.Series(airports_list, name='ICAO') # airports_series.to_csv('airport_list.csv', index=False) df_edit_all = pd.DataFrame() for icao in airports_list: print(icao) df_before = npd.NWPFrame( pd.read_csv('%s/%s.txt' % (before_dir, icao), sep=',')) df_before.strtime_to_datetime(date_key='date', fmt='%Y%m%d%H%M', inplace=True) df_before.index = df_before['date'].values before_bt = [bt for bt in df_before['date'] if bt.hour == 12] vt_list = [] for bt in before_bt: vt = [bt + datetime.timedelta(hours=t) for t in range(18)] vt_list += vt df_before_06_23 = df_before.loc[vt_list] df_before_06_23 = npd.NWPFrame(df_before_06_23) df_before_06_23.dropna(inplace=True) df_before_06_23.datetime_to_strtime(date_key='date', fmt='%Y-%m-%d %H:%M', inplace=True) df_before_06_23.index = df_before_06_23['date'].values h_after = [ 'ICAO', 'BASE', 'VALID', 'precipitation', 'visibility', 'ceiling', 'temperature', 'wind speed', 'wind direction', 'WX_after', 'u4' ] df_after = npd.NWPFrame( pd.read_csv('%s/%s.csv' % (after_dir, icao), names=h_after)) df_after.strtime_to_datetime(date_key='VALID', fmt='%Y%m%d%H%M', inplace=True) df_after.index = df_after['VALID'].values df_after_06_23 = npd.NWPFrame(df_after[[ 'BASE', 'VALID', 'visibility', 'ceiling', 'wind speed', 'wind direction', 'WX_after' ]]) df_after_06_23.strtime_to_datetime(date_key='BASE', fmt='%Y%m%d%H%M', inplace=True) after_bt = list(df_after_06_23.drop_duplicates('BASE')['BASE'].values) vt_list = [] for bt in after_bt: vt = [bt + np.timedelta64(t, 'h') for t in range(6, 24)] vt_list += vt idx_check = True for v in vt_list: if v in df_after_06_23.index: idx_check = False break if idx_check: df_edit = pd.DataFrame( [[icao, len(df_after_06_23), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], columns=[ 'ICAO', 'All', 'Vis edit', 'VIS edit rate', 'CIG edit', 'CIG edit rate', 'WNDSPD edit', 'WNDSPD edit rate', 'WDIR edit', 'WDIR edit rate', 'WX edit', 'WX edit rate' ]) df_edit.to_csv('%s/edit_rate_%s.csv' % (save_dir, icao), index=False) df_edit_all = pd.concat([df_edit_all, df_edit]) continue df_after_06_23 = df_after_06_23.loc[vt_list] df_after_06_23 = npd.NWPFrame(df_after_06_23) df_after_06_23.dropna(inplace=True) df_after_06_23.datetime_to_strtime(date_key='VALID', fmt='%Y-%m-%d %H:%M', inplace=True) df_after_06_23.drop_duplicates('VALID', inplace=True) df_after_06_23.index = df_after_06_23['VALID'].values # print(df_after[['BASE', 'visibility', 'ceiling', 'wind speed', 'wind direction', 'WX_after']]) vis = pd.concat([df_before_06_23, df_after_06_23], axis=1) vis = vis[[ 'ICAO', 'date', 'VIS', 'visibility', 'CLING', 'ceiling', 'WNDSPD', 'wind speed', 'WNDDIR', 'wind direction', 'WX_after' ]] ''' if len(df_before_06_23) > len(df_after_06_23): vis_index = df_after_06_23.index else: vis_index = df_before_06_23.index vis = vis.loc[vis_index] ''' vis.rename(columns={ 'VIS': 'VIS_before', 'visibility': 'VIS_after', 'CLING': 'CIG_before', 'ceiling': 'CIG_after', 'WNDSPD': 'WNDSPD_before', 'wind speed': 'WNDSPD_after', 'WNDDIR': 'WNDDIR_before', 'wind direction': 'WNDDIR_after' }, inplace=True) # vis.reset_index(drop=True, inplace=True) vis_range = [ 0, 25, 75, 125, 175, 225, 275, 325, 375, 450, 550, 625, 675, 725, 775, 850, 950, 1050, 1150, 1250, 1350, 1450, 1550, 1650, 1750, 1900, 2200, 2700, 3100, 3600, 4400, 4900, 5500, 6500, 7500, 8500, 9500, 10000 ] vis_values = [ 0, 50, 100, 150, 200, 250, 300, 350, 400, 500, 600, 650, 700, 750, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 2000, 2400, 3000, 3200, 4000, 4800, 5000, 6000, 7000, 8000, 9000, 9999 ] i = 0 while True: idx = np.where((vis['VIS_before'] > vis_range[i]) & (vis['VIS_before'] <= vis_range[i + 1]))[0] idx = vis.index[idx] vis.loc[idx, 'VIS_before'] = vis_values[i] i += 1 if i == len(vis_values): break vis[['CIG_before', 'CIG_after']] *= 1 / 0.3048 cig_range = [ 0, 15, 40, 75, 125, 175, 225, 275, 325, 375, 450, 550, 650, 750, 850, 950, 1050, 1150, 1250, 1350, 1450, 1550, 1650, 1750, 1850, 1950, 2050, 2150, 2250, 2350, 2450, 2550, 2650, 2750, 2850, 2950, 3250, 3750, 4500, 100000 ] cig_values = [ 0, 30, 50, 100, 150, 200, 250, 300, 350, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3500, 4000, '' ] i = 0 while True: idx = np.where((vis['CIG_before'] > cig_range[i]) & (vis['CIG_before'] <= cig_range[i + 1]))[0] idx = vis.index[idx] vis.loc[idx, 'CIG_before'] = cig_values[i] i += 1 if i == len(cig_values): break i = 0 while True: idx = np.where((vis['CIG_after'] > cig_range[i]) & (vis['CIG_after'] <= cig_range[i + 1]))[0] idx = vis.index[idx] vis.loc[idx, 'CIG_after'] = cig_values[i] i += 1 if i == len(cig_values): break vis['WNDSPD_before'] *= 1 / 0.514444 vis = vis.round({'WNDSPD_before': 0}) vis = vis.round({'WNDDIR_before': -1}) wx_list = ['', '', 'RA', 'SNRA', 'SN', 'SNRA', '', '', ''] wx_telop = [] for idx in vis.index: if idx in df_before_06_23.index: wx_prob = df_before_06_23.loc[idx, [ 'WX_telop_100', 'WX_telop_200', 'WX_telop_300', 'WX_telop_340', 'WX_telop_400', 'WX_telop_430', 'WX_telop_500', 'WX_telop_600', 'WX_telop_610' ]] prc = df_before_06_23.loc[idx, 'PRCRIN_1HOUR_TOTAL'] if (prc >= 5.) and (prc < 10.): wx = '' elif prc >= 10.: wx = '+' else: wx = '-' tnd = df_before_06_23.loc[idx, 'TNDSTM_prob'] if tnd >= 50.: wx += 'TS' wx += wx_list[int(np.argmax(wx_prob.values))] v = df_before_06_23.loc[idx, 'VIS'] tmpr = df_before_06_23.loc[idx, 'AIRTMP'] if v < 1000.: wx += ' FG' elif (v < 1000.) and (tmpr < 0.): wx += ' FZFG' elif (v >= 1000.) and (v <= 5000.): wx += ' BR' wx_telop.append(wx) wx_telop = pd.DataFrame(wx_telop, index=df_before_06_23.index, columns=['WX_before']) vis = pd.concat([vis, wx_telop], axis=1) vis['WX_after'] = vis['WX_after'].str.replace(' ', '') vis['WX_after'] = vis['WX_after'].str.replace('_', '-') vis.dropna(inplace=True) vis_edit = np.where(vis['VIS_before'] != vis['VIS_after'], '*', '') cig_edit = np.where(vis['CIG_before'] != vis['CIG_after'], '*', '') wspd_edit = np.where(vis['WNDSPD_before'] != vis['WNDSPD_after'], '*', '') wdir_edit = np.where(vis['WNDDIR_before'] != vis['WNDDIR_after'], '*', '') wx_edit = np.where(vis['WX_before'] != vis['WX_after'], '*', '') vis_edit_rate = calc_edit_rate(vis_edit) cig_edit_rate = calc_edit_rate(cig_edit) wspd_edit_rate = calc_edit_rate(wspd_edit) wdir_edit_rate = calc_edit_rate(wdir_edit) wx_edit_rate = calc_edit_rate(wx_edit) vis['VIS edit'] = vis_edit vis['CIG edit'] = cig_edit vis['WNDSPD edit'] = wspd_edit vis['WDIR edit'] = wdir_edit vis['WX edit'] = wx_edit # 時系列データが欲しい場合は書き足す columns = [ 'ICAO', 'date', 'VIS_before', 'VIS_after', 'VIS edit', 'CIG_before', 'CIG_after', 'CIG edit', 'WNDSPD_before', 'WNDSPD_after', 'WNDSPD edit', 'WNDDIR_before', 'WNDDIR_after', 'WDIR edit', 'WX_before', 'WX_after', 'WX edit' ] vis = vis[columns] df_edit = pd.DataFrame([[ icao, len(vis_edit), len(vis_edit[vis_edit == '*']), vis_edit_rate, len(cig_edit[cig_edit == '*']), cig_edit_rate, len(wspd_edit[wspd_edit == '*']), wspd_edit_rate, len(wdir_edit[wdir_edit == '*']), wdir_edit_rate, len(wx_edit[wx_edit == '*']), wx_edit_rate ]], columns=[ 'ICAO', 'All', 'Vis edit', 'VIS edit rate', 'CIG edit', 'CIG edit rate', 'WNDSPD edit', 'WNDSPD edit rate', 'WDIR edit', 'WDIR edit rate', 'WX edit', 'WX edit rate' ]) df_edit.to_csv('%s/edit_rate_%s_06_23.csv' % (save_dir, icao), index=False) df_edit_all = pd.concat([df_edit_all, df_edit]) df_edit_all = df_edit_all.round(3) df_edit_all.to_csv('%s/edit_rate_all_06_23.csv' % save_dir, index=False)
def msm_airport_ft0(icaos): import re import glob import gc import pickle import pygrib import skynet.nwp2d as npd from skynet import MSM_INFO, MSM_DATA_DIR latlon = npd.msm.get_airport_latlon(icaos) sf_latlon_idx = npd.msm.latlon_to_indices(latlon, layer='surface') up_latlon_idx = npd.msm.latlon_to_indices(latlon, layer='upper') tagid_list = [ tagid for tagid in MSM_INFO.keys() if re.match(r'4002200', tagid) ] tagid_list.sort() df_airports = {icao: npd.NWPFrame() for icao in icaos} for icao in icaos: for tagid in tagid_list: meta = MSM_INFO[tagid] layer = meta['layer'] path = '%s/%s/bt%s/vt%s%s' % ( MSM_DATA_DIR, layer, meta['base time'], meta['first validity time'], meta['last validity time']) path_list = glob.glob('%s/201*' % path) path_list.sort() for p in path_list: print(p) msm_files = glob.glob('%s/201*' % p) msm_files.sort() for f in msm_files: grbs = pygrib.open(f) if layer == 'surface': grb = grbs.select()[0] if grb is None: continue date = grb.validDate.strftime("%Y-%m-%d %H:%M") param = grb.parameterName lat = sf_latlon_idx[icao][0] lon = sf_latlon_idx[icao][1] df_airports[icao].loc[date, param] = grb.values[lat, lon] del grb gc.collect() if layer == 'upper': grb = grbs.select()[0] if grb is None: continue date = grb.validDate.strftime("%Y-%m-%d %H:%M") param = grb.parameterName[:4] + str(grb.level) lat = up_latlon_idx[icao][0] lon = up_latlon_idx[icao][1] df_airports[icao].loc[date, param] = grb.values[lat, lon] del grb gc.collect() grbs.close() df_airports[icao].to_csv( '/Users/makino/PycharmProjects/SkyCC/data/msm_airport/%s.csv' % icao) pickle.dump( df_airports, open('/Users/makino/PycharmProjects/SkyCC/data/all_airport.pkl', 'wb'))
def main(): import skynet.nwp2d as npd import skynet.datasets as skyds from skynet import DATA_DIR, USER_DIR os.makedirs(os.getcwd() + "/confusion_matrix", exist_ok=True) icao = 'RJFK' ''' 'RJOT', 'RJAA', 'RJSC', 'RJSI', 'RJSK', 'RJSM', 'RJSN', 'RJSS', 'RJTT', 'ROAH', 'RJOC', 'RJOO', # 'RJBB', 'RJCC', 'RJCH', 'RJFF', 'RJFK', 'RJGG', 'RJNK', 'RJOA', ''' data_dir = '%s/ARC-common/fit_input/JMA_MSM/vis' % DATA_DIR model_dir = '%s/ARC-common/fit_output/JMA_MSM/vis' % DATA_DIR model_name = 'GLOBAL_METAR-%s.vis' % icao data_name = 'GLOBAL_METAR-%s.vis' % icao month_keys = ['month:1-2', 'month:3-4', 'month:5-6', 'month:7-8', 'month:9-10', 'month:11-12'] X = npd.NWPFrame(pd.read_csv('/Users/makino/PycharmProjects/SkyCC/data/skynet/test_%s.csv' % icao, sep=',')) # 前処理 # X = preprocessing(X) # print(msm_data) # 時系列でデータを分割 spX = skyds.convert.split_time_series(X, X['month'], date_fmt='%m') # metar metar = set_visibility_metar(icao) # metar = sync_values(base=metar, x=X[["visibility_rank"]]) spmetar = skyds.convert.split_time_series( metar, metar["date"], date_fmt='%Y%m%d%H%M' ) # area_forecast af = set_visibility_area_forecast(icao) spaf = skyds.convert.split_time_series( af, date=af["date"], date_fmt='%Y%m%d%H%M' ) # human edit he = set_visibility_human_edit(icao) sphe = skyds.convert.split_time_series( he, date=he["date"], date_fmt='%Y%m%d%H%M' ) # モデルの準備 ''' clfs = {} model_dir = '%s/PycharmProjects/SkyCC/trained_models' % USER_DIR for i_term, key in enumerate(spX): clfs[key] = [ pickle.load( open("%s/%s/forest/%s/rf%03d.pkl" % (model_dir, icao, key, i), "rb")) for i in range(N_CLF[i_term]) ] clfs = {} for i_term, key in enumerate(spX): os.makedirs('%s/%s/stacking' % (model_dir, key), exist_ok=True) clfs[key] = pickle.load( open('%s/%s.pkl' % (model_dir, model_name), 'rb')) ''' clfs = pickle.load(open('%s/%s.pkl' % (model_dir, model_name), 'rb')) # パラメーター confidence_list = [10, 20, 30, 40, 50, 60, 70, 80, 90] confusion_matrix_threshold = [0, 1, 2, 3, 4, 5, 6, 7, 8] score = pd.DataFrame() for t_num, threshold in enumerate(confidence_list): # 時系列毎の予測(コンフィデンスファクター付) sppred = predict_by_period(spX, clfs, icao, smooth=False, confidence=True) # Xのindexをdateに変換 # X.index = X.strtime_to_datetime('date', fmt='%Y-%m-%d %H:%M') # 編集箇所チェック all_samples = 0 for key in sphe: idx_edit = extract_different_index(sphe[key]["visibility_rank"], spaf[key]["visibility_rank"]) edit = np.array(["" for _ in range(len(sphe[key]))]) edit[idx_edit] = "*" sphe[key]["edit"] = edit all_samples += len(idx_edit) # 期間別vis_table spvis = {} drop_list = [ "metar_visibility", "metar_visibility_rank", "human_visibility", "human_visibility_rank", "skynet_visibility", "skynet_visibility_rank", 'tmp' ] for key in sppred: vis = make_vis_table(metar=spmetar[key], he=sphe[key], ml=sppred[key]) vis["skynet"] = np.round(vis["skynet_visibility_rank"]).astype(int) vis["metar"] = vis["metar_visibility_rank"].astype(int) vis["human"] = vis["human_visibility_rank"].astype(int) import matplotlib.pyplot as plt plt.figure() plt.plot(vis['metar_visibility_rank'].values) plt.plot(vis['skynet_visibility_rank'].values) plt.show() vis = vis.rename(columns={"edit": "tmp"}) vis["edit"] = vis["tmp"] vis = vis.drop(drop_list, axis=1) spvis[key] = vis spvis[key].insert(0, 'date', spX[key]['date'].values) # コンフィデンスファクターが閾値以下となる予測値を削除 samples = 0 for key in spvis: os.makedirs(os.getcwd() + "/confidence_factor/%s/%s" % (key, icao), exist_ok=True) confidence_map = spvis[key].loc[:, range(9)].values idx = confidence_map.argmax(axis=1) c_max = np.array([c[i] for i, c in zip(idx, confidence_map)]) spvis[key] = spvis[key].iloc[c_max >= threshold] spvis[key].to_html(os.getcwd() + "/confidence_factor/%s/%s/%s_%s.html" % (key, icao, icao, threshold)) edit = spvis[key]["edit"] samples += len([e for e in edit if e == "*"]) print("all sample :", all_samples) print("samples :", samples) print("samples / all samples = %0.3f" % (samples / all_samples)) print() # 期間別混同行列 for key in sppred: idx = spvis[key].index sppred[key] = sppred[key].loc[idx] cfm_he, cfm_ml, cfm_heml = make_conf_mat_by_period(metar=spmetar, he=sphe, ml=sppred, threats=confusion_matrix_threshold) cfm_he1y = 0 cfm_ml1y = 0 for key in cfm_he: cfm_he1y += cfm_he[key] cfm_ml1y += cfm_ml[key] os.makedirs( os.getcwd() + "/confusion_matrix/%dx%d/metar_vs_human/%s" % (len(confusion_matrix_threshold), len(confusion_matrix_threshold), icao), exist_ok=True ) os.makedirs( os.getcwd() + "/confusion_matrix/%dx%d/metar_vs_ml/%s" % (len(confusion_matrix_threshold), len(confusion_matrix_threshold), icao), exist_ok=True ) cfm_he1y.to_html( os.getcwd() + "/confusion_matrix/%dx%d/metar_vs_human/%s/%s_%s.html" % (len(confusion_matrix_threshold), len(confusion_matrix_threshold), icao, icao, threshold), ) cfm_ml1y.to_html( os.getcwd() + "/confusion_matrix/%dx%d/metar_vs_ml/%s/%s_%s.html" % (len(confusion_matrix_threshold), len(confusion_matrix_threshold), icao, icao, threshold), ) print(cfm_he1y) print() print(cfm_ml1y) print() mat = cfm_ml1y.values rs = mat[0, 0] / (mat[0, 0] + mat[0, 1]) ps = mat[0, 0] / (mat[0, 0] + mat[1, 0]) f1 = 2 * rs * ps / (rs + ps) print("total: f1 = %0.3f, recall = %0.3f, precision = %0.3f" % (f1, rs, ps)) print() score = score.append([ [ threshold, # all_samples, # samples, # samples / all_samples, f1, rs, ps ] ]) score.columns = ["confidence", # "number of edit", # "edit reduction", "%", "f1", "recall", "precision"] score = score.round(3) # score["%"] *= 100 print(score) os.makedirs(os.getcwd() + "/score", exist_ok=True) score.to_html(os.getcwd() + "/score/%s.html" % icao, index=False)
def main(): import skynet.nwp2d as npd import skynet.datasets as skyds from skynet import DATA_DIR from sklearn.preprocessing import StandardScaler from sklearn.metrics import f1_score icao = "RJAA" train_data_dir = '%s/MSM/airport.process' % DATA_DIR test_data_dir = '%s/skynet' % DATA_DIR train = skyds.read_csv('%s/%s.csv' % (train_data_dir, icao)) test = skyds.read_pkl('%s/test_%s.pkl' % (test_data_dir, icao)) test['date'] = test['date'].astype(int).astype(str) test = npd.NWPFrame(test) test.strtime_to_datetime('date', '%Y%m%d%H%M', inplace=True) test.datetime_to_strtime('date', '%Y-%m-%d %H:%M', inplace=True) df_date = test.split_strcol( 'date', ['year', 'month', 'day', 'hour', 'min'], r'[-\s:]' )[['month', 'day', 'hour', 'min']].astype(int) test = pd.concat([df_date, test], axis=1) fs = skyds.get_init_features() target = skyds.get_init_target() train = train[fs + target] test = test[fs + target] train = train[(train['month'] == 1) | (train['month'] == 2)] test = test[(test['month'] == 1) | (test['month'] == 2)] X = train.iloc[:, :-1] y = train.iloc[:, -1] ss = StandardScaler() X = ss.fit_transform(X) y = y.values X, y = skyds.convert.balanced(X, y) spX, spy = skyds.convert.split_blocks(X, y, n_folds=5) print(spX) spX, spy = preprocess.split(X, y, n_folds=5) X = pd.concat([spX[n] for n in spX if n != 0]).reset_index(drop=True) y = pd.concat([spy[n] for n in spy if n != 0]).reset_index(drop=True) X_test = spX[0].reset_index(drop=True) y_test = spy[0].reset_index(drop=True) from sklearn.ensemble import RandomForestClassifier clf1 = RandomForestClassifier(max_features=2) clf2 = SkySVM() meta = LogisticRegression() # 学習 # (注)balancedしてない sta = SkyStacking((clf1, clf2), meta) sta.fit(X, y) p = sta.predict(X_test) clf1.fit(X.values, y.values[:, 0]) print(np.array(X.keys())[np.argsort(clf1.feature_importances_)[::-1]]) p_rf = clf1.predict(X_test.values) # mlxtendのstacking sc = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta) sc.fit(X.values, y.values[:, 0]) p_sc = sc.predict(X_test.values) y_test = np.where(y_test.values[:, 0] > 1, 0, 1) p = np.where(p > 1, 0, 1) p_rf = np.where(p_rf > 1, 0, 1) p_sc = np.where(p_sc > 1, 0, 1) f1 = f1_score(y_true=y_test, y_pred=p) print("stacking", f1) f1_rf = f1_score(y_true=y_test, y_pred=p_rf) print("random forest", f1_rf) f1_sc = f1_score(y_true=y_test, y_pred=p_sc) print("stacked classifier", f1_sc)