def main(): '''有两个命令行参数 package和function,function指定待调试的函数,而package指定 该函数在哪个包。每次执行完,都可以选择继续执行,或者终止执行''' parser = argparse.ArgumentParser(description='用于调试package.function。\ 每次执行完,都可以选择继续执行,或者终止执行') parser.add_argument('package', help='需要调试的函数所在的包', default=None) parser.add_argument('function', help='需要调试的函数', default=None) args = parser.parse_args() package = args.package func = args.function if package is None: v = input('请输入包名: ') if len(v) == 0: return package = v if func is None: v = input('请输入函数名: ') if len(v) == 0: return func = v #导入待调试的函数 exec('import %s' % package) #初始化数据管理对象 Repo() while True: try: exec('%s.%s()' % (package, func)) except Exception as msg: logging.warn(msg) pass if len(input('\n按回车重新执行%s.%s(),任意字符终止执行.\n' % (package, func))) > 0: break
def get_predict_feature(): r = Repo() lst2 = r(features.last_2hour, name='pred', phase='pred') tvl = r(features.trivial_feature_v2, name='train') enum_cap = [(int(1), True), (int(1), False), (int(2), False), (int(3), True), (int(3), False)] time_int = ['08:00:00', '08:20:00', '08:40:00', '09:00:00', '09:20:00', '09:40:00', '17:00:00', '17:20:00', '17:40:00', '18:00:00', '18:20:00', '18:40:00'] dates = ['2016-10-%d'%k for k in range(18, 25)] res = pd.DataFrame(columns=['tollgate_id','direction','time','max_vol', 'min_vol','avg_vol']) for tid,drc in enum_cap: for t in time_int: for d in dates: index = (tvl.tollgate_id == tid) & (tvl.direction == drc) &\ (tvl.time == '2016-10-17 %s'%t) max_vol = tvl[index].max_vol.values[0] min_vol = tvl[index].min_vol.values[0] avg_vol = tvl[index].avg_vol.values[0] data = {'tollgate_id':tid,'direction':drc,'time':'%s %s'%(d,t), 'max_vol':max_vol,'min_vol':min_vol,'avg_vol':avg_vol} res = res.append(data, ignore_index=True) f = res.merge(lst2, on=['tollgate_id','direction','time'], how='left') f['hour'] = f.time.apply(lambda t:round((int(t[11:13])*60+int(t[14:16]))/60)%24) f['is_work'] = f.time.apply(features.is_work_day) f = f.reindex_axis(['tollgate_id','direction','time','hour', 'max_vol','min_vol','avg_vol','last2h', 'is_work'],axis='columns') return f
def volume_grouped(phase='train'): def round_time(t): m = int(floor(int(t[14:16]) / 20) * 20) return '%s:%s:00' % (t[:13], str(m) if m > 9 else '0%d' % m) r = Repo() path = '../../../dataset/training/volume(table 6)_training.csv' dt_start = '2016-09-19' dt_end = '2016-10-17' if phase != 'trian': path = '../../../dataset/testing_phase1/volume(table 6)_test1.csv' dt_start = '2016-10-18' dt_end = '2016-10-24' df = r(dataproc.prep_volume, name=phase, path=path, dt_start=dt_start, dt_end=dt_end) df = df.ix[:, [0, 1, 2]] df['time'] = df.time.apply(round_time) df = df.groupby(['tollgate_id', 'direction', 'time']).size() df = df.reset_index() df = df.rename_axis({0: 'volume'}, axis='columns') # to-do 平滑节假日异常值 return df
def trivial_feature(): '''之前对应时间段的最大、最小和平均流量''' r = Repo() df = r(volume_grouped, name='train') df = df.sort_values('time', ascending=True) res = pd.DataFrame(columns=[ 'tollgate_id', 'direction', 'time', 'hour', 'max_vol', 'min_vol', 'avg_vol', 'volume' ]) enum_cap = [(int(1), True), (int(1), False), (int(2), False), (int(3), True), (int(3), False)] eid = 0 for tid, dirc in enum_cap: tdf = df[(df.tollgate_id == tid) & (df.direction == dirc)] tdf = tdf.drop(['tollgate_id', 'direction'], axis='columns') tdf['date'], tdf['time'] = tdf.time.str[:10], tdf.time.str[11:] tdf_grp = tdf.groupby('time') eid += 1 gid = 0 for tm, sdf in tdf_grp: sdf = sdf.reset_index() gid += 1 if sdf.shape[0] < 2: continue st_date = np.datetime64(sdf.ix[0, 'date']) sm = sdf.ix[0, 'volume'] min_vol = sm max_vol = sm # logging.info('Lane:%d/%d. Group:%d/%d.' % (eid, len(enum_cap), # gid, len(tdf_grp.groups))) for k in range(1, sdf.shape[0]): nw_date = sdf.ix[k, 'date'] span = int((np.datetime64(nw_date) - st_date) / np.timedelta64(1, 'D')) if span <= 0: logging.error('时间未排序!') break avg_vol = sm / span t = sdf.ix[k, 'volume'] dat = { 'tollgate_id': tid, 'direction': dirc, 'time': '%s %s' % (nw_date, tm), 'hour': round((int(tm[:2]) * 60 + int(tm[3:5])) / 60) % 24, 'max_vol': max_vol, 'min_vol': min_vol, 'avg_vol': avg_vol, 'volume': t } res = res.append(dat, ignore_index=True) sm += t min_vol = min(min_vol, t) max_vol = max(max_vol, t) return res
def last_2hour(phase='train'): '''前两个小时的流量''' r = Repo() df = r(volume_grouped, name=phase, phase=phase) if phase == 'train': df_dp = df.drop('volume', axis='columns') df['time'] = df.time.apply(lambda t: pd.to_datetime(str(np.datetime64(t) + np.timedelta64(2, 'h'))).strftime('%Y-%m-%d %H:%M:%S')) if phase == 'train': df = df_dp.merge( df, on=['tollgate_id', 'direction', 'time'], how='left') df = df.fillna(0) df = df.rename_axis({'volume': 'last2h'}, axis='columns') return df
def main(params={}, debug=False): r = Repo() df1 = r(features.trivial_feature_v2, name='train') df1['is_work'] = df1.time.apply(features.is_work_day) df2 = r(features.last_2hour, name='train') df = r( lambda x, y: x.merge( y, on=['tollgate_id', 'direction', 'time'], how='left'). reindex_axis([ 'tollgate_id', 'direction', 'time', 'hour', 'max_vol', 'min_vol', 'avg_vol', 'last2h', 'is_work', 'volume' ], axis='columns'), 'train', df1, df2) test_index, train_index = dataproc.select_test(df.shape[0], round(df.shape[0] * 0.1)) train_feature = df.ix[train_index, 'hour':'is_work'].reset_index(drop=True) train_label = df.ix[train_index, 'volume'].reset_index(drop=True) # model = xgboost.XGBRegressor(**params) model = KNeighborsRegressor() logging.info('开始交叉验证...') scores = cross_val_score( model, train_feature, train_label, cv=KFold(n_splits=3, shuffle=False), # n_jobs=-1, scoring=dataproc.official_loss) logging.info('交叉验证结果: %s.' % scores) if debug: return logging.info('用所有训练数据训练模型...') model.fit(train_feature, train_label) logging.info('模型训练完毕.') logging.info('开始测试...') test_feature = df.ix[test_index, 'hour':'is_work'].reset_index(drop=True) test_label = df.ix[test_index, 'volume'].reset_index(drop=True) y = model.predict(test_feature) test_mape = dataproc.MAPE(y, test_label) logging.info('测试结果: %f.', dataproc.MAPE(y, test_label)) res_feature = dataproc.get_predict_feature() res_y = model.predict(res_feature.ix[:, 'hour':'is_work']) res = res_feature.ix[:, 'tollgate_id':'time'] res['volume'] = res_y dataproc.formatResult(res, name='knn_%f' % test_mape)
def trivial_feature_v2(): '''定义为tollgate的特征,也就是全局计算最大、最小和平均流量''' r = Repo() df = r(volume_grouped, phase='train', name='train') df['tmp_time'] = df.time.str[11:] gp = df.groupby(['tollgate_id', 'direction', 'tmp_time']).agg({'volume': [np.max, np.min, np.mean]}).reset_index() gp.columns = ['tollgate_id', 'direction', 'tmp_time', 'max_vol', 'min_vol', 'avg_vol'] df = r(volume_grouped, phase='train', name='train') df['tmp_time'] = df.time.str[11:] df = df.merge(gp, on=['tollgate_id', 'direction', 'tmp_time'], how='left') df = df.drop('tmp_time', axis='columns') df['hour'] = df.time.apply(lambda t: round( (int(t[11:13]) * 60 + int(t[14:16])) / 60) % 24) df = df.reindex_axis(['tollgate_id', 'direction', 'time', 'hour', 'max_vol', 'min_vol', 'avg_vol', 'volume'], axis='columns') return df
def formatResult(df, name='basic'): def time_win(dt): day = dt[:10] h = int(dt[11:13]) m = int(dt[14:16]) m += 20 if m >= 60: h += 1 m = 0 return '[%s,%s %s)'%(dt,day,'%s:%s:00'%(str(h) \ if h >= 10 else '0%d'%h, str(m) if m >= 10 else '0%d'%m)) df['time'] = df.time.apply(time_win) df = df.rename_axis({'time':'time_window'}, axis='columns') df = df.reindex_axis(['tollgate_id','time_window','direction', 'volume'], axis='columns') df['direction'] = df.direction.astype(np.int) df['tollgate_id'] = df.tollgate_id.astype(np.int) df = df.sort_values(['tollgate_id','direction','time_window']) Repo().saveResult(df, name)
def main(): '''有两个命令行参数 package和function,function指定待调试的函数,而package指定 该函数在哪个包。每次执行完,都可以选择继续执行,或者终止执行''' parser = argparse.ArgumentParser(description='用于调试package.function。\ 每次执行完,都可以选择继续执行,或者终止执行') parser.add_argument('package', help='需要调试的函数所在的包') parser.add_argument('function', help='需要调试的函数') args = parser.parse_args() #导入待调试的函数 eval('from %s import %s'%(args.package, args.function)) #初始化数据管理对象 Repo() while True: try: eval('%s()'%args.function) except Exception as msg: logging.warn(msg) pass if len(input('\n按回车重新执行%s.%s(),任意字符终止执行.\n' %(args.package, args.function))) > 0: break
def main(): r = Repo() r.saveResult(r(average), name='baseline')