def extract_feature(start_dt, end_dt, pair, candle_types, df_all=None): features_all = None for candle_type in candle_types: params = get_params(candle_type) if df_all is not None: logger.debug('candle type: {}'.format(candle_type)) candles = df_all[candle_type] candles.index = candles.timestamp.map(lambda x: datetime.fromtimestamp(x / 1000)) # 時刻ごとに特徴量を算出(並列処理) args = [(candles[(d - timedelta(minutes=130) <= candles.index) & (candles.index <= d)], params, candle_type, d) for d in datetimerange(str2dt(start_dt), str2dt(end_dt) + timedelta(minutes=1))] tmp_features = multi_process(args) # 必要な時間のみ抽出 features = None dts = [d for d in datetimerange(str2dt(start_dt), str2dt(end_dt) + timedelta(minutes=1))] for dt, tmp_feature in zip(dts, tmp_features): feature = tmp_feature[tmp_feature.index == dt] if features is None: features = feature else: features = pd.concat([features, feature]) del tmp_features gc.collect() else: start_dt_ext = (datetime.strptime(start_dt, '%Y-%m-%d %H:%M:%S') - timedelta(minutes=130)).strftime('%Y-%m-%d %H:%M:%S') candles = API().get_candles(pair, candle_type=candle_type, start_dt=start_dt_ext, end_dt=end_dt) candles = pd.DataFrame(candles, columns=['open', 'high', 'low', 'close', 'volume', 'timestamp']) candles.index = candles.timestamp.map(lambda x: datetime.fromtimestamp(x / 1000)) candles.to_csv('candles_{}_{}.csv'.format(end_dt, candle_type)) features = _extract_feature(candles, params, candle_type, end_dt) features.to_csv('features_{}_{}.csv'.format(end_dt, candle_type)) ''' features = features.loc[(start_dt <= features.index) & (features.index <= end_dt)] ''' features.columns = [c + '_' + candle_type for c in features.columns] if features_all is None: features_all = features else: features_all = pd.concat([features_all, features], axis=1) features_all = features_all.fillna(method='ffill') del features gc.collect() return features_all
def save_depth(pair='xrp_jpy', db=0): pool = redis.ConnectionPool(host='localhost', port=6379, db=db) conn = redis.StrictRedis(connection_pool=pool) depth = API().get_depth(pair) dt = str2dt(format_dt(dt2str(datetime.now()), '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S')) conn.set(dt2str(dt), json.dumps(depth)) logger.debug('save depth at timestamp: {}'.format(dt))
def save_transactions(pair='xrp_jpy', db=1): pool = redis.ConnectionPool(host='localhost', port=6379, db=db) conn = redis.StrictRedis(connection_pool=pool) end_dt = str2dt(format_dt(dt2str(datetime.now()), '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S')) start_dt = end_dt - timedelta(seconds=5) #logger.debug('get transactions from {} to {}'.format(start_dt, end_dt)) transactions = API().get_transactions(pair, start_dt, end_dt) conn.set(dt2str(end_dt), json.dumps(transactions)) logger.debug('save transactions at timestamp: {}'.format(end_dt))
def worker(): pair = 'xrp_jpy' try: pool = redis.ConnectionPool(host='localhost', port=6379, db=0) conn = redis.StrictRedis(connection_pool=pool) end_dt = str2dt(format_dt(dt2str(datetime.now()), '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S')) start_dt = end_dt - timedelta(seconds=5) logger.debug('get transactions from {} to {}'.format(start_dt, end_dt)) transactions = API().get_transactions(pair, start_dt, end_dt) depth = API().get_depth(pair) conn.set(depth['timestamp'], json.dumps(depth)) conn.set(depth['timestamp'], json.dumps(depth)) logger.debug('save depth at timestamp: {}'.format(depth['timestamp'])) except Exception: logger.error(traceback.format_exc()) logger.debug('process reboot') os.execv(sys.executable, [sys.executable] + ['collect/collect_depth.py'])
def main(): is_create_data = False is_feature_extraction = False is_train = False pair = 'xrp_jpy' start_dt = '2019-01-01 00:00:00' end_dt = '2019-02-02 17:59:59' train_test_split = '2019-02-01 00:00:00' threshold = 0.1 candle_types = ['1min', '5min'] #candle_types = ['1min'] # データを取得 candles_all = {} prefix = '{}_{}'.format(format_dt(start_dt, '%Y-%m-%d %H:%M:%S', '%Y%m%d%H%M'), format_dt(end_dt, '%Y-%m-%d %H:%M:%S', '%Y%m%d%H%M')) if is_create_data: start_dt_create_data = dt2str(str2dt(start_dt) - timedelta(minutes=130)) for candle_type in candle_types: candles_all[candle_type] = create_dataset_api(start_dt_create_data, end_dt, pair, candle_type) with open('ml/input/{}_candles_api_{}.pkl'.format(prefix, candle_type), 'wb') as f: pickle.dump(candles_all[candle_type], f) else: for candle_type in candle_types: with open('ml/input/{}_candles_api_{}.pkl'.format(prefix, candle_type), 'rb') as f: candles_all[candle_type] = pickle.load(f) if is_feature_extraction: features = extract_feature(start_dt, end_dt, 'xrp_jpy', candle_types, df_all=candles_all) features.to_pickle('ml/input/{}_features.pkl'.format(prefix)) else: with open('ml/input/{}_features.pkl'.format(prefix), 'rb') as f: features = pickle.load(f) # 正解データを作成 target = create_target(candles_all['1min'], start_dt, end_dt) data = pd.concat([features, target], axis=1) data.columns = features.columns.tolist() + ['target'] data = data.loc[~data.target.isnull()] #data_pos = data[data['target'] == 0] #data_neg = data[data['target'] == 1] #data = pd.concat([data_pos.sample(n=data_neg.shape[0], random_state=0), data_neg]) train_mask = train_test_split > data.index test_mask = train_test_split <= data.index X_train = data.loc[train_mask, [c for c in data.columns if c != 'target']] y_train = data.loc[train_mask, ['target']] X_test = data.loc[test_mask, [c for c in data.columns if c != 'target']] y_test = data.loc[test_mask, ['target']] X_test.to_csv('X_test.csv') if is_train: clf = train(X_train, y_train, prefix) with open('ml/model/{}_clf_binary.pkl'.format(prefix), 'wb') as f: pickle.dump(clf, f) else: #with open('ml/model/{}_clf_binary.pkl'.format(prefix), 'rb') as f: with open('ml/model/201901010000_201902021759_clf_binary.pkl', 'rb') as f: clf = pickle.load(f) y_label = clf.predict(X_train) y_pred = clf.predict_proba(X_train)[:, 1] auc = metrics.roc_auc_score(y_train, y_pred) confusion_mat = confusion_matrix(y_train, y_label) tp, fn, fp, tn = confusion_mat.ravel() accuracy = (tp + tn) / (tp + fn + fp + tn) precision = tp / (tp + fp) recall = tp / (tp + fn) logger.debug('Train: (auc, accuracy, precision, recall) = ({}, {}, {}, {})'.format(auc, accuracy, precision, recall)) norm_mean = y_pred.mean() norm_std = y_pred.std() y_label = clf.predict(X_test) y_pred = clf.predict_proba(X_test)[:, 1] auc = metrics.roc_auc_score(y_test, y_pred) confusion_mat = confusion_matrix(y_test, y_label) tp, fn, fp, tn = confusion_mat.ravel() accuracy = (tp + tn) / (tp + fn + fp + tn) precision = tp / (tp + fp) recall = tp / (tp + fn) logger.debug('Test: (auc, accuracy, precision, recall) = ({}, {}, {}, {})'.format(auc, accuracy, precision, recall)) X_test[[c for c in X_test.columns if '1min' in c]].to_csv('X_test_5min.csv') X_test[[c for c in X_test.columns if '5min' in c]].to_csv('X_test_5min.csv') print(pd.Series(y_pred[:-1], index=X_test.index[1:])) # 投資指標(予測値の確率)とリターンの関係 #norm_mean = 0.4981990534953142 #norm_std = 0.28677379837073347 z_scores = (y_pred - norm_mean) / norm_std logger.debug('mean: {}, std: {}'.format(norm_mean, norm_std)) # z_scoreと実際の利益の相関係数 #spread = 0.062 spread = 0 # 注意: buy -> 相手が買う値段なので自分が売る時の値段、sell -> 相手が売る値段なので自分が買う時の値段 prices = pd.DataFrame({'buy': ((100 - spread) / (100 + spread)) * X_test['close_1min'], 'sell': X_test['close_1min']}) profits = -X_test['close_1min'].diff(-1) profits.iloc[-1] = 0 # ズレた場所を補間する logger.debug(np.corrcoef(z_scores, profits)) #pd.concat([pd.concat([profits, y_test], axis=1), pd.Series(y_pred, index=y_test.index)], axis=1).to_csv('test.csv') # 資産推移を取得 dts = prices.index import itertools threshold_uppers = [0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5] threshold_lowers = [-0.1, -0.3, -0.5, -0.7, -0.9, -1.1, -1.3, -1.5] for threshold_upper, threshold_lower in itertools.product(threshold_uppers, threshold_lowers): # 計算した指標は実際の運用では1分前のものなので1分ずらす assets = get_assets_change(z_scores[:-1], prices[1:], dts[1:], threshold_upper, threshold_lower) print('{} {} {}'.format(threshold_upper, threshold_lower, assets[-1])) # 可視化 fig, ax = plt.subplots(3, 1, figsize=(12, 18)) ax[0].scatter(z_scores, profits, alpha=0.3) ax[0].set_title('predict probability vs profit after 1min') ax[0].set_xlabel('predict probability') ax[0].set_ylabel('profit after 1min') ax[1].hist(z_scores) ax[1].set_title('predict probability') ax[2].plot(dts[1:], assets) ax[2].set_title('assets') #plt.show() pd.Series(y_pred).to_csv('y_pred.csv') X_test.to_csv('X_test.csv')
dt = datetime.strptime(file, '%Y%m%d%H%M%S') else: dt = datetime.strptime(file.replace(prefix, ''), '%Y%m%d%H%M%S') if (start_dt <= dt) and (dt <= end_dt): print('read file: {}'.format(file)) df = pd.read_csv(os.path.join(dirpath, file), header=None) if ret is None: ret = df else: ret = pd.concat([ret, df]) return ret if __name__=='__main__': start_dt = str2dt('2019-04-06 19:00:00') end_dt = str2dt('2019-04-06 21:00:00') # ticker, ohlcv executions = read_csvs('collect/executions', start_dt, end_dt, pattern='execution.*', prefix='execution.') executions.columns = ['timestamp', 'datetime_utc_bitmex', 'side', 'price', 'amount', 'datetime', 'datetime_jst_bitmex'] executions = executions.drop(['datetime', 'datetime_utc_bitmex', 'datetime_jst_bitmex'], axis=1) executions['side'] = executions['side'].str.replace(' ', '') # 後で消す ticker = generate_ticker(executions) ohlcv = generate_ohlcv(executions, candle_type='1s') active_ohlcv_1m = generate_active_ohlcv(ohlcv, candle_type='1m') active_ohlcv_5m = generate_active_ohlcv(ohlcv, candle_type='5m') active_ohlcv_1h = generate_active_ohlcv(ohlcv, candle_type='1h') # orderbook