Пример #1
0
def extract_feature(start_dt, end_dt, pair, candle_types, df_all=None):
    features_all = None
    for candle_type in candle_types:
        params = get_params(candle_type)
        if df_all is not None:
            logger.debug('candle type: {}'.format(candle_type))
            candles = df_all[candle_type]
            candles.index = candles.timestamp.map(lambda x: datetime.fromtimestamp(x / 1000))
            # 時刻ごとに特徴量を算出(並列処理)
            args = [(candles[(d - timedelta(minutes=130) <= candles.index) & (candles.index <= d)], params, candle_type, d) for d in datetimerange(str2dt(start_dt), str2dt(end_dt) + timedelta(minutes=1))]
            tmp_features = multi_process(args)

            # 必要な時間のみ抽出
            features = None
            dts = [d for d in datetimerange(str2dt(start_dt), str2dt(end_dt) + timedelta(minutes=1))]
            for dt, tmp_feature in zip(dts, tmp_features):
                feature = tmp_feature[tmp_feature.index == dt]
                if features is None:
                    features = feature
                else:
                    features = pd.concat([features, feature])

            del tmp_features
            gc.collect()
        else:
            start_dt_ext = (datetime.strptime(start_dt, '%Y-%m-%d %H:%M:%S') - timedelta(minutes=130)).strftime('%Y-%m-%d %H:%M:%S')
            candles = API().get_candles(pair, candle_type=candle_type, start_dt=start_dt_ext, end_dt=end_dt)
            candles = pd.DataFrame(candles, columns=['open', 'high', 'low', 'close', 'volume', 'timestamp'])
            candles.index = candles.timestamp.map(lambda x: datetime.fromtimestamp(x / 1000))
            candles.to_csv('candles_{}_{}.csv'.format(end_dt, candle_type))
            features = _extract_feature(candles, params, candle_type, end_dt)
            features.to_csv('features_{}_{}.csv'.format(end_dt, candle_type))

        '''
        features = features.loc[(start_dt <= features.index) & (features.index <= end_dt)]
        '''
        features.columns = [c + '_' + candle_type for c in features.columns]

        if features_all is None:
            features_all = features
        else:
            features_all = pd.concat([features_all, features], axis=1)
            features_all = features_all.fillna(method='ffill')

        del features
        gc.collect()
    return features_all
Пример #2
0
def save_depth(pair='xrp_jpy', db=0):
    pool = redis.ConnectionPool(host='localhost', port=6379, db=db)
    conn = redis.StrictRedis(connection_pool=pool)

    depth = API().get_depth(pair)
    dt = str2dt(format_dt(dt2str(datetime.now()), '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S'))
    conn.set(dt2str(dt), json.dumps(depth))
    logger.debug('save depth at timestamp: {}'.format(dt))
Пример #3
0
def save_transactions(pair='xrp_jpy', db=1):
    pool = redis.ConnectionPool(host='localhost', port=6379, db=db)
    conn = redis.StrictRedis(connection_pool=pool)

    end_dt = str2dt(format_dt(dt2str(datetime.now()), '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S'))
    start_dt = end_dt - timedelta(seconds=5)
    #logger.debug('get transactions from {} to {}'.format(start_dt, end_dt))
    transactions = API().get_transactions(pair, start_dt, end_dt)
    conn.set(dt2str(end_dt), json.dumps(transactions))
    logger.debug('save transactions at timestamp: {}'.format(end_dt))
def worker():
    pair = 'xrp_jpy'
    try:
        pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
        conn = redis.StrictRedis(connection_pool=pool)

        end_dt = str2dt(format_dt(dt2str(datetime.now()), '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S'))
        start_dt = end_dt - timedelta(seconds=5)
        logger.debug('get transactions from {} to {}'.format(start_dt, end_dt))
        transactions = API().get_transactions(pair, start_dt, end_dt)

        depth = API().get_depth(pair)
        conn.set(depth['timestamp'], json.dumps(depth))
        conn.set(depth['timestamp'], json.dumps(depth))
        logger.debug('save depth at timestamp: {}'.format(depth['timestamp']))
    except Exception:
        logger.error(traceback.format_exc())
        logger.debug('process reboot')
        os.execv(sys.executable, [sys.executable] + ['collect/collect_depth.py'])
Пример #5
0
def main():
    is_create_data = False
    is_feature_extraction = False
    is_train = False
    pair = 'xrp_jpy'
    start_dt = '2019-01-01 00:00:00'
    end_dt = '2019-02-02 17:59:59'
    train_test_split = '2019-02-01 00:00:00'
    threshold = 0.1

    candle_types = ['1min', '5min']
    #candle_types = ['1min']

    # データを取得
    candles_all = {}
    prefix = '{}_{}'.format(format_dt(start_dt, '%Y-%m-%d %H:%M:%S', '%Y%m%d%H%M'), format_dt(end_dt, '%Y-%m-%d %H:%M:%S', '%Y%m%d%H%M'))
    if is_create_data:
        start_dt_create_data = dt2str(str2dt(start_dt) - timedelta(minutes=130))
        for candle_type in candle_types:
            candles_all[candle_type] = create_dataset_api(start_dt_create_data, end_dt, pair, candle_type)
            with open('ml/input/{}_candles_api_{}.pkl'.format(prefix, candle_type), 'wb') as f:
                pickle.dump(candles_all[candle_type], f)
    else:
        for candle_type in candle_types:
            with open('ml/input/{}_candles_api_{}.pkl'.format(prefix, candle_type), 'rb') as f:
                candles_all[candle_type] = pickle.load(f)

    if is_feature_extraction:
        features = extract_feature(start_dt, end_dt, 'xrp_jpy', candle_types, df_all=candles_all)
        features.to_pickle('ml/input/{}_features.pkl'.format(prefix))
    else:
        with open('ml/input/{}_features.pkl'.format(prefix), 'rb') as f:
            features = pickle.load(f)

    # 正解データを作成
    target = create_target(candles_all['1min'], start_dt, end_dt)
    data = pd.concat([features, target], axis=1)
    data.columns = features.columns.tolist() + ['target']
    data = data.loc[~data.target.isnull()]
    #data_pos = data[data['target'] == 0]
    #data_neg = data[data['target'] == 1]
    #data = pd.concat([data_pos.sample(n=data_neg.shape[0], random_state=0), data_neg])

    train_mask = train_test_split > data.index
    test_mask = train_test_split <= data.index
    X_train = data.loc[train_mask, [c for c in data.columns if c != 'target']]
    y_train = data.loc[train_mask, ['target']]
    X_test = data.loc[test_mask, [c for c in data.columns if c != 'target']]
    y_test = data.loc[test_mask, ['target']]
    X_test.to_csv('X_test.csv')

    if is_train:
        clf = train(X_train, y_train, prefix)
        with open('ml/model/{}_clf_binary.pkl'.format(prefix), 'wb') as f:
            pickle.dump(clf, f)
    else:
        #with open('ml/model/{}_clf_binary.pkl'.format(prefix), 'rb') as f:
        with open('ml/model/201901010000_201902021759_clf_binary.pkl', 'rb') as f:
            clf = pickle.load(f)

    y_label = clf.predict(X_train)
    y_pred = clf.predict_proba(X_train)[:, 1]
    auc = metrics.roc_auc_score(y_train, y_pred)
    confusion_mat = confusion_matrix(y_train, y_label)
    tp, fn, fp, tn = confusion_mat.ravel()
    accuracy = (tp + tn) / (tp + fn + fp + tn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    logger.debug('Train: (auc, accuracy, precision, recall) = ({}, {}, {}, {})'.format(auc, accuracy, precision, recall))

    norm_mean = y_pred.mean()
    norm_std = y_pred.std()

    y_label = clf.predict(X_test)
    y_pred = clf.predict_proba(X_test)[:, 1]
    auc = metrics.roc_auc_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_label)
    tp, fn, fp, tn = confusion_mat.ravel()
    accuracy = (tp + tn) / (tp + fn + fp + tn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    logger.debug('Test: (auc, accuracy, precision, recall) = ({}, {}, {}, {})'.format(auc, accuracy, precision, recall))
    X_test[[c for c in X_test.columns if '1min' in c]].to_csv('X_test_5min.csv')
    X_test[[c for c in X_test.columns if '5min' in c]].to_csv('X_test_5min.csv')
    print(pd.Series(y_pred[:-1], index=X_test.index[1:]))

    # 投資指標(予測値の確率)とリターンの関係

    #norm_mean = 0.4981990534953142
    #norm_std = 0.28677379837073347
    z_scores = (y_pred - norm_mean) / norm_std
    logger.debug('mean: {}, std: {}'.format(norm_mean, norm_std))

    # z_scoreと実際の利益の相関係数
    #spread = 0.062
    spread = 0
    # 注意: buy -> 相手が買う値段なので自分が売る時の値段、sell -> 相手が売る値段なので自分が買う時の値段
    prices = pd.DataFrame({'buy': ((100 - spread) / (100 + spread)) * X_test['close_1min'], 'sell': X_test['close_1min']})
    profits = -X_test['close_1min'].diff(-1)
    profits.iloc[-1] = 0  # ズレた場所を補間する
    logger.debug(np.corrcoef(z_scores, profits))

    #pd.concat([pd.concat([profits, y_test], axis=1), pd.Series(y_pred, index=y_test.index)], axis=1).to_csv('test.csv')

    # 資産推移を取得
    dts = prices.index
    import itertools
    threshold_uppers = [0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5]
    threshold_lowers = [-0.1, -0.3, -0.5, -0.7, -0.9, -1.1, -1.3, -1.5]

    for threshold_upper, threshold_lower in itertools.product(threshold_uppers, threshold_lowers):
        # 計算した指標は実際の運用では1分前のものなので1分ずらす
        assets = get_assets_change(z_scores[:-1], prices[1:], dts[1:], threshold_upper, threshold_lower)
        print('{} {} {}'.format(threshold_upper, threshold_lower, assets[-1]))

        # 可視化
        fig, ax = plt.subplots(3, 1, figsize=(12, 18))
        ax[0].scatter(z_scores, profits, alpha=0.3)
        ax[0].set_title('predict probability vs profit after 1min')
        ax[0].set_xlabel('predict probability')
        ax[0].set_ylabel('profit after 1min')
        ax[1].hist(z_scores)
        ax[1].set_title('predict probability')
        ax[2].plot(dts[1:], assets)
        ax[2].set_title('assets')
        #plt.show()

    pd.Series(y_pred).to_csv('y_pred.csv')
    X_test.to_csv('X_test.csv')
Пример #6
0
            dt = datetime.strptime(file, '%Y%m%d%H%M%S')
        else:
            dt = datetime.strptime(file.replace(prefix, ''), '%Y%m%d%H%M%S')

        if (start_dt <= dt) and (dt <= end_dt):
            print('read file: {}'.format(file))
            df = pd.read_csv(os.path.join(dirpath, file), header=None)

            if ret is None:
                ret = df
            else:
                ret = pd.concat([ret, df])
    return ret

if __name__=='__main__':
    start_dt = str2dt('2019-04-06 19:00:00')
    end_dt = str2dt('2019-04-06 21:00:00')

    # ticker, ohlcv
    executions = read_csvs('collect/executions', start_dt, end_dt, pattern='execution.*', prefix='execution.')
    executions.columns = ['timestamp', 'datetime_utc_bitmex', 'side', 'price', 'amount', 'datetime',
                          'datetime_jst_bitmex']
    executions = executions.drop(['datetime', 'datetime_utc_bitmex', 'datetime_jst_bitmex'], axis=1)
    executions['side'] = executions['side'].str.replace(' ', '') # 後で消す
    ticker = generate_ticker(executions)
    ohlcv = generate_ohlcv(executions, candle_type='1s')
    active_ohlcv_1m = generate_active_ohlcv(ohlcv, candle_type='1m')
    active_ohlcv_5m = generate_active_ohlcv(ohlcv, candle_type='5m')
    active_ohlcv_1h = generate_active_ohlcv(ohlcv, candle_type='1h')

    # orderbook