예제 #1
0
def get_tick_source_data():
    df = DataAPI.MktTicksHistOneDayGet(securityID=u"300634.XSHE",
                                       date='20191122',
                                       startSecOffset="",
                                       endSecOffset="",
                                       field=u"bidVolume1",
                                       pandas="1")
    print(df.columns)
    print(df['bidVolume1'].mean())
def get_features(security_id=u"300634.XSHE", date='20191122'):
    ticker, exchange_cd = security_id.split('.')
    df = DataAPI.MktTicksHistOneDayGet(securityID=security_id,
                                       date=date,
                                       startSecOffset="",
                                       endSecOffset="",
                                       field=u"",
                                       pandas="1")
    df_min = DataAPI.SHSZBarHistOneDayGet(tradeDate=date,
                                          exchangeCD=exchange_cd,
                                          ticker=ticker,
                                          unit="5",
                                          startTime=u"",
                                          endTime=u"",
                                          field=u"",
                                          pandas="1")
    datatimes = list(df['dataTime'])
    total_vol = list(df['value'])[-1]
    data_min = [
        '{0}:{1}'.format(item.split(':')[0],
                         item.split(':')[1]) for item in datatimes
    ]

    # tick feature calculation
    df['dataMin'] = data_min
    df['avgPrice'] = df['value'] / df['volume']
    df['amplitude'] = (df['highPrice'] - df['lowPrice']) / df['lastPrice']
    df['spread'] = df['askPrice1'] - df['bidPrice1']
    df['openDiff'] = (df['openPrice'] -
                      df['prevClosePrice']) / df['prevClosePrice']
    df['trackError'] = (df['lastPrice'] - df['avgPrice']) / df['avgPrice']
    df['askTrackError1'] = (df['askPrice1'] - df['avgPrice']) / df['avgPrice']
    df['bidTrackError1'] = (df['bidPrice1'] - df['avgPrice']) / df['avgPrice']
    df['totalAskVolume'] = df['askVolume1'] + df['askVolume2'] + df[
        'askVolume3'] + df['askVolume4'] + df['askVolume5']
    df['totalBidVolume'] = df['bidVolume1'] + df['bidVolume2'] + df[
        'bidVolume3'] + df['bidVolume4'] + df['bidVolume5']
    df['volumeImbalance1'] = (df['askVolume1'] - df['bidVolume1']) / (
        df['askVolume1'] + df['bidVolume1'])
    df['volumeImbalanceTotal'] = (
        df['totalAskVolume'] - df['totalBidVolume']) / (df['totalAskVolume'] +
                                                        df['totalBidVolume'])
    df['volumePerDeal'] = df['volume'] / df['deal']
    df['volumeRatio'] = df['volume'] / total_vol

    #min acc features by tick

    int(list(set(df['dataMin']))[0].split(':')[1]) % 5
    # print(df.shape)
    # print(df.columns)

    min_vwap = list(df_min['vwap'])
    min_vwap.insert(0, min_vwap[0])
    df_min['ret'] = (df_min['vwap'] / min_vwap[:-1] - 1) * 100
    dict_min_ret = dict(zip(df_min['barTime'], df_min['ret']))
    # TODO add features of tick and min
    columns = list(df.columns)
    columns.remove('dataDate')
    columns.remove('exchangeCD')
    columns.remove('ticker')
    columns.remove('dataTime')
    columns.remove('dataMin')
    columns.remove('shortNM')
    columns.remove('currencyCD')
    columns.remove('askPrice1')
    columns.remove('askPrice2')
    columns.remove('askPrice3')
    columns.remove('askPrice4')
    columns.remove('askPrice5')
    columns.remove('askVolume1')
    columns.remove('askVolume2')
    columns.remove('askVolume3')
    columns.remove('askVolume4')
    columns.remove('askVolume5')
    columns.remove('bidPrice1')
    columns.remove('bidPrice2')
    columns.remove('bidPrice3')
    columns.remove('bidPrice4')
    columns.remove('bidPrice5')
    columns.remove('bidVolume1')
    columns.remove('bidVolume2')
    columns.remove('bidVolume3')
    columns.remove('bidVolume4')
    columns.remove('bidVolume5')

    df.sort_values(by='dataTime', ascending=True, inplace=True)
    data_min = list(df['dataMin'])
    df = df[columns]
    print(df.columns)
    print(df.shape)
    # print(df.head(5))
    # df.fillna(method='ffill', inplace=True)
    # df = df.apply(lambda x: (x - np.mean(x)) / np.std(x))
    rows = list(df.values)
    print(rows[:3])
    train_x = []
    train_y = []
    _start, _end = 0, 0
    n_row = len(rows)
    total_row = 0
    for idx, val in enumerate(rows):
        hh, mm = data_min[idx].split(':')
        if int(hh) == 14 and int(mm) == 57:
            break
        if int(hh) == 9 and int(mm) <= 30:
            _start = idx
            continue
        if total_row >= 40:
            break

        if idx == n_row - 1:
            if dict_min_ret.get('{0}:{1}'.format(hh, mm)):
                train_y.append([dict_min_ret.get('{0}:{1}'.format(hh, mm))])
                train_x.append(list(rows[_start:idx]))
                total_row += 1
            _start = idx
        else:
            hh_, mm_ = data_min[idx + 1].split(':')
            if int(mm) % 5 == 0 and int(mm_) != int(mm):
                # print(data_min[_start], data_min[idx])
                if dict_min_ret.get('{0}:{1}'.format(hh, mm)):
                    train_y.append(
                        [dict_min_ret.get('{0}:{1}'.format(hh, mm))])
                    train_x.append(list(rows[_start:idx + 1]))
                    total_row += 1
                _start = idx + 1
    return train_x, train_y
예제 #3
0
def get_features_by_date(security_id=u"300634.XSHE", date='20191122', min_unit="1", tick=False, df_min=None,
                         df_bc_min=None, win_len=20):
    '''
    Example call:
    -get tick level features:
        get_features_by_date(security_id=u"ticker.mkt", date='yyyymmdd', min_unit="1", tick=True)
    -get min level features:
        get_features_by_date(security_id=u"ticker.mkt", date='yyyymmdd', min_unit="1", tick=False, df_min=xx,df_bc_min=xx)

    '''
    logger.info('Start processing sec id:{0} for  date:{1}'.format(security_id, date))
    df = DataAPI.MktTicksHistOneDayGet(securityID=security_id, date=date.replace('-', ''), startSecOffset="",
                                       endSecOffset="",
                                       field=u"", pandas="1")
    # df_min = DataAPI.SHSZBarHistOneDayGet(tradeDate=date, exchangeCD=exchange_cd, ticker=ticker, unit=min_unit,
    #                                       startTime=u"", endTime=u"", field=u"", pandas="1")

    datatimes = list(df['dataTime'])
    total_vol = list(df['value'])[-1]
    data_min = ['{0}:{1}'.format(item.split(':')[0], item.split(':')[1]) for item in datatimes]
    # tick feature calculation
    df['barTime'] = data_min
    _cal_tick_features(df)
    if tick:
        return df

    df_min['vwap'] = df_min['totalValue'] / df_min['totalVolume']
    df_min['bcClosePrice'] = df_bc_min['closePrice']
    # calculate tick level features

    # calculate min features accumulate from tick level
    df_agg = _cal_min_features_by_ticks(df)

    # calculate min level features
    _cal_min_features(df_min, win_len)

    common_min_lst = set(df_min['barTime']).intersection(set(data_min))
    df_min = df_min[df_min['barTime'].isin(common_min_lst)].sort_values(by='barTime', ascending=True)
    df_agg = df_agg[df_agg['barTime'].isin(common_min_lst)].sort_values(by='barTime', ascending=True)

    df_min = df_min.reset_index()
    df_agg = df_agg.reset_index()
    df_min = pd.concat([df_min, df_agg], axis=1, ignore_index=False)

    df_min['volumePerDeal'] = df_min['totalVolume'] / df_min['deal_sum']
    df_min['valuePerDeal'] = df_min['totalValue'] / df_min['deal_sum']

    df_min = df_min.drop(REMOVE_MIN_COLS, axis=1)
    df_min = df_min.replace(np.inf, np.nan)
    df_min = df_min.replace(-np.inf, np.nan)

    col_before_drop = df_min.columns
    # drop the columns that are all None
    df_min.dropna(axis=1, how='all', inplace=True)
    # df_min.fillna(axis=1, inplace=True, method='pad')
    # drop the rows that contain None
    df_min.dropna(axis=0, how='any', inplace=True)
    col_after_drop = df_min.columns
    if set(col_before_drop) - set(col_after_drop):
        logger.info('Drop the empty columns:{0}'.format(set(col_before_drop) - set(col_after_drop)))

    if not tick:
        return df_min
    else:
        # TODO refactor this session, this is to generate train_x and train_y for time series models(RNN)
        min_vwap = list(df_min['vwap'])
        min_vwap.insert(0, min_vwap[0])
        df_min['ret'] = (df_min['vwap'] / min_vwap[:-1] - 1) * 100
        dict_min_ret = dict(zip(df_min['barTime'], df_min['ret']))
        columns = list(set(df.columns) - set(REMOVE_TICK_COLS))
        df.sort_values(by='dataTime', ascending=True, inplace=True)
        data_min = list(df['barTime'])
        df = df[columns]
        # print(df.head(5))
        # df.fillna(method='ffill', inplace=True)
        # df = df.apply(lambda x: (x - np.mean(x)) / np.std(x))
        rows = list(df.values)
        train_x = []
        train_y = []
        _start, _end = 0, 0
        n_row = len(rows)
        total_row = 0
        for idx, val in enumerate(rows):
            hh, mm = data_min[idx].split(':')
            if int(hh) == 14 and int(mm) == 57:
                break
            if int(hh) == 9 and int(mm) <= 30:
                _start = idx
                continue
            if total_row >= 40:
                break

            if idx == n_row - 1:
                if dict_min_ret.get('{0}:{1}'.format(hh, mm)):
                    train_y.append([dict_min_ret.get('{0}:{1}'.format(hh, mm))])
                    train_x.append(list(rows[_start: idx]))
                    total_row += 1
                _start = idx
            else:
                hh_, mm_ = data_min[idx + 1].split(':')
                if int(mm) % 5 == 0 and int(mm_) != int(mm):
                    if dict_min_ret.get('{0}:{1}'.format(hh, mm)):
                        train_y.append([dict_min_ret.get('{0}:{1}'.format(hh, mm))])
                        train_x.append(list(rows[_start: idx + 1]))
                        total_row += 1
                    _start = idx + 1
        logger.info('Start processing sec id:{0} for date:{1}'.format(security_id, date))
        return train_x, train_y, columns