Пример #1
0
def plot_two_hist(px, column, freq1, freq2):
    column1 = utils.get_moving_column_name(column, freq1, 0)
    column2 = utils.get_moving_column_name(column, freq2, 0)
    plt.subplot(1, 2, 1)
    px[column1].hist(bins=100)
    plt.xlabel(column1)
    plt.subplot(1, 2, 2)
    px[column2].hist(bins=100)
    plt.xlabel(column2)
    return
Пример #2
0
def xy_corr(px, second_list, x_raw_column, y_raw_column='tick_move', winsorize_option=None):
    px_new = px.copy()
    x_column = [utils.get_moving_column_name(x_raw_column, x, 0) for x in second_list]
    y_column = [utils.get_moving_column_name(y_raw_column, 0, x) for x in second_list]
    if winsorize_option is not None:
        for col in x_column:
            px_new[col] = utils.winsorize(px_new[col], winsorize_option['x_prob'], winsorize_option['x_bound'])
        for col in y_column:
            px_new[col] = utils.winsorize(px_new[col], winsorize_option['y_prob'], winsorize_option['y_bound'])
    big_corr = px_new[x_column + y_column].corr()
    corr_mat = big_corr.loc[y_column, x_column]
    return corr_mat
Пример #3
0
def scatter_plot(px, x_column, x_backward, x_forward, y_column, y_backward, y_forward):
    x_column_name = utils.get_moving_column_name(x_column, x_backward, x_forward)
    y_column_name = utils.get_moving_column_name(y_column, y_backward, y_forward)
    regr_data = px[[x_column_name, y_column_name]].dropna()
    x = regr_data[[x_column_name]].values
    y = regr_data[y_column_name].values
    regr = linear_model.LinearRegression()
    regr.fit(x, y)
    print('Coefficients: \n', regr.coef_)
    print('R-square: %f' % regr.score(x, y))
    plt.scatter(x, y, marker='o', s=0.1)
    plt.plot(x, regr.predict(x), color='red', linewidth=1)
    plt.xlabel(x_column_name)
    plt.ylabel(y_column_name)
    plt.show()
    return
Пример #4
0
def reg(px, freq_oir, freq_ofi, freq_xreturn, freq_yreturn, show_plot=True, show_inference=True):
    oir_column_name = utils.get_moving_column_name('order_imbalance_ratio', freq_oir, 0)
    ofi_column_name = utils.get_moving_column_name('order_flow_imbalance', freq_ofi, 0)
    xreturn_column_name = utils.get_moving_column_name('tick_move', freq_xreturn, 0)
    yreturn_column_name = utils.get_moving_column_name('tick_move', 0, freq_yreturn)
    regr_data = px[[oir_column_name, ofi_column_name, xreturn_column_name, yreturn_column_name]].dropna()
    regr_data[ofi_column_name] = winsorize(regr_data[ofi_column_name], (0.005, 0.005))
    # regr_data[xreturn_column_name] = winsorize(regr_data[xreturn_column_name], (0.005, 0.005))
    # regr_data[yreturn_column_name] = winsorize(regr_data[yreturn_column_name], (0.005, 0.005))
    x = regr_data[[oir_column_name, ofi_column_name, xreturn_column_name]].values
    y = regr_data[yreturn_column_name].values
    regr = linear_model.LinearRegression()
    regr.fit(x, y)
    yhat = regr.predict(x)
    resids = yhat - y
    if show_plot:
        # regression line
        plt.figure(1)
        plt.scatter(yhat, y, marker='o', s=0.1)
        plt.plot(yhat, yhat, color='red', linewidth=1)
        plt.xlabel('Fitted ' + yreturn_column_name)
        plt.ylabel('Observed ' + yreturn_column_name)
        plt.show()
        # residual histogram
        plt.figure(2)
        plt.hist(resids, bins=40)
        plt.title('Histogram of residuals')
        # residual qq plot
        plt.figure(3)
        stats.probplot(resids, dist="norm", plot=pylab)
        plt.title('QQ plot of residuals')
    if show_inference:
        x2 = sm.add_constant(x)
        est = sm.OLS(y, x2)
        est2 = est.fit()
        print(est2.summary())
    return {'r-square': regr.score(x, y), 'beta': regr.coef_, 'residuals': resids}
Пример #5
0
def backtest(px, config):
    logger.info('Start backtesting')
    dates = list(set(px.date))
    dates.sort()
    y_name = utils.get_moving_column_name(config['response_column'], 0,
                                          config['holding_period'])
    btdf = pd.DataFrame()
    columns = [
        'dt', 'date', 'time', 'price', 'qty', 'volume', 'open_interest', 'b1',
        'b1_size', 's1', 's1_size', 'mid', 'second'
    ]
    fitting_stats = pd.DataFrame(columns=[
        'date', 'rsq', 'beta', 'tstat', 'mse', 'pred_rsq', 'pred_mse'
    ])
    for i in range(config['training_period'], len(dates)):
        date = dates[i]
        logger.info('Backtesting on %s', date)
        logger.debug('Selecting feature')
        train = px[(px.date >= dates[i - config['training_period']])
                   & (px.date < date)].copy()
        features = select_feature(train, config)
        logger.debug('Fitting model')
        model, stats = fit(train, features, config)
        stats['date'] = date
        logger.debug('Predicting future return')
        px_i = px.loc[px.date == date, columns + features + [y_name]].copy()
        x_new = px_i[features]
        x_new = x_new.fillna(x_new.median())
        y_new = px_i[y_name].values
        alpha = model.predict(X=x_new)
        px_i['alpha'] = alpha
        pred_rsq = pd.DataFrame({
            'alpha': alpha,
            'y_new': y_new
        }).corr().iloc[0, 1]
        pred_resid = alpha - y_new
        pred_mse = np.nanmean(pred_resid**2)
        stats['pred_rsq'] = pred_rsq
        stats['pred_mse'] = pred_mse
        fitting_stats = fitting_stats.append(stats, ignore_index=True)
        btdf = btdf.append(px_i)
    logger.info('Finish backtesting')
    return btdf, fitting_stats
Пример #6
0
def fit(train, features, config):
    """Fit linear model using features

    :param train: pandas data frame, must contain columns in features
    :param features: list of column names
    :param config: dictionary, config parameters
    :return: sklearn model class
    """
    y_column = utils.get_moving_column_name(config['response_column'], 0,
                                            config['holding_period'])
    regr_data = train[features + [y_column]].dropna()

    # data processing
    for feature in features:
        raw_feature = utils.get_raw_column_name(feature)
        regr_data[feature] = utils.winsorize(
            regr_data[feature], config['feature_winsorize_prob'][raw_feature],
            config['feature_winsorize_bound'][raw_feature])
    regr_data[y_column] = utils.winsorize(regr_data[y_column],
                                          config['response_winsorize_prob'],
                                          config['response_winsorize_bound'])
    x = regr_data[features].values
    y = regr_data[y_column].values
    regr = linear_model.LinearRegression(fit_intercept=False)
    regr.fit(x, y)
    n = len(y)
    p = len(features) + regr.fit_intercept
    mse = np.sum((regr.predict(x) - y)**2) / (n - p)
    se = np.sqrt(np.diagonal(mse * np.linalg.inv(np.dot(x.T, x))))
    stats = {
        'rsq': regr.score(x, y),
        'beta': regr.coef_,
        'tstat': regr.coef_ / se,
        'mse': mse,
        'df_1': p - 1,
        'df_2': n - p
    }
    return regr, stats
Пример #7
0
def select_feature(train, config):
    """Select features to fit model

    :param train: pandas data frame
    :param config: dictionary, config parameters
    :return: list of strings, column names
    """
    y_column = utils.get_moving_column_name(config['response_column'], 0,
                                            config['holding_period'])
    selected_features = []
    for feature in config['feature_column']:
        logger.debug('Computing correlation of %s and %s', feature,
                     config['response_column'])
        winsorize_option = {
            'x_prob': config['feature_winsorize_prob'][feature],
            'x_bound': config['feature_winsorize_bound'][feature],
            'y_prob': config['response_winsorize_prob'],
            'y_bound': config['response_winsorize_bound']
        }
        corr_mat = signal.xy_corr(train, config['feature_freq'], feature,
                                  config['response_column'], winsorize_option)
        correlation = corr_mat.loc[y_column]
        selected_features.append(correlation.argmax())
    return selected_features
Пример #8
0
signal.plot_two_scatter(px, 'tick_move', 'tick_move', 5, 0, 0, 5, 60, 0, 0, 60)

# signal by signal
signal.plot_two_scatter(px, 'order_imbalance_ratio', 'tick_move', 1, 0, 1, 0,
                        5, 0, 5, 0)
signal.plot_two_scatter(px, 'order_flow_imbalance', 'tick_move', 60, 0, 60, 0,
                        300, 0, 300, 0)
signal.plot_two_scatter(px, 'order_flow_imbalance', 'order_imbalance_ratio',
                        60, 0, 60, 0, 300, 0, 300, 0)

# correlations
# ------------

second_list = [1, 2, 5, 10, 20, 30, 60, 120, 180, 300]
for sec in second_list:
    px = px[(px[utils.get_moving_column_name('tick_move', 0, sec)] <= 10)
            | np.isnan(px.tick_move_1_0)]
    px = px[(px[utils.get_moving_column_name('tick_move', sec, 0)] <= 10)
            | np.isnan(px.tick_move_1_0)]

oir_corr = signal.xy_corr(px, second_list, 'order_imbalance_ratio')
ofi_corr = signal.xy_corr(px, second_list, 'order_flow_imbalance')
autocorr = signal.xy_corr(px, second_list, 'tick_move')
oir_corr.to_csv(os.path.join(research_path, 'oir_corr.csv'))
ofi_corr.to_csv(os.path.join(research_path, 'ofi_corr.csv'))
autocorr.to_csv(os.path.join(research_path, 'autocorr.csv'))

oir_ofi = signal.xx_corr(px, second_list, 'order_imbalance_ratio',
                         'order_flow_imbalance')
oir_return = signal.xx_corr(px, second_list, 'order_imbalance_ratio',
                            'tick_move')
Пример #9
0
def xx_corr(px, second_list, column_name, row_name):
    column_names = [utils.get_moving_column_name(column_name, x, 0) for x in second_list]
    row_names = [utils.get_moving_column_name(row_name, x, 0) for x in second_list]
    big_corr = px[column_names + row_names].corr()
    corr_mat = big_corr.loc[row_names, column_names]
    return corr_mat