def demo3(): close = get_tick('AAL') frac_df = frac_diff_FFD(close.to_frame(), 0.5) vol = get_daily_vol(close) events = cusum_filter(close, 2 * vol) t1 = get_t1(close, events, num_days=5) sampled = get_3barriers(close, events, ptsl=2, trgt=vol, min_ret=0, num_threads=12, t1=t1, side=None) data = sampled.dropna() print(data) features_df = frac_df.loc[data.index].dropna() features = features_df.values # get the labels of these events label = data['t1_type'].loc[features_df.index].values clf = RandomForestClassifier() # learn on these features and labels clf.fit(features, label) # predict the features (on the same data so overfitting could be an issue) print(clf.predict(features))
def demo3(): close = get_tick('AAL') vol = get_daily_vol(close) sampled_idx = cusum_filter(close, vol) t1 = get_t1(close, sampled_idx, num_days=7) side = macd_side(close) events = get_3barriers(close, t_events=sampled_idx, trgt=vol, ptsl=[1, 2], t1=t1, side=side) events = events.dropna() bins = get_bins(events, close) clf = RandomForestClassifier() x = np.hstack([ events['side'].values[:, np.newaxis], close.loc[events.index].values[:, np.newaxis] ]) # action and px # if return was positive, bins = 1 y = bins['bin'].values # supervised answer clf.fit(x, y) predicted_probs = np.array([x[1] for x in clf.predict_proba(x)]) # get_signal(events.drop(columns=['side']), 0.2, predicted_probs, events['side'], 2, 1) get_signal(events.drop(columns=['side']), 0.2, predicted_probs, events['side'], 2, 12)
def demo_44(): close = get_tick('AAL') vol = get_daily_vol(close) sampled_idx = cusum_filter(close, vol) t1 = get_t1(close, sampled_idx, num_days=1) trgt = vol events = get_3barriers(close, t_events=sampled_idx, trgt=trgt, ptsl=1, t1=t1) print(events.head()) num_threads = 24 num_co_events = mp_pandas_obj(get_num_co_events, ('molecule', events.index), num_threads, close_idx=close.index, t1=events['t1']) num_co_events = num_co_events.loc[~num_co_events.index.duplicated( keep='last')] num_co_events = num_co_events.reindex(close.index).fillna(0) num_threads = 24 tw = mp_pandas_obj(get_sample_tw, ('molecule', events.index), num_threads, t1=events['t1'], num_co_events=num_co_events) exp_decay = get_time_decay(tw, last_w=.1, is_exp=True) print(exp_decay.head())
def demo2(): df = get_google_all() df.index = pd.DatetimeIndex(df['Date'].values) close = df["Close"] vol = get_daily_vol(close) sampled_idx = cusum_filter(close, vol) t1 = get_t1(close, sampled_idx, num_days=1) side = None events = get_3barriers(close, t_events=sampled_idx, trgt=vol,ptsl=1, t1=t1, side=side) index = events.index features_df = df.drop(columns=["Date"]).dropna().loc[index] features = features_df label = events['t1_type'].loc[features_df.index] clf = RandomForestClassifier() t1_ = t1.loc[features.index] # No purge, with embargo scores = [] for _ in range(10): scores_ = cv_score(clf, features, label, pct_embargo=0.01, t1=t1_, purging=False) scores.append(np.mean(scores_)) print(np.mean(scores), np.var(scores)) # no purge without embargo scores = [] for _ in range(10): scores_ = cv_score(clf, features, label, pct_embargo=0., t1=t1_, purging=False) scores.append(np.mean(scores_)) print(np.mean(scores), np.var(scores)) n_co_events = get_num_co_events(close.index, t1, events.index) sample_weight = get_sample_tw(t1, n_co_events, events.index) # no purge with embargo and sample weights added to samples scores = [] for _ in range(10): scores_ = cv_score(clf, features, label, sample_weight=sample_weight, pct_embargo=0.01, t1=t1_, purging=False) scores.append(np.mean(scores_)) print(np.mean(scores), np.var(scores)) # no purge without embargo and sample weights added to samples scores = [] for _ in range(10): scores_ = cv_score(clf, features, label, sample_weight=sample_weight, pct_embargo=0., t1=t1_, purging=False) scores.append(np.mean(scores_)) print(np.mean(scores), np.var(scores))
def demo_42(): close = get_tick('AAL') vol = get_daily_vol(close) sampled_idx = cusum_filter(close, vol) t1 = get_t1(close, sampled_idx, num_days=5) trgt = vol events = get_3barriers(close, t_events=sampled_idx, trgt=trgt, ptsl=1, t1=t1) print(events.head()) ind_m = get_ind_matrix(close.index, events['t1']) avg_uniq = get_avg_uniq(ind_m) print(avg_uniq.head()) phi = seq_bootstrap(ind_m) print(phi)
def demo(): close = get_tick('AAL') vol = get_daily_vol(close) sampled_idx = cusum_filter(close, vol) t1 = get_t1(close, sampled_idx, num_days=5) trgt = vol events = get_3barriers(close, t_events=sampled_idx, trgt=trgt, ptsl=1, t1=t1) print(events.head()) num_threads = 1 num_co_events = mp_pandas_obj(get_num_co_events, ('molecule', events.index), num_threads, close_idx=close.index, t1=events['t1']) fig, ax1 = plt.subplots(figsize=(16, 8)) ax1.set_xlabel('time (s)') ax1.set_ylabel('num_co_events', color='red') ax1.plot(num_co_events, color='red') ax1.tick_params(axis='y', labelcolor='red') ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis ax2.set_ylabel('volatility', color='blue') # we already handled the x-label with ax1 ax2.plot(vol, color='blue') ax2.tick_params(axis='y', labelcolor='blue') fig.tight_layout() # otherwise the right y-label is slightly clipped plt.savefig(PNG_PATH + "num_co_events.png") plt.close() fig, ax1 = plt.subplots(figsize=(16, 8)) ax1.set_xlabel('time') ax1.set_ylabel('num_co_events', color='red') ax1.scatter(num_co_events.index, num_co_events.values, color='red') ax2 = ax1.twinx() ret = close.pct_change().dropna() ax2.set_ylabel('return', color='blue') ax2.scatter(ret.index, ret.values, color='blue') plt.savefig(PNG_PATH + "num_co_events_scatter.png") plt.close()
def demo(): # close = get_tick('AAL') df = get_google_all() df.index = pd.DatetimeIndex(df['Date'].values) close = df["Close"] embg_times = get_embargo_times(close.index, pct_embargo=0.01) print(embg_times.head()) vol = get_daily_vol(close) sampled_idx = cusum_filter(close, vol) t1 = get_t1(close, sampled_idx, num_days=1) side = None # events = get_3barriers(close, t_events=sampled_idx, trgt=vol,ptsl=[1, 2], t1=t1, side=side) events = get_3barriers(close, t_events=sampled_idx, trgt=vol,ptsl=1, t1=t1, side=side) print(events.head()) index = events.index features_df = df.drop(columns=["Date"]).dropna().loc[index] features = features_df label = events['t1_type'].loc[features_df.index] # without shuffling scores = [] for _ in range(10): clf = RandomForestClassifier() kfold = KFold(n_splits=10, shuffle=False) scores.append(cross_val_score(clf, features, label, cv=kfold)) print(np.mean(scores), np.var(scores)) # with shuffling the data before putting into batches # Shffuling data introduces data leakage because of simlarity among neighborg, # If you shuffle data uniformly, training data has more information that overlaps test data. scores = [] for _ in range(10): clf = RandomForestClassifier() kfold = KFold(n_splits=10, shuffle=True) scores.append(cross_val_score(clf, features, label, cv=kfold)) print(np.mean(scores), np.var(scores))