def transform(self, X): fe = make_features(X, self.target_col, self.moving_averages) if self.production: X = series_to_predict_matrix(fe, n_in=self.Tx, dropnan=True) return self._reshape(X) else: X, y = data_to_supervised(fe, self.Tx, self.Ty) return self._reshape(X), y
def transform(self, X): fe = make_single_feature(X, self.target_col) if self.production: X = series_to_predict_matrix(fe.target.tolist(), n_in=self.Tx, dropnan=True) X = continuous_wavelet_transform(X, N=self.N, wavelet=self.wavelet) return X else: X, y = data_to_supervised(input_df=fe, Tx=self.Tx, Ty=self.Ty) X = continuous_wavelet_transform(X, N=self.N, wavelet=self.wavelet) return X, y
def transform(self, X): fe = make_single_feature(X, self.target_col) if self.production: X = series_to_predict_matrix(fe['target'], n_in=self.Tx, dropnan=True) X = discrete_wavelet_transform(X, wavelet=self.wavelet) return self._reshape(X) else: X, y = data_to_supervised(input_df=fe[['target']], Tx=self.Tx, Ty=self.Ty) X = discrete_wavelet_transform(X, wavelet=self.wavelet) return self._reshape(X), y
def main(): print('Making features from raw data...') data_dir = join(get_project_path(), 'data', 'raw') output_dir = join(get_project_path(), 'data', 'processed') makedirs(output_dir, exist_ok=True) coins = ['BTC', 'ETH'] TARGET = 'close' Tx = 72 Ty = 1 TEST_SIZE = 0.05 for SYM in coins: raw_data_path = join(data_dir, SYM + '.csv') print('Featurizing raw {} data from {}...'.format(SYM, raw_data_path)) raw_df = pd.read_csv(raw_data_path, index_col=0) feature_df = make_features( raw_df, target_col=TARGET, keep_cols=['close', 'high', 'low', 'volumeto', 'volumefrom'], ma_lags=[6, 12, 24, 48], ma_cols=['close', 'volumefrom', 'volumeto']) X, y = data_to_supervised(feature_df, target_ix=-1, Tx=Tx, Ty=Ty) num_features = int(X.shape[1] / Tx) X = make_3d(X, tx=Tx, num_channels=num_features) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=TEST_SIZE, shuffle=False) np.save(arr=X_train, file=join(output_dir, 'X_train_{}'.format(SYM))) np.save(arr=X_test, file=join(output_dir, 'X_test_{}'.format(SYM))) np.save(arr=y_train, file=join(output_dir, 'y_train_{}'.format(SYM))) np.save(arr=y_test, file=join(output_dir, 'y_test_{}'.format(SYM)))
plt.title('target') # plt.figure(); plt.plot(df.filter(regex='v(t|f)')); plt.title('v(t|f)') plt.show() # In[7]: num_features = arr.shape[1] - pc['ty'] p('Number of Unique Features:', num_features) p('Number of Hours per Sample:', pc['tx']) p('Total Features per Sample:', pc['tx'] * num_features) # In[8]: X, y = data_to_supervised(input_df=pd.DataFrame( data=arr, columns=ct.get_feature_names()), target_ix=-1, Tx=pc['tx'], Ty=pc['ty']) p(X.head(2)) p(y.head(5)) # In[9]: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=pc['test_fraction'], shuffle=False) p('Train shape: ', X_train.shape) p('Test shape: ', X_test.shape) # In[10]: fig, ax = plt.subplots(1, figsize=(10, 5))
Tx = 72 Ty = 1 TEST_SIZE = 0.05 data_path = join(get_project_path(), 'data', 'raw', SYM + '.csv') # In[3]: data = pd.read_csv(data_path, index_col=0) data.head() # In[4]: """ Get percent change feature and target data. """ df = make_single_feature(input_df=data, target_col='close') X, y = data_to_supervised(input_df=df[['target']], Tx=Tx, Ty=Ty) p(X.shape, y.shape) X.head() # In[5]: """ Confirm data reshape and target/feature creation was done correctly. """ y_values_except_last = np.squeeze(y.iloc[:-1].values) t_minus_1_x_values_except_first = X.iloc[1:, -1].values y_values_except_last.all() == t_minus_1_x_values_except_first.all() # In[6]: """ For comparing different transformations
smooth_data_train = pd.DataFrame(smooth_arr_train, columns=ct.get_feature_names()) smooth_arr_test = ct.fit_transform(feature_data_test) smooth_data_test = pd.DataFrame(smooth_arr_test, columns=ct.get_feature_names()) smooth_data_train.plot() plt.show() # In[5]: """ Make time-series data. """ X_train, y_train = data_to_supervised(input_df=smooth_data_train, target_ix=-1, Tx=Tx, Ty=Ty) X_test, y_test = data_to_supervised(input_df=smooth_data_test, target_ix=-1, Tx=Tx, Ty=Ty) p(X_train.head()) p(y_train.head()) """ Reshape the data into 3d array. """ X_train = make_3d(X_train, tx=Tx, num_channels=len(list(feature_data_train.columns)) + 1) X_test = make_3d(X_test, tx=Tx,
SYM = 'BTC' TARGET = 'close' Tx = 72 Ty = 1 TEST_SIZE = 0.05 data_path = os.path.join(get_project_path(), 'data', 'raw', SYM + '.csv') data = pd.read_csv(data_path, index_col=0) data.head() # In[3]: """ Get percent change feature and target data. """ df = make_features(input_df=data, target_col='close', moving_average_lags=[]) X, y = data_to_supervised(input_df=df, Tx=Tx, Ty=Ty) p(X.shape, y.shape) X.head() # In[4]: """ Confirm data reshape and target/feature creation was done correctly. """ y_values_except_last = np.squeeze(y.iloc[:-1].values) t_minus_1_x_values_except_first = X.iloc[1:, -1].values y_values_except_last.all() == t_minus_1_x_values_except_first.all() # In[5]: """ For comparing different transformations
sample_ix = 100 sample_n = 100 sample = train_smooth.iloc[sample_ix:sample_ix + sample_n] fig, ax = plt.subplots(figsize=(12, 7)) plt.plot(sample['orig__pct_change__close'], label='raw') plt.plot(sample['haar_smooth__pct_change__close'], label='smoothed') plt.title('DWT Haar Smoothing') plt.legend() plt.show() # In[7]: """ Create time-series samples. """ X_train, y_train = data_to_supervised(train_smooth, target_ix=-1, Tx=Tx, Ty=Ty) X_test, y_test = data_to_supervised(test_smooth, target_ix=-1, Tx=Tx, Ty=Ty) X_train = make_3d(X_train, tx=Tx, num_channels=2) X_test = make_3d(X_test, tx=Tx, num_channels=2) # In[8]: """ Save data. """ output_dir = join(get_project_path(), 'data', 'processed') np.save(arr=X_train, file=join(output_dir, 'X_train_smooth_{}.npy'.format(SYM))) np.save(arr=X_test, file=join(output_dir, 'X_test_smooth_{}.npy'.format(SYM))) np.save(arr=y_train, file=join(output_dir, 'y_train_smooth_{}.npy'.format(SYM)))