def test_translate_currency_real(self, ticker): columns = [ 'equity', 'eps', 'revenue', 'netinccmn', 'cashneq', 'debt', 'ebit', 'ebitda' ] data_loader = SF1Data(config['sf1_data_path']) quarterly_df = data_loader.load_quarterly_data(ticker, 10) trans_df = SF1Data.translate_currency(quarterly_df, columns) for col in columns: diff = trans_df['{}usd'.format(col)] - trans_df[col] diff = np.abs(diff.values / trans_df['{}usd'.format(col)].values) diff = diff[~np.isnan(diff)] assert diff.max() < 0.1
def test_calculate(self, tickers): data_loader = SF1Data(config['sf1_data_path']) quarterly_df = data_loader.load_quarterly_data(tickers, quarter_count=None) target = QuarterlyTarget('marketcap', quarter_shift=0) info_df = quarterly_df.drop_duplicates('ticker', keep='first') \ [['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df[['ticker', 'date']]) assert type(y) == pd.DataFrame assert 'y' in y.columns np.testing.assert_array_equal(y['y'].values, info_df['marketcap'].values) info_df = quarterly_df[['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df) np.testing.assert_array_equal(y['y'].values, info_df['marketcap'].values) target = QuarterlyTarget('marketcap', quarter_shift=1) info_df = quarterly_df[['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df) np.testing.assert_array_equal(y['y'].values, info_df.groupby('ticker')['marketcap']\ .shift(1).astype('float').values) target = QuarterlyTarget('marketcap', quarter_shift=-3) info_df = quarterly_df[['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df) np.testing.assert_array_equal(y['y'].values, info_df.groupby('ticker')['marketcap']\ .shift(-3).astype('float').values)
def test_load_base_data(self): data_loader = SF1Data(config['sf1_data_path']) df = data_loader.load_base_data() assert type(df) == pd.DataFrame assert len(df) > 0 assert 'ticker' in df.columns assert df['ticker'].isnull().max() == False
def test_calculate(self, tickers): data_loader = SF1Data(config['sf1_data_path']) quarterly_df = data_loader.load_quarterly_data(tickers, quarter_count=None) target = QuarterlyDiffTarget('marketcap', norm=False) info_df = quarterly_df[['ticker', 'date', 'marketcap']] y = target.calculate(data_loader, info_df) assert type(y) == pd.DataFrame assert 'y' in y.columns assert len(y) == len(info_df) gt = info_df['marketcap'].astype('float') - \ info_df.groupby('ticker')['marketcap'].shift(-1).astype('float') np.testing.assert_array_equal(y['y'].values, gt.values)
def test_translate_currency_synthetic(self, cnt): np.random.seed(0) currency_arr = np.array(range(1, cnt + 1)) df = pd.DataFrame() df['debtusd'] = np.random.uniform(-1e5, 1e5, cnt) df['debt'] = df['debtusd'] * currency_arr df['ebitusd'] = np.random.uniform(-10, 10, cnt) noise = np.random.uniform(-0.1, 0.1, cnt) df['ebit'] = df['ebitusd'] * (currency_arr + noise) del_proba = np.random.uniform(0, 0.3) drop_mask = np.random.choice([True, False], cnt, p=[del_proba, 1 - del_proba]) df.loc[drop_mask, 'ebitusd'] = None trans_df = SF1Data.translate_currency(df, ['debt', 'ebit']) for col in ['debt', 'ebit']: diff = trans_df['{}usd'.format(col)] - trans_df[col] diff = np.abs(diff.values / trans_df['{}usd'.format(col)].values) diff = diff[~np.isnan(diff)] assert diff.max() < 0.1
def test_load_quarterly_data(self, tickers, quarter_count, dimension): data_loader = SF1Data(config['sf1_data_path']) quarterly_df = data_loader.load_quarterly_data(tickers, quarter_count, dimension) assert type(quarterly_df) == pd.DataFrame assert 'ticker' in quarterly_df.columns assert 'date' in quarterly_df.columns # Data should be ordered by date inside ticker quarterly_df['date_'] = quarterly_df['date'].astype(np.datetime64) quarterly_df['def_order'] = range(len(quarterly_df))[::-1] expected_dates_order = quarterly_df.sort_values( ['ticker', 'date_'], ascending=False)['date'].values real_dates_order = quarterly_df.sort_values( ['ticker', 'def_order'], ascending=False)['date'].values np.testing.assert_array_equal(expected_dates_order, real_dates_order) for cnt in quarterly_df.groupby('ticker').size(): assert cnt <= quarter_count assert (quarterly_df['dimension'] == dimension).min()
def test_load_daily_data(self, tickers, back_days): data_loader = SF1Data(config['sf1_data_path']) daily_df = data_loader.load_daily_data(tickers, back_days=back_days) assert type(daily_df) == pd.DataFrame assert 'ticker' in daily_df.columns assert 'date' in daily_df.columns # Data should be ordered by date inside ticker daily_df['date_'] = daily_df['date'].astype(np.datetime64) daily_df['def_order'] = range(len(daily_df))[::-1] expected_dates_order = daily_df.sort_values( ['ticker', 'date_'], ascending=False)['date'].values real_dates_order = daily_df.sort_values(['ticker', 'def_order'], ascending=False)['date'].values np.testing.assert_array_equal(expected_dates_order, real_dates_order) # Should not be large holes in date diffs = daily_df.groupby('ticker')['date_'].shift( 1) - daily_df['date_'] assert (diffs.dropna() <= np.timedelta64(14, 'D')).min() if back_days is not None: for cnt in daily_df.groupby('ticker').size(): assert cnt == back_days
"sgna", "ncfx", "divyield", "currentratio", "netinccmn" ] if __name__ == '__main__': parser = argparse.ArgumentParser() arg = parser.add_argument arg('--config_path', type=str) args = parser.parse_args() config = load_json(args.config_path) data_loader = SF1Data(config['sf1_data_path']) tickers_df = data_loader.load_base_data( currency=CURRENCY, scalemarketcap=SCALE_MARKETCAP) ticker_list = tickers_df['ticker'].unique().tolist() fc1 = QuarterlyFeatures( columns=QUARTER_COLUMNS, quarter_counts=QUARTER_COUNTS, max_back_quarter=MAX_BACK_QUARTER) fc2 = BaseCompanyFeatures(cat_columns=CAT_COLUMNS) fc3 = QuarterlyDiffFeatures( columns=QUARTER_COLUMNS, compare_quarter_idxs=COMPARE_QUARTER_IDXS,
import pytest import pandas as pd import numpy as np from ml_investment.data import SF1Data from ml_investment.features import calc_series_stats, QuarterlyFeatures, BaseCompanyFeatures,\ QuarterlyDiffFeatures, FeatureMerger, \ DailyAggQuarterFeatures from ml_investment.utils import load_json, int_hash_of_str from synthetic_data import GeneratedData config = load_json('config.json') loaders = [GeneratedData()] if config['sf1_data_path'] is not None: loaders.append(SF1Data(config['sf1_data_path'])) @pytest.mark.parametrize( ["series", "norm", "expected"], [([10, 0, 1], False, {'_mean': 3.6666666666666665, '_median': 1.0, '_max': 10.0, '_min': 0.0, '_std': 4.4969125210773475}), ([10, -30, 1, 4, 15.2], False, {'_mean': 0.039999999999999855, '_median': 4.0, '_max': 15.2, '_min': -30.0, '_std': 15.798936673080249}), ([1], False,