Пример #1
0
    def _write_syms_to_years(cls, self, df_hist):
        """Take combined df and write to correct year, for each sym."""
        years = df_hist['date'].dt.year.unique()
        syms = df_hist['symbol'].unique()

        df_hist_idx = df_hist.copy()
        df_hist_idx['year'] = df_hist['date'].dt.year

        df_hist_idx = df_hist_idx.set_index(['symbol', 'year'])

        bpath = Path(baseDir().path, 'StockEOD')

        for sym in tqdm(syms):
            try:
                for yr in years:
                    df_mod = (df_hist_idx.loc[sym, yr]
                              .reset_index(level='symbol')
                              .reset_index(drop=True)
                              .copy())
                    yr_path = bpath.joinpath(str(yr), sym.lower()[0], f"_{sym}.parquet")

                    if yr_path.exists():
                        df_old = pd.read_parquet(yr_path)
                        df_all = pd.concat([df_old, df_mod])
                        df_all = df_all.drop_duplicates(subset=['date']).reset_index(drop=True)
                        write_to_parquet(df_all, yr_path)
                    else:
                        write_to_parquet(df_mod.reset_index(drop=True), yr_path)
            except Exception as e:
                print(sym)
                print(str(e))
                print()
Пример #2
0
    def _combine_all(cls, self):
        """Combine all local files into a combined df."""
        bpath = Path(baseDir().path, 'historical/each_sym_all')
        sym_path_list = list(bpath.glob('**/*.parquet'))
        sym_path_list = ([f for f in sym_path_list
                          if 'info' not in str(f)
                          if 'combined_all' not in str(f)])

        sym_list = []

        for fpath in tqdm(sym_path_list):
            try:
                sym_list.append(pd.read_parquet(fpath))
            except Exception as e:
                print(str(e))

        f_suf = f"_{getDate.query('iex_eod')}.parquet"
        path_to_write = bpath.joinpath('each_sym_all', 'combined_all', f_suf)

        df_all = pd.concat(sym_list)
        df_all.columns = [col.lower() for col in df_all.columns]
        (df_all.rename(columns={'open': 'fOpen',
                                'high': 'fHigh', 'low': 'fLow',
                                'close': 'fClose',
                                'volume': 'fVolume'}, inplace=True))
        self.df_all = df_all
        write_to_parquet(df_all, path_to_write)
Пример #3
0
def add_perc_change_columns(df_prices=False, df_cleaned=False, refresh=False):
    """Use historical 2021 data from iex in prep to merge."""
    perc_path = Path(baseDir().path, 'StockEOD/combined', "_2021_yprices_percs.parquet")
    # If perc path exists, return that file instead of running whole analysis
    if perc_path.exists() and not refresh:
        df_y = pd.read_parquet(perc_path)
        max_date = df_y.index.get_level_values('date').max()
        print(f"Most recent date for historical data is: {max_date}")
        return df_y

    df_yprices = df_prices[df_prices['date'] >= df_cleaned['date'].min()].copy()
    cols_to_keep = (['date', 'symbol', 'fOpen', 'fClose', 'fHigh', 'fLow',
                     'fVolume', 'change', 'changePercent', 'changeOverTime',
                     'marketChangeOverTime'])

    df_y = df_yprices[cols_to_keep].copy()
    path = Path(baseDir().path, 'StockEOD/combined', "_2021_yprices.parquet")
    df_yprices.to_parquet(path)

    df_y = (df_y.dropna(subset=['date', 'symbol'])
                .drop_duplicates(subset=['date', 'symbol'])
                .set_index(['date', 'symbol'])
                .sort_index(level=['date', 'symbol']))

    df_y['fRange'] = (df_y['fHigh'] - df_y['fLow']).round(2)
    syms = df_y.index.get_level_values('symbol').unique().tolist()
    idx = pd.IndexSlice

    cols_cperc_change = ['c_perc1', 'c_perc2', 'c_perc3', 'c_perc5', 'c_perc7']
    cols_operc_change = ['o_perc1', 'o_perc2', 'o_perc3', 'o_perc5', 'o_perc7']
    all_perc_cols = cols_cperc_change + cols_operc_change
    df_y[all_perc_cols] = 0

    for sym in tqdm(syms):
        df_sub = df_y[df_y.index.get_level_values('symbol') == sym].copy()
        for col in all_perc_cols:
            df_sub[col] = -df_sub['fClose'].pct_change(periods=-int(col[-1]))
        df_y.loc[idx[df_sub.index, all_perc_cols]] = df_sub[all_perc_cols]

    perc_path = Path(baseDir().path, 'StockEOD/combined', "_2021_yprices_percs.parquet")

    df_y = dataTypes(df_y, parquet=True).df
    df_y.to_parquet(perc_path)

    return df_y
Пример #4
0
def analyze_iex_ytd():
    """Analyze iex historical data for this year."""
    df_prices_get = serverAPI('stock_close_prices').df
    df_prices = df_prices_get.copy()
    df_prices['date'] = pd.to_datetime(df_prices['date'], unit='ms')

    dt_max = df_prices['date'].max().date()
    path = Path(baseDir().path, 'StockEOD/combined', f"_{dt_max}.parquet")
    df_prices.to_parquet(path)

    df_2021 = df_prices[df_prices['date'].dt.year >= 2021].copy()
    return df_2021
Пример #5
0
def get_company_meta_data():
    """Get company meta data, save locally, from IEX."""
    all_symbols = serverAPI('all_symbols').df
    all_cs = all_symbols[all_symbols['type'].isin(['cs', 'ad'])]
    sym_list = all_cs['symbol'].unique().tolist()

    bpath = Path(baseDir().path, 'company_stats/meta')

    for sym in tqdm(sym_list):
        try:
            ud = urlData(f"/stock/{sym}/company")
            fpath_suf = f"{sym.lower()[0]}/_{sym}.parquet"
            fpath = bpath.joinpath(fpath_suf)
            write_to_parquet(ud.df, fpath)
        except Exception as e:
            print(f"Company meta stats error: {type(e)} {str(e)}")
Пример #6
0
    def _write_to_local(cls, self, data):
        """Write to local dataframes."""
        syms = data.columns.get_level_values(0).unique()
        bpath = Path(baseDir().path, 'historical/each_sym_all')

        for sym in tqdm(syms):
            df_sym = (data.loc[:, data.columns.get_level_values(0) == sym]
                          .droplevel(0, axis='columns')
                          .reset_index().copy())
            df_sym.insert(0, 'symbol', sym)
            df_sym.columns = [col.lower() for col in df_sym.columns]
            (df_sym.rename(columns={'Date': 'date', 'Open': 'fOpen',
                                    'High': 'fHigh', 'Low': 'fLow',
                                    'Close': 'fClose',
                                    'Volume': 'fVolume'}, inplace=True))

            sym_ea_path = bpath.joinpath(sym.lower()[0], f"_{sym}.parquet")
            write_to_parquet(df_sym, sym_ea_path)
Пример #7
0
def treasuries_clean_write():
    """Clean, and store daily treasury data locally."""
    tz = serverAPI('treasuries').df

    tz['time_test'] = pd.to_datetime(tz['time'], unit='ms', errors='coerce')
    tz_mod = tz.dropna(subset=['time_test'])
    tz_mod = tz_mod.drop(columns=['time']).rename(columns={'time_test': 'time'})
    tz = tz[~tz.index.isin(tz_mod.index)].drop(columns=['time_test']).copy()
    tz = pd.concat([tz, tz_mod])
    col_dict = ({'^IRX': 'ThreeM', '^FVX': 'ThreeY',
                 '^TNX': 'FiveY', '^TYX': 'TenY'})
    tz.rename(columns=col_dict, inplace=True)
    tz['time'] = pd.to_datetime(tz['time'])
    tz['date'] = pd.to_datetime(tz['time'].dt.date)
    tz = tz.sort_values(by=['date'])

    tz_daily = tz.groupby(by=['date']).mean()
    path_to_write = Path(baseDir().path, 'economic_data/tz_daily.parquet')
    write_to_parquet(tz_daily, path_to_write)

    return tz_daily
Пример #8
0
    def _find_missing_hist_symbols(cls, self):
        """Finding all missing symbols from max historical."""
        bpath = Path(baseDir().path, 'historical/each_sym_all')
        self.bpath = bpath
        info_path = bpath.joinpath('info', 'info.parquet')
        
        if info_path.exists():
            sym_df = pd.read_parquet(info_path)
        else:
            df_stats = get_symbol_stats()
            symbols = df_stats['symbol'].dropna().unique().tolist()
            sym_df = pd.DataFrame(symbols, columns=['symbol'])

        sym_path_list = list(bpath.glob('**/*.parquet'))
        sym_list = [str(f).split('_')[-1].split('.')[0] for f in sym_path_list]

        sym_df['missing'] = np.where(sym_df['symbol'].isin(sym_list), 1, 0)

        write_to_parquet(sym_df, info_path)

        sym_missing = sym_df[sym_df['missing'] == 0]
        sym_start = sym_missing['symbol'].tolist()[0:1000]

        return sym_df, sym_start
Пример #9
0
"""
Get a list of all directories, create in new dir data_test
"""
# %% codecell
##############################
import sys
import glob
import os

from multiuse.help_class import baseDir
# %% codecell
##############################
data_dir = f"{baseDir().path}/*"

glob.glob(data_dir)

list(os.walk(baseDir().path))

# %% codecell
##############################
Пример #10
0
pd.DataFrame.chained_isin = chained_isin

pd.set_option('display.max_columns', 50)
# %% codecell

# %% codecell

# %% codecell

# df_all = read_clean_combined_all(local=True)

# %% codecell

# %% codecell

fpath = Path(baseDir().path, 'ml_data/fib_analysis/df_all_temp.parquet')
df_all = pd.read_parquet(fpath)
df_all = add_gap_col(df_all)

# %% codecell

df_all_cols = df_all.columns
cols_to_round = ([
    'fOpen', 'fLow', 'fClose', 'fHighMax', 'prev_close', 'rsi', 'vol_avg_2m',
    'fCP5', 'sma_50', 'sma_200'
])
df_all[cols_to_round] = df_all[cols_to_round].astype(np.float64).round(2)
df_all.reset_index(drop=True, inplace=True)

# 1. Period of little movement for 2+ weeks.
# 2. Period of major up movement
Пример #11
0
from tqdm import tqdm

import pandas as pd
import numpy as np
import talib
from talib import abstract
from multiuse.help_class import baseDir, dataTypes, getDate

# %% codecell
# Display max 50 columns
pd.set_option('display.max_columns', 100)
# Display maximum rows
pd.set_option('display.max_rows', 100)

# %% codecell
path = Path(baseDir().path,  'historical', '2021')

price_cols = ['fOpen', 'fHigh', 'fLow', 'fClose']
cols_to_read = ['fVolume'] + price_cols

df_list = [pd.read_parquet(fpath) for fpath in list(path.glob('**/*.parquet')) if os.path.getsize(fpath) > 0]
df = pd.concat(df_list)
# %% codecell
df = df.set_index(['symbol', 'date'])
df_sub = df[cols_to_read].copy()

combined_fpath = Path(baseDir().path, 'historical', 'combined', 'sub.parquet')
combined_fpath.resolve()

df_sub = dataTypes(df_sub).df
Пример #12
0
        inf = yoptions_all[np.isfinite(yoptions_all[col]) == False].shape[0]
        if inf > 0:
            print(f"{col} {inf}")
    except TypeError:
        pass

# %% codecell

# %% codecell
# ref_data = serverAPI('cboe_symref').df
# min_ref_data = ref_data[['Underlying', 'side', 'expirationDate', 'sym_suf']].copy()
# min_ref_data['contractSymbol'] = min_ref_data.apply(lambda row: f"{row['Underlying']}{row['sym_suf']}", axis=1)
# deriv_all = pd.merge(yoptions_all, min_ref_data, on=['contractSymbol'], how='left')

# %% codecell
path_to_write = Path(baseDir().path, 'derivatives/temp_dump/yderivs_comb.parquet')
# write_to_parquet(deriv_all, path_to_write)
# %% codecell
df_all = pd.read_parquet(path_to_write)
# %% codecell
df_all['sym_suf'].isna().sum()
# For simplicity's sake, lets only work with cleaned data
df_mod = df_all.dropna(subset=['sym_suf'])
path_to_write = Path(baseDir().path, 'derivatives/temp_dump/yderivs_nonan.parquet')
# write_to_parquet(df_mod, path_to_write)
# %% codecell

# %% codecell
path_to_write = Path(baseDir().path, 'derivatives/temp_dump/yderivs_comb.parquet')
df_mod = pd.read_parquet(path_to_write)
# %% codecell
Пример #13
0
# %% codecell

df_test.loc['OCGN'].nlargest(5, 'corr')

df_test

# %% codecell

# There's the question of correlation with percentage returns
# Or whether to apply a logarithmic function to flatten the noise.
# I'm guess that ^ this is probably the better idea.

scaled_price = (logprice - np.mean(logprice)) / np.sqrt(np.var(logprice))

# %% codecell
fpath = Path(baseDir().path, 'ref_data', 'peer_list', '_peers.parquet')
df_peers = pd.read_parquet(fpath)

all_syms = serverAPI('all_symbols').df
df_peers = pd.merge(df_peers,
                    all_syms[['symbol', 'type']],
                    on='symbol',
                    how='left')
df_peers = (df_peers.mask('corr', .95, lesser=True).mask('corr',
                                                         -.95,
                                                         greater=True))

# %% codecell
df_peers_idx = df_peers.set_index(['key', 'type'])

df_peers
Пример #14
0
df['reportDate'] = pd.to_datetime(df['rptDate'].str[-13:-3], format='%Y-%m-%d')

cols_to_drop = [col for col in df.columns if 'strike' in col or 'rpt' in col]

df_sub = df.drop(columns=cols_to_drop).copy()
df_sub['expDate'] = pd.to_datetime(df_sub['expDate'], unit='ms')

# %% codecell

df_sym_sub = df_sub['Underlying']
cols_to_rename = {'reportDate': 'date', 'Underlying': 'symbol'}
df_sub.rename(columns=cols_to_rename, inplace=True)

# Read historical data collected from IEX
combined_fpath = Path(baseDir().path, 'historical', 'combined', 'sub.parquet')
df_hist = pd.read_parquet(combined_fpath)
# Only include values after 2020
df_hist = df_hist[df_hist.index.get_level_values('date') > '2020']
df_use = df_hist[df_hist.index.get_level_values('symbol').isin(
    df_sub['symbol'].tolist())].copy()
df_use['range'] = df_use['fHigh'] - df_use['fLow']

# Create percentage change columns for the following days
periods = [1, 2, 3, 5, 10]
periods_to_cols = [f"pc_{p}" for p in periods]
df_use.sort_index(level='symbol', inplace=True)
# Cycle through percentage change columns, round to 0 decimal places
for p, col in zip(periods, periods_to_cols):
    df_use[col] = (-df_use['fClose'].pct_change(periods=-p) * 100).round(0)
        url_1 = 'https://api.stocktwits.com/api/2/streams'
        url_2 = f'/symbol/{symbol}.json'
        url = f"{url_1}{url_2}"

        try:
            get = s.get(url)
        except ConnectionError:
            break

        if get.status_code == 200:

            df = pd.DataFrame(get.json()['messages'])
            df = clean_st_messages(df)

            path = Path(baseDir().path, 'all_symbol_data', f"{symbol}",
                        'daily', f"_{dt}.parquet")
            if path.exists():
                df_old = pd.read_parquet(path)
                df_all = pd.concat([df_old, df])
                df_all.drop_duplicates(subset=['id'])
            else:
                df_all = df.copy()
            df_all = df_all.dropna().reset_index(drop=True)
            try:
                write_to_parquet(df_all, path)
                syms_collected.append(symbol)
            except Exception as e:
                print(f"Could not write symbol {symbol} to parquet: {str(e)}")
        elif get.status_code == 404:
            if get.json()['errors'][0]['message']:
Пример #16
0
import yfinance as yf

from multiuse.help_class import baseDir, getDate, write_to_parquet, dataTypes, check_nan
from multiuse.path_helpers import get_most_recent_fpath
from multiuse.pd_funcs import mask, chained_isin

from studies.add_study_cols import add_gap_col, calc_rsi, make_moving_averages, add_fChangeP_col, add_fHighMax_col
importlib.reload(sys.modules['studies.add_study_cols'])
from studies.add_study_cols import add_gap_col, calc_rsi, make_moving_averages, add_fChangeP_col, add_fHighMax_col

from api import serverAPI
# %% codecell
pd.DataFrame.mask = mask
pd.DataFrame.chained_isin = chained_isin

dump_path = Path(baseDir().path, 'dump', 'df_all_cleaned_max.parquet')

df_all = pd.read_parquet(dump_path).copy()
# %% codecell

# %% codecell

# %% codecell

# %% codecell
path = Path(baseDir().path, 'dump', 'refact_fib_data.parquet')
# write_to_parquet(df_all, path)
# %% codecell

df_all = pd.read_parquet(path).copy()
Пример #17
0
# Someone bought/sold 800 calls at $7 strike for RIG 2022

# We probably want the last 50 holidays, the next 50 holidays, to run every 6 months

# %% codecell
##################################
redo_otc_syms = serverAPI('redo', val='otc_ref_data')
otc_syms = serverAPI('otc_syms').df

all_syms = serverAPI('all_symbols').df
all_syms = df_create_bins(all_syms)

all_syms.dtypes

base_dir = baseDir().path


new_syms = urlData('/ref-data/symbols')
new_syms_df = new_syms.df.copy(deep=True)
new_syms_df['type'].value_counts()

all_syms['type'].value_counts()

otc_syms = urlData('/ref-data/otc/symbols').df
otc_df = otc_syms.copy(deep=True)

all_syms['bins'].value_counts()

#  pd.qcut(df['ext price'], q=4)
Пример #18
0
# %% codecell
serverAPI('redo', val='GetMissingDates')
# %% codecell
serverAPI('redo', val='warrants')
# %% codecell
serverAPI('redo', val='get_missing_hist_from_yf')
# %% codecell
serverAPI('redo', val='CboeIntraday')
# %% codecell
serverAPI('redo', val='combine_all_cboe_symref')
# %% codecell

mf_url = '/ref-data/mutual-funds/symbols'
mf_syms = urlData(mf_url).df

path = Path(baseDir().path, 'tickers', 'mfund_symbols.parquet')
mf_syms.info()

# %% codecell

yall_today = serverAPI('yoptions_daily').df
# yall_all = serverAPI('yoptions_all').df
yall_dd = dd.from_pandas(yall_today, npartitions=1)

cboe_symref = serverAPI('cboe_symref_all').df
cboe_dd = dd.from_pandas(cboe_symref, npartitions=1)

cboe_dd['OSI Symbol'] = cboe_dd['OSI Symbol'].str.replace(' ', '')
cboe_dd['expirationDate'] = cboe_dd['expirationDate'].astype('int64')
cboe_dd['expirationDate'] = dd.to_datetime(cboe_dd['expirationDate'], format='%y%m%d')