def build_correlations(start_date='2019-01-01', end_date=datetime.now().strftime('%Y-%m-%d'), num_stocks=1000): engine = get_pg_engine() df, stdevs = clean_and_format(start_date, end_date, num_stocks) df = correlate(df) a_id = f"{start_date.replace('-','_')}_{end_date.replace('-','_')}_{num_stocks}" df['id'] = a_id df.to_sql('correlations_' + a_id, engine, if_exists='replace') stdevs.to_sql('coef_variation', engine, if_exists='replace')
def get_symbols(exchange='LON', full_refresh=False): """ Function which scrapes the Stock's names and symbols from the London Stock Exchange website """ engine = get_pg_engine() if full_refresh: engine.execute("DROP TABLE IF EXISTS symbols") url = f"{os.getenv('IEX_ROOT')}/stable/ref-data/exchange/{exchange}/symbols?token={os.getenv('IEX_TOKEN')}" response = requests.get(url) if response.ok: df = pd.DataFrame.from_dict(response.json()) logger.info(f"Found {len(df)} stock symbols, writing to DB") df.set_index('symbol').to_sql('symbols', engine, if_exists='append') else: logger.info(f"API call not successful, error: {response.text}")
def get_nonexisting(shuffle = True, types=['cs']): engine = get_pg_engine() engine.execute("CREATE TABLE IF NOT EXISTS price (timestamp date, open double precision,high double precision, low double precision, close double precision, volume double precision, symbol text)") engine.execute("CREATE TABLE IF NOT EXISTS errors (symbol text, error text, datetime date)") query = f""" with existing as ( SELECT distinct symbol from (select symbol from price union select symbol from errors) o ) select symbol from symbols where symbol not in (select * from existing) and type in ({','.join([f"'{t}'" for t in types])}) """ df = pd.read_sql(query, engine) if shuffle: res = df.sample(frac=1)['symbol'].tolist() else: res = df.sample(frac=1)['symbol'].tolist() return res
def clean_and_format(start_date='2019-01-01', end_date=datetime.now().strftime('%Y-%m-%d'), num_stocks=1000, use_pct_change=True): assert num_stocks > 20, 'To build an interesting analysis, make sure the number of stocks to use is at least 20' start_date_fmt = datetime.strptime(start_date, '%Y-%m-%d') end_date_fmt = datetime.strptime(end_date, '%Y-%m-%d') diff = end_date_fmt - start_date_fmt assert int( diff.days / 7 ) > 12, 'For more meaningful correlations increase the window between the start_date and end_date (at least 12 weeks)' engine = get_pg_engine() stocks = pd.read_sql( f'select symbol, sum(volume) as volume \ from price \ where timestamp between \'{start_date}\' and \'{end_date}\' \ group by symbol \ order by sum(volume) desc \ limit {num_stocks}', engine) relevant_stocks = ','.join( [f"'{stock}'" for stock in stocks['symbol'].tolist()]) price_data = pd.read_sql( f"select timestamp, symbol, close as price\ from price\ where \"timestamp\" between \'{start_date}\' and \'{end_date}\'\ and symbol in ({relevant_stocks})", engine) # Calculating coefficient of variation to keep only stocks with more price movement stdevs = price_data.groupby('symbol')['price'].apply(lambda x: np.std( x) / x.mean()).to_frame().rename(columns={'price': 'var_coef'}) keep = stdevs[(stdevs < stdevs.quantile(0.95)) & (stdevs > stdevs.quantile(0.05))].index print( f"Dropping stocks with very high or very low coefficients of variation\n\ Keeping {len(keep)} out of {len(price_data['symbol'].unique())}") price_data = price_data[price_data['symbol'].isin(keep)] price_data = price_data.pivot(index='timestamp', columns='symbol', values='price') drop_stocks = price_data.isna().apply('mean').sort_values(ascending=False).reset_index()\ .rename(columns={0:'nas'}).query('nas > 0.65') print( f"Will drop {len(drop_stocks)} stocks from the total list, dropping high missing values" ) price_data = price_data.loc[:, ~price_data.columns. isin(drop_stocks['symbol'])] print(f"Now cleaning dates") price_data = price_data.fillna(method='ffill') drop_stocks = price_data.isna().apply('mean').reset_index().rename( columns={ 0: 'nas' }).query('nas > 0.1') print( f"Will additionally drop {len(drop_stocks)} due to high missingness at start of period" ) if len(drop_stocks) > 0: price_data = price_data.loc[:, ~price_data.columns. isin(drop_stocks['symbol'])] # Drop dates with high percentage og missing values drop_dates = price_data.isna().apply('mean', axis=1).to_frame().assign( drop=lambda df: df[0] > 0).query('drop').index price_data = price_data[~price_data.index.isin(drop_dates)] if use_pct_change: price_data = price_data.pct_change(fill_method='ffill') price_data = price_data.iloc[1:] return price_data, stdevs
import requests import pandas as pd import os from sqlalchemy.exc import ProgrammingError import time import psycopg2 from utils import clean_columns, get_pg_engine, read_table import io import logging from tqdm import tqdm from datetime import datetime import sys engine = get_pg_engine() logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(f"logs/price.log"), logging.StreamHandler(sys.stdout) ] ) logger=logging.getLogger() class RateLimitExceededException(Exception): def __init__(self,msg=None): if msg is None: msg = "API Rate Limit Exceeded" def fetch_price(symbol):