Пример #1
0
def build_correlations(start_date='2019-01-01',
                       end_date=datetime.now().strftime('%Y-%m-%d'),
                       num_stocks=1000):
    engine = get_pg_engine()
    df, stdevs = clean_and_format(start_date, end_date, num_stocks)
    df = correlate(df)
    a_id = f"{start_date.replace('-','_')}_{end_date.replace('-','_')}_{num_stocks}"
    df['id'] = a_id

    df.to_sql('correlations_' + a_id, engine, if_exists='replace')
    stdevs.to_sql('coef_variation', engine, if_exists='replace')
Пример #2
0
def get_symbols(exchange='LON', full_refresh=False):
    """
    Function which scrapes the Stock's names and symbols from the London Stock Exchange website
    """
    engine = get_pg_engine()
    if full_refresh:
        engine.execute("DROP TABLE IF EXISTS symbols")

    url = f"{os.getenv('IEX_ROOT')}/stable/ref-data/exchange/{exchange}/symbols?token={os.getenv('IEX_TOKEN')}"
    response = requests.get(url)
    if response.ok:
        df = pd.DataFrame.from_dict(response.json())
        logger.info(f"Found {len(df)} stock symbols, writing to DB")
        df.set_index('symbol').to_sql('symbols', engine, if_exists='append')
    else:
        logger.info(f"API call not successful, error: {response.text}")
Пример #3
0
def get_nonexisting(shuffle = True, types=['cs']):
    engine = get_pg_engine()
    engine.execute("CREATE TABLE IF NOT EXISTS price (timestamp date, open double precision,high double precision, low double precision, close double precision, volume double precision, symbol text)")
    engine.execute("CREATE TABLE IF NOT EXISTS errors (symbol text, error text, datetime date)")
    query = f"""
    with existing as (
    SELECT distinct symbol from (select symbol from price union select symbol from errors) o
    )
    select symbol from symbols where symbol not in (select * from existing) and type in ({','.join([f"'{t}'" for t in types])})
    """
    df = pd.read_sql(query, engine)
    if shuffle:
        res = df.sample(frac=1)['symbol'].tolist()
    else:
        res = df.sample(frac=1)['symbol'].tolist()
    return res
Пример #4
0
def clean_and_format(start_date='2019-01-01',
                     end_date=datetime.now().strftime('%Y-%m-%d'),
                     num_stocks=1000,
                     use_pct_change=True):

    assert num_stocks > 20, 'To build an interesting analysis, make sure the number of stocks to use is at least 20'

    start_date_fmt = datetime.strptime(start_date, '%Y-%m-%d')
    end_date_fmt = datetime.strptime(end_date, '%Y-%m-%d')
    diff = end_date_fmt - start_date_fmt
    assert int(
        diff.days / 7
    ) > 12, 'For more meaningful correlations increase the window between the start_date and end_date (at least 12 weeks)'

    engine = get_pg_engine()

    stocks = pd.read_sql(
        f'select symbol, sum(volume) as volume \
                       from price \
                       where timestamp between \'{start_date}\' and \'{end_date}\' \
                       group by symbol \
                       order by sum(volume) desc \
                       limit {num_stocks}', engine)

    relevant_stocks = ','.join(
        [f"'{stock}'" for stock in stocks['symbol'].tolist()])

    price_data = pd.read_sql(
        f"select timestamp, symbol, close as price\
                            from price\
                            where \"timestamp\" between \'{start_date}\' and \'{end_date}\'\
                            and symbol in ({relevant_stocks})", engine)

    # Calculating coefficient of variation to keep only stocks with more price movement
    stdevs = price_data.groupby('symbol')['price'].apply(lambda x: np.std(
        x) / x.mean()).to_frame().rename(columns={'price': 'var_coef'})
    keep = stdevs[(stdevs < stdevs.quantile(0.95))
                  & (stdevs > stdevs.quantile(0.05))].index
    print(
        f"Dropping stocks with very high or very low coefficients of variation\n\
        Keeping {len(keep)} out of {len(price_data['symbol'].unique())}")

    price_data = price_data[price_data['symbol'].isin(keep)]

    price_data = price_data.pivot(index='timestamp',
                                  columns='symbol',
                                  values='price')
    drop_stocks = price_data.isna().apply('mean').sort_values(ascending=False).reset_index()\
    .rename(columns={0:'nas'}).query('nas > 0.65')
    print(
        f"Will drop {len(drop_stocks)} stocks from the total list, dropping high missing values"
    )

    price_data = price_data.loc[:, ~price_data.columns.
                                isin(drop_stocks['symbol'])]

    print(f"Now cleaning dates")
    price_data = price_data.fillna(method='ffill')

    drop_stocks = price_data.isna().apply('mean').reset_index().rename(
        columns={
            0: 'nas'
        }).query('nas > 0.1')
    print(
        f"Will additionally drop {len(drop_stocks)} due to high missingness at start of period"
    )
    if len(drop_stocks) > 0:
        price_data = price_data.loc[:, ~price_data.columns.
                                    isin(drop_stocks['symbol'])]

    # Drop dates with high percentage og missing values
    drop_dates = price_data.isna().apply('mean', axis=1).to_frame().assign(
        drop=lambda df: df[0] > 0).query('drop').index
    price_data = price_data[~price_data.index.isin(drop_dates)]

    if use_pct_change:
        price_data = price_data.pct_change(fill_method='ffill')
        price_data = price_data.iloc[1:]

    return price_data, stdevs
Пример #5
0
import requests
import pandas as pd
import os
from sqlalchemy.exc import ProgrammingError
import time
import psycopg2
from utils import clean_columns, get_pg_engine, read_table
import io
import logging
from tqdm import tqdm
from datetime import datetime
import sys


engine = get_pg_engine()
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s', 
                    handlers=[
                        logging.FileHandler(f"logs/price.log"),
                        logging.StreamHandler(sys.stdout)
                    ] )

logger=logging.getLogger() 


class RateLimitExceededException(Exception):
    def __init__(self,msg=None):
        if msg is None:
            msg = "API Rate Limit Exceeded"

def fetch_price(symbol):