class ListManager(): def __init__(self, url="http://www.tsx.com/resource/en/571"): self.today = datetime.today().date() self.session = Session() self.url = url def get_quotes(self): """ This function gets the tickers and various other random information from the TSX website from a hardcoded file and inserts it into the database """ recent_date, = self.session.query(func.max( Listings.updatedate)).first() if self.url.startswith("http"): req = create_url_request(self.url) self.url = urllib.request.urlopen(req) sheet = pd.read_excel(self.url, skiprows=5, header=1, keep_default_na=False) sheet.fillna('', inplace=True) sheet.rename(columns=self.cleanse_str, inplace=True) file_date = self.find_date_in_list(list(sheet.columns.values)) if recent_date is None or (file_date > recent_date): xlsx_dict = sheet.to_dict(orient="records") recent_date = file_date else: LOGGER.info("Already up to date") return row_names = [ "ticker", "exchange", "name", "sector", "osshares", "dateoflisting", "listingtype", "volume", "value", ] all_excel_names = tuple(xlsx_dict[0].keys()) base_wanted_excel_names = [ "Root Ticker", "Exchange", "Name", "Sector", "O/S", "Date of TSX Listing", "Listing Type", "Volume YTD", "Value (C$)", ] wanted_excel_names = [] for bxn in base_wanted_excel_names: for xn in all_excel_names: if xn.startswith(bxn): wanted_excel_names.append(xn) break assert (len(base_wanted_excel_names) == len(wanted_excel_names) == len(row_names)) value_dics = [] for row in xlsx_dict: value_dic = {"updatedate": recent_date} for excel_name, row_name in zip(wanted_excel_names, row_names): val = row[excel_name] if row_name == "dateoflisting": val = datetime.strptime(str(val), "%Y%m%d") # assume YYYYMMDD if val == '': val = None value_dic[row_name] = val value_dics.append(value_dic) self.session.execute(insert(Listings).values(value_dics)) self.session.commit() def get_historic_events(self): """ Gets all the historical events from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = self.session.query( Listings.ticker, Listings.dateoflisting).filter(Listings.exchange == exchange) dict_fields = ["index", "action", "value"] fields = ["exchange", "ticker", "date", "action", "value"] total_listings = listings.count() for counter, (ticker, listdate) in enumerate(listings): lastdate, = self.session.query(func.max( EventHistory.updatedate)).filter( EventHistory.exchange == exchange, EventHistory.ticker == ticker).first() startdate = listdate if lastdate is None else lastdate + timedelta( days=1) rows = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" dividend_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info='dividend') split_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info='split') rows = [] for row in dividend_dict: rows.append([ exchange, ticker, row["date"], "DIVIDEND", row["dividends"], self.today ]) for row in split_dict: rows.append([ exchange, ticker, row["date"], "SPLIT", row["stock_splits"], self.today ]) if rows: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter + 1, total_listings, ticker, startdate, self.today)) stmt = insert(EventHistory).values( rows).on_conflict_do_nothing( constraint='event_history_pkey') self.session.execute(stmt) self.session.commit() else: LOGGER.info("{}/{} Skipping ticker {}".format( counter + 1, total_listings, ticker)) def get_historic_prices(self): """ Gets all the historical prices from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = list( self.session.query( Listings.ticker, Listings.dateoflisting).filter(Listings.exchange == exchange)) total_listings = len(listings) for counter, (ticker, listdate) in enumerate(listings): lastdate, = self.session.query(func.max(PriceHistory.date)).filter( PriceHistory.exchange == exchange, PriceHistory.ticker == ticker).first() startdate = listdate if lastdate is None else lastdate + timedelta( days=1) his_dict = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" start_dic = {"exchange": exchange, "ticker": ticker} his_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info="quote", start_dic=start_dic) if his_dict: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter, total_listings, ticker, startdate, self.today)) for d in his_dict: stmt = insert(PriceHistory).values( d).on_conflict_do_update( constraint='price_history_pkey', set_=d) self.session.execute(stmt) self.session.commit() else: LOGGER.info("{}/{} Skipping ticker {}".format( counter, total_listings, ticker)) def cleanse_str(self, raw_str): return raw_str.replace('\n', ' ').replace(" ", ' ') def find_date_in_list(self, strings): """ Returns the first date that occurs in a list of string or the current date if none are detected. """ cur_date = self.today # default = cur. date for s in strings: try: temp_date = dparser.parse(s, fuzzy=True).date() except ValueError: continue if cur_date != temp_date: cur_date = temp_date break return cur_date def convert_yahoo_element(self, element): converted = None try: converted = float(element) except ValueError: try: converted = datetime.strptime(element, "%Y-%m-%d") except ValueError: if element == 'null': converted = None elif '/' in element: try: a, b = element.split('/') converted = float(a) / float(b) except ValueError: LOGGER.info("Unable to convert {}".format(element)) else: LOGGER.info("Unable to convert {}".format(element)) return converted def ticker_history(self, start, end, ticker, info='quote', start_dic={}): """ Gets and returns the historic prices for a given ticker for between the time period provided. Inclusive. """ start_str = start.strftime('%Y%m%d') end_str = end.strftime('%Y%m%d') # info = 'quote', 'dividend', 'split' try: data = yqd.load_yahoo_quote(ticker, start_str, end_str, info=info) except (HTTPError, URLError, gaierror) as e: LOGGER.info("Yahoo request failed. Blocked?") return [] titles = tuple(t.replace(' ', '_').lower() for t in data[0].split(',')) history = [] for row in data[1:-1]: history_row = {k: v for k, v in start_dic.items()} iter_list = row.split(',') for element, title in zip(iter_list, titles): converted = self.convert_yahoo_element(element) history_row[title] = converted history.append(history_row) return history def clean_exit(self): self.session.close()
class MorningStarScaper(): def __init__(self): self.session = Session() self.today = datetime.today().date() self.ttm_string = self.most_recent_quarter() self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" } self.exchange_map = { "XTSE": "TSX", } self.year_month_cols = set( {"fiscal_year", "margin_date", "profitability_date"}) self.column_key_map = tuple(( ("revenue", "revenue"), ("gross margin", "gross_margin"), ("operating income", "operating_income"), ("operating margin", "operating_margin"), ("net income", "net_income"), ("earnings per share", "eps"), ("dividends", "dividends"), ("payout ratio", "payout_ratio"), ("shares", "num_shares"), ("book value per", "book_value_ps"), ("operating cash flow", "operating_cash_flow"), ("cap spending", "cap_spending"), ("cf free cash flow growth", "free_cash_flow_growth_yoy"), ("cf free cash flow/sales", "free_cash_flow_sales"), ("cf free cash flow/net", "free_cash_flow_net_income"), ("free cash flow per share", "free_cash_flow_ps"), ("free cash flow", "free_cash_flow"), ("working capital", "working_captial"), ("pro margins %", "margin_date"), ("pro revenue", "revenue_per_sales"), ("pro cogs", "revenue_per_cogs"), ("pro gross margin", "sales_gross_margin"), ("pro sg&a", "margin_sga"), ("pro r&d", "margin_rd"), ("pro other", "margin_other"), ("pro operating margin", "margin_operating"), ("pro net int inc", "margin_net_income"), ("pro ebt margin", "margin_ebt"), ("pro profitability", "profitability_date"), ("pro tax rate", "tax_rate"), ("pro net margin", "net_margin_perc"), ("pro asset turnover", "asset_turnover"), ("pro return on assets", "ro_assets"), ("pro financial lever", "financial_leverage"), ("pro return on equity", "ro_equity"), ("pro return on invested capital", "ro_invested_captial"), ("pro interest coverage", "interest_coverage"), ("r% year over year", "revenue_perc_yoy"), ("r% 3-year", "revenue_perc_3y"), ("r% 5-year", "revenue_perc_5y"), ("r% 10-year", "revenue_perc_10y"), ("oi% year over year", "operating_income_yoy"), ("oi% 3-year", "operating_income_3y"), ("oi% 5-year", "operating_income_5y"), ("oi% 10-year", "operating_income_10y"), ("ni% year over year", "net_income_yoy"), ("ni% 3-year", "net_income_3y"), ("ni% 5-year", "net_income_5y"), ("ni% 10-year", "net_income_10y"), ("eps% year over year", "eps_yoy"), ("eps% 3-year", "eps_3y"), ("eps% 5-year", "eps_5y"), ("eps% 10-year", "eps_10y"), ("cf operating cash flow", "cash_flow_operating_growth_yoy"), ("cf cap ex", "cap_expense_perc_sales"), ("fh cash & short", "cash_short_term"), ("fh accounts receivable", "accounts_receivable"), ("fh inventory", "inventory"), ("fh other current assets", "other_cur_assets"), ("fh total current assets", "total_cur_assets"), ("fh net pp&e", "net_ppe"), ("fh intangibles", "intangibles"), ("fh other long-term assets", "other_long_term_assets"), ("fh accounts payable", "accounts_payable"), ("fh short-term debt", "short_term_debt"), ("fh taxes payable", "taxes_payable"), ("fh accured liabilities", "accured_liabilities"), ("fh other short-term liabilities", "short_term_liabilities"), ("fh long-term debt", "long_term_debt"), ("fh total liabilities & equity", "total_liabilities_equity"), ("fh total liabilities", "total_liabilities"), ("fh total stockholder", "total_stockholder"), ("fh current ratio", "current_ratio"), ("fh quick ratio", "quick_ratio"), ("fh debt/equity", "debt_equity"), ("er receivables turnover", "receivables_turnover"), ("er inventory turnover", "inventory_turnover"), ("er fixed assets turnover", "fixed_assets_turnover"), )) self.column_financials_map = tuple(( ("fiscal year", "fiscal_year"), ("revenue", "revenue"), ("cost of revenue", "revenue_cost"), ("gross profit", "gross_profit"), ("sales, general and administrative", "sales_expense"), ("other operating", "operating_expense"), ("other assets", "other_assets"), ("operating income", "operating_income"), ("interest expense", "intrest_expense"), ("total operating expense", "total_costs"), ("total costs and expenses", "total_costs"), ("preferred dividend", "preferred_dividend"), ("income before", "income_before_taxes"), ("provision for", "provision_taxes"), ("net income from continuing op", "net_income_continuing_ops"), ("net income from discontinuing ops", "net_income_discontinuing_ops"), ("net income available to common shareholders", "net_income_common"), ("net income", "net_income"), ("eps basic", "eps_basic"), ("eps diluted", "eps_diluted"), ("waso basic", "waso_basic"), ("waso diluted", "waso_diluted"), ("ebitda", "ebitda"), )) self.special_key_titles = tuple(( ("key ratios -> profitability", "pro "), ("key ratios -> growth", "gro "), ("key ratios -> cash flow", "cf "), ("key ratios -> financial health", "fh "), ("key ratios -> efficiency ratios", "er "), ("revenue %", "r% "), ("operating income %", "oi% "), ("net income %", "ni% "), ("eps %", "eps% "), )) self.special_financials_titles = tuple(( ("earnings per share", "eps "), ("weighted average shares outstanding", "waso "), )) self.translation_table = dict.fromkeys(map(ord, '",'), None) def most_recent_quarter(self): day = self.today.day quarter = (self.today.month - 1) // 3 year = self.today.year month = quarter * 3 + 1 return datetime(year=year, month=month, day=1).date().isoformat() def find_column(self, col, mapper, subtitle=''): col = col.lower().replace('"', '') wst = subtitle + col alt = '' for k, v in mapper: if wst.startswith(k): return v elif col.startswith(k): alt = v return alt def convert_numerical(self, n): try: return int(n) except ValueError: try: return float(n) except ValueError: return n def get_title_multiplier(self, title): multipliers = ["Ths", "Mil", "Bil"] factors = [10**3, 10**6, 10**9] for i, multi in enumerate(multipliers): if title.endswith(multi): return factors[i] return 1 def parse_csv(self, csv_r, num_cols, special_titles, column_map, start_dic={}): subhead = '' next(csv_r) # skip headers return_dics = [] for cols in csv_r: row_cols = len(cols) if row_cols == 0: continue elif row_cols == 1: subhead = self.find_column(cols[0], special_titles) else: db_col = self.find_column(cols[0], column_map, subtitle=subhead) if db_col: multi = self.get_title_multiplier(cols[0]) if len(return_dics) == 0: return_dics = [ start_dic.copy() for _ in range(num_cols) ] for i in range(num_cols): cell = cols[i + 1].translate(self.translation_table) val = self.convert_numerical( cell) * multi if cell else None if db_col in self.year_month_cols: val = self.ttm_string if val == 'TTM' else datetime.strptime( val, '%Y-%m') return_dics[i][db_col] = val return return_dics def get_key_stats(self, ticker, db_exchange="TSX"): """ This function get key statistics from Morning Star. """ url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&" "culture=en-CA®ion=CAN&order=asc&r={}").format( ticker, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting key statistics for {}... ".format(ticker)) else: LOGGER.info("Skipping {}".format(ticker)) return 1 return_dics = self.parse_csv(csv_r, 10, self.special_key_titles, self.column_key_map, start_dic={ "ticker": ticker, "exchange": db_exchange, "update_date": self.today }) for d in return_dics: stmt = insert(MorningStarKeyStatistics).values( d).on_conflict_do_update(constraint='ms_key_statistics_pkey', set_=d) self.session.execute(stmt) self.session.commit() LOGGER.info("Done") return 0 def get_financial(self, ticker, period_name, exchange="XTSE"): """ This function get yearly and quartly information from Morning Star. period_name: "quarter" or "annual" exchanges: XTSE (TSX), """ # this converts the morning star exchange name to our database name if exchange in self.exchange_map: db_exchange = self.exchange_map[exchange] else: raise ValueError("Exchange unsupported {}".format(exchange)) period = 3 if period_name == "quarter" else 12 url = ( "http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t=" "{}:{}®ion=can&culture=en-US&cur=&reportType=is&period={}&" "dataType=A&order=desc&columnYear=5&curYearPart=1st5year&" "rounding=1&view=raw&r={}&denominatorView=raw&number=1").format( exchange, ticker, period, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting {} financial data for {}... ".format( period_name, ticker)) else: LOGGER.info("Skipping {}".format(ticker)) return 1 num_cols = 6 if period_name == "quarter" else 5 # skip last column if not quarter view (removes TTM) return_dics = self.parse_csv(csv_r, num_cols, self.special_financials_titles, self.column_financials_map, start_dic={ "ticker": ticker, "exchange": db_exchange, "period": period, "update_date": self.today }) for d in return_dics: stmt = insert(MorningStarFinancials).values( d).on_conflict_do_update(constraint='fiscal_year_unique', set_=d) self.session.execute(stmt) self.session.commit() LOGGER.info("Done") return 0 def fetch_all(self, db_exchange): q = self.session.query(Listings).filter( Listings.exchange == db_exchange, or_(Listings.onms == True, Listings.onms is None)) for listing in q: ticker = listing.ticker found1 = mss.get_key_stats(ticker) found2 = mss.get_financial(ticker, "quarter") found3 = mss.get_financial(ticker, "annual") on_morningstar = not (found1 and found2 and found3 ) # if the statistics or the financial data self.session.query(Listings).filter( Listings.exchange == db_exchange, Listings.ticker == ticker).update( {Listings.onms: on_morningstar}) def clean_exit(self): self.session.close()
class YahooApiScraper(): def __init__(self): # This is a reverse engineering of the Yahoo Finance REST API # Information off: http://www.jarloo.com/yahoo_finance/ self.y_to_db_map = {'n': 'name', 'y': 'dividend_yield', 'd': 'dividend_ps', 'r': 'pe', 'r1': 'dividend_pay_date', 'q': 'ex_dividend_date', 'o': 'open', 'c1': 'change', 'p2': 'perc_change', 'd1': 'last_trade_date', 'd2': 'trade_date', 'c3': 'commission', 'g': 'day_low', 'h': 'day_high', 'p': 'previous_close', 't8': 'year_target', 'm5': 'change_mv_avg_200', 'm6': 'perc_change_mv_avg_200', 'm7': 'change_mv_avg_50', 'm8': 'perc_change_mv_avg_50', 'm3': 'mv_avg_50', 'm4': 'mv_avg_200', 'w1': 'day_value_change', 'g1': 'holding_gain_perc', 'g3': 'annualized_gain', 'g4': 'holdings_gain', 'k': 'high_52_week', 'j': 'low_52_week', 'j5': 'change_52_week_low', 'k4': 'change_52_week_high', 'j6': 'perc_change_52_week_low', 'k5': 'perc_change_52_week_high', 'j1': 'market_cap', 'f6': 'float_shares', 'x': 'stock_exchange', 's1': 'shares_owned', 'j2': 'shares_outstanding', 'n4': 'notes', 'i': 'more_info', 'v': 'volume', 'a2': 'avg_daily_volume', 'e': 'eps', 'e7': 'eps_year_estimate', 'e8': 'eps_next_year_estimate', 'e9': 'eps_next_q_estimate', 'b4': 'book', 'j4': 'ebitda', 'p5': 'price_sale', 'p6': 'price_book', 'r': 'pe', 'r5': 'peg', 'r6': 'price_eps_estimate_year', 'r7': 'price_eps_estimate_next_year', 's7': 'short_ratio', 's6': 'revenue', 'v1': 'holdings_val', 'l2': 'high_limit', 'l3': 'low_limit', 'a': 'ask', 'b': 'bid'} self.convert_dict = {'K': 10 ** 3, 'M': 10 ** 6, 'B': 10 ** 9, 'T': 10 ** 12} self.condensed_pat = re.compile("([+-]?\d*\.?\d+)([kmbtKMBT])") self.url_flags = tuple(self.y_to_db_map.keys()) self.url_str_flags = "".join(self.url_flags) self.db_entries = tuple(self.y_to_db_map.values()) self.float_pat = re.compile("[+-]?(\d*[\.])?\d+$") self.today = datetime.today().date() self.base_url = "http://finance.yahoo.com/d/quotes.csv" # Sometimes websites are friendlier to iOS devices :) self.headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" } self.session = Session() def handle_csv_string(self, string): if string == 'N/A': return None elif '%' in string: return float(string.strip('%')) elif string.isdigit(): return int(string) elif self.condensed_pat.match(string) is not None: reg = self.condensed_pat.search(string) return int(float(reg.group(1)) * self.convert_dict[reg.group(2).upper()]) elif self.float_pat.match(string) is not None: return float(string) else: try: return dp.parse(string).date().isoformat() except ValueError: return string def chunks(self, l, n): return [l[i:i + n] for i in range(0, len(l), n)] def create_url(self, tickers): ticker_str = "+".join(tickers) url = "{}?s={}&f={}".format(self.base_url, ticker_str, self.url_str_flags) return url def handle_url(self, tickers, url, exchange): """ Fetches the url and inserts the data into the appropriate cols in the DB. """ LOGGER.info("Starting to add url: {} ...".format(url)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) db_list = [] for row, ticker in zip(csv_r, tickers): assert(len(row) == len(self.url_flags)) db_dic = {db_col: self.handle_csv_string(cell) for cell, db_col in zip(row, self.db_entries)} onyahoo = any(v is not None for v in db_dic.values()) self.session.query(Listings).filter(Listings.exchange == exchange, Listings.ticker == ticker ).update({Listings.onyahoo: onyahoo}) if not onyahoo: # not found, skip LOGGER.error("Failed to find quote for {} skipping".format(ticker)) continue db_dic["ticker"] = ticker db_dic["exchange"] = exchange exists = self.session.query(YahooKeyStatistics).filter_by(**db_dic).scalar() is not None if exists: LOGGER.info("Skipping {} due to prior existence".format(ticker)) continue db_dic["update_date"] = self.today # Annoyingly enough, sqlalchemy doesn't allow PostgreSQL bulk inserts # when checking constraints, RIP performance stmt = insert(YahooKeyStatistics).values(db_dic).on_conflict_do_nothing( constraint = 'yahoo_key_statistics_pkey', ) self.session.execute(stmt) self.session.commit() LOGGER.info("Done url.") def fetch_all(self, exchange): extension = '.TO' tickers = tuple(x.ticker + extension for x in self.session.query(Listings.ticker).filter(Listings.exchange == exchange, or_(Listings.onyahoo == True, Listings.onyahoo is None))) ticker_groups = self.chunks(tickers, 200) LOGGER.info("Fetching/Updating {} urls.".format(len(ticker_groups))) for ticker_group in ticker_groups: url = self.create_url(ticker_group) self.handle_url(ticker_group, url, exchange) sleep(1) # limit requests to 1/s def clean_exit(self): self.session.close()