class Features(): def __init__(self, normalize=True): self.db = Database() self.ms_financials_cols = self.db.get_col_names("ms_financials") self.not_feature_cols = {'ticker', 'exchange', 'update_date', 'fiscal_year', 'period'} self.ms_financials_cols = [x for x in self.ms_financials_cols if x not in self.not_feature_cols] self.ms_financials_cols_str = ','.join(self.ms_financials_cols) def fetch(self, ticker, lower_date, upper_date, period='3', where=""): cond = "ticker = %s AND period = %s AND fiscal_year >= %s AND fiscal_year <= %s ORDER BY fiscal_year ASC" return self.db.select(self.ms_financials_cols_str, "ms_financials", where=cond, vals=[ticker, period, lower_date, upper_date])
class Features(): def __init__(self, normalize=True): self.db = Database() self.ms_financials_cols = self.db.get_col_names("ms_financials") def fetch(self, ticker): self.db.cursor.execute("SELECT * from ms_financials LIMIT 1") res = self.db.cursor.fetchall()
class ReturnCalcuator(): def __init__(self): self.db = Database() def next_business_day(self, ticker, date, before=True, fields="date"): cmp = "<=" if before else ">=" return self.db.select( fields, "price_history", where="ticker = %s AND date {} %s ORDER BY date DESC LIMIT 1". format(cmp), vals=[ticker, date], fetch='one') def calculate_return(self, ticker, start_date, end_date): min_date = self.db.select("MIN(date)", "price_history", where="ticker = %s", vals=[ticker], fetch='one', unroll=True) max_date = self.db.select("MAX(date)", "price_history", where="ticker = %s", vals=[ticker], fetch='one', unroll=True) if min_date is None or max_date is None or start_date < min_date: return None start_date, start_close = self.next_business_day( ticker, start_date, fields='date, "adj close"', before=(start_date >= min_date)) end_date, end_close = self.next_business_day( ticker, end_date, fields='date, "adj close"', before=(end_date > max_date)) return end_close / start_close - 1
def get_html(urlQ, callback, xpath_hooks): """ This page takes a url from the URL Queue (urlQ) and calls a callbac that will handle the page source. xpage_hooks is a list used to determine when the page is loaded, see the docs for more details (e.g. ["//div[@data-test='whatever']"] ). """ svr = webkit_server.Server() svrconn = webkit_server.ServerConnection(server=svr) driver = dryscrape.driver.webkit.Driver(connection=svrconn) sess = dryscrape.Session(driver=driver) sess.set_header( "User-Agent", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" ) sess.set_attribute("auto_load_images", False) valid_page_func = lambda: any( sess.at_xpath(xpath) for xpath in xpath_hooks) db = Database() while not urlQ.empty(): url = urlQ.get() sess.visit(url) try: sess.wait_for(valid_page_func, interval=1, timeout=15) except dryscrape.mixins.WaitTimeoutError: LOGGER.error("Timeout so skipping", url) continue response = sess.body() callback(db, url, response) sess.reset() svr.kill() db.destroy()
def __init__(self): BaseBot.__init__(self) self.db = Database() self.ff = Features() self.rc = ReturnCalcuator() self.lr = LinearRegression() self.training_perc = 0.8 self.training_tickers = [] self.training_target = [] self.goal_tickers = [] self.goal_target = []
def __init__(self): self.y_to_db_map = { 'Forward P/E': 'forward_pe', 'Return on Equity': 'ro_equity', 'Current Ratio': 'current_ratio', 'Total Debt': 'total_debt', 'Forward Annual Dividend Rate': 'forward_annual_dividend_rate', 'Last Split Date': 'last_split_date', 'Market Cap (intraday)': 'market_cap', 'EBITDA': 'ebitda', 'Shares Short': 'shares_short', '50-Day Moving Average': 'fifty_day_moving_avg', '52 Week High': 'fifty_two_week_high', 'Quarterly Earnings Growth': 'q_earnings_growth', 'Forward Annual Dividend Yield': 'forward_annual_dividend_yield', 'Beta': 'beta', 'Payout Ratio': 'payout_ratio', 'Avg Vol (3 month)': 'avg_vol_3_month', 'Enterprise Value': 'enterprise_value', '5 Year Average Dividend Yield': 'five_year_avg_dividend_yield', 'Enterprise Value/Revenue': 'enterprise_value_revenue', 'Trailing P/E': 'trailing_pe', 'Total Cash': 'total_cash', 'Operating Cash Flow': 'operating_cash_flow', 'Price/Book': 'price_book', 'Fiscal Year Ends': 'fiscal_year_ends', 'Total Debt/Equity': 'total_debt_equity', 'Dividend Date': 'dividend_date', 'Most Recent Quarter': 'most_recent_q', 'Operating Margin': 'operating_margin', 'Ex-Dividend Date': 'exdividend_date', '% Held by Institutions': 'perc_held_by_institutions', 'Trailing Annual Dividend Yield': 'trailing_annual_dividend_yield', '200-Day Moving Average': 'two_hundred_day_moving_avg', '52 Week Low': 'fifty_two_week_low', 'Avg Vol (10 day)': 'avg_vol_10_day', 'Last Split Factor (new per old)': 'last_split_factor', '% Held by Insiders': 'perc_held_by_insiders', 'Revenue Per Share': 'revenue_per_share', 'Short Ratio': 'short_ratio', 'Shares Short (prior month)': 'shares_short_prior_month', 'Short % of Float': 'short_perc_float', 'Profit Margin': 'profit_margin', 'Return on Assets': 'ro_assets', 'Price/Sales': 'price_sales', 'Gross Profit': 'gross_profit', 'Book Value Per Share': 'book_value_per_share', 'Levered Free Cash Flow': 'levered_free_cash_flow', 'Trailing Annual Dividend Rate': 'trailing_annual_dividend_rate', 'Diluted EPS': 'diluted_eps', 'PEG Ratio (5 yr expected)': 'peg_ratio_5yr', 'Shares Outstanding': 'shares_outstanding', 'Revenue': 'revenue', 'Float': 'float', 'Net Income Avi to Common': 'net_income_avi_common', 'Enterprise Value/EBITDA': 'enterprise_value_ebitda', '52-Week Change': 'fifty_two_week_change', 'Quarterly Revenue Growth': 'q_revenue_growth', 'Total Cash Per Share': 'total_cash_ps' } self.convert_dict = {'K': 10**3, 'M': 10**6, 'B': 10**9, 'T': 10**12} self.condensed_pat = re.compile("([+-]?\d+\.?\d*)([kmbtKMBT])") self.float_pat = re.compile("[+-]?\d+\.\d+") self.parenthese_pat = re.compile(" *\(([^)]*)\)") self.date_line_pat = re.compile("\(as of (\d+.*\d+)\)") self.url_ticker_pat = re.compile(".*/quote/(.*)\.(.*)/key-statistics") self.keywords = set({"mrq", "ttm", "yoy", "lfy", "fye"}) self.today = datetime.today().date() self.default_fye = datetime(self.today.year, 12, 31) self.db = Database()
class YahooScraper(): def __init__(self): self.y_to_db_map = { 'Forward P/E': 'forward_pe', 'Return on Equity': 'ro_equity', 'Current Ratio': 'current_ratio', 'Total Debt': 'total_debt', 'Forward Annual Dividend Rate': 'forward_annual_dividend_rate', 'Last Split Date': 'last_split_date', 'Market Cap (intraday)': 'market_cap', 'EBITDA': 'ebitda', 'Shares Short': 'shares_short', '50-Day Moving Average': 'fifty_day_moving_avg', '52 Week High': 'fifty_two_week_high', 'Quarterly Earnings Growth': 'q_earnings_growth', 'Forward Annual Dividend Yield': 'forward_annual_dividend_yield', 'Beta': 'beta', 'Payout Ratio': 'payout_ratio', 'Avg Vol (3 month)': 'avg_vol_3_month', 'Enterprise Value': 'enterprise_value', '5 Year Average Dividend Yield': 'five_year_avg_dividend_yield', 'Enterprise Value/Revenue': 'enterprise_value_revenue', 'Trailing P/E': 'trailing_pe', 'Total Cash': 'total_cash', 'Operating Cash Flow': 'operating_cash_flow', 'Price/Book': 'price_book', 'Fiscal Year Ends': 'fiscal_year_ends', 'Total Debt/Equity': 'total_debt_equity', 'Dividend Date': 'dividend_date', 'Most Recent Quarter': 'most_recent_q', 'Operating Margin': 'operating_margin', 'Ex-Dividend Date': 'exdividend_date', '% Held by Institutions': 'perc_held_by_institutions', 'Trailing Annual Dividend Yield': 'trailing_annual_dividend_yield', '200-Day Moving Average': 'two_hundred_day_moving_avg', '52 Week Low': 'fifty_two_week_low', 'Avg Vol (10 day)': 'avg_vol_10_day', 'Last Split Factor (new per old)': 'last_split_factor', '% Held by Insiders': 'perc_held_by_insiders', 'Revenue Per Share': 'revenue_per_share', 'Short Ratio': 'short_ratio', 'Shares Short (prior month)': 'shares_short_prior_month', 'Short % of Float': 'short_perc_float', 'Profit Margin': 'profit_margin', 'Return on Assets': 'ro_assets', 'Price/Sales': 'price_sales', 'Gross Profit': 'gross_profit', 'Book Value Per Share': 'book_value_per_share', 'Levered Free Cash Flow': 'levered_free_cash_flow', 'Trailing Annual Dividend Rate': 'trailing_annual_dividend_rate', 'Diluted EPS': 'diluted_eps', 'PEG Ratio (5 yr expected)': 'peg_ratio_5yr', 'Shares Outstanding': 'shares_outstanding', 'Revenue': 'revenue', 'Float': 'float', 'Net Income Avi to Common': 'net_income_avi_common', 'Enterprise Value/EBITDA': 'enterprise_value_ebitda', '52-Week Change': 'fifty_two_week_change', 'Quarterly Revenue Growth': 'q_revenue_growth', 'Total Cash Per Share': 'total_cash_ps' } self.convert_dict = {'K': 10**3, 'M': 10**6, 'B': 10**9, 'T': 10**12} self.condensed_pat = re.compile("([+-]?\d+\.?\d*)([kmbtKMBT])") self.float_pat = re.compile("[+-]?\d+\.\d+") self.parenthese_pat = re.compile(" *\(([^)]*)\)") self.date_line_pat = re.compile("\(as of (\d+.*\d+)\)") self.url_ticker_pat = re.compile(".*/quote/(.*)\.(.*)/key-statistics") self.keywords = set({"mrq", "ttm", "yoy", "lfy", "fye"}) self.today = datetime.today().date() self.default_fye = datetime(self.today.year, 12, 31) self.db = Database() def s2n(self, string): reg = self.condensed_pat.search(string) return int( float(reg.group(1)) * self.convert_dict[reg.group(2).upper()]) def s2p(self, string): return float(string.strip('%')) def s2r(self, string): split = string.split(':') return float(split[0]) / float(split[1]) def parse_numeric(self, string): try: if '%' in string: return self.s2p(string) elif self.condensed_pat.match(string) is not None: return self.s2n(string) elif self.float_pat.match(string) is not None: return float(string) elif string.isdigit(): return int(string) elif ':' in string: return self.s2r(string) else: return dp.parse(string).date().isoformat() except ValueError: return None def cleanse_str(self, string): return self.parenthese_pat.sub('', string.replace(',', '')).strip(':') def dic_parse(self, db, url, html): def innerHtml(ele): return ele.decode_contents(formatter="html") soup = BeautifulSoup(html, "lxml") ticker = self.url_ticker_pat.search(url).group(1) exchange = "TSX" on_yahoo = soup.find('div', attrs={'data-test': 'unknown-quote' }) is None db.update("listings", ["onyahoo"], [on_yahoo], "exchange=%s AND ticker=%s", [exchange, ticker]) if not on_yahoo: # if quote not found, exit LOGGER.error("Failed to find quote for", url, "skipping") return div_test = soup.find('div', attrs={'data-test': 'qsp-statistics'}) if div_test is None: LOGGER.error("Unknown error for", url, "skipping") return db_dic = {} for table in div_test.find_all('table'): for row in table.find_all('tr'): td_list = row.find_all('td') title = innerHtml(td_list[0].find('span')) val = innerHtml(td_list[1]) if td_list[1].find( 'span') is None else innerHtml(td_list[1].find('span')) if title in self.y_to_db_map: db_dic[self.y_to_db_map[title]] = self.parse_numeric(val) if db_dic: db_dic["ticker"] = ticker db_dic["exchange"] = exchange col_names, vals = list(db_dic.keys()), list(db_dic.values()) where = db.create_conditional_string(col_names) if db.exists("key_statistics", where, vals): LOGGER.info("Skipping {} due to prior existence".format(url)) else: col_names.append("update_date") vals.append(self.today) db.insert_into("key_statistics", col_names, vals, multiple=False) LOGGER.info("Done parsing {}".format(url)) else: LOGGER.info("Skipping {}".format(url)) def fetch_all(self, exchange): listings = self.db.select( "ticker", "listings", where="exchange = %s AND (onyahoo=TRUE OR onyahoo IS NULL)", vals=[exchange], unroll=True) extension = '.TO' urls = [ "https://ca.finance.yahoo.com/quote/{}{}/key-statistics".format( ticker, extension) for ticker in listings ] xpath_hooks = [ "//div[@data-test='qsp-statistics']", "//div[@data-test='unknown-quote']" ] jsps = JSPageScraper(self.dic_parse, xpath_hooks, "key_statistics") jsps.go(urls) def clean_exit(self): self.db.destroy()
def __init__(self): self.db = Database()
def __init__(self, cache=True, url="http://www.tsx.com/resource/en/571"): self.today = datetime.today().date() self.db = Database() self.url = url self.cache_path = os.path.join(os.getcwd(), "cache", "TSX", "listings") if cache else ""
class ListManager(): def __init__(self, cache=True, url="http://www.tsx.com/resource/en/571"): self.today = datetime.today().date() self.db = Database() self.url = url self.cache_path = os.path.join(os.getcwd(), "cache", "TSX", "listings") if cache else "" def write_cache(self, date, sheet): os.makedirs(self.cache_path, exist_ok=True) json_name = date.strftime('TSX-%Y-%m-%d.json') full_path = os.path.join(self.cache_path, json_name) sheet.to_json(full_path, orient="records") LOGGER.info("Wrote file to {}".format(full_path)) def get_quotes(self): """ This function gets the tickers and various other random information from the TSX website from a hardcoded file and inserts it into the database """ recent_date = self.db.select("MAX(updatedate)", "listings", fetch="one", unroll=True) if self.url.startswith("http"): req = create_url_request(self.url) self.url = urllib.request.urlopen(req) sheet = pd.read_excel(self.url, skiprows=5, header=1, keep_default_na=False) sheet.fillna('', inplace=True) sheet.rename(columns=self.cleanse_str, inplace=True) file_date = self.find_date_in_list(list(sheet.columns.values)) if recent_date is None or (file_date > recent_date): xlsx_dict = sheet.to_dict(orient="records") recent_date = file_date if self.cache_path: self.write_cache(recent_date, sheet) else: LOGGER.info("Already up to date") return row_names = [ "updatedate", "ticker", "exchange", "name", "sector", "osshares", "dateoflisting", "listingtype", "volume", "value", ] all_excel_names = tuple(xlsx_dict[0].keys()) base_wanted_excel_names = [ "Root Ticker", "Exchange", "Name", "Sector", "O/S", "Date of TSX Listing", "Listing Type", "Volume YTD", "Value (C$)", ] types = [ "str", "str", "str", "str", "int", "date", "str", "int", "int", ] wanted_excel_names = [] for bxn in base_wanted_excel_names: for xn in all_excel_names: if xn.startswith(bxn): wanted_excel_names.append(xn) break num_rows = len(wanted_excel_names) table_name = "listings" values = [] for row in xlsx_dict: value_list = [recent_date] for i in range(num_rows): excel_name = wanted_excel_names[i] val = row[excel_name] if types[i] == "date": val = datetime.strptime(str(val), "%Y%m%d") # assume YYYYMMDD value_list.append(val) values.append(value_list) self.db.insert_into(table_name, row_names, values) def get_historic_events(self): """ Gets all the historical events from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = self.db.select("ticker, dateoflisting", "listings", where="exchange = %s", vals=[exchange]) dict_fields = ["index", "action", "value"] fields = ["exchange", "ticker", "date", "action", "value"] total_listings = len(listings) for counter, (ticker, listdate) in enumerate(listings): lastdate = self.db.select("MAX(date)", "event_history", fetch="one", where="exchange = %s AND ticker = %s", vals=[exchange, ticker], unroll=True) startdate = listdate if lastdate is None else lastdate + timedelta( days=1) event_dict = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" event_dict = self.ticker_event_history(startdate, self.today, yahoo_ticker) if event_dict: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter, total_listings, ticker, startdate, self.today)) rows = [[exchange, ticker] + [row[k] for k in dict_fields] for row in event_dict] self.db.insert_into("event_history", fields, rows) else: LOGGER.info("{}/{} Skipping ticker {}".format( counter, total_listings, ticker)) def get_historic_prices(self): """ Gets all the historical prices from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = self.db.select("ticker, dateoflisting", "listings", where="exchange = %s", vals=[exchange]) dict_fields = ["Adj Close", "High", "Close", "Open", "Low", "Date"] fields = ["exchange", "ticker"] + [x.lower() for x in dict_fields] total_listings = len(listings) for counter, (ticker, listdate) in enumerate(listings): lastdate = self.db.select("MAX(date)", "price_history", fetch="one", where="exchange = %s AND ticker = %s", vals=[exchange, ticker], unroll=True) startdate = listdate if lastdate is None else lastdate + timedelta( days=1) his_dict = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" his_dict = self.ticker_price_history(startdate, self.today, yahoo_ticker) if his_dict: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter, total_listings, ticker, startdate, self.today)) rows = [[exchange, ticker] + [row[k] for k in dict_fields] for row in his_dict] self.db.insert_into("price_history", fields, rows) else: LOGGER.info("{}/{} Skipping ticker {}".format( counter, total_listings, ticker)) def cleanse_str(self, raw_str): return raw_str.replace('\n', ' ').replace(" ", ' ') def find_date_in_list(self, strings): """ Returns the first date that occurs in a list of string or the current date if none are detected. """ cur_date = self.today # default = cur. date for s in strings: try: temp_date = dparser.parse(s, fuzzy=True).date() except ValueError: continue if cur_date != temp_date: cur_date = temp_date break return cur_date def ticker_price_history(self, start, end, ticker): """ Gets and returns the historic prices for a given ticker for between the time period provided. Inclusive. """ try: dr = DataReader(ticker, 'yahoo', start, end).reset_index() except OSError: return [] # if the page cannot be reached for some reason return dr.to_dict(orient="records") def ticker_event_history(self, start, end, ticker): try: dr = DataReader(ticker, 'yahoo-actions', start, end).reset_index() except OSError: return [] # if the page cannot be reached for some reason return dr.to_dict(orient="records") def clean_exit(self): self.db.destroy()
def __init__(self, normalize=True): self.db = Database() self.ms_financials_cols = self.db.get_col_names("ms_financials") self.not_feature_cols = {'ticker', 'exchange', 'update_date', 'fiscal_year', 'period'} self.ms_financials_cols = [x for x in self.ms_financials_cols if x not in self.not_feature_cols] self.ms_financials_cols_str = ','.join(self.ms_financials_cols)
def __init__(self): self.db = Database() self.today = datetime.today().date() self.ttm_string = self.most_recent_quarter() self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" } self.exchange_map = { "XTSE": "TSX", } self.year_month_cols = set( {"fiscal_year", "margin_date", "profitability_date"}) self.column_key_map = tuple(( ("revenue", "revenue"), ("gross margin", "gross_margin"), ("operating income", "operating_income"), ("operating margin", "operating_margin"), ("net income", "net_income"), ("earnings per share", "eps"), ("dividends", "dividends"), ("payout ratio", "payout_ratio"), ("shares", "num_shares"), ("book value per", "book_value_ps"), ("operating cash flow", "operating_cash_flow"), ("cap spending", "cap_spending"), ("cf free cash flow growth", "free_cash_flow_growth_yoy"), ("cf free cash flow/sales", "free_cash_flow_sales"), ("cf free cash flow/net", "free_cash_flow_net_income"), ("free cash flow per share", "free_cash_flow_ps"), ("free cash flow", "free_cash_flow"), ("working capital", "working_captial"), ("pro margins %", "margin_date"), ("pro revenue", "revenue_per_sales"), ("pro cogs", "revenue_per_cogs"), ("pro gross margin", "sales_gross_margin"), ("pro sg&a", "margin_sga"), ("pro r&d", "margin_rd"), ("pro other", "margin_other"), ("pro operating margin", "margin_operating"), ("pro net int inc", "margin_net_income"), ("pro ebt margin", "margin_ebt"), ("pro profitability", "profitability_date"), ("pro tax rate", "tax_rate"), ("pro net margin", "net_margin_perc"), ("pro asset turnover", "asset_turnover"), ("pro return on assets", "ro_assets"), ("pro financial lever", "financial_leverage"), ("pro return on equity", "ro_equity"), ("pro return on invested capital", "ro_invested_captial"), ("pro interest coverage", "interest_coverage"), ("r% year over year", "revenue_perc_yoy"), ("r% 3-year", "revenue_perc_3y"), ("r% 5-year", "revenue_perc_5y"), ("r% 10-year", "revenue_perc_10y"), ("oi% year over year", "operating_income_yoy"), ("oi% 3-year", "operating_income_3y"), ("oi% 5-year", "operating_income_5y"), ("oi% 10-year", "operating_income_10y"), ("ni% year over year", "net_income_yoy"), ("ni% 3-year", "net_income_3y"), ("ni% 5-year", "net_income_5y"), ("ni% 10-year", "net_income_10y"), ("eps% year over year", "eps_yoy"), ("eps% 3-year", "eps_3y"), ("eps% 5-year", "eps_5y"), ("eps% 10-year", "eps_10y"), ("cf operating cash flow", "cash_flow_operating_growth_yoy"), ("cf cap ex", "cap_expense_perc_sales"), ("fh cash & short", "cash_short_term"), ("fh accounts receivable", "accounts_receivable"), ("fh inventory", "inventory"), ("fh other current assets", "other_cur_assets"), ("fh total current assets", "total_cur_assets"), ("fh net pp&e", "net_ppe"), ("fh intangibles", "intangibles"), ("fh other long-term assets", "other_long_term_assets"), ("fh accounts payable", "accounts_payable"), ("fh short-term debt", "short_term_debt"), ("fh taxes payable", "taxes_payable"), ("fh accured liabilities", "accured_liabilities"), ("fh other short-term liabilities", "short_term_liabilities"), ("fh long-term debt", "long_term_debt"), ("fh total liabilities & equity", "total_liabilities_equity"), ("fh total liabilities", "total_liabilities"), ("fh total stockholder", "total_stockholder"), ("fh current ratio", "current_ratio"), ("fh quick ratio", "quick_ratio"), ("fh debt/equity", "debt_equity"), ("er receivables turnover", "receivables_turnover"), ("er inventory turnover", "inventory_turnover"), ("er fixed assets turnover", "fixed_assets_turnover"), )) self.column_financials_map = tuple(( ("fiscal year", "fiscal_year"), ("revenue", "revenue"), ("cost of revenue", "revenue_cost"), ("gross profit", "gross_profit"), ("sales, general and administrative", "sales_expense"), ("other operating", "operating_expense"), ("other assets", "other_assets"), ("operating income", "operating_income"), ("interest expense", "intrest_expense"), ("total operating expense", "total_costs"), ("total costs and expenses", "total_costs"), ("preferred dividend", "preferred_dividend"), ("income before", "income_before_taxes"), ("provision for", "provision_taxes"), ("net income from continuing op", "net_income_continuing_ops"), ("net income from discontinuing ops", "net_income_discontinuing_ops"), ("net income available to common shareholders", "net_income_common"), ("net income", "net_income"), ("eps basic", "eps_basic"), ("eps diluted", "eps_diluted"), ("waso basic", "waso_basic"), ("waso diluted", "waso_diluted"), ("ebitda", "ebitda"), )) self.special_key_titles = tuple(( ("key ratios -> profitability", "pro "), ("key ratios -> growth", "gro "), ("key ratios -> cash flow", "cf "), ("key ratios -> financial health", "fh "), ("key ratios -> efficiency ratios", "er "), ("revenue %", "r% "), ("operating income %", "oi% "), ("net income %", "ni% "), ("eps %", "eps% "), )) self.special_financials_titles = tuple(( ("earnings per share", "eps "), ("weighted average shares outstanding", "waso "), )) self.translation_table = dict.fromkeys(map(ord, '",'), None)
class MorningStarScaper(): def __init__(self): self.db = Database() self.today = datetime.today().date() self.ttm_string = self.most_recent_quarter() self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" } self.exchange_map = { "XTSE": "TSX", } self.year_month_cols = set( {"fiscal_year", "margin_date", "profitability_date"}) self.column_key_map = tuple(( ("revenue", "revenue"), ("gross margin", "gross_margin"), ("operating income", "operating_income"), ("operating margin", "operating_margin"), ("net income", "net_income"), ("earnings per share", "eps"), ("dividends", "dividends"), ("payout ratio", "payout_ratio"), ("shares", "num_shares"), ("book value per", "book_value_ps"), ("operating cash flow", "operating_cash_flow"), ("cap spending", "cap_spending"), ("cf free cash flow growth", "free_cash_flow_growth_yoy"), ("cf free cash flow/sales", "free_cash_flow_sales"), ("cf free cash flow/net", "free_cash_flow_net_income"), ("free cash flow per share", "free_cash_flow_ps"), ("free cash flow", "free_cash_flow"), ("working capital", "working_captial"), ("pro margins %", "margin_date"), ("pro revenue", "revenue_per_sales"), ("pro cogs", "revenue_per_cogs"), ("pro gross margin", "sales_gross_margin"), ("pro sg&a", "margin_sga"), ("pro r&d", "margin_rd"), ("pro other", "margin_other"), ("pro operating margin", "margin_operating"), ("pro net int inc", "margin_net_income"), ("pro ebt margin", "margin_ebt"), ("pro profitability", "profitability_date"), ("pro tax rate", "tax_rate"), ("pro net margin", "net_margin_perc"), ("pro asset turnover", "asset_turnover"), ("pro return on assets", "ro_assets"), ("pro financial lever", "financial_leverage"), ("pro return on equity", "ro_equity"), ("pro return on invested capital", "ro_invested_captial"), ("pro interest coverage", "interest_coverage"), ("r% year over year", "revenue_perc_yoy"), ("r% 3-year", "revenue_perc_3y"), ("r% 5-year", "revenue_perc_5y"), ("r% 10-year", "revenue_perc_10y"), ("oi% year over year", "operating_income_yoy"), ("oi% 3-year", "operating_income_3y"), ("oi% 5-year", "operating_income_5y"), ("oi% 10-year", "operating_income_10y"), ("ni% year over year", "net_income_yoy"), ("ni% 3-year", "net_income_3y"), ("ni% 5-year", "net_income_5y"), ("ni% 10-year", "net_income_10y"), ("eps% year over year", "eps_yoy"), ("eps% 3-year", "eps_3y"), ("eps% 5-year", "eps_5y"), ("eps% 10-year", "eps_10y"), ("cf operating cash flow", "cash_flow_operating_growth_yoy"), ("cf cap ex", "cap_expense_perc_sales"), ("fh cash & short", "cash_short_term"), ("fh accounts receivable", "accounts_receivable"), ("fh inventory", "inventory"), ("fh other current assets", "other_cur_assets"), ("fh total current assets", "total_cur_assets"), ("fh net pp&e", "net_ppe"), ("fh intangibles", "intangibles"), ("fh other long-term assets", "other_long_term_assets"), ("fh accounts payable", "accounts_payable"), ("fh short-term debt", "short_term_debt"), ("fh taxes payable", "taxes_payable"), ("fh accured liabilities", "accured_liabilities"), ("fh other short-term liabilities", "short_term_liabilities"), ("fh long-term debt", "long_term_debt"), ("fh total liabilities & equity", "total_liabilities_equity"), ("fh total liabilities", "total_liabilities"), ("fh total stockholder", "total_stockholder"), ("fh current ratio", "current_ratio"), ("fh quick ratio", "quick_ratio"), ("fh debt/equity", "debt_equity"), ("er receivables turnover", "receivables_turnover"), ("er inventory turnover", "inventory_turnover"), ("er fixed assets turnover", "fixed_assets_turnover"), )) self.column_financials_map = tuple(( ("fiscal year", "fiscal_year"), ("revenue", "revenue"), ("cost of revenue", "revenue_cost"), ("gross profit", "gross_profit"), ("sales, general and administrative", "sales_expense"), ("other operating", "operating_expense"), ("other assets", "other_assets"), ("operating income", "operating_income"), ("interest expense", "intrest_expense"), ("total operating expense", "total_costs"), ("total costs and expenses", "total_costs"), ("preferred dividend", "preferred_dividend"), ("income before", "income_before_taxes"), ("provision for", "provision_taxes"), ("net income from continuing op", "net_income_continuing_ops"), ("net income from discontinuing ops", "net_income_discontinuing_ops"), ("net income available to common shareholders", "net_income_common"), ("net income", "net_income"), ("eps basic", "eps_basic"), ("eps diluted", "eps_diluted"), ("waso basic", "waso_basic"), ("waso diluted", "waso_diluted"), ("ebitda", "ebitda"), )) self.special_key_titles = tuple(( ("key ratios -> profitability", "pro "), ("key ratios -> growth", "gro "), ("key ratios -> cash flow", "cf "), ("key ratios -> financial health", "fh "), ("key ratios -> efficiency ratios", "er "), ("revenue %", "r% "), ("operating income %", "oi% "), ("net income %", "ni% "), ("eps %", "eps% "), )) self.special_financials_titles = tuple(( ("earnings per share", "eps "), ("weighted average shares outstanding", "waso "), )) self.translation_table = dict.fromkeys(map(ord, '",'), None) def most_recent_quarter(self): day = self.today.day quarter = (self.today.month - 1) // 3 year = self.today.year month = quarter * 3 + 1 return datetime(year=year, month=month, day=1).date().isoformat() def find_column(self, col, mapper, subtitle=''): col = col.lower().replace('"', '') wst = subtitle + col alt = '' for k, v in mapper: if wst.startswith(k): return v elif col.startswith(k): alt = v return alt def convert_numerical(self, n): try: return int(n) except ValueError: try: return float(n) except ValueError: return n def get_title_multiplier(self, title): multipliers = ["Ths", "Mil", "Bil"] factors = [10**3, 10**6, 10**9] for i, multi in enumerate(multipliers): if title.endswith(multi): return factors[i] return 1 def parse_csv(self, csv_r, num_cols, special_titles, column_map, extra_cols=[], extra_vals=[]): subhead = '' next(csv_r) # skip header col_names = extra_cols vals = [list(extra_vals) for _ in range(num_cols)] # Don't do, [[]] * 6 for cols in csv_r: row_cols = len(cols) if row_cols == 0: continue elif row_cols == 1: subhead = self.find_column(cols[0], special_titles) else: db_col = self.find_column(cols[0], column_map, subtitle=subhead) if db_col: multi = self.get_title_multiplier(cols[0]) col_names.append(db_col) for i in range(num_cols): cell = cols[i + 1].translate(self.translation_table) val = self.convert_numerical( cell) * multi if cell else None if db_col in self.year_month_cols: val = self.ttm_string if val == 'TTM' else datetime.strptime( val, '%Y-%m') vals[i].append(val) return col_names, vals def get_key_stats(self, ticker, db_exchange="TSX"): """ This function get key statistics from Morning Star. """ url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&" "culture=en-CA®ion=CAN&order=asc&r={}").format( ticker, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting key statistics for {}... ".format(ticker)) else: LOGGER.info("Skipping", ticker) return 1 col_names, vals = self.parse_csv( csv_r, 10, self.special_key_titles, self.column_key_map, extra_cols=["ticker", "exchange", "update_date"], extra_vals=[ticker, db_exchange, self.today]) self.db.insert_into("ms_key_statistics", col_names, vals, unique_conflict=True) LOGGER.info("Done") return 0 def get_financial(self, ticker, period_name, exchange="XTSE"): """ This function get yearly and quartly information from Morning Star. period_name: "quarter" or "annual" exchanges: XTSE (TSX), """ # this converts the morning star exchange name to our database name if exchange in self.exchange_map: db_exchange = self.exchange_map[exchange] else: raise ValueError("Exchange unsupported {}".format(exchange)) period = 3 if period_name == "quarter" else 12 url = ( "http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t=" "{}:{}®ion=can&culture=en-US&cur=&reportType=is&period={}&" "dataType=A&order=desc&columnYear=5&curYearPart=1st5year&" "rounding=1&view=raw&r={}&denominatorView=raw&number=1").format( exchange, ticker, period, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting {} financial data for {}... ".format( period_name, ticker), end='') else: LOGGER.info("Skipping", ticker) return 1 num_cols = 6 if period_name == "quarter" else 5 # skip last column if not quarter view (removes TTM) col_names, vals = self.parse_csv( csv_r, num_cols, self.special_financials_titles, self.column_financials_map, extra_cols=["ticker", "exchange", "period", "update_date"], extra_vals=[ticker, db_exchange, period, self.today]) self.db.insert_into("ms_financials", col_names, vals, unique_conflict=True) LOGGER.info("Done") return 0 def fetch_all(self, db_exchange): listings = self.db.select( "ticker", "listings", where="exchange = %s AND (onms=TRUE OR onms IS NULL)", vals=[db_exchange], unroll=True) for ticker in listings: found1 = mss.get_key_stats(ticker) found2 = mss.get_financial(ticker, "quarter") found3 = mss.get_financia(ticker, "annual") on_morningstar = not (found1 and found2 and found3 ) # if the statistics or the financial data self.db.update("listings", ["onms"], [on_morningstar], "exchange=%s AND ticker=%s", [db_exchange, ticker]) def clean_exit(self): self.db.destroy()
def __init__(self, normalize=True): self.db = Database() self.ms_financials_cols = self.db.get_col_names("ms_financials")