def get_html(urlQ, callback, xpath_hooks): """ This page takes a url from the URL Queue (urlQ) and calls a callbac that will handle the page source. xpage_hooks is a list used to determine when the page is loaded, see the docs for more details (e.g. ["//div[@data-test='whatever']"] ). """ svr = webkit_server.Server() svrconn = webkit_server.ServerConnection(server=svr) driver = dryscrape.driver.webkit.Driver(connection=svrconn) sess = dryscrape.Session(driver=driver) sess.set_header( "User-Agent", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" ) sess.set_attribute("auto_load_images", False) valid_page_func = lambda: any( sess.at_xpath(xpath) for xpath in xpath_hooks) session = Session() while not urlQ.empty(): url = urlQ.get() try: sess.visit(url) except webkit_server.InvalidResponseError: LOGGER.error( "Got invalid response from something? Skipping {}".format(url)) continue try: sess.wait_for(valid_page_func, interval=1, timeout=15) except dryscrape.mixins.WaitTimeoutError: LOGGER.error("Timeout so skipping {}".format(url)) continue response = sess.body() callback(session, url, response) sess.reset() svr.kill() session.close()
def __init__(self): # This is a reverse engineering of the Yahoo Finance REST API # Information off: http://www.jarloo.com/yahoo_finance/ self.y_to_db_map = {'n': 'name', 'y': 'dividend_yield', 'd': 'dividend_ps', 'r': 'pe', 'r1': 'dividend_pay_date', 'q': 'ex_dividend_date', 'o': 'open', 'c1': 'change', 'p2': 'perc_change', 'd1': 'last_trade_date', 'd2': 'trade_date', 'c3': 'commission', 'g': 'day_low', 'h': 'day_high', 'p': 'previous_close', 't8': 'year_target', 'm5': 'change_mv_avg_200', 'm6': 'perc_change_mv_avg_200', 'm7': 'change_mv_avg_50', 'm8': 'perc_change_mv_avg_50', 'm3': 'mv_avg_50', 'm4': 'mv_avg_200', 'w1': 'day_value_change', 'g1': 'holding_gain_perc', 'g3': 'annualized_gain', 'g4': 'holdings_gain', 'k': 'high_52_week', 'j': 'low_52_week', 'j5': 'change_52_week_low', 'k4': 'change_52_week_high', 'j6': 'perc_change_52_week_low', 'k5': 'perc_change_52_week_high', 'j1': 'market_cap', 'f6': 'float_shares', 'x': 'stock_exchange', 's1': 'shares_owned', 'j2': 'shares_outstanding', 'n4': 'notes', 'i': 'more_info', 'v': 'volume', 'a2': 'avg_daily_volume', 'e': 'eps', 'e7': 'eps_year_estimate', 'e8': 'eps_next_year_estimate', 'e9': 'eps_next_q_estimate', 'b4': 'book', 'j4': 'ebitda', 'p5': 'price_sale', 'p6': 'price_book', 'r': 'pe', 'r5': 'peg', 'r6': 'price_eps_estimate_year', 'r7': 'price_eps_estimate_next_year', 's7': 'short_ratio', 's6': 'revenue', 'v1': 'holdings_val', 'l2': 'high_limit', 'l3': 'low_limit', 'a': 'ask', 'b': 'bid'} self.convert_dict = {'K': 10 ** 3, 'M': 10 ** 6, 'B': 10 ** 9, 'T': 10 ** 12} self.condensed_pat = re.compile("([+-]?\d*\.?\d+)([kmbtKMBT])") self.url_flags = tuple(self.y_to_db_map.keys()) self.url_str_flags = "".join(self.url_flags) self.db_entries = tuple(self.y_to_db_map.values()) self.float_pat = re.compile("[+-]?(\d*[\.])?\d+$") self.today = datetime.today().date() self.base_url = "http://finance.yahoo.com/d/quotes.csv" # Sometimes websites are friendlier to iOS devices :) self.headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" } self.session = Session()
def __init__(self): self.session = Session() self.today = datetime.today().date() self.ttm_string = self.most_recent_quarter() self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" } self.exchange_map = { "XTSE": "TSX", } self.year_month_cols = set( {"fiscal_year", "margin_date", "profitability_date"}) self.column_key_map = tuple(( ("revenue", "revenue"), ("gross margin", "gross_margin"), ("operating income", "operating_income"), ("operating margin", "operating_margin"), ("net income", "net_income"), ("earnings per share", "eps"), ("dividends", "dividends"), ("payout ratio", "payout_ratio"), ("shares", "num_shares"), ("book value per", "book_value_ps"), ("operating cash flow", "operating_cash_flow"), ("cap spending", "cap_spending"), ("cf free cash flow growth", "free_cash_flow_growth_yoy"), ("cf free cash flow/sales", "free_cash_flow_sales"), ("cf free cash flow/net", "free_cash_flow_net_income"), ("free cash flow per share", "free_cash_flow_ps"), ("free cash flow", "free_cash_flow"), ("working capital", "working_captial"), ("pro margins %", "margin_date"), ("pro revenue", "revenue_per_sales"), ("pro cogs", "revenue_per_cogs"), ("pro gross margin", "sales_gross_margin"), ("pro sg&a", "margin_sga"), ("pro r&d", "margin_rd"), ("pro other", "margin_other"), ("pro operating margin", "margin_operating"), ("pro net int inc", "margin_net_income"), ("pro ebt margin", "margin_ebt"), ("pro profitability", "profitability_date"), ("pro tax rate", "tax_rate"), ("pro net margin", "net_margin_perc"), ("pro asset turnover", "asset_turnover"), ("pro return on assets", "ro_assets"), ("pro financial lever", "financial_leverage"), ("pro return on equity", "ro_equity"), ("pro return on invested capital", "ro_invested_captial"), ("pro interest coverage", "interest_coverage"), ("r% year over year", "revenue_perc_yoy"), ("r% 3-year", "revenue_perc_3y"), ("r% 5-year", "revenue_perc_5y"), ("r% 10-year", "revenue_perc_10y"), ("oi% year over year", "operating_income_yoy"), ("oi% 3-year", "operating_income_3y"), ("oi% 5-year", "operating_income_5y"), ("oi% 10-year", "operating_income_10y"), ("ni% year over year", "net_income_yoy"), ("ni% 3-year", "net_income_3y"), ("ni% 5-year", "net_income_5y"), ("ni% 10-year", "net_income_10y"), ("eps% year over year", "eps_yoy"), ("eps% 3-year", "eps_3y"), ("eps% 5-year", "eps_5y"), ("eps% 10-year", "eps_10y"), ("cf operating cash flow", "cash_flow_operating_growth_yoy"), ("cf cap ex", "cap_expense_perc_sales"), ("fh cash & short", "cash_short_term"), ("fh accounts receivable", "accounts_receivable"), ("fh inventory", "inventory"), ("fh other current assets", "other_cur_assets"), ("fh total current assets", "total_cur_assets"), ("fh net pp&e", "net_ppe"), ("fh intangibles", "intangibles"), ("fh other long-term assets", "other_long_term_assets"), ("fh accounts payable", "accounts_payable"), ("fh short-term debt", "short_term_debt"), ("fh taxes payable", "taxes_payable"), ("fh accured liabilities", "accured_liabilities"), ("fh other short-term liabilities", "short_term_liabilities"), ("fh long-term debt", "long_term_debt"), ("fh total liabilities & equity", "total_liabilities_equity"), ("fh total liabilities", "total_liabilities"), ("fh total stockholder", "total_stockholder"), ("fh current ratio", "current_ratio"), ("fh quick ratio", "quick_ratio"), ("fh debt/equity", "debt_equity"), ("er receivables turnover", "receivables_turnover"), ("er inventory turnover", "inventory_turnover"), ("er fixed assets turnover", "fixed_assets_turnover"), )) self.column_financials_map = tuple(( ("fiscal year", "fiscal_year"), ("revenue", "revenue"), ("cost of revenue", "revenue_cost"), ("gross profit", "gross_profit"), ("sales, general and administrative", "sales_expense"), ("other operating", "operating_expense"), ("other assets", "other_assets"), ("operating income", "operating_income"), ("interest expense", "intrest_expense"), ("total operating expense", "total_costs"), ("total costs and expenses", "total_costs"), ("preferred dividend", "preferred_dividend"), ("income before", "income_before_taxes"), ("provision for", "provision_taxes"), ("net income from continuing op", "net_income_continuing_ops"), ("net income from discontinuing ops", "net_income_discontinuing_ops"), ("net income available to common shareholders", "net_income_common"), ("net income", "net_income"), ("eps basic", "eps_basic"), ("eps diluted", "eps_diluted"), ("waso basic", "waso_basic"), ("waso diluted", "waso_diluted"), ("ebitda", "ebitda"), )) self.special_key_titles = tuple(( ("key ratios -> profitability", "pro "), ("key ratios -> growth", "gro "), ("key ratios -> cash flow", "cf "), ("key ratios -> financial health", "fh "), ("key ratios -> efficiency ratios", "er "), ("revenue %", "r% "), ("operating income %", "oi% "), ("net income %", "ni% "), ("eps %", "eps% "), )) self.special_financials_titles = tuple(( ("earnings per share", "eps "), ("weighted average shares outstanding", "waso "), )) self.translation_table = dict.fromkeys(map(ord, '",'), None)
class MorningStarScaper(): def __init__(self): self.session = Session() self.today = datetime.today().date() self.ttm_string = self.most_recent_quarter() self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" } self.exchange_map = { "XTSE": "TSX", } self.year_month_cols = set( {"fiscal_year", "margin_date", "profitability_date"}) self.column_key_map = tuple(( ("revenue", "revenue"), ("gross margin", "gross_margin"), ("operating income", "operating_income"), ("operating margin", "operating_margin"), ("net income", "net_income"), ("earnings per share", "eps"), ("dividends", "dividends"), ("payout ratio", "payout_ratio"), ("shares", "num_shares"), ("book value per", "book_value_ps"), ("operating cash flow", "operating_cash_flow"), ("cap spending", "cap_spending"), ("cf free cash flow growth", "free_cash_flow_growth_yoy"), ("cf free cash flow/sales", "free_cash_flow_sales"), ("cf free cash flow/net", "free_cash_flow_net_income"), ("free cash flow per share", "free_cash_flow_ps"), ("free cash flow", "free_cash_flow"), ("working capital", "working_captial"), ("pro margins %", "margin_date"), ("pro revenue", "revenue_per_sales"), ("pro cogs", "revenue_per_cogs"), ("pro gross margin", "sales_gross_margin"), ("pro sg&a", "margin_sga"), ("pro r&d", "margin_rd"), ("pro other", "margin_other"), ("pro operating margin", "margin_operating"), ("pro net int inc", "margin_net_income"), ("pro ebt margin", "margin_ebt"), ("pro profitability", "profitability_date"), ("pro tax rate", "tax_rate"), ("pro net margin", "net_margin_perc"), ("pro asset turnover", "asset_turnover"), ("pro return on assets", "ro_assets"), ("pro financial lever", "financial_leverage"), ("pro return on equity", "ro_equity"), ("pro return on invested capital", "ro_invested_captial"), ("pro interest coverage", "interest_coverage"), ("r% year over year", "revenue_perc_yoy"), ("r% 3-year", "revenue_perc_3y"), ("r% 5-year", "revenue_perc_5y"), ("r% 10-year", "revenue_perc_10y"), ("oi% year over year", "operating_income_yoy"), ("oi% 3-year", "operating_income_3y"), ("oi% 5-year", "operating_income_5y"), ("oi% 10-year", "operating_income_10y"), ("ni% year over year", "net_income_yoy"), ("ni% 3-year", "net_income_3y"), ("ni% 5-year", "net_income_5y"), ("ni% 10-year", "net_income_10y"), ("eps% year over year", "eps_yoy"), ("eps% 3-year", "eps_3y"), ("eps% 5-year", "eps_5y"), ("eps% 10-year", "eps_10y"), ("cf operating cash flow", "cash_flow_operating_growth_yoy"), ("cf cap ex", "cap_expense_perc_sales"), ("fh cash & short", "cash_short_term"), ("fh accounts receivable", "accounts_receivable"), ("fh inventory", "inventory"), ("fh other current assets", "other_cur_assets"), ("fh total current assets", "total_cur_assets"), ("fh net pp&e", "net_ppe"), ("fh intangibles", "intangibles"), ("fh other long-term assets", "other_long_term_assets"), ("fh accounts payable", "accounts_payable"), ("fh short-term debt", "short_term_debt"), ("fh taxes payable", "taxes_payable"), ("fh accured liabilities", "accured_liabilities"), ("fh other short-term liabilities", "short_term_liabilities"), ("fh long-term debt", "long_term_debt"), ("fh total liabilities & equity", "total_liabilities_equity"), ("fh total liabilities", "total_liabilities"), ("fh total stockholder", "total_stockholder"), ("fh current ratio", "current_ratio"), ("fh quick ratio", "quick_ratio"), ("fh debt/equity", "debt_equity"), ("er receivables turnover", "receivables_turnover"), ("er inventory turnover", "inventory_turnover"), ("er fixed assets turnover", "fixed_assets_turnover"), )) self.column_financials_map = tuple(( ("fiscal year", "fiscal_year"), ("revenue", "revenue"), ("cost of revenue", "revenue_cost"), ("gross profit", "gross_profit"), ("sales, general and administrative", "sales_expense"), ("other operating", "operating_expense"), ("other assets", "other_assets"), ("operating income", "operating_income"), ("interest expense", "intrest_expense"), ("total operating expense", "total_costs"), ("total costs and expenses", "total_costs"), ("preferred dividend", "preferred_dividend"), ("income before", "income_before_taxes"), ("provision for", "provision_taxes"), ("net income from continuing op", "net_income_continuing_ops"), ("net income from discontinuing ops", "net_income_discontinuing_ops"), ("net income available to common shareholders", "net_income_common"), ("net income", "net_income"), ("eps basic", "eps_basic"), ("eps diluted", "eps_diluted"), ("waso basic", "waso_basic"), ("waso diluted", "waso_diluted"), ("ebitda", "ebitda"), )) self.special_key_titles = tuple(( ("key ratios -> profitability", "pro "), ("key ratios -> growth", "gro "), ("key ratios -> cash flow", "cf "), ("key ratios -> financial health", "fh "), ("key ratios -> efficiency ratios", "er "), ("revenue %", "r% "), ("operating income %", "oi% "), ("net income %", "ni% "), ("eps %", "eps% "), )) self.special_financials_titles = tuple(( ("earnings per share", "eps "), ("weighted average shares outstanding", "waso "), )) self.translation_table = dict.fromkeys(map(ord, '",'), None) def most_recent_quarter(self): day = self.today.day quarter = (self.today.month - 1) // 3 year = self.today.year month = quarter * 3 + 1 return datetime(year=year, month=month, day=1).date().isoformat() def find_column(self, col, mapper, subtitle=''): col = col.lower().replace('"', '') wst = subtitle + col alt = '' for k, v in mapper: if wst.startswith(k): return v elif col.startswith(k): alt = v return alt def convert_numerical(self, n): try: return int(n) except ValueError: try: return float(n) except ValueError: return n def get_title_multiplier(self, title): multipliers = ["Ths", "Mil", "Bil"] factors = [10**3, 10**6, 10**9] for i, multi in enumerate(multipliers): if title.endswith(multi): return factors[i] return 1 def parse_csv(self, csv_r, num_cols, special_titles, column_map, start_dic={}): subhead = '' next(csv_r) # skip headers return_dics = [] for cols in csv_r: row_cols = len(cols) if row_cols == 0: continue elif row_cols == 1: subhead = self.find_column(cols[0], special_titles) else: db_col = self.find_column(cols[0], column_map, subtitle=subhead) if db_col: multi = self.get_title_multiplier(cols[0]) if len(return_dics) == 0: return_dics = [ start_dic.copy() for _ in range(num_cols) ] for i in range(num_cols): cell = cols[i + 1].translate(self.translation_table) val = self.convert_numerical( cell) * multi if cell else None if db_col in self.year_month_cols: val = self.ttm_string if val == 'TTM' else datetime.strptime( val, '%Y-%m') return_dics[i][db_col] = val return return_dics def get_key_stats(self, ticker, db_exchange="TSX"): """ This function get key statistics from Morning Star. """ url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&" "culture=en-CA®ion=CAN&order=asc&r={}").format( ticker, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting key statistics for {}... ".format(ticker)) else: LOGGER.info("Skipping {}".format(ticker)) return 1 return_dics = self.parse_csv(csv_r, 10, self.special_key_titles, self.column_key_map, start_dic={ "ticker": ticker, "exchange": db_exchange, "update_date": self.today }) for d in return_dics: stmt = insert(MorningStarKeyStatistics).values( d).on_conflict_do_update(constraint='ms_key_statistics_pkey', set_=d) self.session.execute(stmt) self.session.commit() LOGGER.info("Done") return 0 def get_financial(self, ticker, period_name, exchange="XTSE"): """ This function get yearly and quartly information from Morning Star. period_name: "quarter" or "annual" exchanges: XTSE (TSX), """ # this converts the morning star exchange name to our database name if exchange in self.exchange_map: db_exchange = self.exchange_map[exchange] else: raise ValueError("Exchange unsupported {}".format(exchange)) period = 3 if period_name == "quarter" else 12 url = ( "http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t=" "{}:{}®ion=can&culture=en-US&cur=&reportType=is&period={}&" "dataType=A&order=desc&columnYear=5&curYearPart=1st5year&" "rounding=1&view=raw&r={}&denominatorView=raw&number=1").format( exchange, ticker, period, randint(1, 500000)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) on_morningstar = csv_r and resp.headers['content-length'] != '0' if on_morningstar: LOGGER.info("Getting {} financial data for {}... ".format( period_name, ticker)) else: LOGGER.info("Skipping {}".format(ticker)) return 1 num_cols = 6 if period_name == "quarter" else 5 # skip last column if not quarter view (removes TTM) return_dics = self.parse_csv(csv_r, num_cols, self.special_financials_titles, self.column_financials_map, start_dic={ "ticker": ticker, "exchange": db_exchange, "period": period, "update_date": self.today }) for d in return_dics: stmt = insert(MorningStarFinancials).values( d).on_conflict_do_update(constraint='fiscal_year_unique', set_=d) self.session.execute(stmt) self.session.commit() LOGGER.info("Done") return 0 def fetch_all(self, db_exchange): q = self.session.query(Listings).filter( Listings.exchange == db_exchange, or_(Listings.onms == True, Listings.onms is None)) for listing in q: ticker = listing.ticker found1 = mss.get_key_stats(ticker) found2 = mss.get_financial(ticker, "quarter") found3 = mss.get_financial(ticker, "annual") on_morningstar = not (found1 and found2 and found3 ) # if the statistics or the financial data self.session.query(Listings).filter( Listings.exchange == db_exchange, Listings.ticker == ticker).update( {Listings.onms: on_morningstar}) def clean_exit(self): self.session.close()
imp = imp.fit(train_data) train_ticker_names = np.array(train_ticker_names, dtype=np.str) train_data = imp.transform(train_data) train_targets = np.array(train_targets, dtype=np.float) test_ticker_names = np.array(test_ticker_names, dtype=np.str) test_data = imp.transform(np.array(test_data, dtype=np.float)) test_targets = np.array(test_targets, dtype=np.float) if not os.path.exists(self.dir_path): os.makedirs(self.dir_path) LOGGER.info("Saving file at: {}".format(self.file_path)) np.savez(self.file_path, train_data=train_data, train_targets=train_targets, train_ticker_names=train_ticker_names, test_data=test_data, test_targets=test_targets, test_ticker_names=test_ticker_names) if __name__ == "__main__": from sa.database import Session sess = Session() fc = FeatureHelper(sess) fc.generate_and_save_feature_data(independent=False) fc.screen_and_save_feature_data()
def __init__(self): self.sess = Session()
def __init__(self, url="http://www.tsx.com/resource/en/571"): self.today = datetime.today().date() self.session = Session() self.url = url
class ListManager(): def __init__(self, url="http://www.tsx.com/resource/en/571"): self.today = datetime.today().date() self.session = Session() self.url = url def get_quotes(self): """ This function gets the tickers and various other random information from the TSX website from a hardcoded file and inserts it into the database """ recent_date, = self.session.query(func.max( Listings.updatedate)).first() if self.url.startswith("http"): req = create_url_request(self.url) self.url = urllib.request.urlopen(req) sheet = pd.read_excel(self.url, skiprows=5, header=1, keep_default_na=False) sheet.fillna('', inplace=True) sheet.rename(columns=self.cleanse_str, inplace=True) file_date = self.find_date_in_list(list(sheet.columns.values)) if recent_date is None or (file_date > recent_date): xlsx_dict = sheet.to_dict(orient="records") recent_date = file_date else: LOGGER.info("Already up to date") return row_names = [ "ticker", "exchange", "name", "sector", "osshares", "dateoflisting", "listingtype", "volume", "value", ] all_excel_names = tuple(xlsx_dict[0].keys()) base_wanted_excel_names = [ "Root Ticker", "Exchange", "Name", "Sector", "O/S", "Date of TSX Listing", "Listing Type", "Volume YTD", "Value (C$)", ] wanted_excel_names = [] for bxn in base_wanted_excel_names: for xn in all_excel_names: if xn.startswith(bxn): wanted_excel_names.append(xn) break assert (len(base_wanted_excel_names) == len(wanted_excel_names) == len(row_names)) value_dics = [] for row in xlsx_dict: value_dic = {"updatedate": recent_date} for excel_name, row_name in zip(wanted_excel_names, row_names): val = row[excel_name] if row_name == "dateoflisting": val = datetime.strptime(str(val), "%Y%m%d") # assume YYYYMMDD if val == '': val = None value_dic[row_name] = val value_dics.append(value_dic) self.session.execute(insert(Listings).values(value_dics)) self.session.commit() def get_historic_events(self): """ Gets all the historical events from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = self.session.query( Listings.ticker, Listings.dateoflisting).filter(Listings.exchange == exchange) dict_fields = ["index", "action", "value"] fields = ["exchange", "ticker", "date", "action", "value"] total_listings = listings.count() for counter, (ticker, listdate) in enumerate(listings): lastdate, = self.session.query(func.max( EventHistory.updatedate)).filter( EventHistory.exchange == exchange, EventHistory.ticker == ticker).first() startdate = listdate if lastdate is None else lastdate + timedelta( days=1) rows = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" dividend_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info='dividend') split_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info='split') rows = [] for row in dividend_dict: rows.append([ exchange, ticker, row["date"], "DIVIDEND", row["dividends"], self.today ]) for row in split_dict: rows.append([ exchange, ticker, row["date"], "SPLIT", row["stock_splits"], self.today ]) if rows: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter + 1, total_listings, ticker, startdate, self.today)) stmt = insert(EventHistory).values( rows).on_conflict_do_nothing( constraint='event_history_pkey') self.session.execute(stmt) self.session.commit() else: LOGGER.info("{}/{} Skipping ticker {}".format( counter + 1, total_listings, ticker)) def get_historic_prices(self): """ Gets all the historical prices from yahoo, updating only the new entries based on the date of the last fetch. """ exchange = "TSX" listings = list( self.session.query( Listings.ticker, Listings.dateoflisting).filter(Listings.exchange == exchange)) total_listings = len(listings) for counter, (ticker, listdate) in enumerate(listings): lastdate, = self.session.query(func.max(PriceHistory.date)).filter( PriceHistory.exchange == exchange, PriceHistory.ticker == ticker).first() startdate = listdate if lastdate is None else lastdate + timedelta( days=1) his_dict = [] if startdate < self.today: yahoo_ticker = ticker + ".TO" start_dic = {"exchange": exchange, "ticker": ticker} his_dict = self.ticker_history(startdate, self.today, yahoo_ticker, info="quote", start_dic=start_dic) if his_dict: LOGGER.info("{}/{} Inserting {} from {} to {}".format( counter, total_listings, ticker, startdate, self.today)) for d in his_dict: stmt = insert(PriceHistory).values( d).on_conflict_do_update( constraint='price_history_pkey', set_=d) self.session.execute(stmt) self.session.commit() else: LOGGER.info("{}/{} Skipping ticker {}".format( counter, total_listings, ticker)) def cleanse_str(self, raw_str): return raw_str.replace('\n', ' ').replace(" ", ' ') def find_date_in_list(self, strings): """ Returns the first date that occurs in a list of string or the current date if none are detected. """ cur_date = self.today # default = cur. date for s in strings: try: temp_date = dparser.parse(s, fuzzy=True).date() except ValueError: continue if cur_date != temp_date: cur_date = temp_date break return cur_date def convert_yahoo_element(self, element): converted = None try: converted = float(element) except ValueError: try: converted = datetime.strptime(element, "%Y-%m-%d") except ValueError: if element == 'null': converted = None elif '/' in element: try: a, b = element.split('/') converted = float(a) / float(b) except ValueError: LOGGER.info("Unable to convert {}".format(element)) else: LOGGER.info("Unable to convert {}".format(element)) return converted def ticker_history(self, start, end, ticker, info='quote', start_dic={}): """ Gets and returns the historic prices for a given ticker for between the time period provided. Inclusive. """ start_str = start.strftime('%Y%m%d') end_str = end.strftime('%Y%m%d') # info = 'quote', 'dividend', 'split' try: data = yqd.load_yahoo_quote(ticker, start_str, end_str, info=info) except (HTTPError, URLError, gaierror) as e: LOGGER.info("Yahoo request failed. Blocked?") return [] titles = tuple(t.replace(' ', '_').lower() for t in data[0].split(',')) history = [] for row in data[1:-1]: history_row = {k: v for k, v in start_dic.items()} iter_list = row.split(',') for element, title in zip(iter_list, titles): converted = self.convert_yahoo_element(element) history_row[title] = converted history.append(history_row) return history def clean_exit(self): self.session.close()
def __init__(self): self.y_to_db_map = { 'Forward P/E': 'forward_pe', 'Return on Equity': 'ro_equity', 'Current Ratio': 'current_ratio', 'Total Debt': 'total_debt', 'Forward Annual Dividend Rate': 'forward_annual_dividend_rate', 'Last Split Date': 'last_split_date', 'Market Cap (intraday)': 'market_cap', 'EBITDA': 'ebitda', 'Shares Short': 'shares_short', '50-Day Moving Average': 'fifty_day_moving_avg', '52 Week High': 'fifty_two_week_high', 'Quarterly Earnings Growth': 'q_earnings_growth', 'Forward Annual Dividend Yield': 'forward_annual_dividend_yield', 'Beta': 'beta', 'Payout Ratio': 'payout_ratio', 'Avg Vol (3 month)': 'avg_vol_3_month', 'Enterprise Value': 'enterprise_value', '5 Year Average Dividend Yield': 'five_year_avg_dividend_yield', 'Enterprise Value/Revenue': 'enterprise_value_revenue', 'Trailing P/E': 'trailing_pe', 'Total Cash': 'total_cash', 'Operating Cash Flow': 'operating_cash_flow', 'Price/Book': 'price_book', 'Fiscal Year Ends': 'fiscal_year_ends', 'Total Debt/Equity': 'total_debt_equity', 'Dividend Date': 'dividend_date', 'Most Recent Quarter': 'most_recent_q', 'Operating Margin': 'operating_margin', 'Ex-Dividend Date': 'exdividend_date', '% Held by Institutions': 'perc_held_by_institutions', 'Trailing Annual Dividend Yield': 'trailing_annual_dividend_yield', '200-Day Moving Average': 'two_hundred_day_moving_avg', '52 Week Low': 'fifty_two_week_low', 'Avg Vol (10 day)': 'avg_vol_10_day', 'Last Split Factor (new per old)': 'last_split_factor', '% Held by Insiders': 'perc_held_by_insiders', 'Revenue Per Share': 'revenue_per_share', 'Short Ratio': 'short_ratio', 'Shares Short (prior month)': 'shares_short_prior_month', 'Short % of Float': 'short_perc_float', 'Profit Margin': 'profit_margin', 'Return on Assets': 'ro_assets', 'Price/Sales': 'price_sales', 'Gross Profit': 'gross_profit', 'Book Value Per Share': 'book_value_per_share', 'Levered Free Cash Flow': 'levered_free_cash_flow', 'Trailing Annual Dividend Rate': 'trailing_annual_dividend_rate', 'Diluted EPS': 'diluted_eps', 'PEG Ratio (5 yr expected)': 'peg_ratio_5yr', 'Shares Outstanding': 'shares_outstanding', 'Revenue': 'revenue', 'Float': 'float', 'Net Income Avi to Common': 'net_income_avi_common', 'Enterprise Value/EBITDA': 'enterprise_value_ebitda', '52-Week Change': 'fifty_two_week_change', 'Quarterly Revenue Growth': 'q_revenue_growth', 'Total Cash Per Share': 'total_cash_ps' } self.convert_dict = {'K': 10**3, 'M': 10**6, 'B': 10**9, 'T': 10**12} self.condensed_pat = re.compile("([+-]?\d*[\.]?\d+)([kmbtKMBT])$") self.float_pat = re.compile("[+-]?\d*[\.]\d+$") self.parenthese_pat = re.compile(" *\(([^)]*)\)") self.date_line_pat = re.compile("\(as of (\d+.*\d+)\)") self.url_ticker_pat = re.compile(".*/quote/(.*)\.(.*)/key-statistics") self.keywords = set({"mrq", "ttm", "yoy", "lfy", "fye"}) self.today = datetime.today().date() self.default_fye = datetime(self.today.year, 12, 31) self.session = Session()
class YahooScraper(): def __init__(self): self.y_to_db_map = { 'Forward P/E': 'forward_pe', 'Return on Equity': 'ro_equity', 'Current Ratio': 'current_ratio', 'Total Debt': 'total_debt', 'Forward Annual Dividend Rate': 'forward_annual_dividend_rate', 'Last Split Date': 'last_split_date', 'Market Cap (intraday)': 'market_cap', 'EBITDA': 'ebitda', 'Shares Short': 'shares_short', '50-Day Moving Average': 'fifty_day_moving_avg', '52 Week High': 'fifty_two_week_high', 'Quarterly Earnings Growth': 'q_earnings_growth', 'Forward Annual Dividend Yield': 'forward_annual_dividend_yield', 'Beta': 'beta', 'Payout Ratio': 'payout_ratio', 'Avg Vol (3 month)': 'avg_vol_3_month', 'Enterprise Value': 'enterprise_value', '5 Year Average Dividend Yield': 'five_year_avg_dividend_yield', 'Enterprise Value/Revenue': 'enterprise_value_revenue', 'Trailing P/E': 'trailing_pe', 'Total Cash': 'total_cash', 'Operating Cash Flow': 'operating_cash_flow', 'Price/Book': 'price_book', 'Fiscal Year Ends': 'fiscal_year_ends', 'Total Debt/Equity': 'total_debt_equity', 'Dividend Date': 'dividend_date', 'Most Recent Quarter': 'most_recent_q', 'Operating Margin': 'operating_margin', 'Ex-Dividend Date': 'exdividend_date', '% Held by Institutions': 'perc_held_by_institutions', 'Trailing Annual Dividend Yield': 'trailing_annual_dividend_yield', '200-Day Moving Average': 'two_hundred_day_moving_avg', '52 Week Low': 'fifty_two_week_low', 'Avg Vol (10 day)': 'avg_vol_10_day', 'Last Split Factor (new per old)': 'last_split_factor', '% Held by Insiders': 'perc_held_by_insiders', 'Revenue Per Share': 'revenue_per_share', 'Short Ratio': 'short_ratio', 'Shares Short (prior month)': 'shares_short_prior_month', 'Short % of Float': 'short_perc_float', 'Profit Margin': 'profit_margin', 'Return on Assets': 'ro_assets', 'Price/Sales': 'price_sales', 'Gross Profit': 'gross_profit', 'Book Value Per Share': 'book_value_per_share', 'Levered Free Cash Flow': 'levered_free_cash_flow', 'Trailing Annual Dividend Rate': 'trailing_annual_dividend_rate', 'Diluted EPS': 'diluted_eps', 'PEG Ratio (5 yr expected)': 'peg_ratio_5yr', 'Shares Outstanding': 'shares_outstanding', 'Revenue': 'revenue', 'Float': 'float', 'Net Income Avi to Common': 'net_income_avi_common', 'Enterprise Value/EBITDA': 'enterprise_value_ebitda', '52-Week Change': 'fifty_two_week_change', 'Quarterly Revenue Growth': 'q_revenue_growth', 'Total Cash Per Share': 'total_cash_ps' } self.convert_dict = {'K': 10**3, 'M': 10**6, 'B': 10**9, 'T': 10**12} self.condensed_pat = re.compile("([+-]?\d*[\.]?\d+)([kmbtKMBT])$") self.float_pat = re.compile("[+-]?\d*[\.]\d+$") self.parenthese_pat = re.compile(" *\(([^)]*)\)") self.date_line_pat = re.compile("\(as of (\d+.*\d+)\)") self.url_ticker_pat = re.compile(".*/quote/(.*)\.(.*)/key-statistics") self.keywords = set({"mrq", "ttm", "yoy", "lfy", "fye"}) self.today = datetime.today().date() self.default_fye = datetime(self.today.year, 12, 31) self.session = Session() def s2n(self, string): reg = self.condensed_pat.search(string) return int( float(reg.group(1)) * self.convert_dict[reg.group(2).upper()]) @staticmethod def s2p(string): return float(string.strip('%')) @staticmethod def s2r(string): split = string.split(':') return float(split[0]) / float(split[1]) def parse_numeric(self, string): try: if '%' in string: return self.s2p(string) elif self.condensed_pat.match(string) is not None: return self.s2n(string) elif self.float_pat.match(string) is not None: return float(string) elif string.isdigit(): return int(string) elif ':' in string: return self.s2r(string) else: return dp.parse(string).date().isoformat() except ValueError: return None def cleanse_str(self, string): return self.parenthese_pat.sub('', string.replace(',', '')).strip(':') def dic_parse(self, session, url, html): def innerHtml(ele): return ele.decode_contents(formatter="html") soup = BeautifulSoup(html, "lxml") ticker = self.url_ticker_pat.search(url).group(1) exchange = "TSX" on_yahoo = soup.find('section', attrs={'data-test': 'lookup-page' }) is None session.query(Listings).filter(Listings.exchange == exchange, Listings.ticker == ticker).update( {Listings.onyahoo: on_yahoo}) if not on_yahoo: # if quote not found, exit LOGGER.error("Failed to find quote for {} skipping".format(url)) return div_test = soup.find('section', attrs={'data-test': 'qsp-statistics'}) if div_test is None: LOGGER.error("Unknown error for {} skipping".format(url)) return db_dic = {} for table in div_test.find_all('table'): for row in table.find_all('tr'): td_list = row.find_all('td') title = innerHtml(td_list[0].find('span')) val = innerHtml(td_list[1]) if td_list[1].find( 'span') is None else innerHtml(td_list[1].find('span')) if title in self.y_to_db_map: db_dic[self.y_to_db_map[title]] = self.parse_numeric(val) if db_dic: db_dic["ticker"] = ticker db_dic["exchange"] = exchange exists = session.query(KeyStatistics).filter_by( **db_dic).scalar() is not None if exists: LOGGER.info("Skipping {} due to prior existence".format(url)) else: db_dic["update_date"] = self.today stmt = insert(KeyStatistics).values( db_dic).on_conflict_do_nothing( constraint='key_statistics_pkey', ) session.execute(stmt) session.commit() LOGGER.info("Done parsing {}".format(url)) else: LOGGER.info("Skipping {}".format(url)) def fetch_all(self, exchange): q = self.session.query(Listings).filter( Listings.exchange == exchange, or_(Listings.onyahoo == True, Listings.onyahoo is None)) extension = '.TO' urls = [ "https://ca.finance.yahoo.com/quote/{}{}/key-statistics".format( l.ticker, extension) for l in q ] xpath_hooks = [ "//section[@data-test='qsp-statistics']", "//section[@data-test='lookup-page']" ] LOGGER.info("Fetching/Updating {} urls.".format(len(urls))) jsps = JSPageScraper(self.dic_parse, xpath_hooks, "key_statistics") jsps.go(urls) def clean_exit(self): self.session.close()
class YahooApiScraper(): def __init__(self): # This is a reverse engineering of the Yahoo Finance REST API # Information off: http://www.jarloo.com/yahoo_finance/ self.y_to_db_map = {'n': 'name', 'y': 'dividend_yield', 'd': 'dividend_ps', 'r': 'pe', 'r1': 'dividend_pay_date', 'q': 'ex_dividend_date', 'o': 'open', 'c1': 'change', 'p2': 'perc_change', 'd1': 'last_trade_date', 'd2': 'trade_date', 'c3': 'commission', 'g': 'day_low', 'h': 'day_high', 'p': 'previous_close', 't8': 'year_target', 'm5': 'change_mv_avg_200', 'm6': 'perc_change_mv_avg_200', 'm7': 'change_mv_avg_50', 'm8': 'perc_change_mv_avg_50', 'm3': 'mv_avg_50', 'm4': 'mv_avg_200', 'w1': 'day_value_change', 'g1': 'holding_gain_perc', 'g3': 'annualized_gain', 'g4': 'holdings_gain', 'k': 'high_52_week', 'j': 'low_52_week', 'j5': 'change_52_week_low', 'k4': 'change_52_week_high', 'j6': 'perc_change_52_week_low', 'k5': 'perc_change_52_week_high', 'j1': 'market_cap', 'f6': 'float_shares', 'x': 'stock_exchange', 's1': 'shares_owned', 'j2': 'shares_outstanding', 'n4': 'notes', 'i': 'more_info', 'v': 'volume', 'a2': 'avg_daily_volume', 'e': 'eps', 'e7': 'eps_year_estimate', 'e8': 'eps_next_year_estimate', 'e9': 'eps_next_q_estimate', 'b4': 'book', 'j4': 'ebitda', 'p5': 'price_sale', 'p6': 'price_book', 'r': 'pe', 'r5': 'peg', 'r6': 'price_eps_estimate_year', 'r7': 'price_eps_estimate_next_year', 's7': 'short_ratio', 's6': 'revenue', 'v1': 'holdings_val', 'l2': 'high_limit', 'l3': 'low_limit', 'a': 'ask', 'b': 'bid'} self.convert_dict = {'K': 10 ** 3, 'M': 10 ** 6, 'B': 10 ** 9, 'T': 10 ** 12} self.condensed_pat = re.compile("([+-]?\d*\.?\d+)([kmbtKMBT])") self.url_flags = tuple(self.y_to_db_map.keys()) self.url_str_flags = "".join(self.url_flags) self.db_entries = tuple(self.y_to_db_map.values()) self.float_pat = re.compile("[+-]?(\d*[\.])?\d+$") self.today = datetime.today().date() self.base_url = "http://finance.yahoo.com/d/quotes.csv" # Sometimes websites are friendlier to iOS devices :) self.headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25" } self.session = Session() def handle_csv_string(self, string): if string == 'N/A': return None elif '%' in string: return float(string.strip('%')) elif string.isdigit(): return int(string) elif self.condensed_pat.match(string) is not None: reg = self.condensed_pat.search(string) return int(float(reg.group(1)) * self.convert_dict[reg.group(2).upper()]) elif self.float_pat.match(string) is not None: return float(string) else: try: return dp.parse(string).date().isoformat() except ValueError: return string def chunks(self, l, n): return [l[i:i + n] for i in range(0, len(l), n)] def create_url(self, tickers): ticker_str = "+".join(tickers) url = "{}?s={}&f={}".format(self.base_url, ticker_str, self.url_str_flags) return url def handle_url(self, tickers, url, exchange): """ Fetches the url and inserts the data into the appropriate cols in the DB. """ LOGGER.info("Starting to add url: {} ...".format(url)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) db_list = [] for row, ticker in zip(csv_r, tickers): assert(len(row) == len(self.url_flags)) db_dic = {db_col: self.handle_csv_string(cell) for cell, db_col in zip(row, self.db_entries)} onyahoo = any(v is not None for v in db_dic.values()) self.session.query(Listings).filter(Listings.exchange == exchange, Listings.ticker == ticker ).update({Listings.onyahoo: onyahoo}) if not onyahoo: # not found, skip LOGGER.error("Failed to find quote for {} skipping".format(ticker)) continue db_dic["ticker"] = ticker db_dic["exchange"] = exchange exists = self.session.query(YahooKeyStatistics).filter_by(**db_dic).scalar() is not None if exists: LOGGER.info("Skipping {} due to prior existence".format(ticker)) continue db_dic["update_date"] = self.today # Annoyingly enough, sqlalchemy doesn't allow PostgreSQL bulk inserts # when checking constraints, RIP performance stmt = insert(YahooKeyStatistics).values(db_dic).on_conflict_do_nothing( constraint = 'yahoo_key_statistics_pkey', ) self.session.execute(stmt) self.session.commit() LOGGER.info("Done url.") def fetch_all(self, exchange): extension = '.TO' tickers = tuple(x.ticker + extension for x in self.session.query(Listings.ticker).filter(Listings.exchange == exchange, or_(Listings.onyahoo == True, Listings.onyahoo is None))) ticker_groups = self.chunks(tickers, 200) LOGGER.info("Fetching/Updating {} urls.".format(len(ticker_groups))) for ticker_group in ticker_groups: url = self.create_url(ticker_group) self.handle_url(ticker_group, url, exchange) sleep(1) # limit requests to 1/s def clean_exit(self): self.session.close()