def dic_parse(self, session, url, html): def innerHtml(ele): return ele.decode_contents(formatter="html") soup = BeautifulSoup(html, "lxml") ticker = self.url_ticker_pat.search(url).group(1) exchange = "TSX" on_yahoo = soup.find('section', attrs={'data-test': 'lookup-page' }) is None session.query(Listings).filter(Listings.exchange == exchange, Listings.ticker == ticker).update( {Listings.onyahoo: on_yahoo}) if not on_yahoo: # if quote not found, exit LOGGER.error("Failed to find quote for {} skipping".format(url)) return div_test = soup.find('section', attrs={'data-test': 'qsp-statistics'}) if div_test is None: LOGGER.error("Unknown error for {} skipping".format(url)) return db_dic = {} for table in div_test.find_all('table'): for row in table.find_all('tr'): td_list = row.find_all('td') title = innerHtml(td_list[0].find('span')) val = innerHtml(td_list[1]) if td_list[1].find( 'span') is None else innerHtml(td_list[1].find('span')) if title in self.y_to_db_map: db_dic[self.y_to_db_map[title]] = self.parse_numeric(val) if db_dic: db_dic["ticker"] = ticker db_dic["exchange"] = exchange exists = session.query(KeyStatistics).filter_by( **db_dic).scalar() is not None if exists: LOGGER.info("Skipping {} due to prior existence".format(url)) else: db_dic["update_date"] = self.today stmt = insert(KeyStatistics).values( db_dic).on_conflict_do_nothing( constraint='key_statistics_pkey', ) session.execute(stmt) session.commit() LOGGER.info("Done parsing {}".format(url)) else: LOGGER.info("Skipping {}".format(url))
def dic_parse(self, db, url, html): def innerHtml(ele): return ele.decode_contents(formatter="html") soup = BeautifulSoup(html, "lxml") ticker = self.url_ticker_pat.search(url).group(1) exchange = "TSX" on_yahoo = soup.find('div', attrs={'data-test': 'unknown-quote' }) is None db.update("listings", ["onyahoo"], [on_yahoo], "exchange=%s AND ticker=%s", [exchange, ticker]) if not on_yahoo: # if quote not found, exit LOGGER.error("Failed to find quote for", url, "skipping") return div_test = soup.find('div', attrs={'data-test': 'qsp-statistics'}) if div_test is None: LOGGER.error("Unknown error for", url, "skipping") return db_dic = {} for table in div_test.find_all('table'): for row in table.find_all('tr'): td_list = row.find_all('td') title = innerHtml(td_list[0].find('span')) val = innerHtml(td_list[1]) if td_list[1].find( 'span') is None else innerHtml(td_list[1].find('span')) if title in self.y_to_db_map: db_dic[self.y_to_db_map[title]] = self.parse_numeric(val) if db_dic: db_dic["ticker"] = ticker db_dic["exchange"] = exchange col_names, vals = list(db_dic.keys()), list(db_dic.values()) where = db.create_conditional_string(col_names) if db.exists("key_statistics", where, vals): LOGGER.info("Skipping {} due to prior existence".format(url)) else: col_names.append("update_date") vals.append(self.today) db.insert_into("key_statistics", col_names, vals, multiple=False) LOGGER.info("Done parsing {}".format(url)) else: LOGGER.info("Skipping {}".format(url))
def get_html(urlQ, callback, xpath_hooks): """ This page takes a url from the URL Queue (urlQ) and calls a callbac that will handle the page source. xpage_hooks is a list used to determine when the page is loaded, see the docs for more details (e.g. ["//div[@data-test='whatever']"] ). """ svr = webkit_server.Server() svrconn = webkit_server.ServerConnection(server=svr) driver = dryscrape.driver.webkit.Driver(connection=svrconn) sess = dryscrape.Session(driver=driver) sess.set_header( "User-Agent", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" ) sess.set_attribute("auto_load_images", False) valid_page_func = lambda: any( sess.at_xpath(xpath) for xpath in xpath_hooks) session = Session() while not urlQ.empty(): url = urlQ.get() try: sess.visit(url) except webkit_server.InvalidResponseError: LOGGER.error( "Got invalid response from something? Skipping {}".format(url)) continue try: sess.wait_for(valid_page_func, interval=1, timeout=15) except dryscrape.mixins.WaitTimeoutError: LOGGER.error("Timeout so skipping {}".format(url)) continue response = sess.body() callback(session, url, response) sess.reset() svr.kill() session.close()
def handle_url(self, tickers, url, exchange): """ Fetches the url and inserts the data into the appropriate cols in the DB. """ LOGGER.info("Starting to add url: {} ...".format(url)) req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req) csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8')) db_list = [] for row, ticker in zip(csv_r, tickers): assert(len(row) == len(self.url_flags)) db_dic = {db_col: self.handle_csv_string(cell) for cell, db_col in zip(row, self.db_entries)} onyahoo = any(v is not None for v in db_dic.values()) self.session.query(Listings).filter(Listings.exchange == exchange, Listings.ticker == ticker ).update({Listings.onyahoo: onyahoo}) if not onyahoo: # not found, skip LOGGER.error("Failed to find quote for {} skipping".format(ticker)) continue db_dic["ticker"] = ticker db_dic["exchange"] = exchange exists = self.session.query(YahooKeyStatistics).filter_by(**db_dic).scalar() is not None if exists: LOGGER.info("Skipping {} due to prior existence".format(ticker)) continue db_dic["update_date"] = self.today # Annoyingly enough, sqlalchemy doesn't allow PostgreSQL bulk inserts # when checking constraints, RIP performance stmt = insert(YahooKeyStatistics).values(db_dic).on_conflict_do_nothing( constraint = 'yahoo_key_statistics_pkey', ) self.session.execute(stmt) self.session.commit() LOGGER.info("Done url.")