def __crawl(driver): while True: while len(subHtmlUrlQueue) > 0: url = subHtmlUrlQueue.popleft() if url in visitedUrl: continue try: visitedUrl[url] = 1 logger.info("visit page %s" % url) driver.get(url) WebDriverWait( driver, 20).until(lambda x: x.find_elements_by_tag_name("script")) driver.switch_to.frame("contentFrame") atags = driver.find_elements_by_tag_name("a") hrefs = [a.get_attribute("href") for a in atags] links = filter_link(hrefs) song_links = get_song_link(links) logger.debug("get song_links %s" % len(song_links)) notsong_links = get_nonsong_link(links) songUrlQueue.extend(song_links) subHtmlUrlQueue.extend(notsong_links) except Exception as e: logger.error("", exc_info=True) continue else: logger.info("empty page queue") time.sleep(1)
def fetch_yahoo_responses() -> List[Tuple]: tickers: List[List] = [] for model in [IncomeStatement, BalanceSheetStatement, CashFlowStatement]: tickers.append( fetch_isins_not_updated_financials(model)) # type: ignore tickers_unique: List[Tuple] = union_of_list_elements(*tickers) logger.info('Fetching financials from %s stocks' % len(tickers_unique)) responses: List[Tuple[Any, ...]] = [] for ticker_tuple in tickers_unique: if len(ticker_tuple) == 2: isin: str = ticker_tuple[0] yahoo_ticker: str = ticker_tuple[1] try: response = fetch_yahoo_data( yahoo_ticker, 'balanceSheetHistory,incomeStatementHistory,cashflowStatementHistory' ) logger.info('Succeeded getting ticker, isin: %s, %s' % (yahoo_ticker, isin)) except Exception: logger.error( 'Something went wrong getting ticker, isin: %s, %s' % (yahoo_ticker, isin)) logger.error(format_exc()) continue responses.append((response, isin)) else: continue return responses
def session_scope(): try: session = Session() yield session session.commit() except Exception as e: logger.error("", exc_info=True) session.rollback() finally: session.close()
def load_data(data: List[Base]) -> None: if len(data) > 0: session = Session() for idx, record in enumerate(data): try: session.merge(record) except Exception: logger.info('Something went wrong: %s' % record) logger.error(format_exc()) continue logger.debug(record) if idx > 0 and idx % 100 == 0: session.commit() logger.info('Chunked commit at %s records' % idx) session.commit() logger.info('Chunked commit at %s records' % idx) session.close() else: logger.info('No data to load')
def crawl_songs(): while True: while len(songUrlQueue) > 0: song_link = songUrlQueue.popleft() if song_link in visitedUrl: continue logger.info("visit song %s" % song_link) try: visitedUrl[song_link] = 1 sid = song_link[song_link.find("=") + 1:] if songService.is_existed(sid): logger.info("%s has existed" % sid) continue info = extract.getSongInfo(song_link, song_driver) if not info: continue songService.add(info) except Exception as e: logger.error("", exc_info=True) continue else: logger.info("empty song queue") time.sleep(1)
def execute(*args, **kws): try: return realf(*args, **kws) except Exception as e: logger.error(repr(e), exc_info=True) return None