def load_upcoming_events(self, ts=None): logger.info('Load youtube upcoming events for detail parsing (online_ts and cover)') cnx = mysql.connector.connect(**db) cursor = cnx.cursor(dictionary=True, buffered=True) select_sql = "SELECT id, web_url from crawled_lives " \ " where list_crawler_ts > %(crawl_query_ts)s AND " \ " online_ts is NULL AND" \ " site = 'youtube'" today = datetime.datetime.now() param = { "crawl_query_ts":get_query_ts(today - datetime.timedelta(days=2)), "online_query_ts":get_query_ts(today), } try: cursor.execute(select_sql, param) logger.info(cursor.statement) all_upcoming_events = list(cursor.fetchall()) logger.info('Load %d youtube upcoming events from DB for detail parsing.', len(all_upcoming_events)) return all_upcoming_events except: logger.error('error! SQL=%s', cursor.statement, exc_info=True) finally: cnx.close()
def _load_existing_upcoming_events(self, referer_url, ts=None): """Load existing events of livestream for filtering""" logger.info("Load existing upcoming events with referer_url %s for filtering.", referer_url) cnx = mysql.connector.connect(**db) cursor = cnx.cursor(dictionary=True, buffered=True) select_sql = ( "SELECT web_url from crawled_lives " " where list_crawler_ts > %(crawl_query_ts)s AND " " online_ts is NULL or online_ts > %(online_query_ts)s AND" " site = 'livestream' AND " " referer = %(referer_url)s" ) today = datetime.datetime.now() param = { "crawl_query_ts": get_query_ts(today - datetime.timedelta(days=2)), "online_query_ts": get_query_ts(today), "referer_url": referer_url, } try: cursor.execute(select_sql, param) all_upcoming_events = cursor.fetchall() logger.info( "Load %d existing upcoming events from DB with referer_url %s.", len(all_upcoming_events), referer_url ) return set([event["web_url"] for event in all_upcoming_events]) except: logger.error("Error while executing SQL=%s", cursor.statement, exc_info=True) finally: cnx.close()
def update_db(self, live_events): logger.info('Update youtube upcoming events with (online_ts and cover)') cnx = mysql.connector.connect(**db) cursor = cnx.cursor(buffered=True) update_sql = "UPDATE crawled_lives " \ " SET " \ " detail_crawler=%(detail_crawler)s, " \ " detail_crawler_ts=%(detail_crawler_ts)s, " \ " online_ts=%(online_ts)s, " \ " cover_url=%(cover_url)s, " \ " cover=%(cover)s, " \ " owner_avatar = %(avatar)s, " \ " category = %(category)s, " \ " description = %(description)s, " \ " more_info = %(more_info)s " \ " WHERE web_url = %(web_url)s " to_update = { "detail_crawler":"youtube_detail", "detail_crawler_ts":get_query_ts(), } try: for event in live_events: event.update(to_update) event['cover'] = fetch_image(event['cover_url']) if event['cover_url'] else None event['avatar'] = fetch_image(event['avatar_url']) if event['avatar_url'] else None cursor.execute(update_sql, event) logger.debug('event(%s) is updated.' % (event['web_url'], )) cnx.commit() except: logger.error("error! SQL=" + cursor.statement, exc_info=True) finally: cnx.close()
def _parse_event_detail(self, event_list): """ Parse upcoming event detail. Livestream event detail page has two templates. """ sub_browser = webdriver.Firefox() valid_event_list = [] for event in event_list: try: sub_browser.get(event["web_url"]) if not self._is_region_available(sub_browser): logger.info("web_url %s not region available", event["web_url"]) continue if self._required_login(sub_browser): logger.info("web_url %s require login", event["web_url"]) continue if self._required_password(sub_browser): logger.info("web_url %s require password", event["web_url"]) continue if not self._is_sepecial_page(sub_browser): logger.info("web_url %s belong to old template", event["web_url"]) WebDriverWait(sub_browser, 5).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div#event-meta")) ) sub_event = getattr(sub_browser, ED_EXTRACTOR["sub_event"][0])(ED_EXTRACTOR["sub_event"][1]) category_elem = getattr(sub_event, ED_EXTRACTOR["sub_event_category"][0])( ED_EXTRACTOR["sub_event_category"][1] ) category = category_elem.text sub_owner_elem = getattr(sub_browser, ED_EXTRACTOR["sub_event_owner"][0])( ED_EXTRACTOR["sub_event_owner"][1] ) avatar_url = sub_owner_elem.get_attribute("src") cover_elem = getattr(sub_browser, ED_EXTRACTOR["sub_event_cover"][0])( ED_EXTRACTOR["sub_event_cover"][1] ) cover_url = cover_elem.get_attribute("src") number_events_elem = getattr(sub_browser, ED_EXTRACTOR["sub_event_owner_events"][0])( ED_EXTRACTOR["sub_event_owner_events"][1] ) number_events = number_events_elem.text number_followers_elem = getattr(sub_browser, ED_EXTRACTOR["sub_event_owner_follower"][0])( ED_EXTRACTOR["sub_event_owner_follower"][1] ) number_followers = number_followers_elem.text else: logger.info("web_url %s belong to new template", event["web_url"]) info = sub_browser.find_element_by_xpath('//a[@ng-if="enable_drawers_embed"]') info.click() sub_browser.implicitly_wait(5) WebDriverWait(sub_browser, 5).until( EC.presence_of_element_located( ( By.XPATH, '//div[@class="event_date_category"]/a[@class="event_category ng-binding ng-scope"]', ) ) ) category_elem = sub_browser.find_element_by_xpath( '//div[@class="event_date_category"]/a[@class="event_category ng-binding ng-scope"]' ) category = category_elem.text.strip() sub_owner_elem = sub_browser.find_element_by_xpath('//a[@class="owner_avatar"]/img') avatar_url = sub_owner_elem.get_attribute("src") events_followers_elem = sub_browser.find_elements_by_xpath( '//div[@class="account_details"]/pluralize-with-html/span' ) number_events = events_followers_elem[0].text number_followers = events_followers_elem[1].text cover_elem = sub_browser.find_element_by_xpath('//div[@class="event_poster_wrapper ng-scope"]') cover_url = cover_elem.get_attribute("style") m = re.search(r'"//(img.*)"', cover_url) if m: cover_url = r"http://" + m.group(1) else: cover_url = None event.update( { "cover_url": cover_url, "owner_avatar_url": avatar_url, "site": "livestream", "category": category, "list_crawler": "livestream_live", "detail_crawler": "livestream_live", "detail_crawler_ts": get_query_ts(), "more_info": json.dumps( { "events": int(re.sub("[, ]", "", number_events)), "followers": int(re.sub("[, ]", "", number_followers)), } ), } ) logger.info("Successfully parse event %s with url %s", event["title"], event["web_url"]) valid_event_list.append(event) except Exception, e: logger.error(e.message, exc_info=True)