def _article_processor(self, article): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6', 'Connection': 'keep-alive', 'DNT': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } try: log.debug('Processing {}'.format(article.url)) article_html_maps = url_helper.get_url_sync(article.url, headers=headers) log.debug('Sleeping {}'.format(self.pause_btw_crawl)) time.sleep(self.pause_btw_crawl) content_article = self.src_pipeline[article.src]['article_parser']( article=article, article_html=article_html_maps[article.url]) article.left_merge(content_article) except Exception as e: log.error('Error processing news {}'.format(article.url)) log.debug('Error msg {}'.format(e)) return article
def test_qq_parser(): import url_helper url = r'http://news.qq.com/a/20171125/008910.htm' contents = url_helper.get_url_sync(url) parsed = qq(contents[url]) print(parsed)
def test_xh_parser(): import url_helper url = r'http://news.xinhuanet.com/politics/2017-11/29/c_1122032030.htm' contents = url_helper.get_url_sync(url) parsed = xinhua(contents[url]) print(parsed)
def process_news_worker(article): try: log.debug('Processing {}'.format(article.url)) article_html_maps = url_helper.get_url_sync(article.url) parsed_article_dict = parsers.parser_master( parser_name=article.src, article_html=article_html_maps[article.url], url=article.url) if bool(parsed_article_dict) is True: if bool(parsed_article_dict['title']): article.title = parsed_article_dict['title'] if bool(parsed_article_dict['section']): article.section = parsed_article_dict['section'] if bool(parsed_article_dict['as_of_dt']): article.as_of_dt = parsed_article_dict['as_of_dt'] if bool(parsed_article_dict['src']): article.src = parsed_article_dict['src'] if bool(parsed_article_dict['text']): article.text = parsed_article_dict['text'] else: article.parser_ready = False to_db(article) log.debug('Sleeping {}'.format(configs.pause_btw_crawl)) time.sleep(configs.pause_btw_crawl) except Exception as e: log.error('Error processing news {}'.format(article.url)) log.debug('Error msg {}'.format(e))
def ix_parser_hexun(self, as_of_dt): src_name = 'hexun' dt_fmt = r'%Y-%m-%d' articles_per_pg = 30 url_template = r'http://roll.hexun.com/roolNews_listRool.action?type=all&ids=100,101,103,125,105,124,162,194,108,122,121,119,107,116,114,115,182,120,169,170,177,180,118,190,200,155,130,117,153,106&date={date}&page={page}' headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6', 'Connection': 'keep-alive', # 'Cookie': '__jsluid=55133f1037eb29153a2290b27dc5d112; UM_distinctid=1608c824e45225-0187e4f3081221-16386656-fa000-1608c824e4639e; HexunTrack=SID=20171225151038013bf64005c70424b62990fd27a22ba0eb7&CITY=81&TOWN=0; __utma=194262068.122689198.1514185838.1514185838.1514185838.1; __utmc=194262068; __utmz=194262068.1514185838.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); vjuids=2c8772259.1608c828e36.0.577adf14db4dd; vjlast=1514185855.1514185855.30; hxck_sq_common=LoginStateCookie=; ASL=17525,0000p,3d5d7863; ADVC=35cb70ac656a72; ADVS=35cb70ac656a72; cn_1263247791_dplus=%7B%22distinct_id%22%3A%20%221608c824e45225-0187e4f3081221-16386656-fa000-1608c824e4639e%22%2C%22sp%22%3A%20%7B%22userFirstDate%22%3A%20%2220171225%22%2C%22userID%22%3A%20%22%22%2C%22userName%22%3A%20%22%22%2C%22userType%22%3A%20%22nologinuser%22%2C%22userLoginDate%22%3A%20%2220171225%22%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201514186147%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201514186147%7D%2C%22initial_view_time%22%3A%20%221514181081%22%2C%22initial_referrer%22%3A%20%22http%3A%2F%2Fnews.hexun.com%2F%22%2C%22initial_referrer_domain%22%3A%20%22news.hexun.com%22%7D; CNZZDATA1262910278=584070923-1514181788-http%253A%252F%252Fnews.hexun.com%252F%7C1514181788; __utmb=194262068.3.10.1514185838', 'DNT': '1', 'Host': 'roll.hexun.com', 'Referer': 'http://roll.hexun.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' } current_page = 1 num_pages = 1 articles = [] log.debug('Getting {} index page for date {}'.format( src_name, as_of_dt)) while current_page <= num_pages: url = url_template.format(date=as_of_dt.strftime(dt_fmt), page=current_page) ix_page = url_helper.get_url_sync(url, headers=headers)[url] time.sleep(self.pause_btw_crawl) try: ix_parsed = demjson.decode(ix_page) num_rcd = int(ix_parsed['sum']) num_pages = math.ceil(num_rcd / articles_per_pg) #comment out for debug rcds = ix_parsed['list'] for rcd in rcds: if not rcd['titleLink'].endswith('PDF'): articles.append( Article(url=rcd['titleLink'], title=rcd['title'], section=rcd['columnName'], as_of_dt=as_of_dt, crawl_ts=dt.datetime.utcnow(), src=src_name)) except Exception as e: log.error( 'Failed to parse {} index page {} for date {}'.format( src_name, current_page, as_of_dt)) log.debug('Error msg {}'.format(e)) finally: current_page += 1 return articles
def process_news_worker(article, parser_config): """ :param article: (article object) :return: """ try: log.debug('Processing {}'.format(article.url)) article_html_maps = url_helper.get_url_sync(article.url) net_loc = urllib.parse.urlparse(article.url).netloc if net_loc in parser_config.keys(): parsed_article_dict = parsers.parser_master( parser_config[net_loc], article_html_maps[article.url], article.url) if bool(parsed_article_dict) is True: if bool(parsed_article_dict['title']): article.title = parsed_article_dict['title'] if bool(parsed_article_dict['section']): article.section = parsed_article_dict['section'] if bool(parsed_article_dict['as_of_dt']): article.as_of_dt = parsed_article_dict['as_of_dt'] if bool(parsed_article_dict['src']): article.src = parsed_article_dict['src'] if bool(parsed_article_dict['text']): article.text = parsed_article_dict['text'] else: article.parser_ready = False else: log.debug( 'Parser not ready for {}. Only RSS content parsed'.format( net_loc)) article.parser_ready = False article.crawl_ts = dt.datetime.utcnow() to_db(article) log.debug('Sleeping {}'.format(5)) time.sleep(5) except: log.error('Error processing news {}'.format(article.url))