Exemplo n.º 1
0
 def _article_processor(self, article):
     headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Encoding':
         'gzip, deflate',
         'Accept-Language':
         'en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6',
         'Connection':
         'keep-alive',
         'DNT':
         '1',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
     }
     try:
         log.debug('Processing {}'.format(article.url))
         article_html_maps = url_helper.get_url_sync(article.url,
                                                     headers=headers)
         log.debug('Sleeping {}'.format(self.pause_btw_crawl))
         time.sleep(self.pause_btw_crawl)
         content_article = self.src_pipeline[article.src]['article_parser'](
             article=article, article_html=article_html_maps[article.url])
         article.left_merge(content_article)
     except Exception as e:
         log.error('Error processing news {}'.format(article.url))
         log.debug('Error msg {}'.format(e))
     return article
Exemplo n.º 2
0
def test_qq_parser():
    import url_helper
    url = r'http://news.qq.com/a/20171125/008910.htm'
    contents = url_helper.get_url_sync(url)

    parsed = qq(contents[url])
    print(parsed)
Exemplo n.º 3
0
def test_xh_parser():
    import url_helper
    url = r'http://news.xinhuanet.com/politics/2017-11/29/c_1122032030.htm'
    contents = url_helper.get_url_sync(url)

    parsed = xinhua(contents[url])
    print(parsed)
Exemplo n.º 4
0
def process_news_worker(article):
    try:
        log.debug('Processing {}'.format(article.url))
        article_html_maps = url_helper.get_url_sync(article.url)
        parsed_article_dict = parsers.parser_master(
            parser_name=article.src,
            article_html=article_html_maps[article.url],
            url=article.url)
        if bool(parsed_article_dict) is True:
            if bool(parsed_article_dict['title']):
                article.title = parsed_article_dict['title']
            if bool(parsed_article_dict['section']):
                article.section = parsed_article_dict['section']
            if bool(parsed_article_dict['as_of_dt']):
                article.as_of_dt = parsed_article_dict['as_of_dt']
            if bool(parsed_article_dict['src']):
                article.src = parsed_article_dict['src']
            if bool(parsed_article_dict['text']):
                article.text = parsed_article_dict['text']
        else:
            article.parser_ready = False
        to_db(article)
        log.debug('Sleeping {}'.format(configs.pause_btw_crawl))
        time.sleep(configs.pause_btw_crawl)
    except Exception as e:
        log.error('Error processing news {}'.format(article.url))
        log.debug('Error msg {}'.format(e))
Exemplo n.º 5
0
 def ix_parser_hexun(self, as_of_dt):
     src_name = 'hexun'
     dt_fmt = r'%Y-%m-%d'
     articles_per_pg = 30
     url_template = r'http://roll.hexun.com/roolNews_listRool.action?type=all&ids=100,101,103,125,105,124,162,194,108,122,121,119,107,116,114,115,182,120,169,170,177,180,118,190,200,155,130,117,153,106&date={date}&page={page}'
     headers = {
         'Accept-Encoding':
         'gzip, deflate',
         'Accept-Language':
         'en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6',
         'Connection':
         'keep-alive',
         #     'Cookie': '__jsluid=55133f1037eb29153a2290b27dc5d112; UM_distinctid=1608c824e45225-0187e4f3081221-16386656-fa000-1608c824e4639e; HexunTrack=SID=20171225151038013bf64005c70424b62990fd27a22ba0eb7&CITY=81&TOWN=0; __utma=194262068.122689198.1514185838.1514185838.1514185838.1; __utmc=194262068; __utmz=194262068.1514185838.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); vjuids=2c8772259.1608c828e36.0.577adf14db4dd; vjlast=1514185855.1514185855.30; hxck_sq_common=LoginStateCookie=; ASL=17525,0000p,3d5d7863; ADVC=35cb70ac656a72; ADVS=35cb70ac656a72; cn_1263247791_dplus=%7B%22distinct_id%22%3A%20%221608c824e45225-0187e4f3081221-16386656-fa000-1608c824e4639e%22%2C%22sp%22%3A%20%7B%22userFirstDate%22%3A%20%2220171225%22%2C%22userID%22%3A%20%22%22%2C%22userName%22%3A%20%22%22%2C%22userType%22%3A%20%22nologinuser%22%2C%22userLoginDate%22%3A%20%2220171225%22%2C%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201514186147%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201514186147%7D%2C%22initial_view_time%22%3A%20%221514181081%22%2C%22initial_referrer%22%3A%20%22http%3A%2F%2Fnews.hexun.com%2F%22%2C%22initial_referrer_domain%22%3A%20%22news.hexun.com%22%7D; CNZZDATA1262910278=584070923-1514181788-http%253A%252F%252Fnews.hexun.com%252F%7C1514181788; __utmb=194262068.3.10.1514185838',
         'DNT':
         '1',
         'Host':
         'roll.hexun.com',
         'Referer':
         'http://roll.hexun.com/',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
     }
     current_page = 1
     num_pages = 1
     articles = []
     log.debug('Getting {} index page for date {}'.format(
         src_name, as_of_dt))
     while current_page <= num_pages:
         url = url_template.format(date=as_of_dt.strftime(dt_fmt),
                                   page=current_page)
         ix_page = url_helper.get_url_sync(url, headers=headers)[url]
         time.sleep(self.pause_btw_crawl)
         try:
             ix_parsed = demjson.decode(ix_page)
             num_rcd = int(ix_parsed['sum'])
             num_pages = math.ceil(num_rcd /
                                   articles_per_pg)  #comment out for debug
             rcds = ix_parsed['list']
             for rcd in rcds:
                 if not rcd['titleLink'].endswith('PDF'):
                     articles.append(
                         Article(url=rcd['titleLink'],
                                 title=rcd['title'],
                                 section=rcd['columnName'],
                                 as_of_dt=as_of_dt,
                                 crawl_ts=dt.datetime.utcnow(),
                                 src=src_name))
         except Exception as e:
             log.error(
                 'Failed to parse {} index page {} for date {}'.format(
                     src_name, current_page, as_of_dt))
             log.debug('Error msg {}'.format(e))
         finally:
             current_page += 1
     return articles
Exemplo n.º 6
0
def process_news_worker(article, parser_config):
    """
    :param article: (article object)
    :return:
    """
    try:
        log.debug('Processing {}'.format(article.url))
        article_html_maps = url_helper.get_url_sync(article.url)
        net_loc = urllib.parse.urlparse(article.url).netloc
        if net_loc in parser_config.keys():
            parsed_article_dict = parsers.parser_master(
                parser_config[net_loc], article_html_maps[article.url],
                article.url)
            if bool(parsed_article_dict) is True:
                if bool(parsed_article_dict['title']):
                    article.title = parsed_article_dict['title']
                if bool(parsed_article_dict['section']):
                    article.section = parsed_article_dict['section']
                if bool(parsed_article_dict['as_of_dt']):
                    article.as_of_dt = parsed_article_dict['as_of_dt']
                if bool(parsed_article_dict['src']):
                    article.src = parsed_article_dict['src']
                if bool(parsed_article_dict['text']):
                    article.text = parsed_article_dict['text']
            else:
                article.parser_ready = False
        else:
            log.debug(
                'Parser not ready for {}. Only RSS content parsed'.format(
                    net_loc))
            article.parser_ready = False
        article.crawl_ts = dt.datetime.utcnow()
        to_db(article)
        log.debug('Sleeping {}'.format(5))
        time.sleep(5)
    except:
        log.error('Error processing news {}'.format(article.url))