Exemplo n.º 1
0
def get_feed(rss_config):
    log.debug('Start getting rss feed')
    rss_urls = list(rss_config.keys())
    rss_feeds = url_helper.get_urls_async(rss_urls,
                                          configs.rss_ix_stop_btw_batch,
                                          configs.rss_ix_sim_req)
    feed = []
    for rss_url, rss_contents in rss_feeds.items():
        log.debug('Parsing index {}'.format(rss_url))
        try:
            rss_feed = fp.parse(rss_contents)
            rss_entries = rss_feed['entries']
            tmp_section = rss_config[rss_url]['section']
            for rss_entry in rss_entries:

                tmp_url = rss_entry['links'][0]['href']
                tmp_title = rss_entry['title']
                if 'published_parsed' in rss_entry.keys():
                    as_of_dt = dt.datetime(*rss_entry['published_parsed'][:6])
                else:
                    as_of_dt = None
                feed.append(
                    Article(url=tmp_url,
                            title=tmp_title,
                            section=tmp_section,
                            as_of_dt=as_of_dt))

        except Exception as e:
            log.error('Error parsing index {} \n {}'.format(
                rss_url, e.message))
    return feed
Exemplo n.º 2
0
def get_ix_pg(as_of_dt):
    log.debug('Getting index page for date {}'.format(as_of_dt))
    # dt_fmt = '%Y%m'
    path_template = r'http://paper.people.com.cn/rmrb/html/{}/{}/nbs.D110000renmrb_01.htm'
    path = path_template.format(as_of_dt.strftime(r'%Y-%m'),
                                as_of_dt.strftime(r'%d'))
    ix_page = url_helper.get_url(path)
    soup = bs(ix_page, 'html.parser')

    #get sub-section url
    subsection_div = soup.find('div', id='pageList').find_all('a')
    sub_sections_url_map = {}
    for i in subsection_div:
        if i['href'].endswith('htm'):
            sub_section_url = urllib.parse.urljoin(path, i['href'])
            sub_section_name = i.text
            sub_sections_url_map[sub_section_url] = sub_section_name

    #get articles url
    sub_sections_html_map = url_helper.get_urls_async(
        list(sub_sections_url_map.keys()), True, configs.aapl_sim_req)
    article_links = []
    for sub_section_url, sub_section_html in sub_sections_html_map.items():
        sub_soup = bs(sub_section_html, 'html.parser')
        article_urls_a = sub_soup.find('div', id='titleList').find_all('a')
        for i in article_urls_a:
            if i['href'].endswith('htm'):
                tmp_url = urllib.parse.urljoin(sub_section_url, i['href'])
                tmp_name = sub_sections_url_map[sub_section_url]
                article_links.append({
                    'url': tmp_url,
                    'title': None,
                    'section': tmp_name
                })
    return article_links
Exemplo n.º 3
0
def process_news(as_of_dt):
    log.info('Start processing news for {}'.format(as_of_dt))
    daily_paper = None
    try:
        daily_paper = get_ix_pg(as_of_dt)  #add retry for getting index
        article_links = [i['url'] for i in daily_paper]
        article_html_maps = url_helper.get_urls_async(article_links, True,
                                                      configs.aapl_sim_req)
        for article in daily_paper:
            article['contents'] = parse_article(article_html_maps[
                article['url']])  #add retry in getting contents
        db_helper.to_db(as_of_dt, configs.b81daily_coll, daily_paper)
        log.info('Finish processing news for {}'.format(as_of_dt))
    except:
        log.error('Failed processing news for {}'.format(as_of_dt))
    return daily_paper
Exemplo n.º 4
0
def get_ix_pg(as_of_dt):
    log.debug('Getting index page for date {}'.format(as_of_dt))
    # dt_fmt = '%Y%m'

    path_template = r'http://www.81.cn/jfjbmap/content/{}/{}/node_2.htm'
    path = path_template.format(as_of_dt.strftime(r'%Y-%m'),
                                as_of_dt.strftime(r'%d'))
    ix_page = url_helper.get_url(path)
    soup = bs(ix_page, 'html.parser')

    #get sub-section url
    subsection_div = soup.find('div',
                               class_='col-md-4-10 channel-list').find_all('a')
    sub_sections_url_map = {}
    for i in subsection_div:
        if i['href'].endswith('htm'):
            sub_section_url = urllib.parse.urljoin(path, i['href'])
            sub_section_name = i.text
            sub_sections_url_map[sub_section_url] = sub_section_name

    #get articles url
    sub_sections_html_map = url_helper.get_urls_async(
        list(sub_sections_url_map.keys()), True, configs.aapl_sim_req)
    article_links = []
    for sub_section_url, sub_section_html in sub_sections_html_map.items():
        sub_soup = bs(sub_section_html, 'html.parser')
        article_urls_a = sub_soup.find(
            'div', class_='newslist-item current').find_all('a')
        for i in article_urls_a:
            if i['href'].endswith('htm'):
                tmp_url = urllib.parse.urljoin(sub_section_url, i['href'])
                tmp_name = sub_sections_url_map[sub_section_url]
                tmp_title = i.text
                article_links.append({
                    'url': tmp_url,
                    'title': tmp_title,
                    'section': tmp_name
                })
    return article_links