def get_feed(rss_config): log.debug('Start getting rss feed') rss_urls = list(rss_config.keys()) rss_feeds = url_helper.get_urls_async(rss_urls, configs.rss_ix_stop_btw_batch, configs.rss_ix_sim_req) feed = [] for rss_url, rss_contents in rss_feeds.items(): log.debug('Parsing index {}'.format(rss_url)) try: rss_feed = fp.parse(rss_contents) rss_entries = rss_feed['entries'] tmp_section = rss_config[rss_url]['section'] for rss_entry in rss_entries: tmp_url = rss_entry['links'][0]['href'] tmp_title = rss_entry['title'] if 'published_parsed' in rss_entry.keys(): as_of_dt = dt.datetime(*rss_entry['published_parsed'][:6]) else: as_of_dt = None feed.append( Article(url=tmp_url, title=tmp_title, section=tmp_section, as_of_dt=as_of_dt)) except Exception as e: log.error('Error parsing index {} \n {}'.format( rss_url, e.message)) return feed
def get_ix_pg(as_of_dt): log.debug('Getting index page for date {}'.format(as_of_dt)) # dt_fmt = '%Y%m' path_template = r'http://paper.people.com.cn/rmrb/html/{}/{}/nbs.D110000renmrb_01.htm' path = path_template.format(as_of_dt.strftime(r'%Y-%m'), as_of_dt.strftime(r'%d')) ix_page = url_helper.get_url(path) soup = bs(ix_page, 'html.parser') #get sub-section url subsection_div = soup.find('div', id='pageList').find_all('a') sub_sections_url_map = {} for i in subsection_div: if i['href'].endswith('htm'): sub_section_url = urllib.parse.urljoin(path, i['href']) sub_section_name = i.text sub_sections_url_map[sub_section_url] = sub_section_name #get articles url sub_sections_html_map = url_helper.get_urls_async( list(sub_sections_url_map.keys()), True, configs.aapl_sim_req) article_links = [] for sub_section_url, sub_section_html in sub_sections_html_map.items(): sub_soup = bs(sub_section_html, 'html.parser') article_urls_a = sub_soup.find('div', id='titleList').find_all('a') for i in article_urls_a: if i['href'].endswith('htm'): tmp_url = urllib.parse.urljoin(sub_section_url, i['href']) tmp_name = sub_sections_url_map[sub_section_url] article_links.append({ 'url': tmp_url, 'title': None, 'section': tmp_name }) return article_links
def process_news(as_of_dt): log.info('Start processing news for {}'.format(as_of_dt)) daily_paper = None try: daily_paper = get_ix_pg(as_of_dt) #add retry for getting index article_links = [i['url'] for i in daily_paper] article_html_maps = url_helper.get_urls_async(article_links, True, configs.aapl_sim_req) for article in daily_paper: article['contents'] = parse_article(article_html_maps[ article['url']]) #add retry in getting contents db_helper.to_db(as_of_dt, configs.b81daily_coll, daily_paper) log.info('Finish processing news for {}'.format(as_of_dt)) except: log.error('Failed processing news for {}'.format(as_of_dt)) return daily_paper
def get_ix_pg(as_of_dt): log.debug('Getting index page for date {}'.format(as_of_dt)) # dt_fmt = '%Y%m' path_template = r'http://www.81.cn/jfjbmap/content/{}/{}/node_2.htm' path = path_template.format(as_of_dt.strftime(r'%Y-%m'), as_of_dt.strftime(r'%d')) ix_page = url_helper.get_url(path) soup = bs(ix_page, 'html.parser') #get sub-section url subsection_div = soup.find('div', class_='col-md-4-10 channel-list').find_all('a') sub_sections_url_map = {} for i in subsection_div: if i['href'].endswith('htm'): sub_section_url = urllib.parse.urljoin(path, i['href']) sub_section_name = i.text sub_sections_url_map[sub_section_url] = sub_section_name #get articles url sub_sections_html_map = url_helper.get_urls_async( list(sub_sections_url_map.keys()), True, configs.aapl_sim_req) article_links = [] for sub_section_url, sub_section_html in sub_sections_html_map.items(): sub_soup = bs(sub_section_html, 'html.parser') article_urls_a = sub_soup.find( 'div', class_='newslist-item current').find_all('a') for i in article_urls_a: if i['href'].endswith('htm'): tmp_url = urllib.parse.urljoin(sub_section_url, i['href']) tmp_name = sub_sections_url_map[sub_section_url] tmp_title = i.text article_links.append({ 'url': tmp_url, 'title': tmp_title, 'section': tmp_name }) return article_links