def _get_news_url_list_by_pat_and_date(pat, date): list_ = [] tag = False html = urlopen_and_read(pat % '', ).decode('gbk', 'ignore') page = eval(re.search('var maxPage = (\\d+);', html).group(1)) soup = Soup(html) while True: for div in soup.find_all('div', class_='article'): url = div.find_all('a')[1]['href'] if url[21:29] > date: continue elif url[21:29] == date: if '?' in url: url = url[:url.find('?')] if not url in list_: list_.append(url) else: tag = True break page -= 1 if tag: break soup = Soup( urlopen_and_read(pat % '_%u' % page).decode('gbk', 'ignore')) return list_
def _work_item(news_module_str, store_module, news_module, type_, url, logger): if DEBUG: print(url) try: try_ = TRY_TIME while True: try: html = urlopen_and_read(url).decode(news_module.NEWS_CHARSET, 'ignore') news = news_module.match_news(html, url) if not news:return break except: if try_: try_ -= 1 continue else: raise file_name = re.sub(NAME_PAT, '', news.title) if not type_: try: type_ = news_module.get_type(html) except: type_ = 'temp' store_args = news_module_str, type_, file_name if not DEBUG: store_module.store_news(news, store_args) comment_url_args = news.comment_url_args web.news.crawl_comments(news_module, _match_and_store_comments(news_module_str, news_module, store_module), store_args, comment_url_args) except Exception as e: logger.error("\"%s\" happened on '%s' '%s' work_item"%(e, news_module_str, url)) if DEBUG: raise
def match_news(html, url): soup = Soup(html) sid = re.search('/n(\\d+)', url).group(1) url2 = 'http://changyan.sohu.com/node/html?client_id=cyqemw6s1&topicsid=%s' % sid topic_id = eval(urlopen_and_read(url2).decode( 'utf-8', 'ignore'))['listData']['topic_id'] comment_url_args = (topic_id, ) title = soup.title.text main_content = soup.find('div', {'itemprop': 'articleBody'}) if not main_content: return if main_content.img: news_image = main_content.img['src'] else: news_image = None content = '\n'.join([ temp.strip() for temp in [item.get_text() for item in main_content.find_all('p')] if not re.match('\\s*$', temp) ]) source = soup.find('span', {'itemprop': 'name'}).text source_url = soup.find('span', {'itemprop': 'isBasedOnUrl'}).text date = soup.find('div', {'itemprop': 'datePublished'}).get_text() date = time.strptime(date, "%Y-%m-%d %H:%M:%S") date = time.mktime(date) return News(url, comment_url_args, title, content, source, date, source_url, news_image=news_image)
def get_news_url_list(date_=None): if not date_: today = time.localtime() date_ = time.strftime('%Y-%m-%d', (today[0], today[1], today[2]-1, today[3], today[4], today[5], today[6], today[7], today[8])) else: date_ = time.strptime(date_,"%Y%m%d") date_ = time.strftime('%Y-%m-%d', (date_[0], date_[1], date_[2], date_[3], date_[4], date_[5], date_[6], date_[7], date_[8])) data = urlopen_and_read("http://news.163.com/special/0001220O/news_json.js").decode("gbk", 'ignore') data = eval(data[data.find('{'):data.rfind('}')+1]) data = data['news'] dict_ = {} for newsData in data: try: type_ = DICT[newsData[0]['c']] except: continue dict_[type_] = [] for newsDataItem in newsData: if newsDataItem['p'][0:10] > date_: continue elif newsDataItem['p'][0:10] == date_: url = newsDataItem['l'] if '?' in url: url = url[:url.find('?')] if not url in dict_[type_]: dict_[type_].append(url) else: break return dict_
def get_news_url_list(date_=None): if not date_: time_ = time.localtime() time1 = time.mktime((time_[0], time_[1], time_[2]-1, 0, 0, 0, 0, 0, 0)) time2 = time.mktime((time_[0], time_[1], time_[2]-1, 23, 59, 59, 0, 0, 0)) else: time_ = time.strptime(date_,"%Y%m%d") time1 = time.mktime((time_[0], time_[1], time_[2], 0, 0, 0, 0, 0, 0)) time2 = time.mktime((time_[0], time_[1], time_[2], 23, 59, 59, 0, 0, 0)) _YESTODAY_URL_PAT = 'http://news.baidu.com/ns?bt=%u&et=%u&tn=newstitledy&rn=50&q6=%s'%(time1, time2, web_)+'&pn=%u' list_ = [] pn = 0 while True: web = _YESTODAY_URL_PAT%pn html = urlopen_and_read(web).decode('utf-8') soup = Soup(html) div_list = soup.find_all('div', class_='result') for div in div_list: web = div.a['href'] if re.match(url_pat, web): if '?' in web: web = web[:web.find('?')] if not web in list_: list_.append(web) if not re.search('下一页', html): break pn += 50 return {'':list_}
def get_news_url_list(date=None): if not date: today = time.localtime() date = time.strftime( '%Y%m%d', (today[0], today[1], today[2] - 1, today[3], today[4], today[5], today[6], today[7], today[8])) dict_ = {} for type_ in DICT_.keys(): type_id = DICT_[type_] page = 1 while True: url_ = URL_PAT % (type_id, date, page) try: soup = Soup(urlopen_and_read(url_).decode('utf-8')) url_list = [ item['href'] for item in soup.find( 'div', class_='newsList').ul.find_all('a') ] except: break if not type_ in dict_.keys(): dict_[type_] = [] dict_[type_].extend(url_list) page += 1 return dict_
def get_news_url_list(date_=None): if not date_: today = time.localtime() date_ = time.strftime('%Y%m%d', (today[0], today[1], today[2]-1, today[3], today[4], today[5], today[6], today[7], today[8])) dict_ = {} for type_ in NEWS_URL_DICT.keys(): url = NEWS_URL_DICT[type_] page = 1 tag = False while True: data = urlopen_and_read(url%page).decode("utf-8", 'ignore') data = eval(data)['result']['data'] for item in data: date = time.localtime(eval(item['createtime'])) date = time.strftime('%Y%m%d', date) if date > date_: continue elif date == date_: if not type_ in dict_.keys(): dict_[type_] = [] url_ = item['url'].replace('\\', '') if not 'video' in url_: if '?' in url_: url_ = url_[:url_.find('?')] if not url_ in dict_[type_]: dict_[type_].append(url_) else: tag = True break if tag: break page += 1 return dict_
def do_item(item): conn = get_conn() cur = conn.cursor() title = Soup(urlopen_and_read(item[2]).decode(NEWS_CHARSET, 'ignore')).title.text title = re.sub('(_新闻)_腾讯网', '', title) title = re.sub(NAME_PAT, '', title) sql = SQL_PAT%(title, item[0]) cur.execute(sql) conn.commit() cur.close() conn.close() print(title)
def do_item(item): conn = get_conn() cur = conn.cursor() title = Soup(urlopen_and_read(item[2]).decode(NEWS_CHARSET, 'ignore')).title.text title = re.sub('(_新闻)_腾讯网', '', title) title = re.sub(NAME_PAT, '', title) sql = SQL_PAT % (title, item[0]) cur.execute(sql) conn.commit() cur.close() conn.close() print(title)
def _get_news_url_list_by_pat_and_date(pat, date): list_ = [] tag = False html = urlopen_and_read(pat%'', ).decode('gbk', 'ignore') page = eval(re.search('var maxPage = (\\d+);', html).group(1)) soup = Soup(html) while True: for div in soup.find_all('div', class_='article'): url = div.find_all('a')[1]['href'] if url[21:29] > date: continue elif url[21:29] == date: if '?' in url: url = url[:url.find('?')] if not url in list_: list_.append(url) else: tag = True break page -= 1 if tag: break soup = Soup(urlopen_and_read(pat%'_%u'%page).decode('gbk', 'ignore')) return list_
def match_news(html, url): soup = Soup(html) sid = re.search('/n(\\d+)', url).group(1) url2 = 'http://changyan.sohu.com/node/html?client_id=cyqemw6s1&topicsid=%s' % sid topic_id = eval(urlopen_and_read(url2).decode('utf-8', 'ignore'))['listData']['topic_id'] comment_url_args = (topic_id,) title = soup.title.text main_content = soup.find('div',{'itemprop':'articleBody'}) if not main_content:return if main_content.img: news_image = main_content.img['src'] else: news_image = None content = '\n'.join([temp.strip() for temp in [item.get_text() for item in main_content.find_all('p')] if not re.match('\\s*$', temp)]) source = soup.find('span', {'itemprop':'name'}).text source_url = soup.find('span',{'itemprop':'isBasedOnUrl'}).text date = soup.find('div', {'itemprop':'datePublished'}).get_text() date = time.strptime(date,"%Y-%m-%d %H:%M:%S") date = time.mktime(date) return News(url, comment_url_args, title, content, source, date, source_url, news_image=news_image)
def get_news_url_list(date=None): if not date: today = time.localtime() date = time.strftime('%Y%m%d', (today[0], today[1], today[2]-1, today[3], today[4], today[5], today[6], today[7], today[8])) dict_ = {} for type_ in DICT_.keys(): type_id = DICT_[type_] page = 1 while True: url_ = URL_PAT%(type_id, date, page) try: soup = Soup(urlopen_and_read(url_).decode('utf-8')) url_list = [item['href'] for item in soup.find('div', class_='newsList').ul.find_all('a')] except: break if not type_ in dict_.keys(): dict_[type_] = [] dict_[type_].extend(url_list) page += 1 return dict_
def crawl_comments(module, todo, store_args, args): logger = logging.getLogger('crawlerLog') retry = _RETRY_TIME try_ = _TRY_TIME if module.TYPE: page = 1 else: id = None while True: if module.TYPE: url = module.get_comment_page_url(page, args) else: url = module.get_comment_page_url(id, args) try: if type(url) == tuple: url, data = url html = urlopen_and_read(url, data).decode(module.COMMENT_CHARSET, 'ignore') else: html = urlopen_and_read(url).decode(module.COMMENT_CHARSET, 'ignore') if retry != _RETRY_TIME: retry = _RETRY_TIME if try_ != _TRY_TIME and module.TYPE: try_ = _TRY_TIME except: if retry: retry -= 1 continue logger.error("'%s' was not accessible" % url) if module.TYPE and try_: try_ -= 1 page += 1 continue else: logger.error( "'%s' was not accessible and it's not the first failure" % url) break try: data = module.get_comment_source_list(html) except: break if not data: break list_ = [] for comment_source in data: list_.append(comment_source) if not list: if module.TYPE and try_: try_ -= 1 page += 1 continue else: logger.error( "The data from '%s' couldn't be found and it's not the first failure" % url) break elif module.TYPE: try_ = _TRY_TIME todo(list_, store_args) if module.TYPE: page += 1 else: has_next = module.has_next(html) if not has_next: break id = module.get_next_id(html)
def crawl_comments(module, todo, store_args, args): logger = logging.getLogger('crawlerLog') retry = _RETRY_TIME try_ = _TRY_TIME if module.TYPE: page = 1 else: id = None while True: if module.TYPE: url = module.get_comment_page_url(page, args) else: url = module.get_comment_page_url(id, args) try: if type(url) == tuple: url, data = url html = urlopen_and_read(url, data).decode(module.COMMENT_CHARSET, 'ignore') else: html = urlopen_and_read(url).decode(module.COMMENT_CHARSET, 'ignore') if retry != _RETRY_TIME: retry = _RETRY_TIME if try_ != _TRY_TIME and module.TYPE: try_ = _TRY_TIME except: if retry: retry -=1 continue logger.error("'%s' was not accessible"%url) if module.TYPE and try_: try_ -= 1 page += 1 continue else: logger.error("'%s' was not accessible and it's not the first failure"%url) break try: data = module.get_comment_source_list(html) except: break if not data: break list_ = [] for comment_source in data: list_.append(comment_source) if not list: if module.TYPE and try_: try_ -= 1 page += 1 continue else: logger.error("The data from '%s' couldn't be found and it's not the first failure"%url) break elif module.TYPE: try_ = _TRY_TIME todo(list_, store_args) if module.TYPE: page += 1 else: has_next = module.has_next(html) if not has_next: break id = module.get_next_id(html)