def pic_detail_spider(self, url): content = self.get_content(url) soup = BeautifulSoup(content) news_detail_list = list() now_year = str(datetime.now().year) for data in soup.select(".picList div"): if '(' in data.span.string: date_time = now_year + '-' + data.span.string[2:-1].replace('/', '-') + ':00' else: date_time = data.span.string + ':00' pub_timestamp = string_transform_timestamp(date_time) if pub_timestamp < self.start_timestamp: self.flag = 1 break news_detail_list.append(data.p.a['href']) for news in news_detail_list: tmp_dict = dict() try: news_body = self.get_content(news) except Exception as e: print traceback.format_exc() logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) title = get_tag_html(news_soup, 'h1') tmp_dict['title'] = title # 获取文章内容 content_list = news_body.split('\n') artile_list = list() img_list = list() img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>' artile = '' for em in content_list: if '{title:' in em: em = em.replace("{title:'", "") em = em.replace("',", "") artile_list.append(em.strip()) if 'big_img: ' in em: em = em.replace("big_img: '", "") em = em.replace("',", "") img_title = '' # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(em.strip()) if status: artile += img_tag.format(img_url=img_url, img_title=img_title) img_list.append([img_title, img_url]) for a_content in set(artile_list): artile += a_content tmp_dict['artile'] = artile tmp_dict['img_list'] = img_list tmp_dict['source'] = news tmp_dict['pic_mode'] = 1 self.article_data_list.append(tmp_dict)
def __init__(self, start_time): self.start_timestamp = string_transform_timestamp(start_time) self.flag = 0 self.article_data_list = list() self.url_list = [ 'http://yule.sohu.com/tv{page}.shtml', 'http://yule.sohu.com/movie{page}.shtml', 'http://music.yule.sohu.com/news{page}.shtml', ] self.pic_url_list = [ 'http://pic.yule.sohu.com/cate-911401.shtml' ]
def __init__(self, start_time): self.start_timestamp = string_transform_timestamp(start_time) self.flag = 0 self.article_data_list = list() self.url_list = [ 'http://ent.ifeng.com/listpage/3/{page}/list.shtml', 'http://ent.ifeng.com/listpage/6/{page}/list.shtml', 'http://ent.ifeng.com/listpage/1370/{page}/list.shtml', 'http://ent.ifeng.com/listpage/30741/{page}/list.shtml', ] self.pic_url_list = [ 'http://yue.ifeng.com/pagelist/21897/{page}/list.shtml', 'http://ent.ifeng.com/listpage/39788/{page}/list.shtml' ]
def detail_spider(self, url): content = self.get_content(url) soup = BeautifulSoup(content) news_detail_list = list() for data in soup.select(".box_txt"): pub_timestamp = string_transform_timestamp(data.span.string + ':00') if pub_timestamp < self.start_timestamp: self.flag = 1 break news_detail_list.append(data.a['href']) for news in news_detail_list: tmp_dict = dict() try: news_body = self.get_content(news) except Exception as e: print traceback.format_exc() logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) title = get_tag_html(news_soup, 'h1') tmp_dict['title'] = title # 获取文章内容 artile = '' # 获取图片 img_list = list() img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>' for data in news_soup.select("#main_content"): img_title = data.span.string if data.span.string else '' try: img_url = data.p.img['src'] except Exception as e: print traceback.format_exc() logger.debug(traceback.format_exc()) continue # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) artile += img_tag.format(img_url=img_url, img_title=img_title) for a in news_soup.select("#main_content p"): for string in a.strings: artile += u'<p>' + string.strip() + u'</p>' tmp_dict['artile'] = artile tmp_dict['img_list'] = img_list tmp_dict['source'] = news tmp_dict['pic_mode'] = 1 self.article_data_list.append(tmp_dict)
def __init__(self, start_time): self.start_timestamp = string_transform_timestamp(start_time) self.flag = 0 self.article_data_list = list() self.url_list = [ 'http://feed.mix.sina.com.cn/api/roll/get?pageid=107&lid=1244&num=30' '&versionNumber=1.2.8&page={page}&encode=utf-8&callback=feedCardJsonpCallback', 'http://feed.mix.sina.com.cn/api/roll/get?pageid=51&lid=740' '&num=30&versionNumber=1.2.8&page={page}&encode=utf-8&callback=feedCardJsonpCallback', 'http://feed.mix.sina.com.cn/api/roll/get?pageid=105&lid=1217&num=30&versionNumber=1.2.8&page={page}' '&encode=utf-8&callback=feedCardJsonpCallback', 'http://feed.mix.sina.com.cn/api/roll/get?pageid=50&lid=697&num=30&versionNumber=1.2.8&page={page}' '&encode=utf-8&callback=feedCardJsonpCallback' ] self.pic_url_list = [ 'http://pic.yule.sohu.com/cate-911401.shtml' ]
def pic_main(self): for url in self.pic_url_list: try: content = self.get_content(url) except Exception as e: logger.debug(traceback.format_exc()) continue soup = BeautifulSoup(content) for data in soup.select("#item-list a"): tmp_dict = dict() news_url = data['href'] try: news_body = self.get_content(news_url) except Exception as e: logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) title = get_tag_html(news_soup, '#contentE h2') pub_time = get_tag_html(news_soup, '[class~=timt]') pub_time = pub_time.replace(u'日期:', '').strip() pub_timmestamp = string_transform_timestamp(pub_time + ' 00:00:00') if pub_timmestamp < self.start_timestamp: self.flag = 1 break tmp_dict['title'] = title # 获取文章内容 tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]') # 获取图片 img_list = list() for img in news_soup.select("#picPlayerTab img"): img_title = img['alt'] img_url = img['src'].replace('st', '') # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) tmp_dict['img_list'] = img_list tmp_dict['source'] = news_url self.article_data_list.append(tmp_dict) insert_news_to_mysql(self.article_data_list)
def detail_spider(self, url): content = self.get_content(url) soup = BeautifulSoup(content) news_detail_list = list() now_year = str(datetime.now().year) for data in soup.select("[class~=f14list] li"): if data.span: date_time = now_year + '-' + data.span.string[2:-1].replace('/', '-') + ':00' date_timestamp = string_transform_timestamp(date_time) if date_timestamp < self.start_timestamp: self.flag = 1 break news_detail_list.append(data.a['href']) for news in news_detail_list: tmp_dict = dict() try: news_body = self.get_content(news) except Exception as e: logger.debug(traceback.format_exc()) continue news_soup = BeautifulSoup(news_body) if 'pic' not in news: print news title = get_tag_html(news_soup, 'h1') tmp_dict['title'] = title # 获取图片 img_list = list() img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>' artile = '' for img in news_soup.select("#contentText img"): img_title = img['alt'] img_url = img['src'] # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) artile += img_tag.format(img_url=img_url, img_title=img_title) # 获取文章内容 for a in news_soup.select("#contentText p"): for string in a.strings: if '_tvId' not in string: artile += u'<p>' + string.strip() + u'</p>' artile = artile.replace(u'搜狐娱乐讯 ', '') tmp_dict['artile'] = artile tmp_dict['img_list'] = img_list tmp_dict['pic_mode'] = 0 else: title = get_tag_html(news_soup, '[class~=ttl]') tmp_dict['title'] = title # 获取文章内容 tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]') # 获取图片 img_list = list() for img in news_soup.select("#picPlayerTab img"): img_title = img.get('alt') if img.get('alt') else '' img_url = img['src'].replace('st', '') # 上传图片到阿里云 status, msg, img_url = upload_img_to_oss2(img_url) if status: img_list.append([img_title, img_url]) tmp_dict['img_list'] = img_list tmp_dict['pic_mode'] = 1 tmp_dict['source'] = news self.article_data_list.append(tmp_dict)