示例#1
0
    def pic_detail_spider(self, url):
        content = self.get_content(url)
        soup = BeautifulSoup(content)
        news_detail_list = list()
        now_year = str(datetime.now().year)
        for data in soup.select(".picList div"):
            if '(' in data.span.string:
                date_time = now_year + '-' + data.span.string[2:-1].replace('/', '-') + ':00'
            else:
                date_time = data.span.string + ':00'
            pub_timestamp = string_transform_timestamp(date_time)
            if pub_timestamp < self.start_timestamp:
                self.flag = 1
                break
            news_detail_list.append(data.p.a['href'])
        for news in news_detail_list:
            tmp_dict = dict()
            try:
                news_body = self.get_content(news)
            except Exception as e:
                print traceback.format_exc()
                logger.debug(traceback.format_exc())
                continue
            news_soup = BeautifulSoup(news_body)
            title = get_tag_html(news_soup, 'h1')
            tmp_dict['title'] = title
            # 获取文章内容
            content_list = news_body.split('\n')
            artile_list = list()
            img_list = list()
            img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>'
            artile = ''
            for em in content_list:
                if '{title:' in em:
                    em = em.replace("{title:'", "")
                    em = em.replace("',", "")
                    artile_list.append(em.strip())
                if 'big_img: ' in em:
                    em = em.replace("big_img: '", "")
                    em = em.replace("',", "")
                    img_title = ''
                    # 上传图片到阿里云
                    status, msg, img_url = upload_img_to_oss2(em.strip())
                    if status:
                        artile += img_tag.format(img_url=img_url, img_title=img_title)
                        img_list.append([img_title, img_url])

            for a_content in set(artile_list):
                artile += a_content
            tmp_dict['artile'] = artile
            tmp_dict['img_list'] = img_list
            tmp_dict['source'] = news
            tmp_dict['pic_mode'] = 1
            self.article_data_list.append(tmp_dict)
示例#2
0
 def __init__(self, start_time):
     self.start_timestamp = string_transform_timestamp(start_time)
     self.flag = 0
     self.article_data_list = list()
     self.url_list = [
         'http://yule.sohu.com/tv{page}.shtml',
         'http://yule.sohu.com/movie{page}.shtml',
         'http://music.yule.sohu.com/news{page}.shtml',
     ]
     self.pic_url_list = [
         'http://pic.yule.sohu.com/cate-911401.shtml'
     ]
示例#3
0
 def __init__(self, start_time):
     self.start_timestamp = string_transform_timestamp(start_time)
     self.flag = 0
     self.article_data_list = list()
     self.url_list = [
         'http://ent.ifeng.com/listpage/3/{page}/list.shtml',
         'http://ent.ifeng.com/listpage/6/{page}/list.shtml',
         'http://ent.ifeng.com/listpage/1370/{page}/list.shtml',
         'http://ent.ifeng.com/listpage/30741/{page}/list.shtml',
     ]
     self.pic_url_list = [
         'http://yue.ifeng.com/pagelist/21897/{page}/list.shtml',
         'http://ent.ifeng.com/listpage/39788/{page}/list.shtml'
     ]
示例#4
0
    def detail_spider(self, url):
        content = self.get_content(url)

        soup = BeautifulSoup(content)
        news_detail_list = list()
        for data in soup.select(".box_txt"):
            pub_timestamp = string_transform_timestamp(data.span.string + ':00')
            if pub_timestamp < self.start_timestamp:
                self.flag = 1
                break
            news_detail_list.append(data.a['href'])
        for news in news_detail_list:
            tmp_dict = dict()
            try:
                news_body = self.get_content(news)
            except Exception as e:
                print traceback.format_exc()
                logger.debug(traceback.format_exc())
                continue
            news_soup = BeautifulSoup(news_body)
            title = get_tag_html(news_soup, 'h1')
            tmp_dict['title'] = title
            # 获取文章内容
            artile = ''
            # 获取图片
            img_list = list()
            img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>'
            for data in news_soup.select("#main_content"):
                img_title = data.span.string if data.span.string else ''
                try:
                    img_url = data.p.img['src']
                except Exception as e:
                    print traceback.format_exc()
                    logger.debug(traceback.format_exc())
                    continue
                # 上传图片到阿里云
                status, msg, img_url = upload_img_to_oss2(img_url)
                if status:
                    img_list.append([img_title, img_url])
                    artile += img_tag.format(img_url=img_url, img_title=img_title)
            for a in news_soup.select("#main_content p"):
                for string in a.strings:
                    artile += u'<p>' + string.strip() + u'</p>'
            tmp_dict['artile'] = artile
            tmp_dict['img_list'] = img_list
            tmp_dict['source'] = news
            tmp_dict['pic_mode'] = 1
            self.article_data_list.append(tmp_dict)
示例#5
0
 def __init__(self, start_time):
     self.start_timestamp = string_transform_timestamp(start_time)
     self.flag = 0
     self.article_data_list = list()
     self.url_list = [
         'http://feed.mix.sina.com.cn/api/roll/get?pageid=107&lid=1244&num=30'
         '&versionNumber=1.2.8&page={page}&encode=utf-8&callback=feedCardJsonpCallback',
         'http://feed.mix.sina.com.cn/api/roll/get?pageid=51&lid=740'
         '&num=30&versionNumber=1.2.8&page={page}&encode=utf-8&callback=feedCardJsonpCallback',
         'http://feed.mix.sina.com.cn/api/roll/get?pageid=105&lid=1217&num=30&versionNumber=1.2.8&page={page}'
         '&encode=utf-8&callback=feedCardJsonpCallback',
         'http://feed.mix.sina.com.cn/api/roll/get?pageid=50&lid=697&num=30&versionNumber=1.2.8&page={page}'
         '&encode=utf-8&callback=feedCardJsonpCallback'
     ]
     self.pic_url_list = [
         'http://pic.yule.sohu.com/cate-911401.shtml'
     ]
示例#6
0
 def pic_main(self):
     for url in self.pic_url_list:
         try:
             content = self.get_content(url)
         except Exception as e:
             logger.debug(traceback.format_exc())
             continue
         soup = BeautifulSoup(content)
         for data in soup.select("#item-list a"):
             tmp_dict = dict()
             news_url = data['href']
             try:
                 news_body = self.get_content(news_url)
             except Exception as e:
                 logger.debug(traceback.format_exc())
                 continue
             news_soup = BeautifulSoup(news_body)
             title = get_tag_html(news_soup, '#contentE h2')
             pub_time = get_tag_html(news_soup, '[class~=timt]')
             pub_time = pub_time.replace(u'日期:', '').strip()
             pub_timmestamp = string_transform_timestamp(pub_time + ' 00:00:00')
             if pub_timmestamp < self.start_timestamp:
                 self.flag = 1
                 break
             tmp_dict['title'] = title
             # 获取文章内容
             tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]')
             # 获取图片
             img_list = list()
             for img in news_soup.select("#picPlayerTab img"):
                 img_title = img['alt']
                 img_url = img['src'].replace('st', '')
                 # 上传图片到阿里云
                 status, msg, img_url = upload_img_to_oss2(img_url)
                 if status:
                     img_list.append([img_title, img_url])
             tmp_dict['img_list'] = img_list
             tmp_dict['source'] = news_url
             self.article_data_list.append(tmp_dict)
     insert_news_to_mysql(self.article_data_list)
示例#7
0
 def detail_spider(self, url):
     content = self.get_content(url)
     soup = BeautifulSoup(content)
     news_detail_list = list()
     now_year = str(datetime.now().year)
     for data in soup.select("[class~=f14list] li"):
         if data.span:
             date_time = now_year + '-' + data.span.string[2:-1].replace('/', '-') + ':00'
             date_timestamp = string_transform_timestamp(date_time)
             if date_timestamp < self.start_timestamp:
                 self.flag = 1
                 break
             news_detail_list.append(data.a['href'])
     for news in news_detail_list:
         tmp_dict = dict()
         try:
             news_body = self.get_content(news)
         except Exception as e:
             logger.debug(traceback.format_exc())
             continue
         news_soup = BeautifulSoup(news_body)
         if 'pic' not in news:
             print news
             title = get_tag_html(news_soup, 'h1')
             tmp_dict['title'] = title
             # 获取图片
             img_list = list()
             img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>'
             artile = ''
             for img in news_soup.select("#contentText img"):
                 img_title = img['alt']
                 img_url = img['src']
                 # 上传图片到阿里云
                 status, msg, img_url = upload_img_to_oss2(img_url)
                 if status:
                     img_list.append([img_title, img_url])
                     artile += img_tag.format(img_url=img_url, img_title=img_title)
             # 获取文章内容
             for a in news_soup.select("#contentText p"):
                 for string in a.strings:
                     if '_tvId' not in string:
                         artile += u'<p>' + string.strip() + u'</p>'
             artile = artile.replace(u'搜狐娱乐讯 ', '')
             tmp_dict['artile'] = artile
             tmp_dict['img_list'] = img_list
             tmp_dict['pic_mode'] = 0
         else:
             title = get_tag_html(news_soup, '[class~=ttl]')
             tmp_dict['title'] = title
             # 获取文章内容
             tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]')
             # 获取图片
             img_list = list()
             for img in news_soup.select("#picPlayerTab img"):
                 img_title = img.get('alt') if img.get('alt') else ''
                 img_url = img['src'].replace('st', '')
                 # 上传图片到阿里云
                 status, msg, img_url = upload_img_to_oss2(img_url)
                 if status:
                     img_list.append([img_title, img_url])
             tmp_dict['img_list'] = img_list
             tmp_dict['pic_mode'] = 1
         tmp_dict['source'] = news
         self.article_data_list.append(tmp_dict)