Exemplo n.º 1
0
    def parse_article(self, response):
        logger.info("{} Url {}".format(get_function_name(), response.url))

        scripts = response.xpath('//script/text()').extract()
        for script in scripts:
            script = script.lstrip()
            # 1. 找到记录数据的script, 提取content字段
            if script.startswith('var BASE_DATA'):
                script.replace("\n", "")
                script = re.sub(self.match_space, "", script)
                start_tag = "content:'"
                start_idx = script.find(start_tag)
                end_idx = script.find("',", start_idx)
                if start_idx != -1 and end_idx != -1:
                    script = script[start_idx + len(start_tag):end_idx]
                    # 2. 获取content转义后提取文本
                    data = html.unescape(script)
                    soup = BeautifulSoup(data, 'lxml')
                    texts = "\r\n".join([p.text for p in soup.select('p')])
                    if len(texts.strip()) == 0:
                        continue
                    # 3. 记录文本
                    item = ToutiaoItem()
                    item['url'] = response.url
                    item['field'] = time.strftime(
                        "%Y%m%d") + os.sep + response.meta['field']
                    item['title'] = time.strftime("%Y%m%d_%H%M%S_") + str(
                        random.randint(0, 1000))
                    item['content'] = texts
                    print('dump item:   ', item['url'])
                    yield item
Exemplo n.º 2
0
 def parse(self, response):
     item = ToutiaoItem()
     list_selector = response.xpath("//div[@class='wcommonFeed']/ul/li")
     for li in list_selector:
         try:
             #标题
             title = li.xpath(".//a[@class='link title']/text()").extract()
             #去除空格
             title = title[0].strip(" ")
             #来源
             source = li.xpath(
                 ".//a[@class='lbtn source']/text()").extract()
             #去除点号和全角空格
             source = source[0].strip("⋅").strip(" ")
             #评论数
             comment = li.xpath(".//a[@class='lbtn comment']/text()")
             #去除文字及空格
             comment = comment.re("(.*?)评论")[0]
             comment = "".join(comment.split())  #去除空格:&nbsp
             item["title"] = title  #标题
             item["source"] = source  #来源
             item["comment"] = comment  #评论数
             yield item
         except:
             continue
Exemplo n.º 3
0
    def second(self, response):
        item = ToutiaoItem()
        # print(response.url)
        # 文章标题
        title_re = re.compile('<title>(.*?)</title>', re.S)
        article_title = re.findall(title_re, response.text)
        if article_title:
            article_title = ''.join(article_title)
        # 文章内容
        content_re = re.compile(",\n      content: \'(.*?)',\n      groupId",
                                re.S)
        article_content = re.findall(content_re, response.text)
        if article_content:
            article_content = article_content[0]
            article_content = article_content.replace('&lt;', '<').replace(
                '&gt;', '>').replace('&#x3D;', '=').replace('&quot;', '"')
            # image_re = re.compile('')
            # doc = pq(article_content)
            # article_content = doc('p').text()
            # https://www.cnblogs.com/lei0213/p/7676254.html
            # print(article_content)
            # with open('detail.txt', 'a', encoding='utf-8') as f:
            #     f.write(article_content+'\n')
            item['article_url'] = response.url
            item['article_title'] = article_title
            item['article_content'] = article_content

            yield item
Exemplo n.º 4
0
    def parse(self, response):
        # 转换为python格式
        data = json.loads(response.text)['data']
        for c in data:
            item = ToutiaoItem()
            each = c['comment']
            # 用户名称
            item['name'] = each['user_name']
            #评论内容
            item['content'] = each['text']
            # 点赞数量
            item['digg_count'] = each['digg_count']
            # 回复数量
            item['reply_count'] = each['reply_count']

            yield item

        self.page += 20
        yield scrapy.Request(self.url + str(self.page), callback=self.parse)
 def parse(self, response):
     # self.log(response.text)
     title = response.css('.article-title::text').extract_first()
     date = response.css('.article-sub span+ span').extract_first()
     content = ''.join(response.css('.article-content p::text').extract())
     abstract = content[:140]
     if title is not None and date is not None and abstract is not None:
         title = title.strip()
         date = date.strip()
         date = self.reg_filter.sub('', date)
         abstract = abstract.strip()
         if title != '' and date != '' and abstract != '':
             item = ToutiaoItem()
             item["title"] = title
             item["abstract"] = abstract
             item["url"] = response.url
             item["date"] = date
             item["content"] = content
             item["mediaName"] = self.mediaName
             item["keyword"] = response.meta["keyword"]
             yield item
Exemplo n.º 6
0
    def parse(self, response):
        result = json.loads(response.text)

        print('开始')
        headers = self.get_headers()

        if result.get('data'):
            item = ToutiaoItem()
            list = []
            for content in result.get('data'):

                get = content

                re_compile = re.compile('"Abstract":"(.*?)","abstract"')
                match = re.findall(re_compile, str(get))
                if match[0].find('壁纸') != -1:
                    re_compile = re.compile(
                        '.*?"url_list":.{"url":"(http://sf\d-ttcdn-tos.pstatp.com/img/pgc-image/.*?~400x400_c5.webp)"},{"url":"'
                    )
                    findall = re.findall(re_compile, str(get))
                    #1. 一张一张下
                    # if len(findall)<=4:
                    #     for url in findall:
                    #         item['name']=match[0]
                    #         item['img_url'] =url
                    #         yield item
                    #     2. 一篇一篇下
                    if len(findall) <= 4:
                        item['name'] = match[0]
                        item['img_url'] = findall
                        yield item
                # list.append(findall)
            # item['img_url'] = list
            #     item['img_url'] = findall
            #     yield item
            offset = result.get('offset')
            yield Request(url=self.url.format(offset=offset),
                          callback=self.parse,
                          headers=headers,
                          dont_filter=True)
Exemplo n.º 7
0
 def parse(self, response):
     # 解析json格式数据
     data = json.loads(response.text)['data']
     for i in data:
         # 每一项评论创建一个Item实例
         item = ToutiaoItem()
         comment = i['comment']
         item['Num'] = self.Num
         item['text'] = comment['text']
         item['name'] = comment['user_name']
         item['like'] = comment['digg_count']
         item['reply'] = comment['reply_count']
         # 打印测试结果
         print(u'\n')
         print(u'序号:', item['Num'])
         print(u'评论:', item['text'])
         print(u'名字:', item['name'])
         print(u'点赞:', item['like'])
         print(u'回复:', item['reply'])
         # Num设置为全局变量,方便查看进度
         self.Num += 1
         return item
Exemplo n.º 8
0
 def parse(self, response):
     #这里的response好像不是中间件中返回的HtmlResponse???看下一行
     #经过测试,中间件返回的HtmlResponse会传到这里的解析函数parse中
     #之前测试一直错误是因为中间件中xpath书写错误导致并没有返回HtmlResponse,而是print('错误消息')
     #说明是返回的None,说明并没有截断请求,response还是原页面的源代码,原页面是通过AJAX加载的,所以一直获取不到元素
     selector_list = response.xpath('//div[@class="wcommonFeed"]/ul/li')
     # print(selector_list,'*'*100)
     # print(len(response.text),'*'*100)
     for li in selector_list:
         try:
             item = ToutiaoItem()
             item["title"] = li.xpath(
                 './/a[@class="link title"]/text()').extract()[0].strip()
             item["source"] = li.xpath(
                 './/a[@class="lbtn source"]/text()').extract()[0].strip(
                     '·').strip()  #这里的点号一直没去掉,可以尝试从源代码中把那个点复制粘贴出来试试
             comment = li.xpath(
                 './/a[@class="lbtn comment"]/text()').extract()[0].strip()
             item["comment"] = re.findall('(\d+?)评论', comment)[0]
             yield item
         except:
             continue  #如果出错了,就跳过这个item,继续下一个item的抽取
Exemplo n.º 9
0
    def parse(self, response):
        item = ToutiaoItem()
        list_selectors = response.xpath("//div[@class='wcommonFeed']/ul/li")
        for li in list_selectors:
            try:
                title = li.xpath(".//a[@class='link title']/text()").extract()
                title = title[0].strip(" ")
                print("title:", title)
                source = li.xpath(".//a[@class='lbtn source']/text()").extract()
                source = source[0].strip("⋅").strip(" ")
                print("source:", source)
                comment = li.xpath(".//a[@class='lbtn comment']/text()")
                comment = comment.re("(.*?)评论")[0]
                comment = "".join(comment.split())
                print("comment:", comment)

                item['title'] = title
                item['source'] = source
                item['comment'] = comment
                yield item
            except:
                continue
Exemplo n.º 10
0
 def parse(self, response):
     js = json.loads(response.body.decode('utf-8'))
     data = js['data']
     # cookie = response.headers.getlist('Set-Cookie')[0].split(';')[0]
     # print(cookie)
     for d in data:
         image_list = d.get('image_list')
         if image_list:
             for item in image_list:
                 toutiao_item = ToutiaoItem()
                 url = item.get('url')
                 new_image_url = url.replace('list', 'large').replace(
                     '/190x124', '')
                 pic_id = hashlib.md5(
                     new_image_url.encode("utf8")).hexdigest()
                 result = collection.find_one({"pic_id": pic_id})
                 if (result):
                     print("continue")
                     continue
                 else:
                     toutiao_item["pic_id"] = pic_id
                     toutiao_item["pic_link"] = new_image_url
                     toutiao_item["pic_desc"] = self.getPicDesc()
                     yield toutiao_item
Exemplo n.º 11
0
    def parse(self, response):
        item = ToutiaoItem()
        list_selector = response.xpath("//div[@class='single-mode-rbox-inner']")
        for div in list_selector:
            try:
                # 标题
                title = div.xpath("./div[@class='title-box']/a/text()").extract()
                # 去除标题空格
                title = title[0].strip(" ")
                # 来源
                source = div.xpath("./div[@class='footer-bar']/div[1]/a[2]/text()").extract()
                # 去除点号与全角空格
                source = source[1].strip("·").strip(" ")
                # 评论数
                comment = div.xpath("./div[@class='footer-bar']/div[1]/a[3]/text()").extract()
                comment = comment[1]
                item["title"] = title
                item["source"] = source
                item["comment"] = comment

                print("item:" + item)
                yield item
            except:
                continue
Exemplo n.º 12
0
    def parse1(self, response):
        items = ToutiaoItem()

        #类型
        items['class_id'] = '1'
        # 文章ID
        items['article_id'] = str(response.meta['article_id'])
        #标题
        items['title'] = response.xpath(
            '//*[@id="app"]/div[1]/div/div[1]/div/div[1]/h2').xpath(
                'string(.)').extract_first()
        #作者
        items['writer'] = response.xpath(
            '//*[@id="app"]/div[1]/div/div[1]/div/div[2]/a').xpath(
                'string(.)').extract_first()
        #发表时间
        items['publish_time'] = str(response.meta['publish_time'])

        # 浏览量
        items['read_number'] = str(response.meta['read_number'])
        # 导读
        items['summary'] = str(response.meta['summary'])
        #内容
        if response.css('div.js-article-detail p').extract():
            article = response.css('div.js-article-detail p').extract()
            article = ''.join(article)

        #关键字
        try:
            tags = response.xpath('//div[@class="tags"]/a').xpath(
                'string(.)').extract()
            items['tags'] = ','.join(tags)
        except:
            pass
        time.sleep(1)

        #图片下载
        # 根据日期创建文件夹
        file_day = items['publish_time'].split('/')[0]
        try:
            os.mkdir('img/tt_img/' + file_day)
        except:
            pass
        time.sleep(1)
        # 遍历img文件夹下的所有日期文件夹
        path = r'E:\安培斯通\amber-spider\\toutiao\img\\tt_img'
        # path = r'/home/amber-spider/toutiao/img/tt_img'
        date_files = os.listdir(path)
        for file in date_files:
            fi = os.path.join(path, file)
            fil = os.path.join(path, file).split('/')[-1]

            if fil == file_day:
                try:
                    os.mkdir(fi + '/' + items['article_id'])
                except:
                    pass
                if response.css(
                        'div.js-article-detail p img::attr(src)').extract():
                    img_url = response.css(
                        'div.js-article-detail p img::attr(src)').extract()
                time.sleep(1)
                try:
                    for i in img_url:

                        #截取原图的URL地址
                        url = i.split('_')[0]

                        request.urlretrieve(
                            url, fi + '/' + items['article_id'] + '/' +
                            url.split("/")[-1] + '.jpg')
                        time.sleep(1)
                except:
                    pass

            # 拼接oss上传路径
                oss_url = 'https://capital-future-imgs.oss-cn-beijing.aliyuncs.com/tt_img/' + fil + '/' + items[
                    'article_id'] + '/'
                if 'https://img.jinse.com/' and '_image3.png' in article:
                    items['article'] = article.replace(
                        'https://img.jinse.com/', oss_url).replace(
                            '_image3.png',
                            '.jpg?x-oss-process=image/resize,l_500')
                elif 'https://img.jinse.com/' and '_watermarknone.png' in article:
                    items['article'] = article.replace(
                        'https://img.jinse.com/', oss_url).replace(
                            '_watermarknone.png',
                            '.jpg?x-oss-process=image/resize,l_500')
                else:
                    items['article'] = article
            # 内容里的图片地址
                if 'https://capital-future-imgs.oss-cn-beijing.aliyuncs.com/' in items[
                        'article']:
                    oss_url1 = items['article'].split('src=')[1].split(
                        '.jpg')[0].replace(
                            '"', '') + '.jpg?x-oss-process=image/resize,l_500'
                    items['oss_url1'] = oss_url1
                else:
                    items[
                        'oss_url1'] = 'https://capital-future-imgs.oss-cn-beijing.aliyuncs.com/images/new_default.png?x-oss-process=image/resize,l_500'

                yield self._clean_str(items)

    # 根据时间终止爬虫
    # 爬取时间
        array_time1 = time.strptime(items['publish_time'], "%Y-%m-%d/%H:%M:%S")
        crawl_time = time.mktime(array_time1)
        # 获取当前时间戳
        now = datetime.datetime.now()
        # 当前时间减去1天(1天=86400秒)
        sched_timer = str(
            datetime.datetime(now.year, now.month, now.day, now.hour,
                              now.minute, now.second) -
            datetime.timedelta(seconds=86400 * 5))
        array_time = time.strptime(sched_timer, "%Y-%m-%d %H:%M:%S")
        now_time = time.mktime(array_time)
        if crawl_time < now_time:
            self.crawler.stop()
Exemplo n.º 13
0
    def sub_nav(self, response):
        page = Selector(response)

        # print(response.text)
        # 所有子标签的url
        sub_nav_tips1 = page.xpath(
            '//div[@class="channel"]/ul/li/a/@href').extract()
        del sub_nav_tips1[:2], sub_nav_tips1[-1], sub_nav_tips1[1]
        sub_nav_tips2 = page.xpath(
            '//div[@class="channel-more-layer"]/ul/li/a/@href').extract()
        sub_nav_tips = sub_nav_tips1 + sub_nav_tips2
        # print(sub_nav_tips)
        #子标签的名字
        sub_names1 = page.xpath(
            '//div[@class="channel"]/ul/li/a/span/text()').extract()
        del sub_names1[:2], sub_names1[-1], sub_names1[1]
        sub_names2 = page.xpath(
            '//div[@class="channel-more-layer"]/ul/li/a/span/text()').extract(
            )
        sub_names = sub_names1 + sub_names2
        # print(sub_names)
        # 每个子标签遍历
        for i in range(0, len(sub_nav_tips)):
            items = []
            # 请求子标签页面
            self.brower.get('https://www.toutiao.com' + sub_nav_tips[i])
            # 返回秒时间戳
            now = round(time.time())
            # 获取signature加密数据
            signature = self.brower.execute_script('return TAC.sign(' +
                                                   str(now) + ')')
            # print(signature)
            # 获取cookie
            cookie = self.brower.get_cookies()
            cookie = [item['name'] + "=" + item['value'] for item in cookie]
            cookiestr = '; '.join(item for item in cookie)
            # print(cookiestr)

            header1 = {
                'Host': 'www.toutiao.com',
                'User-Agent':
                '"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"',
                # 'Referer': 'https://www.toutiao.com/ch/news_hot/',
                "Cookie": cookiestr
            }

            send_data = {
                'category': sub_nav_tips[i][4:-1],
                'utm_source': 'toutiao',
                'widen': '1',
                'max_behot_time': now,
                '_signature': signature
            }
            # 拼接ajax URL
            url = self.ajax_url_base + urlencode(send_data)
            # print(url)
            html = requests.get(url, headers=header1, verify=False)
            # 返回json数据,解析
            json_datas = json.loads(html.text)['data']
            # print(json_datas)
            for json_data in json_datas:
                item = ToutiaoItem()
                # print(type(json_data))
                item['title'] = json_data['title']
                # 有的字段为空
                try:
                    item[
                        'source_url'] = 'https://www.toutiao.com/a' + json_data[
                            'source_url'][7:]
                except:
                    item['source_url'] = ''
                try:
                    item['abstract'] = json_data['abstract']
                except:
                    item['abstract'] = ''
                try:
                    item['source'] = json_data['source']
                except:
                    item['source'] = ''
                try:
                    item['tag'] = json_data['tag']
                except:
                    item['tag'] = ''
                try:
                    item['chinese_tag'] = json_data['chinese_tag']
                except:
                    item['chinese_tag'] = '无标签类别'
                item['news_class'] = sub_names[i]
                yield item
        self.brower.quit()