Exemplo n.º 1
0
    def deal_content_from_news(self,response):
        # print response.body
        data_TCPI= gather_all_funtion.get_result_you_need(response)
        content=data_TCPI[1]
        title=data_TCPI[0]
        publish_time=data_TCPI[2]
        img_urls=data_TCPI[3]

        print content
        print title
        print publish_time
        print img_urls
Exemplo n.º 2
0
    def deal_content2(self, response):
        # print response.url
        # Save_org_file(plantform='sohu', date_time=response.meta['publish_time'], urlOruid=response.url,
        #               newsidOrtid=response.meta['id'], datatype='news', full_data=response.body)
        # Save_zip(plantform='sohu', date_time=response.meta['publish_time'], urlOruid=response.url,
        #          newsidOrtid=response.meta['id'], datatype='news')

        data_TCPI = gather_all_funtion.get_result_you_need(response)
        content = data_TCPI[1]
        # publish_time=data_TCPI[2]
        img_urls = data_TCPI[3]
        # time_format = '%Y-%m-%d'
        # spider_time = time.strftime(time_format, time.localtime())
        # publish_time=time.strftime(time_format,time.localtime(float(response.meta['publish_time'])))

        # print response.body
        data = response.meta['data']
        data['content'] = content
        data['reply_nodes'] = []
        data['img_urls'] = img_urls

        Re_find_comment_id = re.compile(r'cms_id: \'.*?\'')
        try:
            comment_id = Re_find_comment_id.findall(response.body)
            print content
            print '\n'
            print data_TCPI[0]
            comment_id_find_by_re = comment_id[0]
            comment_id_find_by_re = comment_id_find_by_re.split("\'")[1]
            #https://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=3500748995&page_no=2
            url_to_comments = 'https://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=' + str(
                comment_id_find_by_re) + '&page_no=2'
            yield scrapy.Request(url=url_to_comments,
                                 headers=response.headers,
                                 meta={
                                     'plant_form': 'None',
                                     'data': data,
                                     'download_timeout': 3,
                                     'isIndex_request': True
                                 })
        except Exception as e:
            print e
Exemplo n.º 3
0
    def deal_content(self, response):  #台海网有下一页,此功能代码还没有设计
        # print response
        data = gather_all_funtion.get_result_you_need(response)
        # for element in gather_all_funtion
        content = data[1]
        # imglist=data[1]
        # publish_user=response.xpath('/html/body/div[3]/span[2]/span/a')#没有抓取到发帖人,后边单独写一个模块
        time_format = '%Y-%m-%d'
        spider_time = time.strftime(time_format, time.localtime())
        publish_time = data[2]
        img_urls = data[3]

        if len(publish_time.split(':')) == 2:
            publish_time += ':00'
        else:
            print publish_time
            publish_time = '2211-11-11 11:11:11'

        data = {}
        data['url'] = response.url
        data['title'] = response.meta['data']['title']
        data['id'] = response.meta['data']['id']
        data['url'] = response.meta['data']['url']
        data['spider_time'] = spider_time
        data['img_urls'] = img_urls
        data['publish_time'] = publish_time
        # data['publish_user']=publish_user
        data['content'] = content

        print '\n\n'
        persionalSetting.Save_org_file('taihainet',
                                       date_time=data['publish_time'],
                                       urlOruid=data['url'],
                                       newsidOrtid=data['id'],
                                       datatype='news',
                                       full_data=response.body)
        persionalSetting.Save_result(plantform='taihainet',
                                     date_time=data['publish_time'],
                                     urlOruid=data['url'],
                                     newsidOrtid=data['id'],
                                     datatype='news',
                                     full_data={'data': data})
Exemplo n.º 4
0
    def deal_content(self, response):
        if response.status == 404:
            return  #这样设计靠谱吗?

        if response.request.cookies:
            cookies = response.request.cookies
        else:
            cookies = {}
        headers = response.request.headers
        if 'Set-Cookie' in headers.keys():
            print response.headers['Set-Cookie']
            for headers_key in response.headers.keys():
                if 'Set-Cookie' in headers_key:
                    set_cookie = response.headers[headers_key]
                    cookies_name = set_cookie.split(';')[0].split('=')
                    cookies[cookies_name[0]] = cookies_name[1]
                else:
                    headers[headers_key] = response.headers[headers_key]

        data_TCPI = gather_all_funtion.get_result_you_need(response)
        title = data_TCPI[0]
        content = data_TCPI[1]
        img_urls = data_TCPI[3]
        time_format = '%Y-%m-%d'
        spider_time = time.strftime(time_format, time.localtime())

        # try:
        #     publish_time=response.xpath('/html/body/div[4]/div[2]/p/span[4]')
        #     print publish_time
        # except Exception as e:
        #     print e
        #     print 'time wrong'
        publish_time = data_TCPI[2]

        id = str(response.url.split('/')[-1])
        data = {
            'url': response.url,
            'content': content,
            'title': title,
            'publish_time': publish_time,
            'img_urls': img_urls,
            'id': id,
            'spider_time': spider_time,
            'reply_node': []
        }
        # cmt_url='http://changyan.sohu.com/api/2/topic/comments?client_id=cyrHnxhFx&page_size=30&topic_id=630645353&page_no=1'
        Re_find_sid = re.compile(r'sid="\d*?"')
        sid = Re_find_sid.findall(response.body)  #为了找到评论
        print sid
        try:
            sidnum = sid[0].split('"')[1]

            cmt_url_with_out_num = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cyrHnxhFx&page_size=30&hot_size=5&topic_source_id='
            cmt_url_to_visit = cmt_url_with_out_num + sidnum

            yield scrapy.Request(url=cmt_url_to_visit,
                                 headers=headers,
                                 cookies=cookies,
                                 meta=data)
            # Save_result(plantform='chengdu',date_time=publish_time,urlOruid=response.url,newsidOrtid=id,datatype='news',full_data=data)
            print data  #有可能同样的正则表达式同样的网页但是就是没有找到对应的sid
        except Exception as e:
            print e
Exemplo n.º 5
0
 def deal_content(self, response):
     if response.meta['isNextPage'] == False:
         data_TCPI = gather_all_funtion.get_result_you_need(response)
         print data_TCPI
         content = data_TCPI[1]
         data = response.meta
         # data['content']=content
         data['data']['content'] = content
         #发现发布时间里边有'刚刚,1小时前,2小时前,3小时前,分钟前'
         publish_time = response.meta['data']['publish_time']
         if publish_time == u'刚刚':
             publish_time = time.time()
         elif u'小时前' in publish_time:
             time_pass = int(publish_time.replace(u'小时前', '')) * 60 * 60
             publish_time = time.time() - time_pass
         elif u'分钟前' in publish_time:
             time_pass = int(publish_time.replace(u'分钟前', '')) * 60
             publish_time = time.time() - time_pass
         elif '-' in publish_time and len(publish_time) == 5:
             publish_time = '2017-' + publish_time
         response.meta['data']['publish_time'] = publish_time
     else:
         data_TCPI = gather_all_funtion.get_result_you_need(response)
         content1 = data_TCPI[1]
         response.meta['data']['content'] += content1
         pass
     #body > div.scrollBox.mt10 > div.article > div.mb10.mt5.fs14 > a.page-next.ml5
     next_page_selector = response.css(
         'body > div.scrollBox.mt10 > div.article > div.mb10.mt5.fs14 > a.page-next.ml5'
     )
     if next_page_selector:
         next_page_html = next_page_selector.get('href')
         if next_page_html and 'href' in next_page_html and len(
                 next_page_html.split('"')[1]) > 3:
             #<a href="/v/1000010001000802_2.html" style="color: #069700;" class="page-next ml5">下一页</a>
             if response.request.cookies:
                 cookies = response.request.cookies
             else:
                 cookies = {}
             headers = response.request.headers
             if 'Set-Cookie' in response.headers.keys():
                 print response.headers['Set-Cookie']
                 for headers_key in response.headers.keys():
                     if 'Set-Cookie' in headers_key:
                         set_cookie = response.headers[headers_key]
                         cookies_name = set_cookie.split(';')[0].split('=')
                         cookies[cookies_name[0]] = cookies_name[1]
                     else:
                         headers[headers_key] = response.headers[
                             headers_key]
             next_page_url = next_page_html.split('"')[1]
             next_url = 'http://m.xilu.com' + next_page_url
             print next_url
             response.meta['isNextPage'] = True
             yield scrapy.Request(url=next_url,
                                  headers=headers,
                                  meta=response.meta,
                                  cookies=cookies,
                                  priority=2)
         else:
             # 如果下一页为空,那么就不跟进下一页,执行后边的else中的评论获取,所以这里的else中的代码和后边else中的代码一致,都是进入评论的链接。
             cmt_url_without_id = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cysYw3AKM&page_size=30&hot_size=10&topic_source_id='
             this_page_id = response.url.split('/')[-1].split('.')[0]
             cmt_url = cmt_url_without_id + this_page_id
             yield scrapy.Request(url=cmt_url,
                                  headers=response.headers,
                                  meta=response.meta,
                                  priority=2)
     else:
         cmt_url_without_id = 'http://changyan.sohu.com/api/3/topic/liteload?&client_id=cysYw3AKM&page_size=30&hot_size=10&topic_source_id='
         this_page_id = response.url.split('/')[-1].split('.')[0]
         cmt_url = cmt_url_without_id + this_page_id
         yield scrapy.Request(url=cmt_url,
                              headers=response.headers,
                              meta=response.meta,
                              priority=2)
Exemplo n.º 6
0
    def deal_content(self, response):
        if '<div riot-tag="player"></div>' in response.body and '<div riot-tag="abstract"></div>' in response.body and '<div riot-tag="hotvideo"></div>' in response.body:
            return  #因为这里边是视频
        if 'toutiao' not in response.url:
            return
        #这里可以找一个图片的判断语句,用来判断是否是图片,因为有些其它没图片板块,传过来的也可能是图片模块
        Re_find_pattern1 = re.compile(r'\bvar gallery =.*?\]\}')

        if response.request.cookies:
            cookies = response.request.cookies
        else:
            cookies = {}
        headers = response.request.headers
        if 'Set-Cookie' in headers.keys():
            print response.headers['Set-Cookie']
            for headers_key in response.headers.keys():
                if 'Set-Cookie' in headers_key:
                    set_cookie = response.headers[headers_key]
                    cookies_name = set_cookie.split(';')[0].split('=')
                    cookies[cookies_name[0]] = cookies_name[1]
                else:
                    headers[headers_key] = response.headers[headers_key]
        thismeta = response.meta
        print response.headers
        #针对/i和/a这两种网页来单独设计xpathget的redis库
        # Re_find_urlwithnum=re.compile(r'toutiao.com/\d')
        # if 'toutiao.com/i' in response.url:
        #     response.meta['plant_form']='toutiao_i'
        # elif 'toutiao.com/a' in response.url:
        #     response.meta['plant_form']='toutiao_a'
        # elif Re_find_urlwithnum.findall(response.url):
        #     response.meta['plant_form']='toutiao'

        #----------------------------7-19日添加的内容处理模块

        #添加的一个图片模块识别代码
        img_urls = []
        content = ''
        pictureinfo = Re_find_pattern1.findall(response.body)
        if pictureinfo:  #如果是图片,自然进入这个模块,这样也相当于给下xpath减轻了压力了
            picture_data = pictureinfo[0]
            picture_data_json_original = picture_data.split('=')[1]
            datajson = json.loads(picture_data_json_original)
            for picture_info in datajson['sub_images']:
                img_url_in_for = picture_info['url']
                img_urls.append(img_url_in_for)
            for content_info in datajson['sub_abstracts']:
                content += content_info
            title = datajson['sub_titles'][0]
            # Re_find_publish_time=re.compile(r'')
            Re_find_time = re.compile(r'publish_time:.*?\,')
            publish_time = Re_find_time.findall(
                response.body)[0].split("'")[1].replace('/', '-')
        else:
            data_TCPI = get_result_you_need(response)
            content = data_TCPI[1]
            img_urls = data_TCPI[3]
            publish_time = data_TCPI[2]
        # title=response.xpath('//*[@id="article-main"]/h1').extract()
        # content=''
        # for i in response.xpath('//*[@id="article-main"]/div[2]/div/p[2]').extract():
        #     content+=i
        # publish_time=response.xpath('//*[@id="article-main"]/div[1]/span[@class="time"]').extract()

        if response.meta['special_key'] == 'is_picture':
            Re_find_content_in_html = re.compile(
                r'\bgallery: .*?siblingList\b')
            Re_find_content_in_html.findall(response.body)
        if len(content) < 10:
            Re_find_content_in_html = re.compile(
                r'\bgallery: .*?siblingList\b')

        # response.meta['plant_form']='toutiao'

        data = {
            'id': thismeta['id'],
            'url': thismeta['url'],
            'reply_count': thismeta['reply_count'],
            'title': thismeta['title'],
            'publish_user': thismeta['publish_user'],
            'spider_time': thismeta['spider_time'],
            'publish_user_photo': thismeta['publish_user_photo'],
            'content': content,
            'img_urls': [],
            'video_urls': [],
            'publish_time': publish_time,
            'reply_nodes': [],
        }

        #http://www.toutiao.com/api/comment/list/?group_id=6438917736949612802&item_id=6438920814917059074&offset=5&count=15
        if '.toutiao.com' in response.url:
            print response.body
            xpath_data = response.xpath('//div/article/div[1]/h1')
            print response

            #下边都是处理转到对应的评论所需要的信息.
            Re_content_item_id = re.compile(r'item_id: \'.*?\'')
            Re_content_qid = re.compile(r'qid : \".*?\"')
            # print response.body#普通的html文档
            item_id_re = Re_content_item_id.findall(response.body)
            print item_id_re
            if not item_id_re:
                qid_re = Re_content_qid.findall(response.body)
                print qid_re[0].split('"')[1]  #这里的作用是找出文中对应的id部分.
                # yield scrapy.Request()
            else:
                print item_id_re[0].split("'")[1]
                thisurl = response.url.split('com/a')
                nexturl = 'http://www.toutiao.com/api/comment/list/?group_id=' + thisurl[
                    1].replace('/', '') + '&item_id=' + str(
                        item_id_re[0].split("'")[1]) + '&offset=0&count=20'
                yield scrapy.Request(url=nexturl,
                                     cookies=cookies,
                                     headers=headers,
                                     meta={
                                         'data': data,
                                         'plant_form': 'toutiao',
                                         'isIndex_request': False
                                     })