예제 #1
0
    def parseNum(self, response):

        # 声明item
        infoItem = userInfoItem()

        # 封装爬取的uid和时间
        infoItem['uid'] = response.meta['uid']
        time.localtime(time.time())
        infoItem['crawlTime']  = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        # 爬取粉丝数、关注数、微博数
        sel = Selector(response)
        infoItem['weiboNum'] = int(sel.xpath('//div[@class="tip2"]/span/text()').extract_first()[3: -1])
        infoItem['fansNum'] = int(sel.xpath('//div[@class="tip2"]/a[1]/text()').extract_first()[3: -1])
        infoItem['conNum'] = int(sel.xpath('//div[@class="tip2"]/a[2]/text()').extract_first()[3: -1])

        # 爬取详细页面的连接并跳转
        url = response.meta['urlInfo']
        request = Request(url, meta={'infoItem': infoItem}, callback=self.parseInfo)
        request.cookies = random.choice(self.COOKIEPOOL)

        '''
        # 这里同样设置代理,但是没有检验机制
        if self.IPPOOL.__len__() != 0 :
            proxy = random.choice(self.IPPOOL)
            request.meta['proxy'] = proxy
        '''

        yield request
예제 #2
0
    def process_request(self, request: Request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # 设置请求头
        # scrapy.Request
        # request.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
        request.headers['User-Agent'] = ua.get()

        request.headers['Sec-Fetch-Mode'] = 'cors'
        request.headers['Sec-Fetch-Site'] = 'none'
        request.headers['Sec-Fetch-User'] = '******'
        request.headers['Upgrade-Insecure-Requests'] = 1

        # 注: cookies是dict类型
        request.cookies = cookies.get()

        # 设置请求的代理 request.meta['proxy'] = 'http://ip:port'

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
예제 #3
0
    def process_request(self, request: Request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # 设置请求的头
        request.headers['User-Agent'] = user_agents.get_ua()

        # 设置Cookie
        request.cookies = cookies.get_cookie()

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
예제 #4
0
    def start_requests(self):

        # 初始化ip池(弃用)
        '''
        print u'正在初始化IPPOOL...'
        ipUrl = 'http://www.xicidaili.com/nn/'
        request = Request(ipUrl, callback=self.parseIP, dont_filter = True)
        yield request
        '''


        # 从setting中读取,设置请求频率
        request_interval = REQUEST_INTREVAL

        # 从setting中读取,设置文件的读取位置
        inputfile_location = INPUTFILE_LOCATION

        # 用于计数爬取了多少条用户记录
        d = 0

        # 用于控制IPPOOL的更新
        #ipflushCount = 0

        print u'初始化cookie池...'
        print u'cookie池的期望大小为:  ' , cookieList.__len__()

        # 初始化cookie池
        cookiepool = []

        # cookies.cookieList中是为处理的cookie,这里需要处理一下然后存到本地的COOKIEPOOL中
        for string in cookieList :
            # cookies.cookieList中的cookie是字符串,我们需要把它转化成字典形式
            single_cookie = {}
            # 所以这里我们可以利用一下Python的字符串方法分词
            for str_spl in  str(string).split(';') :
                # 整个字符串分解之后再利用分词将每个小字符串化成key:value的形式,也就是json的格式
                cookie_spl = str_spl.split("=")
                # strip()方法用于去除字符串两边的空格
                key = cookie_spl[0].strip()
                value = cookie_spl[1].strip()
                # ALF和SSOLoginState两个属性必须是int类型,不然会报错
                if (key == 'ALF') | (key == 'SSOLoginState'):
                    value = int(value)
                single_cookie[key] = value
            cookiepool.append(single_cookie)
            print u'新cookie加入cookie池...'



        # 处理完cookie之后把缓存的结构化的cookie直接复制到COOKIEPOOL
        self.COOKIEPOOL = cookiepool
        print u'cookie池初始化完成,大小为:  ', self.COOKIEPOOL.__len__()
        time.sleep(1.5)



        # 打印一些有的没的的日志
        print '\n------------------------------------\n'
        print u'当前请求的频率为:  每条请求间隔时间 ' , request_interval, u'秒\n'
        print u'当前输入文件的位置为:  ', inputfile_location, '\n'
        print '------------------------------------\n'
        print u'开始爬虫进程...'
        time.sleep(1.5)


        # 读取文件中的uid
        f = open(inputfile_location)
        for id in f.readlines() :

            # d计数器只是计算已经读了多少条用户id而已,从1开始计数
            d = d + 1

            # 每秒上限速度4条
            time.sleep(request_interval)

            # 控制IPPOOL的计数器每秒会加一,每次在发出IPPOOL的请求时候会清零
            # ipflushCount = ipflushCount + 1


            # 首先爬代理ip,把爬下来的ip放在IPPOOL中,每爬20条数据更新一次ip池(已弃用)
            '''
            if (ipflushCount == 20):
                # 首先检查IPPOOL中随机取得的代理是否有效
                #while self.IPPOOL.__len__() != 0 :
                proxy = random.choice(self.IPPOOL)

                    try :
                        protocol = (str(proxy).split(":"))[0]
                        proxies = {protocol: proxy}
                        # 如果该代理ip能正常访问则跳出循环
                        if requests.get('http://www.xicidaili.com/nn/', proxies=proxies, timeout=2).status_code == 200:
                            request.meta['proxy'] = proxy
                            break
                        else :
                            self.IPPOOL.remove(proxy)
                            print u'代理 ', proxy, u'已失效并已去除,当前IPPOOL大小为:  ' , self.IPPOOL.__len__()
                    except:
                        # 如果代理失效则去除该代理IP并重新随机选择高匿代理IP
                        self.IPPOOL.remove(proxy)
                        print u'代理 ', proxy, u'已失效并已去除,当前IPPOOL大小为:  ' , self.IPPOOL.__len__()
                        continue
            '''

            # 开启爬取西刺代理的IP的进程
            '''
                request = Request(ipUrl, callback=self.parseIP, dont_filter=True)
                request.meta['proxy'] = proxy
                ipflushCount = 0
                yield request
            '''
            #<end : if>




            # 该路径为用户的wap端首页,可以取到用户的微博数、关注数、粉丝数
            urlNum = "https://weibo.cn/u/%d" % int(id)

            # 该路径为用户的wap端个人信息页面,可以获取用户的其他信息
            urlInfo = 'https://weibo.cn/%d/info' % int(id)

            # 如果IPPOOL不为空,则从IPPOOL中去高匿代理IP
            # 首先检查IPPOOL中随机取得的代理是否有效
            '''
            while self.IPPOOL.__len__() > 0 :
                proxy = random.choice(self.IPPOOL)
                try :
                    protocol = 'https' if 'https' in proxy else 'http'
                    proxies = {protocol: proxy}
                    if requests.get('http://www.baidu.com/', proxies=proxies, timeout=2).status_code == 200 :
                        request.meta['proxy'] = proxy
                        print request.meta['proxy']
                        break
                    else :
                        self.IPPOOL.remove(proxy)
                        print u'代理 ', proxy, u'已失效并已去除,当前IPPOOL大小为:  ' , self.IPPOOL.__len__()
                except:
                    # 如果代理失效则去除该代理IP并重新随机选择高匿代理IP
                    self.IPPOOL.remove(proxy)
                    print u'代理 ', proxy, u'已失效并已去除,当前IPPOOL大小为:  ' , self.IPPOOL.__len__()
                    continue
          '''

            # 设置请求的路径和携带的信息
            request = Request(urlNum, meta={'uid': id , 'urlInfo': urlInfo }, callback=self.parseNum)

            # 设置随机cookie
            request.cookies = random.choice(self.COOKIEPOOL)
            #print request.cookies

            # 设置IP代理
            '''
            if self.IPPOOL.__len__() > 0 :
                request.meta['proxy'] = random.choice(self.IPPOOL)
           '''

            # 输出日志并开始爬
            print u'读取第', d, u'条用户uid:  '  , id

            yield request