Пример #1
0
    def parse(self, response):
        # self.count = self.count+1
        #print('这是第', self.count, '个页面')
        print('当前爬取页面' + response.request.url.strip('*/'))
        print('当前集合大小', len(self.url_pool))
        titles = response.xpath('//a/@href').extract()

        basic_url = response.request.url.strip('*/')

        # 对于url进行拼接处理
        for url in titles:
            # print(url)
            item = sudaMainItem()
            matchFullUrl = re.match(r'^(http|https)://([\w.]+/?)\S*', url,
                                    re.M | re.I)
            matchRelateUrl = re.match(r'^/([\w.]?/?)\S*', url, re.M | re.I)
            matchRelateUrl2 = re.match(r'^[^/]([\w.]?/?)\S*', url, re.M | re.I)

            if url:
                if matchFullUrl:
                    true_url = url
                    # print('原始url', true_url)
                elif matchRelateUrl:
                    true_url = basic_url + url
                    # print('拼接url1', true_url)
                elif matchRelateUrl2:
                    true_url = basic_url + '/' + url
                    # print('拼接url2', true_url)
                else:
                    true_url = url
                    # print('未处理且未匹配', true_url)
                if self.judge_suda(true_url):
                    item['father'] = basic_url
                    item['url'] = true_url
                    self.url_pool.add(true_url)
                    yield item
                # if true_url not in self.url_pool:
                #     item['distinct_url'] = true_url

                # else:
                #     item['distinct_url'] = 'duplicate'
                # yield item
        # url_list = self.getDistinctUrls()
        # print(url_list)
        url_pool_copy = copy.deepcopy(self.url_pool)
        # url_pool_copy = list(self.url_pool)

        for next_url in url_pool_copy:
            # print('这是第', index, '个元素')
            if 'http://' in next_url or 'https://' in next_url:
                yield scrapy.Request(next_url, self.parse, dont_filter=False)
            else:
                yield scrapy.Request('http://' + next_url,
                                     self.parse,
                                     dont_filter=False)
Пример #2
0
    def parsePage(self, response):
        # self.count = self.count+1
        #print('这是第', self.count, '个页面')
        print('当前爬取页面' + response.request.url.strip('*/'))
        randomdelay = random.randint(0, 4)
        time.sleep(randomdelay)
        print("### random delay: %s s ###" % (randomdelay))
        # print('当前集合大小', len(self.url_pool))
        titles = response.xpath('//a/@href').extract()

        basic_url = response.request.url.strip('*/')

        # 对于url进行拼接处理
        for url in titles:
            # print(url)
            item = sudaMainItem()
            matchFullUrl = re.match(r'^(http|https)://([\w.]+/?)\S*', url,
                                    re.M | re.I)
            # matchRelateUrl = re.match(r'^/([\w.]?/?)\S*', url, re.M | re.I)
            # matchRelateUrl2 = re.match(r'^[^/]([\w.]?/?)\S*', url, re.M | re.I)
            matchUselessUrl = re.match(r'^#([\w.]?/?)\S*', url, re.M | re.I)
            # matchParams = re.match(r'^\?([\w.]?/?)\S*', url, re.M | re.I)
            if url:
                if matchFullUrl:
                    true_url = url
                    # print('原始url', true_url)
                elif matchUselessUrl:
                    true_url = basic_url
                else:
                    true_url = urljoin(basic_url, url)
                    # print('未处理且未匹配', true_url)
                if self.judge_suda(true_url):
                    item['father'] = basic_url
                    item['url'] = true_url
                    self.url_pool.add(true_url)
                    yield item