Exemplo n.º 1
0
    def parse(self, response):
        if response.status != 200:
            if self.index > 1:
                self.index -= 1

            # 每次处理完一页的数据之后,重新发送下一页页面请求
            # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
            yield scrapy.Request(self.url + str(self.index), callback = self.parse)
        else:
            app_name = response.xpath('//div[@class="news_box"]/h2/text()').extract_first()
            if app_name is not None:
                app_item = KouziCrawlerItem()
                app_item['app_name'] = app_name
                app_item['app_link'] = self.url + str(self.index)
                app_item['kouzi_type'] = 'web'
                app_item['kouzi_name'] = '分销平台'
                app_item['kouzi_link'] = 'http://zx3.kchlx.com/'
                yield app_item

            if self.index > 1:
                self.index -= 1

            # 每次处理完一页的数据之后,重新发送下一页页面请求
            # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
            yield scrapy.Request(self.url + str(self.index), callback = self.parse)
Exemplo n.º 2
0
    def parse(self, response):
        app_list = response.xpath(
            '//tbody[@id="daikuan_list"]//tr[@class="filter-tr"]')
        kouzi_name = '嘉合骏'
        kouzi_url = 'https://www.jiahejun.com/'
        kouzi_type = 'web'
        for item in app_list:
            app_item = KouziCrawlerItem()
            kouzi_detail = item.xpath(
                './td[@class="action"]//a/@href').extract_first()
            kouzi_id = re.match(r"kouzi-(\d+).html", kouzi_detail)
            kouzi_id = kouzi_id.group(1)
            name = item.xpath(
                './td[@class="logo"]//span/text()').extract_first()
            if (name is None):
                continue
            app_item['app_name'] = name.strip()
            app_item[
                'app_link'] = '{}plugin.php?id=cc_daikuan:kz&kouziid={}'.format(
                    kouzi_url, kouzi_id)
            app_item['kouzi_type'] = kouzi_type
            app_item['kouzi_name'] = kouzi_name
            app_item['kouzi_link'] = '{}{}'.format(kouzi_url, kouzi_detail)
            print(app_item)
            yield app_item

        next_link = response.xpath(
            '//tbody[@id="daikuan_list"]//tr//div[@class="page"]//a[@class="nxt"]/@href'
        ).extract_first()
        if next_link:
            yield scrapy.Request(kouzi_url + next_link, callback=self.parse)
Exemplo n.º 3
0
    def parse(self, response):
        page_id = int(response.url.split('=')[-1])
        print('crawed page_id : {}'.format(page_id))
        self.crawed_page_ids.add(page_id)

        app_list = response.xpath('//ul[@class="search-result-list"]/li')
        kouzi_name = '小七钱包'
        kouzi_link = response.url
        kouzi_type = 'web'
        for item in app_list:
            app_item = KouziCrawlerItem()
            app_item['app_name'] = item.xpath(
                './a/div[@class="result-text"]/h2/text()').extract_first()
            app_item['app_link'] = self.app_domain + item.xpath(
                './a/@href').extract_first()
            app_item['kouzi_type'] = kouzi_type
            app_item['kouzi_name'] = kouzi_name
            app_item['kouzi_link'] = kouzi_link

            # print(app_item)
            yield app_item
        pages = response.xpath(
            '//ul[@class="pagination"]//li[not (contains(@class, "active"))]//a/@href'
        ).extract()
        for url in pages:
            next_page = self.domain + url
            new_page_id = int(next_page.split('=')[-1])
            if not new_page_id in self.crawed_page_ids:
                yield scrapy.Request(next_page, callback=self.parse)
Exemplo n.º 4
0
 def parse_item(self, response):
     response_json = json.loads(response.body)
     card_list = response_json['list']
     for item in card_list:
         app_item = KouziCrawlerItem()
         app_item['app_name'] = item['name']
         app_item['app_link'] = item['links']
         app_item['kouzi_type'] = 'web'
         app_item['kouzi_name'] = '大东与小丹'
         app_item['kouzi_link'] = 'http://adong.ren'
         yield app_item
Exemplo n.º 5
0
 def parse(self, response):
     kouzi_name = '口子大神'
     kouzi_link = 'https://www.wuyouxinyong.com/daikuan'
     kouzi_type = 'web'
     response_json = json.loads(response.body)
     app_list = response_json['result'][0]['card_link_list']
     for item in app_list:
         app_item = KouziCrawlerItem()
         app_item['app_name'] = item['platform_name']
         app_item['app_link'] = item['ios_url']
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item
Exemplo n.º 6
0
 def parse_item(self, response):
     app_list = response.xpath('//dl[@class="cpDl2"]/dd/ul//li')
     kouzi_name = '有鱼汇'
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         app_item['app_name'] = item.xpath(
             './a//dd//h3/text()').extract_first().strip()
         app_item['app_link'] = item.xpath('./a/@href').extract_first()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item
Exemplo n.º 7
0
 def parse_item(self, response):
     app_list = response.xpath('//div[@id="content"]/p[@id="mrtj"]//span')
     kouzi_name = response.xpath('//title/text()').extract_first()
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         name = item.xpath('./a/text()').extract_first()
         name = name.split('-')[0]
         app_item['app_name'] = name.strip()
         app_item['app_link'] = item.xpath('./a/@href').extract_first()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item
Exemplo n.º 8
0
 def parse_item(self, response):
     app_list = response.xpath('//section[@class="nr"]//a')
     kouzi_name = '乐乐家'
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         app_item['app_name'] = item.xpath(
             './/div[@class="list_a3"]//p[@class="p1"]/text()'
         ).extract_first().strip()
         app_item['app_link'] = item.xpath('./@href').extract_first()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item
Exemplo n.º 9
0
 def parse_item(self, response):
     app_list = response.xpath(
         '//div[@class="list_content"]//ul[@class="clearfix"]/li')
     kouzi_name = '借亦有道'
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         app_item['app_name'] = item.xpath(
             './a//div[@class="rt"]//h2/text()').extract_first().strip()
         app_item['app_link'] = item.xpath('./a/@href').extract_first()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item
Exemplo n.º 10
0
 def parse(self, response):
     app_list = response.xpath('//ul//li')
     kouzi_name = '十一钱包'
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         name = item.xpath('.//div/text()').extract_first()
         app_link = item.xpath('./a/@href').extract_first()
         app_item['app_name'] = name.strip()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         app_item['app_link'] = app_link
         print(app_item)
         yield app_item
Exemplo n.º 11
0
 def parse_item(self, response):
     app_list = response.xpath(
         '//ul[@class="loan-list"]/li[@class="active"]//ul[@class="item-list"]//li'
     )
     kouzi_name = response.xpath('//title/text()').extract_first()
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         app_item['app_name'] = item.xpath(
             './/div[@class="name"]/text()').extract_first().strip()
         app_item['app_link'] = item.xpath('./a/@href').extract_first()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item
Exemplo n.º 12
0
 def parse(self, response):
     app_list = response.xpath('//div[@id="content"]/p//a')
     kouzi_name = '老哥汇'
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         name = item.xpath('./text()').extract_first()
         if (name is None):
             continue
         app_item['app_name'] = name.strip()
         app_item['app_link'] = item.xpath('./@href').extract_first()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item
Exemplo n.º 13
0
    def parse_item(self, response):

        data = json.loads(response.text)['data']
        app_list = data['data']
        kouzi_name = '乐乐家APP'
        kouzi_link = response.url
        kouzi_type = 'app'
        for item in app_list:
            app_item = KouziCrawlerItem()
            app_item['app_name'] = item['names']
            app_item['app_link'] = item['links']
            app_item['kouzi_type'] = kouzi_type
            app_item['kouzi_name'] = kouzi_name
            app_item['kouzi_link'] = kouzi_link

            # print(app_item)
            yield app_item
Exemplo n.º 14
0
 def parse_item(self, response):
     app_list = response.xpath('//div[@id="content"]/p[@id="mrtj"]//a')
     kouzi_name = '超级卡汇'
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         name = item.xpath('./text()').extract_first()
         name = name.split('-')[0]
         app_item['app_name'] = name.strip()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         app_detail_link = item.xpath('./@href').extract_first()
         yield scrapy.Request(self.start_urls[0] + app_detail_link,
                              meta={'app_item': app_item},
                              callback=self.parse_app_link)
Exemplo n.º 15
0
 def parse(self, response):
     app_list = response.xpath('//div[@class="rows"]/a')
     kouzi_name = '点金易推'
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         name = item.xpath(
             './/div[@class="des"]/span/text()').extract_first()
         if (name is None):
             continue
         app_item['app_name'] = name.strip()
         app_item['app_link'] = self.start_urls[0] + item.xpath(
             './@href').extract_first()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item
Exemplo n.º 16
0
 def parse_item(self, response):
     kouzi_name = '分销平台'
     kouzi_link = 'http://www.kaxsd.cn/'
     kouzi_type = 'web'
     response_json = json.loads(response.body)
     page_html = response_json['data']
     app_list = scrapy.Selector(text=page_html).xpath('//li')
     for item in app_list:
         app_item = KouziCrawlerItem()
         name = item.xpath('./a/h1/text()').extract_first()
         if (name is None):
             continue
         app_item['app_name'] = name.strip()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         app_item['app_link'] = item.xpath('./a/@href').extract_first()
         yield app_item
Exemplo n.º 17
0
 def parse(self, response):
     app_list = response.xpath('//div[@id="content"]//ul/li')
     kouzi_name = '卡农联盟'
     url = 'https://d.kanongquan.net'
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         app_link = item.xpath('./a/@href').extract_first()
         app_item['kouzi_link'] = url + app_link
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         kouzi_id = re.match(r".*?series_id=(\d+)", app_link)
         kouzi_id = kouzi_id.group(1)
         app_data_url = 'https://d.kanongquan.net/index.php?r=share%2Fmore&page=1&page_size=1000&series_id={}&class_id='.format(
             kouzi_id)
         yield scrapy.Request(app_data_url,
                              meta={'app_item': app_item},
                              callback=self.parse_app_link)
Exemplo n.º 18
0
 def parse(self, response):
     app_list = response.xpath(
         '//div[contains(@class,"loan-wrap")]/div[@class="loan-list"]')
     kouzi_name = '我爱卡'
     kouzi_link = 'https://www.51credit.com/loan/'
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         name = item.xpath('./a/h2/text()').extract_first()
         if (name is None):
             continue
         app_item['app_name'] = name.strip()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         app_link = item.xpath('./a/@href').extract_first()
         yield scrapy.Request(app_link,
                              meta={'app_item': app_item},
                              callback=self.parse_app_link)
Exemplo n.º 19
0
 def parse_item(self, response):
     body = response.body.decode('utf-8')
     response = response.replace(body=body)
     app_list = response.xpath(
         '//div[contains(@class,"indexbox")]/ul[@class="clearfix"]//li')
     kouzi_name = response.xpath('//title/text()').extract_first()
     kouzi_name = kouzi_name
     kouzi_link = response.url
     kouzi_type = 'web'
     for item in app_list:
         app_item = KouziCrawlerItem()
         app_name = item.xpath('./a/text()').extract_first()
         if (app_name is None):
             continue
         app_item['app_name'] = app_name.strip()
         app_item['app_link'] = item.xpath('./a/@href').extract_first()
         app_item['kouzi_type'] = kouzi_type
         app_item['kouzi_name'] = kouzi_name
         app_item['kouzi_link'] = kouzi_link
         yield app_item