示例#1
0
 def parse_good_list(self, response):
     """
     解析三级类别下的商品列表页面
     """
     for g_item in response.css('div#plist li.gl-item div.j-sku-item'):
         item = {
             GIF.SKUID: g_item.xpath('./@data-sku').extract_first(),
             GIF.NAME: g_item.css('div.p-name a em::text').extract_first(),
             GIF.URL:
             g_item.css('div.p-name a::attr(href)').extract_first(),
             GIF.CATEGORY: response.meta[GIF.CATEGORY]
         }
         item[GIF.URL] = urlparse.urljoin(response.url, item[GIF.URL])
         req = Request(item[GIF.URL], callback=self.parse_good_brand)
         req.meta['item'] = item
         yield req
     # 解析该商品类别页面的下一页
     next_page_url = response.css(
         '#J_bottomPage span.p-num a.pn-next::attr(href)').extract_first()
     if next_page_url:
         req = Request(url=urlparse.urljoin(response.url, next_page_url),
                       callback=self.parse_good_list,
                       dont_filter=True)
         req.meta[GIF.CATEGORY] = response.meta[GIF.CATEGORY]
         yield req
示例#2
0
文件: csbk.py 项目: zxbzxb180/spider
 def start_requests(self):  #生成爬虫爬取的链接
     urls = "http://www.qiushibaike.com/text/"
     yield Request(url=urls, callback=self.parse)
     for i in range(1, 6):
         url = "http://www.qiushibaike.com/text/page/" + str(
             i) + "/?s=4964137"
         yield Request(url=url, callback=self.parse)
 def parse_good_list(self, response):
     """
     解析三级类别下的商品列表页面
     """
     for g_item in response.css('div#plist li.gl-item div.j-sku-item'):
         item = {
             GIF.SKUID: g_item.xpath('./@data-sku').extract_first(),
             GIF.NAME:
             ''.join(g_item.css('div.p-name a em::text').extract()),
             GIF.URL:
             g_item.css('div.p-name a::attr(href)').extract_first(),
             GIF.CATEGORY: response.meta[GIF.CATEGORY]
         }
         # 判断sku-id是否是京东自营,如果非自营,则不爬取
         if not self.is_sku_self_supported(item[GIF.SKUID]):
             continue
         item[GIF.URL] = urlparse.urljoin(response.url, item[GIF.URL])
         req = Request(item[GIF.URL],
                       callback=self.parse_good_brand,
                       errback=self.errback_http)
         req.meta['item'] = item
         # req.meta['dont_redirect'] = True
         yield req
     # 解析该商品类别页面的下一页
     next_page_url = response.css(
         '#J_bottomPage span.p-num a.pn-next::attr(href)').extract_first()
     if next_page_url:
         req = Request(url=urlparse.urljoin(response.url, next_page_url),
                       callback=self.parse_good_list,
                       errback=self.errback_http,
                       dont_filter=True)
         req.meta[GIF.CATEGORY] = response.meta[GIF.CATEGORY]
         yield req
 def parse_good_price(self, response):
     """
     解析每个商品的价格
     """
     try:
         pt_index = response.meta['pt_index']
         if response.meta['is_jd_api']:
             price = self.jdPriceTool.get_price_from_response(response)
         else:
             price = self.priceToolList[pt_index].get_price_from_response(
                 response)
         item = response.meta['item']
         item[GIF.PRICE] = price
         item[GIF.UPDATE_TIME] = datetime.utcnow()
         item[GIF.SOURCE] = self.source
         good_item = GItem(item)
         yield good_item
     except Exception as e:
         # 返回的数据格式:[{"id":"J_4426168","p":"23.90","m":"32.01","op":"32.00"}]
         logging.error(u"解析价格错误,链接为: " + response.url)
         logging.error(e.message)
         logging.error(traceback.format_exc())
         if response.meta['is_jd_api']:
             raise CloseSpider(u'解析价格错误, 返回数据为: ' + response.body)
         else:
             # 使用京东API进行查询
             item = response.meta['item']
             req = Request(url=self.jdPriceTool.get_price_url(
                 item[GIF.SKUID]),
                           callback=self.parse_good_price)
             req.meta['item'] = item
             req.meta['is_jd_api'] = True
             yield req
示例#5
0
 def parse_nearbyHotel(self, response):
     if response.status == 403 or response.status == 500 or response.status == 10060 or response.status == 503:
         yield Request(response.url,
                       callback=self.parse_nearbyHotel,
                       dont_filter=True)
         return
     item = UpItem()
     item['coll'] = 'nearbyhotels'
     item['resID'] = int(re.search(r'd\d+', response.url).group(0)[1:])
     hotels = response.xpath(
         '//a[@class="property_title prominent "]/text()').extract()
     hrefs = response.xpath(
         '//a[@class="property_title prominent "]/@href').extract()
     ids = []
     for href in hrefs:
         ids.append(int(re.search(r'd\d+', href).group(0)[1:]))
     diss = response.xpath('//b/text()').extract()
     list = []
     length = len(ids)
     for i in range(0, length):
         hotelitem = NearbyItem()
         hotelitem['ID'] = ids[i]
         hotelitem['name'] = hotels[i]
         hotelitem['dis'] = float(diss[i][0:-2])
         list.append(hotelitem)
     item['list'] = list
     print(item)
     yield item
     url = response.xpath(
         '//a[@class="nav next taLnk ui_button primary"]/@href').extract(
         )[0]
     next_url = self.base + url
     yield Request(next_url, self.parse_nearbyHotel)
示例#6
0
    def parse(self, response):
        company_selector = Selector(response)
        company_iterator = company_selector.xpath(
            r'//ul[@class="list-search"]/li')
        for eachcompany in company_iterator:
            companyitem = QiyeqianzhanItem()
            compony_name_1 = eachcompany.xpath(
                r'div[@class="tit"]/a/text()[1]').extract()
            compony_name_2 = eachcompany.xpath(
                r'div[@class="tit"]/a/em/text()').extract()
            compony_name_3 = eachcompany.xpath(
                r'div[@class="tit"]/a/text()[2]').extract()
            compony_url = eachcompany.xpath(
                r'div[@class="tit"]/a/@href').extract()
            if compony_name_1:
                if compony_name_2:
                    if compony_name_3:
                        companyitem['compony_name'] = compony_name_1[0].strip(
                        ) + compony_name_2[0].strip(
                        ) + compony_name_3[0].strip()
                    else:
                        companyitem['compony_name'] = compony_name_1[0].strip(
                        ) + compony_name_2[0].strip()
                else:
                    companyitem['compony_name'] = compony_name_1[0].strip()
            else:
                if compony_name_2:
                    if compony_name_3:
                        companyitem['compony_name'] = compony_name_2[0].strip(
                        ) + compony_name_3[0].strip()
                    else:
                        companyitem['compony_name'] = compony_name_2[0].strip()

            if compony_url:
                companyitem[
                    'compony_url'] = self.parent_url + compony_url[0].strip()
            companyitem["id"] = str(self.i)
            self.i += 1
            # yield companyitem
            #测试内容网址
            print(self.parent_url + compony_url[0].strip())
            time.sleep(2)
            yield Request(self.parent_url + compony_url[0].strip(),
                          meta={"item": companyitem},
                          callback=self.parse_article_content,
                          cookies=self.cookies2)

        #nextlink = response.xpath(r'//div[@class="page-list"]/a[contains(text(),"下一页")]/@href').extract()
        nextlink = response.xpath(
            r'//div[@class="page-list"]/a[@class="next"]/@href').extract()

        if nextlink:
            Nextlink = nextlink[0].strip()
            request = Request(self.parent_url + Nextlink,
                              callback=self.parse,
                              cookies=self.cookies)
            time.sleep(2)
            yield request
        else:
            print('下一页链接为空')
示例#7
0
    def start_requests(self):
        for url in self.list_of_start_urls:
            if not url.startswith('http'):
                url = 'http://www.' + url

            request = Request(url)
            request.meta['orig_domain'] = urlparse(url).netloc
            yield request
示例#8
0
 def start_requests(self):
     try:
         req = Request(
             'http://www.lfp.fr/competitionPluginCalendrierResultat/changeCalendrierHomeJournee?c=ligue1&js=%s&id=0' %
             self.journee,
             dont_filter=True)
     except AttributeError:
         req = Request('http://www.lfp.fr/ligue1/calendrier_resultat', dont_filter=True)
     return [req]
示例#9
0
 def start_requests(self):
     tags = BOOK_CATEGORY.keys()
     for tag in tags:
         url = 'https://book.douban.com/tag/' + tag
         request = Request(url=url,
                           callback=self.parse,
                           cookies={'bid': random.choice(self.bids)})
         request.meta['real_tag'] = tag
         yield request
示例#10
0
 def start_requests(self):
     url = "https://www.apartments.com/"
     input_file_dir = getattr(self, 'input', None)
     with open(input_file_dir) as data_file:
         data = json.load(data_file)
     if data:
         for query in data['queries']:
             area_url = "".join([url, query['area']])
             yield Request(area_url, self.parse)
     else:
         yield Request(url, self.parse)
示例#11
0
    def parse(self, response):
        self.check_code(response)

        movies_url = response.xpath('//a[@class="nbg"]/@href').extract()
        for movie_url in movies_url:
            logging.info("movie_url: %s", movie_url)
            yield Request(movie_url, callback=self.parse_subject)

        next_url = response.xpath('//span[@class="next"]/a/@href').extract()
        if next_url:
            logging.info("tag: %s", next_url[0])
            yield Request(next_url[0])
示例#12
0
    def parse_pagination(self, response):
        product_urls = response.css('.product-image a ::attr(href)').extract()
        for product_url in product_urls:
            yield Request(url=product_url,
                          meta=response.meta.copy(),
                          callback=self.parse_product)

        raw_url = response.css(
            '.infinite-scroll-placeholder ::attr(data-grid-url)'
        ).extract_first()
        if raw_url:
            yield Request(url=HTMLParser().unescape(raw_url),
                          callback=self.parse_pagination)
    def parse(self, response):
        root = Selector(response)

        links = root \
            .xpath('//a[@title]/@href').getall()
        for link in links:
            yield Request(urljoin(response.url, link),
                          callback=self.parse_laptop_page)

        next_page_url = root.xpath(
            '//li[@class="listing__pagination-nav"][last()]/a/@href').get()
        yield Request(urljoin(response.url, next_page_url),
                      callback=self.parse)
示例#14
0
    def parse_pagination(self, response):
        trail = self.add_trail(response)
        product_urls = response.css('a.product-link ::attr(href)').extract()
        for product_url in product_urls:
            yield Request(url=response.urljoin(product_url),
                          meta={"trail": trail},
                          callback=self.parse_product)

        next_page = response.css('a#loadmore ::attr(href)').extract_first()
        if next_page:

            yield Request(url=response.urljoin(next_page),
                          meta={"trail": trail},
                          callback=self.parse_pagination)
示例#15
0
    def start_requests(self):
        connection = pymongo.MongoClient(settings['MONGODB_ADDR'], )
        db = connection[settings['MONGODB_DB']]
        posts = db['posts']

        logger.info('Querying mongo to get participants')
        participants = posts.aggregate(pipeline=[
            {
                '$project': {
                    'users_participated': 1,
                    'blog_id': 1
                }
            },
            {
                '$unwind': '$users_participated'
            },
            {
                '$group': {
                    '_id': {
                        'username': '******',
                        'blog_id': '$blog_id'
                    },
                    'count': {
                        '$sum': 1
                    }
                },
            },
            {
                '$sort': {
                    'count': -1
                }
            },
        ])
        participants = list(participants)
        logger.info('Total {} participants'.format(len(participants)))

        for entry in participants:
            _id = entry['_id']
            blog_url = BLOG_URLS[_id['blog_id']]
            yield Request(
                urlparse.urljoin(
                    blog_url, '/users/{}/favorites/'.format(_id['username'])),
                self.parse_favorites,
                priority=1,
            )
            yield Request(
                urlparse.urljoin(blog_url,
                                 '/users/{}/'.format(_id['username'])),
                self.parse_user,
            )
    def parse_res(self, response):
        resID = int(re.search(r'd\d+', response.url).group(0)[1:])
        # 附近酒店
        url = response.url
        # print(response.url)
        hotelurl = response.url
        hotelurl = hotelurl.replace('Restaurant_Review', 'HotelsNear')
        hotelurl = hotelurl.replace('-Reviews-', '-')

        # print(url)
        yield Request(hotelurl,
                      callback=self.parse_nearbyHotel,
                      meta={'resID': resID})

        # 附近餐厅
        resurl = url
        resurl = resurl.replace('Restaurant_Review', 'RestaurantsNear')
        resurl = resurl.replace('-Reviews-', '-')
        for i in range(0, 7):
            if i == 0:
                yield Request(resurl,
                              callback=self.parse_nearbyRes,
                              meta={'resID': resID})
            else:
                index = re.search(r'd\d+', resurl).span()[-1]
                next_url = resurl[0:index] + "-oa%s" % (i *
                                                        30) + resurl[index:]
                print(next_url)
                yield Request(next_url,
                              callback=self.parse_nearbyRes,
                              meta={'resID': resID})

        # 附近景点
        spoturl = url
        spoturl = spoturl.replace('Restaurant_Review', 'AttractionsNear')
        spoturl = spoturl.replace('-Reviews-', '-')
        for i in range(0, 7):
            if i == 0:
                yield Request(spoturl,
                              callback=self.parse_nearbySpot,
                              meta={'resID': resID})
            else:
                index = re.search(r'd\d+', spoturl).span()[-1]
                next_url = spoturl[0:index] + "-oa%s" % (i *
                                                         30) + spoturl[index:]
                print(next_url)
                yield Request(next_url,
                              callback=self.parse_nearbySpot,
                              meta={'resID': resID})
示例#17
0
    def parse_category(self, response):
        prod_links = response.xpath(
            '//a[@class="product-block__image"]/@href').extract()

        for link in prod_links:
            yield Request(response.urljoin(link),
                          self.parse_item,
                          meta={'category': response.meta['category']})

        # Case in pagination
        next_links = response.xpath('//a[@rel="next"]/@href').extract()
        if next_links:
            yield Request(response.urljoin(next_links[0]),
                          self.parse_category,
                          meta={'category': response.meta['category']})
示例#18
0
 def parse(self, response):
     if self.i == 0:
         citys = response.xpath(
             '//div[@class="geo_name"]/a/@href').extract()
     else:
         citys = response.xpath(
             '//ul[@class="geoList"]/li/a/@href').extract()
     for city in citys:
         url = urllib.parse.urljoin(self.base, city)
         yield Request(url, callback=self.parse_city)
     self.i += 20
     if self.i <= 720:
         next_url = self.base + ("/Restaurants-g294211-oa%s-China.html" %
                                 self.i)
         yield Request(next_url, callback=self.parse)
 def parse(self, response):
     ress = response.xpath('//a[@class="property_title"]/@href').extract()
     for res in ress:
         url = self.base + res
         # print(url)
         yield Request(url, callback=self.parse_rev)
     # 分页找出所有餐厅列表
     hrefs = response.xpath(
         '//a[@class="nav next rndBtn ui_button primary taLnk"]/@href'
     ).extract()
     if len(hrefs) > 0:
         href = hrefs[0]
     next_url = self.base + href
     next_url.replace(' ', '')
     yield Request(next_url, callback=self.parse)
示例#20
0
文件: xici.py 项目: smallliang/IpPool
    def parse(self, response):
        ip_list = response.xpath('//*[@id="ip_list"]/tr')

        for ip in ip_list[1:]:
            item = CollectipsItem()
            try:
                item['IP'] = ip.xpath('td[2]/text()')[0].extract()
                item['PORT'] = ip.xpath('td[3]/text()')[0].extract()
                item['POSITION'] = ip.xpath('td[4]/a/text()')[0].extract()
                item['TYPE'] = ip.xpath('td[6]/text()')[0].extract()
                item['SPEED'] = ip.xpath('td[7]/div/@title').re(
                    '\d{0,}\.\d{0,}')[0]
                item['LAST_CHECK_TIME'] = ip.xpath(
                    'td[10]/text()')[0].extract()
                yield item
            except:
                pass
        #获取下一页链接
        next_page_nums = response.xpath('//*[@class="next_page"]/@href')
        if (next_page_nums):
            next_page = self.link_url + next_page_nums[0].extract()
            print(next_page)
            yield Request(url=next_page, callback=self.parse)
        else:
            print("爬取完成")
示例#21
0
    def start_requests(self):
        self.cp = MyConfigParser()
        self.cp.read("conf/conf.ini")

        self.usr = self.cp['user_info']['user']
        self.pwd = self.cp['user_info']['pwd']
        self.usr_name = self.cp['user_info']['usrname']
        self.usr_IDcard = self.cp['user_info']['usrIDcard']
        self.usr_phnum = self.cp['user_info']['usrphnum']

        self.seat_type = self.cp['user_info']['seat_type']
        self.usr_type = self.cp['user_info']['usr_type']
        self.usr_type_code = self.cp['user_type_code'][self.usr_type]

        self.from_station = self.cp['station_info']['from_station']
        self.from_station_code = stations.get(self.from_station)
        self.to_station = self.cp['station_info']['to_station']
        self.to_station_code = stations.get(self.to_station)

        self.earliest = self.cp['station_info']['earliest']
        self.lastest = self.cp['station_info']['lastest']
        self.date = self.cp['station_info']['date']

        print('start_requests')
        yield Request("http://www.12306.cn/mormhweb/", meta={'cookiejar': self.cookiejar}, headers=self.header)
示例#22
0
 def start_requests(self):
     # Parse weibo homepage
     home_url = "http://weibo.cn/u/%s" % self.uid
     yield Request(url=home_url,
                   cookies=self.cookie,
                   callback=self._parse_homepage,
                   errback=self.parse_error)
示例#23
0
    def parse_item(self, response):
        mzitu = response.meta['mzitu']
        mzitu["url"] = response.url
        mzitu["tags"] = response.xpath(
            '//div[@class="main-tags"]/a/text()').extract()
        mzitu["title"] = response.xpath(
            '//h2[@class="main-title"]/text()').extract_first()
        mzitu["classify"] = response.xpath(
            '//div[@class="main-meta"]/span[1]/a/text()').extract_first()
        mzitu["publish_date"] = response.xpath(
            '//div[@class="main-meta"]/span[2]/text()').re(u'发布于 (.*)')[0]
        mzitu["visitors"] = int(
            response.xpath('//div[@class="main-meta"]/span[3]/text()').re(
                u'(\\d+[,\\d+]*)')[0].replace(",", ""))
        mzitu["pics"] = []

        img_urls = response.xpath(
            '//div[@class="main-image"]/descendant::img/@src').extract()
        for img_url in img_urls:
            mzitu["pics"].append(
                re.match("http://[^/]+/(.*)", img_url).group(1))

        # yield mzitu

        next_page = response.xpath(
            '//div[@class="pagenavi"]/a[last()]/@href').extract_first()
        non_re_next_page = re.match(r"http://www.mzitu.com/\\d+$", next_page)
        if next_page == response.url or non_re_next_page:
            yield mzitu
        else:
            yield Request(url=urljoin(response.url, next_page),
                          meta={'mzitu': mzitu},
                          dont_filter=True,
                          callback=self.parse_next_page)
示例#24
0
 def start_requests(self):
     logging.debug("###### 妹子图Spider开始启动.....%s" % self)
     return [
         Request(url="http://www.meizitu.com/tag/nvshen_460_1.html",
                 callback=self.parse,
                 headers=self.user_header)
     ]
示例#25
0
    def parse(self, response):
        # print(response.body)
        for picdiv in response.css('div[class="pic"]'):
            image_urls = picdiv.css(
                'a[target="_blank"] img::attr(src)').extract_first()
            image_split = image_urls.split("/")
            image_name = image_split[-3] + image_split[-2] + image_split[-1]
            yield SaveGirlImageItem({
                'name':
                MeiziTuSpider.__remove_html_tags(
                    picdiv.css('a[target="_blank"] img::attr(alt)').extract()
                    [0]),  #获取这组相片的名称
                'url':
                picdiv.css('a[target="_blank"] img::attr(src)').extract_first(
                ),  #获取这组照片的链接
                'image_urls': [
                    picdiv.css(
                        'a[target="_blank"] img::attr(src)').extract_first()
                ],
                'images':
                image_name
            })

        next_page = response.xpath(
            u'//div[@class="navigation"]//li/a[contains(.,"下一页")]/@href'
        ).extract_first()

        if next_page is not None:
            requesturl = "http://www.meizitu.com" + next_page
            yield Request(requesturl,
                          callback=self.parse,
                          headers=self.user_header)
示例#26
0
    def start_requests(self):
        reported_date = self.reported_date
        _, month, year = self.reported_date.split('.')

        for start_url in self.start_urls:
            yield Request(url=start_url.format(year, month, reported_date),
                          callback=self.parse)
示例#27
0
 def make_request_from_data_str(self, data_str):
     try:
         return Request(url=self.datastr_to_url(data_str),
                        meta={'id': int(data_str)},
                        dont_filter=False)
     except Exception as e:
         print(e)
示例#28
0
    def parse_listings(self, response, **kwargs):
        if not isinstance(response, HtmlResponse):
            response = HtmlResponse(response.url, body=response.body, request=response.request)

        raw_movies = json.loads(response.text)['results'][0]

        if not raw_movies:
            return

        for page in range(1, raw_movies['nbPages']):
            listings_formdata = self.listings_formdata.copy()
            listings_formdata["page"] = str(page)

            form_data = {
                "requests":
                    [{
                        "indexName": "films",
                        "params": urllib.parse.urlencode(listings_formdata)
                    }]
            }

            yield Request(
                self.listings_api_url,
                method='POST',
                callback=self.parse_listings,
                body=json.dumps(form_data),
            )

        yield from self._requests_to_follow(response)
 def parse(self, response):
     """
     解析京东商品的分类页面
     """
     selectors = response.css('.category-items .col .category-item')
     logging.info(u'------------从主页上获取的一级类别数目为:{0}------------'.format(
         len(selectors)))
     url_count = 0
     for main_cat_sel in selectors:
         # 第一级类别名称
         first_cat = main_cat_sel.css(
             '.mt .item-title span::text').extract_first()
         if first_cat not in self.included_cat_list:
             continue
         logging.info(first_cat)
         # 找到二级类别名称,以及其下面的三级类别名称列表和对应的页面
         for items_sel in main_cat_sel.css('.mc div.items dl.clearfix'):
             # 二级类别名称
             second_cat = items_sel.css('dt a::text').extract_first()
             # 三级类别名称,技改类别下面商品列表的链接
             for item_sel in items_sel.css('dd a'):
                 url_count += 1
                 third_cat = item_sel.xpath('./text()').extract_first()
                 url = item_sel.xpath('./@href').extract_first()
                 req = Request(url=urlparse.urljoin(response.url, url),
                               callback=self.parse_good_list,
                               errback=self.errback_http,
                               dont_filter=True)
                 req.meta[GIF.CATEGORY] = [first_cat, second_cat, third_cat]
                 yield req
     logging.info(
         u'------------从主页上获取的三级类别数目为:{0}------------'.format(url_count))
示例#30
0
 def parse(self, response):
     for country, currency in zip(self.country.split(','),
                                  self.currency.split(',')):
         yield Request(url="{}country={}&currency={}".format(
             self.country_code_api, country, currency),
                       dont_filter=True,
                       callback=self.parse_start_page)
	def parse_info(self, response):
		src = response.url
		title = response.css(".photoDetails h1").xpath(".//text()").extract()[0].strip()
		link = response.css("#video-container iframe").xpath("./attribute::src").extract()[0]
		cover = response.css("meta[property=og\\3a image]").xpath("./attribute::content").extract()[0].strip()
		pdate = response.css(".post-info .post-date").extract()[0].strip()
		# print(response.css("meta[property=og\\3a description]").xpath("./attribute::content").extract()[0].strip())
		item = FuviItem()
		item["title"] = title
		item["sapo"] = ""
		item["cover"] = cover
		item["link"] = link
		item["src"] = src
		item["site"] = self.site
		item["catId"] = 1
		request = Request(link, callback=self.parse_item)
		request.meta["item"] = item

		yield(request)