示例#1
0
    def parse_artwork(self, response):
        """Extracts information from an artwork detail page
        """

        # create a url version free of search query noise
        url_bits = urlparse.urlparse(response.url)
        url_bits = url_bits._replace(query='')
        clean_url = urlparse.urlunparse(url_bits)

        loader = ItemLoader(item=ArtworkItem(), response=response)
        loader.add_value('museum_code', self.name)
        loader.add_value('url', clean_url)
        loader.add_xpath('artist_name',
                         '//div[@id="tombstone"]/p[1]/a/text()[1]')

        artist_url = response.xpath('//div[@id="tombstone"]/p[1]/a/@href')
        artist_url = urlparse.urljoin(response.url, artist_url.extract()[0])
        loader.add_value('artist_url', artist_url)

        loader.add_css('title', '#tombstone span:nth-of-type(1)::text')
        loader.add_xpath('thumbnail',
                         '//div[@id="artwork-image"]/a/img/@src')
        loader.add_xpath('on_display', ON_DISPLAY_SELECTOR)
        item = loader.load_item()

        self.logger.info('Scraped ' + item['title'][0])

        yield item
    def parse_item(self,response):
        sel = Selector(response)
        il = ItemLoader(item=Product(), response=response)

        cat = il.get_xpath('//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()')
        availability = il.get_xpath('//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()')
        price = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text')
        sale = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text')
       
        """If the xpath doesn't retunr a category, the product belongs to the Bundle category"""
        if not cat:
            il.add_value("category", "Bundle")
        else:
            il.add_value("category", cat)
       
        il.add_css("title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text")
        il.add_value("url",response.url)
       
        """If a product can be added to the cart, the product is available online, if not, the product is not available online"""
        if "ADD TO CART" in availability:
            il.add_value("availability", "Product is available online")
        else:
            il.add_value("availability", "Product is not available online")

        """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website"""
        if not price:
            il.add_value("regPrice",sale)
            il.add_value("salePrice", None)
        else:
            il.add_value("regPrice", price)
            il.add_value("salePrice",sale)
        return il.load_item()
示例#3
0
 def parse_titles(self, response):
     # for post_title in response.css('div.entries > ul > li a::text').extract():
     #     yield {'title': post_title}
     l = ItemLoader(item=Product(), response=response)
     l.add_css('name', '#content > h1::text')
     l.add_css('consist', 'div.entries > ul > li a::text')
     return l.load_item()
示例#4
0
文件: sina.py 项目: bgcolors/ztcrawl
    def parse_item(self, response):
        l = ItemLoader(item=ZtArticleItem(), response=response)
        l.add_value('classId', '10');
        l.add_value('cataName', u'私募资讯')
        l.add_value('url', response.urljoin(response.url))
        l.add_css('title', '#artibodyTitle::text')
        l.add_css('seo_keywords', 'meta[name*=keywords]::attr(content)')
        l.add_css('seo_description', 'meta[name*=description]::attr(content)')
        
        # 新浪发布时间和来源在一个dom节点内,而且来源会变化简单处理
        source = response.css('.time-source span a::text').extract()
        if len(source) == 0:
            tmp = response.css('.time-source::text').extract()[0]
            tmp = tmp.replace(' ', '').replace('\n', '').replace('\t', '')
            l.add_value('author', tmp[16:])
            l.add_value('source', tmp[16:])
            l.add_value('publishTime', tmp[:16])
        else:
            l.add_value('author', source[0])
            l.add_value('source', source[0])
            l.add_value('publishTime', response.css('.time-source::text').extract()[0].replace(' ', '').replace('\n', '').replace('\t', ''))
        
        l.add_css('keywords', '.article-keywords a::text')
        # 新浪无浏览数随机一个三位数
        l.add_value('views', randint(100, 999))

        l.add_css('image_urls', '#artibody img::attr(src)')

        # content = ''.join(response.xpath('//div[@id="artibody"]/*').extract())
        content = response.css('#artibody').extract()[0]
        l.add_value('content', content)
        yield l.load_item()
示例#5
0
    def parse_item(self, response):

        loader = ItemLoader(GaokaopaiZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
        loader.add_css('name', u'.modTitle>h1::text')

        def parse_category():
            for e in response.css(u'.catType>a'):
                yield {
                    'url': e.css('::attr(href)').extract_first(),
                    'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
                    'name': e.css('::text').extract_first(),
                }

        loader.add_value('category', list(parse_category()))
        loader.add_css('detail', u'.zhiyeShow')

        item = loader.load_item()

        return FormRequest(
            url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
            formdata={'code': item['code'][0]},
            meta={'item': item},
            dont_filter=True,
            callback=self.parse_majors
        )
示例#6
0
def parse(self, response):
    l = ItemLoader(item=Product(), response=response)
    l.add_xpath('name', '//div[@class="product_name"]')
    l.add_xpath('name', '//div[@class="product_title"]')
    l.add_xpath('price', '//p[@id="price"]')
    l.add_css('stock', 'p#stock]')
    l.add_value('last_updated', 'today')
    return l.load_item()
示例#7
0
文件: eol.py 项目: EasyData/gaokao
    def parse_item(self, response):

        loader = ItemLoader(EolZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=r'/(\w+)\.shtml')
        loader.add_css('name', 'h1#pagetitle::text')
        loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "行业")]/a/text()')
        loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "职业")]/a/text()')
        loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n'))
        yield loader.load_item()
 def parse(self, response):
     wines = response.css('table.table > tr')
     for wine in wines:
         loader = ItemLoader(item=WineDeciderItem(), selector=wine)
         loader.add_css('maturity',
                        css='td.cbResultSetTableCell > img::attr(src)')
         loader.add_css('winery_name',
                        css='td.cbResultSetTableCell > h3::text')
         loader.add_css('wine_name', css='td.cbResultSetTableCell::text')
         loader.add_css('vintage', css='td:nth-child(3) h3::text')
         loader.add_css('mark', css='td:nth-child(5) ::text')
         yield loader.load_item()
 def parse(self, response):
     loader = ItemLoader(item=RecipeItem(), selector=response)
     loader.add_css('recipe_name',
                    css='div.container h1.main-title span ::text')
     loader.add_css(
         'recipe_image',
         css='div.container div.product-area div.image ::attr(src)')
     ingredient_loader = loader.nested_css('div.ingredient li')
     ingredient_loader.add_css('recipe_ingredients', css='::text')
     prepa_loader = loader.nested_css('div.preparation li')
     prepa_loader.add_css('recipe_prepa', css='div > p ::text')
     yield loader.load_item()
示例#10
0
    def startProductLoader(self, response):

        productLoader = ItemLoader(item = Product(), response = response)

        productLoader.add_css('name', 'h3.product_name a')
        productLoader.add_css('category', 'div.product_category a')
        productLoader.add_css('image_urls', 'img.image_main::attr(src)')
        productLoader.add_css('price', 'span.price')
        productLoader.add_css('slug', 'h3.product_name a::attr(href)')
        productLoader.add_value('label', 'primary')
        productLoader.add_value('description', '')
        return productLoader.load_item()
示例#11
0
 def parse(self, response):
     for result in response.css("div.search-results-listing"):
         l = ItemLoader(DownloaddmppdfsItem(), result)
         l.add_css('name', 'a.result-title > strong::text')
         l.add_xpath('file_urls', 'a.result-title::attr(href)')
         l.add_xpath('summary', '//span[@class="result-text"][0]')
         l.add_xpath('date_delivered', '//span[@class="result-text"][1]')
         l.add_xpath('parties', '//span[@class="result-text"][2]'
                     )  # you can also use literal values
         l.add_xpath('tenement', '//span[@class="result-text"][3]'
                     )  # you can also use literal values
     yield l.load_item()
示例#12
0
 def parse_item(self, response):
     i = ItemLoader(item=Annonce(), response=response)
     i.add_value('url', response.url)
     i.add_css('titre', 'header h1::text')
     i.add_css('prix', '.item_price .value::text')
     i.add_css('date', 'section.properties p.line::text')
     i.add_css('description', '.properties_description p.value::text')
     i.add_css(
         'tag',
         '.line h2:not(.item_price) span::text, .line h2:not(.item_price) span a::text'
     )
     return i.load_item()
示例#13
0
    def parse_item(self, response):
        loader = ItemLoader(item=Chapter(), response=response)
        loader.default_output_processor = TakeFirst()

        loader.add_css('content', '#content')
        loader.add_css('title', '.bookname h1::text')
        loader.add_xpath('book_name', '//div[@class="con_top"]/a[3]/text()')
        loader.add_value('url', response.url)
        loader.add_value('chapter_id', response.url)
        loader.add_value('crawl_time', datetime.datetime.now())
        loader.add_value('number', response.meta['number'])

        yield loader.load_item()
示例#14
0
 def parse(self, response):
     l = ItemLoader(item=NewsParagraph(), response=response)
     l.add_xpath(
         'title',
         '/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[2]/h3[1]/text()')
     l.add_xpath(
         'date',
         '/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[2]/h3[1]/div[1]/text()'
     )
     l.add_css('paragraph_text', 'div.clear > p::text')
     l.add_value('last_updated', datetime.today())
     rv = l.load_item()
     return rv
    def parse(self, response):
        l = ItemLoader(item=AmazonItem(), response=response)
        # extracting info
        l.add_css('product_name', '.a-color-base.a-text-normal::text')
        l.add_css('author', '.a-size-base.a-link-normal:nth-child(2)::text')
        l.add_css('price', '.a-offscreen')
        l.add_css("image_link", '.s-image::attr(src)')
        l.add_css('stars', ".a-icon-alt::text")

        yield l.load_item()
        next_page = response.css('li.a-last ::attr(href)').extract_first()
        next_page = response.urljoin(next_page)
        yield scrapy.Request(url=next_page, callback=self.parse)
示例#16
0
    def parseEpisode(self, response):
        print("parsing " + self.nprPodcastName + " episode")

        subsections = response.css('h3.rundown-segment__title a::attr(href)')
        for url in subsections.extract():
            l = ItemLoader(PodcastTranscriptsItem(), response)
            l.add_css('episode_title', 'div.title-description h1')
            request = scrapy.Request(url,
                                     callback=self.parseSubsection,
                                     meta={'itemLoader': l},
                                     dont_filter=True,
                                     priority=75)
            yield request
示例#17
0
    def parse_ads(self, response: HtmlResponse):
        loader = ItemLoader(item=AvitoparserItem(), response=response)
        loader.add_css('title','div.AdvertCard_advertTitle__1S1Ak::text')
        loader.add_xpath('photos','//figure[@class=\'PhotoGallery_photo__36e_r\']//source/@srcset')
        loader.add_xpath('price', '//div[@class=\'AdvertCard_price__3dDCr AdvertCard_topAdvertHeaderCommon__2zUjb rouble\']/text()')
        yield loader.load_item()



        # title = response.css('h1.title-info-title span.title-info-title-text::text').extract_first()
        # photos = response.xpath('//div[contains(@class, "gallery-img-wrapper")]//div[contains(@class, "gallery-img-frame")]/@data-url').extract()
        # # print(title, photos)
        # yield AvitoparserItem(title=title, photos=photos)
示例#18
0
    def parse_item(self, response):
        houses = response.xpath(
            '//div[@class="f-main-list"]//div[@class="f-list-item ershoufang-list"]'
        )

        for house in houses:
            l = ItemLoader(item=GanjiItem(), selector=house)
            l.add_css('title', 'dd.dd-item.title a::text')
            l.add_css('size', 'dd.dd-item.size span::text')
            l.add_css('address', 'dd.dd-item.address span ::text')
            l.add_css('feature', 'dd.dd-item.feature span::text')
            l.add_css('info', 'dd.dd-item.info div ::text')
            yield l.load_item()
示例#19
0
 def parse_ads(self, response: HtmlResponse):
     loader = ItemLoader(item=AvitoparserItem(), response=response)
     loader.add_xpath(
         'photos',
         '//div[contains(@class, "gallery-img-wrapper")]//div[contains(@class, "gallery-img-frame")]/@data-url'
     )
     loader.add_css('name',
                    'h1.title-info-title span.title-info-title-text::text')
     loader.add_xpath('price', '//span[@class="js-item-price"]/@content')
     loader.add_xpath(
         'currency',
         '//span[@class="price-value-prices-list-item-currency_sign"]/@content'
     )
     loader.add_xpath(
         'car_brand',
         '//li[contains(@class, "item-params-list-item")][1]/text()')
     loader.add_xpath(
         'car_model',
         '//li[contains(@class, "item-params-list-item")][2]/text()')
     loader.add_xpath(
         'modification',
         '//li[contains(@class, "item-params-list-item")][4]/text()')
     loader.add_xpath(
         'year',
         '//li[contains(@class, "item-params-list-item")][5]/text()')
     loader.add_xpath(
         'mileage',
         '//li[contains(@class, "item-params-list-item")][6]/text()')
     loader.add_xpath(
         'num_doors',
         '//li[contains(@class, "item-params-list-item")][11]/text()')
     loader.add_xpath(
         'engine_type',
         '//li[contains(@class, "item-params-list-item")][12]/text()')
     loader.add_xpath(
         'transmission',
         '//li[contains(@class, "item-params-list-item")][13]/text()')
     loader.add_xpath(
         'drive',
         '//li[contains(@class, "item-params-list-item")][14]/text()')
     loader.add_xpath(
         'rudder',
         '//li[contains(@class, "item-params-list-item")][15]/text()')
     loader.add_xpath(
         'color',
         '//li[contains(@class, "item-params-list-item")][16]/text()')
     loader.add_xpath(
         'place_inspection',
         '//li[contains(@class, "item-params-list-item")][18]/text()')
     loader.add_value('link', response.url)
     yield loader.load_item()
示例#20
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        #提取文章的具体字段
        front_imgae_url = response.meta.get('front_image_url', '')
        title = response.css('.entry-header h1::text').extract()[0]
        create_date = response.css(
            '.entry-meta-hide-on-mobile::text').extract()[0].strip()[0:10]
        praise_nums = int(response.css('.vote-post-up h10::text').extract()[0])
        fav_num = response.css(
            '.btn-bluet-bigger.href-style.bookmark-btn.register-user-only::text'
        ).extract()[0]
        match_re = re.match('.*?(\d+).*?', fav_num)
        if match_re:
            fav_num = int(match_re.group(1))
        else:
            fav_num = 0
        com_num = response.css(
            '.btn-bluet-bigger.href-style.hide-on-480 ::text').extract()[0]
        match_re = re.match('.*?(\d+).*?', com_num)
        if match_re:
            com_num = int(match_re.group(1))
        else:
            com_num = 0
        content = response.css('.entry').extract()[0]
        tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract()
        tags = ','.join(tag_list)

        article_item['url_object_id'] = get_md5(response.url)
        article_item['title'] = title
        article_item['url'] = response.url
        try:
            create_date = datetime.datetime.strptime(create_date,
                                                     '%Y/%m/%d').date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
        article_item['create_date'] = create_date
        article_item['front_image_url'] = {front_imgae_url}
        article_item['praise_nums'] = praise_nums
        article_item['comment_nums'] = com_num
        article_item['fav_nums'] = fav_num
        article_item['tags'] = tags
        article_item['content'] = content

        #通过Item Loader加载Item
        item_loader = ItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_value('url', response.url)

        yield article_item

        pass
示例#21
0
    def parsepage(self, res):
        """This function parses a movie page.
        @url https://yts.mx/browse-movies
        @returns items 1
        @scrapes movie_title release_year genre imdb_rating
        @scrapes rating_count criticts audience magnet_link
        """

        try:
            cr = res.css("div.rating-row > span::text").re(r"\d+%")[0]
            au = res.css("div.rating-row > span::text").re(r"\d+%")[1]
        except IndexError:
            cr = None
            au = None

        # Create the loader using the response
        movie_loader = ItemLoader(item=YtsItem(), response=res)

        movie_loader.default_output_processor = TakeFirst()

        # Load fields using XPath expressions
        movie_loader.add_css(
            "movie_title",
            "div.row > div#movie-info.col-xs-10.col-sm-14.col-md-7.col-lg-8.col-lg-offset-1 > div.hidden-xs > h1::text",
            MapCompose(str.strip, str.title),
        )
        movie_loader.add_css(
            "release_year",
            "div.row > div#movie-info.col-xs-10.col-sm-14.col-md-7.col-lg-8.col-lg-offset-1 > div.hidden-xs > h2::text",
        )
        movie_loader.add_xpath(
            "genre",
            "/html/body/div[4]/div[3]/div[1]/div[4]/div[1]/h2[2]/text()",
        )
        movie_loader.add_css(
            "imdb_rating",
            'div.rating-row > span[itemprop="ratingValue"]::text',
        )
        movie_loader.add_css(
            "rating_count",
            'div.rating-row > span[itemprop="ratingCount"]::text',
        )
        movie_loader.add_value("criticts", cr)
        movie_loader.add_value("audience", au)
        movie_loader.add_css(
            "magnet_link",
            "div.modal-torrent a.magnet-download.download-torrent.magnet::attr(href)",
        )

        return movie_loader.load_item()
示例#22
0
    def parse(self, response):
        self.log(f"I just visited {response.url}")

        for article in response.css("article.product_pod"):

            # item loader initialization
            item_loader = ItemLoader(item=BookDataItemLoaderItem(),
                                     selector=article)
            BookDataItemLoaderSpider.count += 1

            item_loader.add_value('item_number',
                                  BookDataItemLoaderSpider.count)
            item_loader.add_css('title', "h3 > a::attr(title)")
            item_loader.add_css('price', "p.price_color::text")
            item_loader.add_css('stars', "article > p::attr(class)")
            item_loader.add_css('thumbnail_path', "div > a > img::attr(src)")
            item_loader.add_css('detailed_book_url', "div > a::attr(href)")

            # crawl detailed book page
            detailed_book_url = article.css("div > a::attr(href)").get()
            if detailed_book_url:
                # parse the detailed book page with an approprate parse method
                yield response.follow(
                    url=detailed_book_url,
                    callback=self.parse_detailed_book_url,
                    # send some meta data to the parse_detailed_book_url method
                    meta={'item': item_loader.load_item()},
                    dont_filter=True)
            else:
                yield item_loader.load_item()

        # move to following pages
        next_page_url = response.css("li.next > a::attr(href)").get()
        if next_page_url:
            yield response.follow(url=next_page_url, callback=self.parse)
示例#23
0
    def parse(self, response):
        # self.logger.info('hello this is my first spider')
        self.logger.info('Parse function called on {}'.format(response.url))
        quotes = response.css('div.quote')
        for quote in quotes:
            loader = ItemLoader(item=QuoteItem(),selector=quote)
            loader.add_css('quote_content', '.text::text')
            loader.add_css('tags', '.tag::text')
            quote_item = loader.load_item()
            author_url = quote.css('.author + a::attr(href)').get()
            yield response.follow(author_url, self.parse_author, meta = {'quote_item': quote_item})

        for a in response.css('li.next a'):
            yield response.follow(a, self.parse)
示例#24
0
    def parse_item(self, response):
        sel = response.css("div.path")

        loader = ItemLoader(item=SeriesItem(), selector=sel)
        loader.add_css("series_id", "a:last-child::attr(href)")
        loader.add_css("series_name", "a:last-child::text")

        series = loader.load_item()
        print(series)

        # 即将销售 & 在售
        for sel in response.css("div.interval01-list-cars-infor"):
            loader = ItemLoader(item=ModelItem(), selector=sel)
            loader.add_css("model_id", "a::attr(href)")
            loader.add_css("model_name", "a::text")
            loader.add_value("series_id", series['series_id'])
            loader.add_value("series_name", series['series_name'])

            yield loader.load_item()

        # 停售
        url = "http://www.autohome.com.cn/ashx/series_allspec.ashx"

        years = response.css(".dropdown-content a::attr(data)")

        for year in years.extract():
            qs = {
                "y": year,
                "s": series["series_id"]
            }

            yield Request(url + "?" + urlencode(qs), self.stop_sale)
示例#25
0
    def parse(self, response, **kwargs):
        loader = ItemLoader(item=ReservaItem(), selector=response)
        loader.add_value('url_item', response.url)
        loader.add_css(
            'name',
            '.item-page > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > p:nth-child(2)'
        )
        loader.add_css(
            'biome',
            '.item-page > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > p:nth-child(3)::text'
        )
        loader.add_css(
            'size_area',
            '.item-page > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > p:nth-child(4)::text'
        )
        loader.add_css(
            'unity_created_at',
            '.item-page > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > p:nth-child(5)::text'
        )

        detail = response.xpath(
            '/html/body/div[2]/main/div/div/div/section/div/div[1]/table/tbody/tr[1]/td[2]/p[5]/text()'
        ).extract()
        loader.add_value('regional_administration', detail[0])
        loader.add_value('address', detail[1])
        loader.add_value('phones', detail[2])

        yield loader.load_item()
示例#26
0
    def load_lesson(self, selector):
        lesson_loader = ItemLoader(items.Lesson(), selector)
        name = selector.xpath(
            './/span[@itemprop="name"]/text()').extract_first()
        url = selector.xpath(
            './/link[@itemprop="contentUrl"]/@href').extract_first()
        extention = url.split('.')[-1]
        filename = f'{name}.{extention}'
        lesson_loader.add_value('name', name)
        lesson_loader.add_value('file_urls', url)
        lesson_loader.add_value('filename', filename)
        lesson_loader.add_css('duration', 'em.lessons-list__duration::text')

        return lesson_loader.load_item()
示例#27
0
    def parse_question(self, response):
        # 处理question页面
        match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                             response.url)
        if match_obj:
            question_id = int(match_obj.group(2))
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        # 标题
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        # 问题内容
        item_loader.add_css("content", ".QuestionHeader-detail")
        # 问题url
        item_loader.add_value("url", response.url)
        # 问题id
        item_loader.add_value("zhihu_id", question_id)
        # 回答数量
        item_loader.add_xpath("answer_num",
                              "//h4[@class='List-headerText']/span//text()")
        # 评论数量
        item_loader.add_css("comment_num",
                            ".QuestionHeader-Comment button::text")
        # 关注者和浏览数
        item_loader.add_xpath(
            "watch_user_num",
            "//strong[@class='NumberBoard-itemValue']/text()")
        # 所属话题
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(
            str(question_id), 20, 0),
                             callback=self.parse_answer)
        yield question_item
示例#28
0
    def parse_product(self, response):
        meta = response.meta
        product_loader = ItemLoader(ProductItem(), selector=response)

        url = response.url
        article_id = self.extract_id_from_url(url)
        if not article_id:
            return
        product_loader.add_value('article_id', article_id)
        meta.update({'product_id': article_id})
        meta.update({'product_url': url})
        product_loader.add_value('url', url)
        product_loader.add_value('list_page', meta.get('list_page', ''))
        product_loader.add_css('images', '.lh li img::attr(src)')
        product_loader.add_css('title', '.sku-name::text')
        product_loader.add_css('currency', '.p-price span::text')
        product_loader.add_css('description',
                               '.parameter2.p-parameter-list *::text')
        vendor_id = self.get_vendor_id(response)
        product_loader.add_value('vendor_id', vendor_id)
        product = product_loader.load_item()
        yield product
        yield self.get_stock_and_price(product)
        yield response.follow(self.review_url.format(
            article_id, '0', self.comments_request_length),
                              callback=self.parse_review,
                              meta=meta)
示例#29
0
    def parse(self, response):
        # page = response.url.split("/")[-2]
        #filename = 'quotes-%s.html' % page
        #with open(filename, 'wb') as f:
        #    f.write(response.body)
        #self.log('Saved file %s' % filename)
        for producteRAW in response.css(
                'div.products article.product-miniature'):

            loader = ItemLoader(item=Producte(), selector=producteRAW)
            loader.add_css('nom', 'h2.product-title > a::text')
            loader.add_css('url', 'h2.product-title > a::attr(href)')
            loader.add_css('preu',
                           'div.product-price-and-shipping span.price::text')
            loader.add_css(
                'preu_original',
                'div.product-price-and-shipping span.regular-price::text')

            agotado = producteRAW.css('ul.product-flags li.agotado').get()
            if agotado is not None:
                loader.add_value('stock', 'Agotado')
            else:
                loader.add_value('stock', 'Disponible')

            producte = loader.load_item()

            producte['botiga'] = 'Jugamosotra'

            yield producte

        #PAGINES SEGÜENTS
        for next_page in response.css('ul.page-list a[rel=next]::attr(href)'):
            #print("next!")
            yield response.follow(next_page, self.parse)
示例#30
0
    def parse_questions(self, response):

        q_item = ItemLoader(item=ZhihuQuestionItem(), response=response)
        q_id = response.meta['question_id']
        q_item.add_value('q_id', q_id)
        q_item.add_value('q_url', response.url)
        q_item.add_css('q_title', '.QuestionHeader-tags+h1::text')
        q_item.add_css(
            'q_content',
            '.QuestionRichText.QuestionRichText--collapsed span::text')
        q_item.add_css('q_topic', '.Tag.QuestionTopic .Popover div::text')
        q_item.add_css('q_answers_num', '.List-headerText span::text')
        q_item.add_xpath(
            'q_follower',
            '//div[@class="NumberBoard QuestionFollowStatus-counts NumberBoard--divider"]/button//strong/text()'
        )
        q_item.add_xpath(
            'q_watcher',
            '//div[@class="NumberBoard QuestionFollowStatus-counts NumberBoard--divider"]/div//strong/text()'
        )
        q_item.add_value('crawl_time', datetime.datetime.now())
        question_item = q_item.load_item()

        answers_url = self.temp_answers_url[0].format(q_id, 15, 0)

        yield Request(url=answers_url,
                      callback=self.parse_answers,
                      headers=self.headers,
                      meta={'q_id': q_id})

        yield question_item
    def parse(self, response):

        productos = response.css('div.item__info')
        for producto in productos:
            # titulo = producto.css('a.name::text')
            # url = producto.xpath('//div[contains(@class,"detail")]/a[contains(@class,"image")]/img[contains(@id,"gImg")]/@src')
            producto_loader = ItemLoader(item=ProductoMercadoLibre(),
                                         selector=producto)

            producto_loader.default_output_processor = TakeFirst()

            producto_loader.add_css(
                'titulo',
                'div>h2.item__title>a.item__info-title>span.main-title::text')
            producto_loader.add_css('precio',
                                    'div.price__container>div.item__price')
            producto_loader.add_css(
                'vendidos',
                'div.item__stack_column>div.item__stack_column__info>div.stack_column_item>div.item__status>div.item__condition::text'
            )
            producto_loader.add_css(
                'lugar',
                'div.item__stack_column>div.item__stack_column__info>div.stack_column_item>div.item__status>div.item__condition::text'
            )
            yield producto_loader.load_item()
示例#32
0
 def parse(self, response):
     for news in response.css('.cg-research-article-link'):
         loader = ItemLoader(item=TestprojItem(),
                             selector=news,
                             response=news)
         loader.add_css('_title', '.cg-research-article-title::text')
         loader.add_css('_abstract', '.cg-research-article-excerpt::text')
         # No views tracked
         # loader.add_css('_views', '.body li:nth-child(1)::text')
         loader.add_css('_author', '.name::text')
         loader.add_css('_image', '.cg-research-article-image::attr(style)')
         loader.add_css('_date', '.dotlist span::text')
         loader.add_value('_source', response.url)
         yield loader.load_item()
示例#33
0
    def parse_detail(self, response):
        # TODO: Test with interactvate
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        loader = ItemLoader(item=FreeproxyItem(),
                            response=response,
                            url=self.start_urls[0])

        # for element in response.css("table tr")[1:]:
        loader.add_xpath("area", "//tr/td[1]/img/@alt",
                         MapCompose(lambda x: x.lower()))
        loader.add_css("ip", "table > tr > td:nth-of-type(2)::text")
        loader.add_css("port", "table > tr > td:nth-of-type(3)::text")
        loader.add_css("ssl", "table > tr > td:nth-of-type(6)::text")

        loader.add_css("security", "table > tr > td:nth-of-type(5)::text",
                       MapCompose(self.__fix_security))

        items = loader.load_item()
        yield items
        # # TODO: Test with interactvate
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        next_page = self.start_urls[0] + response.xpath(
            "//div/a[@class='next_page']/@href").extract_first()
        yield scrapy.Request(next_page, callback=self.parse_detail)
示例#34
0
文件: pedaily.py 项目: 3con/ztcrawl
    def parse_item(self, response):
        l = ItemLoader(item=response.meta['item'], response=response)
        l.add_value('classId', '18');
        l.add_value('cataName', u'私募股权资讯')
        l.add_value('url', response.urljoin(response.url))
        l.add_css('title', 'h1::text')

        keywords = response.css('meta[name*=eywords]::attr(content)').extract()[0]
        keywordsList = keywords.split(',')
        while '' in keywordsList:
            keywordsList.remove('')
            
        l.add_value('keywords', keywordsList)
        l.add_value('seo_keywords', keywords)
        
        description = response.css('.news-show .subject::text').extract()
        l.add_value('description', description)
        l.add_value('seo_description', description)
        
        l.add_value('publishTime', response.css('.date::text').extract()[0])
        
        tmp = response.css('.news-show .box-l::text').extract()[0].split(u'\u3000')
        while '' in tmp:
            tmp.remove('')

        l.add_value('source', tmp[0].replace(' ', ''))
        l.add_value('author', tmp[1].replace(' ', '') if tmp[1] != u'\u3000' else '')

        # pedaily 阅读数为ajax加载, 没有爬取的必要
        # views = response.css('#HitsText::text').extract()[0].replace(u'阅读:', '')
        # l.add_value('views', views)
        
        image_urls = response.css('#news-content img::attr(src)').extract().append(response.meta['item']['image_urls'])
        l.add_value('image_urls', image_urls)


        content = response.css('#news-content').extract()[0]
        
        # 去广告
        # content = content.replace(response.css('.visible-lg-block.visible-md-block').extract()[0], '')

        # 锚文本替换
        atags = response.css('#news-content a').extract()
        atexts = response.css('#news-content a::text').extract()
        if (len(atags) == len(atexts)):
            for index, atag in enumerate(atags):
                content = content.replace(atag, atexts[index])

        l.add_value('content', content)
        yield l.load_item()
示例#35
0
    def parse_user(self, response, user_avatar_url=''):
        item = ItemLoader(item=User(), response=response)
        item.add_css(FULL_NAME, '.name-bio-message h3')
        item.add_css(USERNAME, '.profileusername')
        item.add_css(FOLLOWER, '.user-followers span')
        item.add_css(FOLLOWING, '.user-following span')

        user = dict(item.load_item())
        try:
            user[USER_ID] = int(response.url.split('/')[-1])
            user[FOLLOWER] = int(user[FOLLOWER])
            user[FOLLOWING] = int(user[FOLLOWING])
            user[TYPE] = USER
            user[USER_AVATAR_URL] = user_avatar_url
            yield user
            # yield Request(
            #     url=OWN_RECIPE_URL.format(user[USER_ID], 1),
            #     callback=self.parse_first_own_recipe
            # )
            # Followers
            if user[FOLLOWER]:
                for i in range(1, 1 + user[FOLLOWER]):
                    yield Request(url=FOLLOW_URL.format(
                        user[USER_ID], 'followers', i),
                                  callback=self.parse_follow,
                                  cb_kwargs=dict(user=user, f_type=FOLLOWER))
            # Followings
            if user[FOLLOWING]:
                for i in range(1, 1 + user[FOLLOWING]):
                    yield Request(url=FOLLOW_URL.format(
                        user[USER_ID], 'follows', i),
                                  callback=self.parse_follow,
                                  cb_kwargs=dict(user=user, f_type=FOLLOWING))
        except Exception as e:
            yield {TYPE: ERROR, URL: response.url, ERROR: str(e)}
示例#36
0
 def parse_item(self, response):
     """
     This function parses a property page.
     :param response:
     :return: item
     """
     # Create the loader using response
     l = ItemLoader(item=AllitebooksItem(), response=response)
     # Load primary fields using css expressions
     l.add_css('title', '.single-title::text', MapCompose(str.strip))
     l.add_css('cover', '.entry-body-thumbnail>a>img::attr(src)')
     book_details = response.css('.book-detail>dl>dd::text').extract()
     author_list = response.css(
         '.book-detail>dl>dd:nth-child(2)>a::text').extract()
     category_list = response.css(
         '.book-detail>dl>dd:nth-child(16)>a::text').extract()
     author = ','.join(author_list)
     category = ','.join(category_list)
     book_details = book_details[len(author_list):(-len(category_list))]
     l.add_value('author', author, MapCompose(str.strip))
     l.add_value('category', category, MapCompose(str.strip))
     item_name = "isbn year pages language file_size file_format".split()
     for index, value in enumerate(item_name):
         l.add_value(value, book_details[index], MapCompose(str.strip))
     l.add_css('description', '.entry-content')
     l.add_css('download', 'span.download-links>a::attr(href)',
               MapCompose(str.strip), TakeFirst())
     # Housekeeping fields
     l.add_value('url', response.url)
     l.add_value('spider', self.name)
     l.add_value('date', datetime.datetime.now())
     yield l.load_item()
示例#37
0
 def vacansy_parse(self, response):
     #name = response.css('div._3mfro CuJz5 PlM3e _2JVkc _3LJqf::text').extract_first()
     #url_vacancy = response.url
     #min_salary = response.css('span._3mfro _2Wp8I ZON4b PlM3e _2JVkc::text').extract()
     #sourse = 'superjob.ru'
     #yield JobparserItem(name=name, url_vacancy=url_vacancy, min_salary=min_salary,
     #                    max_salary=None, source=sourse)
     loader = ItemLoader(item=JobparserItem(), response=response)
     loader.add_css('name', 'div._3mfro CuJz5 PlM3e _2JVkc _3LJqf::text')
     loader.add_value('url_vacancy', response.url)
     loader.add_css('min_salary',
                    'span._3mfro _2Wp8I ZON4b PlM3e _2JVkc::text')
     loader.add_value('max_salary', None)
     loader.add_value('source', 'superjob.ru')
示例#38
0
    def parse_question(self, response):
        #处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            #处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_xpath(
            "comments_num",
            "//*[@class='QuestionHeader-Comment']/button/text()")
        item_loader.add_xpath(
            "watch_user_num",
            "//*[@class='QuestionHeader-follow-status']/div/div/button/div/strong/text()"
        )
        item_loader.add_css("topics",
                            ".QuestionHeader-topics .Popover div::text")
        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 3, 0),
                             callback=self.parse_answer)
        yield question_item
示例#39
0
    def parse_item(self, response):

        loader = ItemLoader(GaokaopaiZhuanyeItem(), response)

        loader.add_value('url', response.url)
        loader.add_css('name', u'.majorTitle>h1::text')

        loader.add_xpath('code', u'//div[@class="majorBase"]/h3[starts-with(., "专业代码:")]/text()', re=ur':(.+)')
        loader.add_xpath('degree', u'//div[@class="majorBase"]/h3[starts-with(., "授予学位:")]/text()', re=ur':(.+)')
        loader.add_xpath('period', u'//div[@class="majorBase"]/h3[starts-with(., "修学年限:")]/text()', re=ur':(.+)')
        loader.add_xpath('courses', u'//div[@class="course"]/h3[.="开设课程:"]/following-sibling::p/text()')

        def parse_related():

            for e in response.xpath(u'//div[@class="course"]/h3[.="相近专业:"]/following-sibling::a'):
                yield {
                    'url': e.css('::attr(href)').extract_first(),
                    'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
                    'name': e.css('::text').extract_first(),
                }

        loader.add_value('related', list(parse_related()))

        def parse_category():

            category = []

            for i in [u"学历类别", u"学科门类", u"专业类别"]:
                x = u'//h3[.="{}"]/following-sibling::ul[1]/li[@class="current"]/a'.format(i)
                e = response.xpath(x)
                category.append({
                    'url': e.css('::attr(href)').extract_first(),
                    'code': e.css('::attr(href)').re_first(ur'/zhuanye([-0-9]*)\.html').strip('-'),
                    'name': e.css('::text').extract_first(),
                })

            return category

        loader.add_value('category', parse_category())
        loader.add_css('detail', u'.majorCon')

        item = loader.load_item()

        return Request(
            url='http://www.gaokaopai.com/zhuanye-jiuye-{}.html'.format(item['code'][0]),
            meta={'item': item},
            callback=self.parse_jiuye
        )
示例#40
0
 def parse_question(self,response):
     question_id = response.meta.get('question_id','')
     item_loader = ItemLoader(item = ZhihuQuestionItem(),response=response)
     item_loader.add_css('title','.QuestionHeader h1.QuestionHeader-title::text')
     item_loader.add_css('topics','.QuestionTopic .Popover  div::text')
     item_loader.add_css('content','.QuestionHeader-detail span::text')
     item_loader.add_value('url',response.url)
     item_loader.add_value('zhihu_id',question_id)
     item_loader.add_css('answer_num','.List-headerText span::text ')
     item_loader.add_css('comments_num','.QuestionHeader-Comment button::text')
     item_loader.add_css('watch_user_num','.NumberBoard-itemValue::text')
     question_item = item_loader.load_item()
     yield scrapy.Request(self.start_answer_url.format(question_id,20,0),headers=self.headers, callback=self.parse_answer)
     yield question_item
     scrapy.FormRequest
     pass
示例#41
0
文件: eol.py 项目: EasyData/gaokao
    def parse(self, response):

        for outer in response.css('#comapreTable tr:not(:first-child)'):

            if outer.css('td[align="center"]'):
                ccode = outer.css('td[align="center"]>a::attr(id)').extract_first()
                cname = outer.css('td[align="center"]>a::text').extract_first()

            for inner in outer.xpath('td[div[@align="left"]/a]'):
                loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
                loader.add_value('ccode', ccode)
                loader.add_value('cname', cname)
                loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
                loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
                loader.add_css('name', 'a::text', MapCompose(unicode.strip))
                item = loader.load_item()

                yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
示例#42
0
文件: forzieri.py 项目: EasyData/shop
    def parse_item(self, response):

        loader = ItemLoader(item=ShopItem(), response=response)
        loader.add_css('id', '#breadcrumbs .last-child::text')
        loader.add_value('url', response.url)
        loader.add_css('brand', '.productTitle>.brand_name>a::text')
        loader.add_css('name', '.productTitle>h1::text')
        loader.add_css('desc', 'p[itemprop="description"]')
        loader.add_css('cate', '#breadcrumbs span[itemprop="title"]::text', Join())
        loader.add_value('site', 'forzieri')
        loader.add_value('lang', self.lang)
        loader.add_value('time', time.time())

        url = urlparse.urljoin(
            self.start_urls[0],
            response.css('#scheda_tecnica_tab_trigger::attr(href)').extract_first(),
        )
        return Request(url, meta={'loader': loader}, callback=self.parse_item_ajax)
示例#43
0
    def parse(self, response):
        l = ItemLoader(item=UrbanusItem(), response=response)
        l.add_xpath('title', "//section[1]/div/div[2]/div[1]/h1")
        l.add_xpath('address', "//section[2]/div/div/p")
        l.add_xpath('photos', "//div[contains(@class, 'slide_gale')]/span/img/@src")
        l.add_value('url', response.url)

        properties = extract_properties(response)
        l.add_value('bedrooms', properties.bedrooms)
        l.add_value('bathrooms', properties.bathrooms)
        l.add_value('area_total', properties.area_total)
        l.add_value('area_constructed', properties.area_constructed)
        l.add_value('garage', properties.garage)

        l.add_css('price', "span.inmueble_price")
        l.add_css('description', "div.show_detail")

        return l.load_item()
示例#44
0
文件: shopbop.py 项目: EasyData/shop
    def parse_item(self, response):

        loader = ItemLoader(item=ShopItem(), response=response)
        loader.add_value('id', response.url, re=r'/([0-9]+).htm\?')
        loader.add_value('url', response.url)
        loader.add_css('brand', '#product-information h1 .row[itemprop="brand"]::text', TakeFirst(), unicode.strip)
        loader.add_css('name', '#product-information h1 .row[itemprop="name"]::text', TakeFirst(), unicode.strip)
        loader.add_css('desc', 'div[itemprop="description"]', TakeFirst(), remove_tags, unicode.strip)
        loader.add_css('cate', '.breadcrumb-list .breadcrumb span::text', Join())
        loader.add_value('site', 'shopbop')
        loader.add_value('lang', self.lang)
        loader.add_value('time', time.time())
        return loader.load_item()
示例#45
0
 def parse_titles(self, response):
     loader = ItemLoader(item=BlogCategory(), response=response)
     loader.add_value('hub', response.meta['hname'])
     loader.add_css('title', 'div.company_post h1 span::text')
     loader.add_css('date', 'div.published::text')
     loader.add_css('article', 'div.content::text')
     yield loader.load_item()
示例#46
0
    def parse_detail(self, response):
        il = ItemLoader(NewsItem(), response=response)

        il.add_css("title", "%s::text" % self.title)
        il.add_css("date", "%s::text" % self.date)
        il.add_css("auth", "%s::text" % self.auth)
        il.add_css("content", "%s > p::text" % self.content)
        il.add_value("cate", response.meta["cate"])
        return il.load_item()
示例#47
0
    def parse_item(self, response):
        l = ItemLoader(item=ZtArticleItem(), response=response)
        l.add_value('classId', '51');
        l.add_value('cataName', u'公募要闻')
        l.add_value('url', response.urljoin(response.url))
        l.add_css('title', 'h1.title::text')

        # 中证seo的keyword就是tags, description为简介, 然而description开头有中文空格, 一看就是程序员不细心
        l.add_css('keywords', 'meta[name*=keywords]::attr(content)')
        l.add_css('seo_keywords', 'meta[name*=keywords]::attr(content)')
        
        description = response.css('meta[name*=description]::attr(content)').extract()[0].replace(u'\u3000', '') # 去掉开头的中文空格
        l.add_value('description', description)
        l.add_value('seo_description', description)
        
        l.add_css('publishTime', '.timer::text')
        source = response.css('.source a::text').extract()
        
        # 有来源还是采上来源
        if len(source) == 0:
            tmp = response.css('.source::text').extract()[0]
            tmp = tmp.replace(u'来源:', '')
            l.add_value('source', tmp)
            # l.add_value('author', tmp)
        else:
            l.add_value('source', source[0])
        # 无浏览数随机一个三位数
        l.add_value('views', randint(100, 999))

        l.add_css('image_urls', '#qmt_content_div p img::attr(src)')

        content = response.css('#qmt_content_div').extract()[0]
        # 去广告
        content = content.replace(response.css('.visible-lg-block.visible-md-block').extract()[0], '')
        l.add_value('content', content)
        yield l.load_item()
示例#48
0
文件: chsi.py 项目: EasyData/gaokao
    def parse_item(self, response):

        loader = ItemLoader(ChsiDaxueItem(), response)
        loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml')
        loader.add_value('url', response.url)
        loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url)))
        loader.add_css('name', u'.topImg::text')
        loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)')

        data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip)
        loader.add_xpath('type', u'//span[@class="f_bold" and .="院校类型:"]/following-sibling::text()', data_clean)
        loader.add_xpath('membership', u'//span[@class="f_bold" and .="院校隶属:"]/following-sibling::text()', data_clean)
        loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean)
        loader.add_xpath('address', u'//span[@class="f_bold" and .="通讯地址:"]/following-sibling::text()', data_clean)
        loader.add_xpath('phone', u'//span[@class="f_bold" and .="联系电话:"]/following-sibling::text()', data_clean)
        loader.add_xpath('website', u'//span[@class="f_bold" and .="学校网址:"]/following-sibling::a/@href', data_clean)
        loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="特殊招生:"]/following-sibling::text()', data_clean)

        def parse_votes():
            xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank'
            get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0)
            return {
                'overall': get_vote(u'综合满意度'),
                'environment': get_vote(u'校园环境满意度'),
                'life': get_vote(u'生活满意度'),
            }

        loader.add_value('votes', parse_votes())

        def parse_trending():
            css = u'{}>table tr:not(:first-child)'
            def get_trending(what):
                majors = []
                for e in response.css(css.format(what)):
                    majors.append({
                        'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'),
                        'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(),
                        'vote': float(e.css(u'.avg_rank::text').extract_first()),
                        'count': int(e.css(u'.c_f00::text, .red::text').extract_first()),
                    })
                return majors
            return {
                'count': get_trending(u'#topNoofPTable'),
                'index': get_trending(u'#topIndexTable'),
                'like': get_trending(u'.r_r_box_zymyd'),
            }

        loader.add_value('trending', parse_trending())

        item = loader.load_item()

        for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="学校简介"]').extract_links(response):
            yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)
示例#49
0
文件: hexun.py 项目: bgcolors/ztcrawl
    def parse_item(self, response):
        l = ItemLoader(item=ZtArticleItem(), response=response)
        l.add_value('classId', '57');
        l.add_value('cataName', u'市场动态')
        l.add_value('url', response.urljoin(response.url))
        l.add_css('title', '#artibodyTitle h1::text')

        keywords = response.css('meta[name*=eywords]::attr(content)').extract()[0]
        keywordsList = keywords.split(' ')
        while '' in keywordsList:
            keywordsList.remove('')
            
        l.add_value('keywords', keywordsList)
        l.add_value('seo_keywords', keywords)
        
        description = ''.join(response.css('#artibody p::text').extract())
        if len(description) > 200:
            description = description[:200]
            
        l.add_value('description', description)
        l.add_value('seo_description', description)
        
        l.add_value('publishTime', response.css('#pubtime_baidu::text').extract()[0])
        l.add_css('source', '#source_baidu a::text')

        # 无浏览数随机一个三位数
        l.add_value('views', randint(100, 999))
        
        l.add_css('image_urls', '#artibody img::attr(src)')

        content = response.css('#artibody').extract()[0]
        
        # 去广告
        # content = content.replace(response.css('.visible-lg-block.visible-md-block').extract()[0], '')

        # 锚文本替换
        atags = response.css('#artibody a').extract()
        atexts = response.css('#artibody a::text').extract()
        if (len(atags) == len(atexts)):
            for index, atag in enumerate(atags):
                content = content.replace(atag, atexts[index])

        l.add_value('content', content)
        yield l.load_item()
示例#50
0
    def parse_item(self, response):
        l = ItemLoader(item=ZtArticleItem(), response=response)
        l.add_value('classId', '11');
        l.add_value('cataName', u'私募要闻')
        l.add_value('url', response.urljoin(response.url))
        l.add_css('title', '.hd h1::text')

        keywords = response.css('meta[name*=eywords]::attr(content)').extract()[0]
        keywordsList = keywords.split(',')
        while '' in keywordsList:
            keywordsList.remove('')

        l.add_value('keywords', keywordsList)
        l.add_value('seo_keywords', keywords)
        
        description = response.css('meta[name*=escription]::attr(content)').extract()[0]
        l.add_value('description', description)
        l.add_value('seo_description', description)
        
        l.add_value('publishTime', response.css('.info::text').extract()[0])
        l.add_css('source', '.info .where::text')

        # 无浏览数随机一个三位数
        l.add_value('views', randint(100, 999))
        
        l.add_css('image_urls', '#qmt_content_div p img::attr(src)')

        content = response.css('#Cnt-Main').extract()[0]
        
        # 去广告
        # content = content.replace(response.css('.visible-lg-block.visible-md-block').extract()[0], '')

        # 锚文本替换
        atags = response.css('#Cnt-Main a').extract()
        atexts = response.css('#Cnt-Main a::text').extract()
        if (len(atags) == len(atexts)):
            for index, atag in enumerate(atags):
                content = content.replace(atag, atexts[index])

        content = content.replace(u'(专栏)', '')

        l.add_value('content', content)
        yield l.load_item()
示例#51
0
    def parse_article(self, response):
        # Initialize some I/O processors
        join_all = Join('')
        take_first = TakeFirst()
        identity = Identity()
        prepend_url = PrependResponseUrl(response.url)
        strip_all, strip_one = StripAll(), StripOne()
        add_space_after_punct = AddSpaceAfterPunct()

        # Load PersonItem
        person_loader = ItemLoader(item=PersonItem(), response=response)
        person_loader.default_output_processor = take_first
        person_loader.add_css('name', 'h3.p-name::text', strip_all)
        person_loader.add_value('article_url', response.url)
        person_loader.add_css('pub_date', 'time.dt-published::attr(datetime)')
        person_loader.add_css('title', 'p.summary.p-summary::text', strip_all)
        person_loader.add_css('img_src', 'img.portrait::attr(src)', prepend_url)
        person_loader.add_xpath('bio', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=1]/descendant-or-self::*/text()', join_all, add_space_after_punct)
        person_loader.add_xpath('hardware', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=2]/descendant-or-self::*/text()', join_all, add_space_after_punct)
        person_loader.add_xpath('software', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=3]/descendant-or-self::*/text()', join_all, add_space_after_punct)
        person_loader.add_xpath('dream', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=4]/descendant-or-self::*/text()', join_all, add_space_after_punct)
        person_item = person_loader.load_item()

        # @gbrener 8/16/2015: The following line causes a NotImplementedError
        #object.__setattr__(person_item, 'export_empty_fields', True)
        person_item.fill_empty_fields()

        # Load a list of ToolItems
        tool_items = []
        for tool_selector in response.css('div.e-content p a'):
            tool_loader = ItemLoader(item=ToolItem(), selector=tool_selector, response=response)
            tool_loader.default_output_processor = take_first
            tool_loader.add_xpath('tool_name', './descendant-or-self::*/text()', join_all, strip_one)
            tool_loader.add_xpath('tool_url', './@href')
            tool_item = tool_loader.load_item()

            # @gbrener 8/16/2015: The following line causes a NotImplementedError
            #object.__setattr__(tool_item, 'export_empty_fields', True)
            tool_item.fill_empty_fields()

            tool_items.append(tool_item)

        yield dict(person=person_item, tools=tool_items)
示例#52
0
 def parse_item(self, response):
     il = ItemLoader(item=ImageItem(), response=response)
     il.add_css('image_urls', 'img::attr(src)')
     return il.load_item()
示例#53
0
    def parse_item(self, response):
        l = ItemLoader(item=response.meta['item'], response=response)
        l.add_value('classId', '49');
        l.add_value('cataName', u'国内经济')
        l.add_value('url', response.urljoin(response.url))
        l.add_css('title', '.newsContent h1::text')
        l.add_css('seo_title', '.newsContent h1::text')
        l.add_css('seo_keywords', 'meta[name*=keywords]::attr(content)')
        l.add_css('seo_description', 'meta[name*=description]::attr(content)')
        # 东财来源是图片
        l.add_value('source', u'东方财富网')
        # l.add_value('author', u'东方财富网')
        # 无浏览数随机一个三位数
        l.add_value('views', randint(100, 999))

        l.add_css('keywords', 'meta[name*=keywords]::attr(content)')
        l.add_css('description', '.c_review::text')

        # 2015年12月9日 11:09 -> 2015-12-09 11:09
        publishTime = response.css('.Info span:first-child::text').extract()[0]
        publishTime = time.strftime('%Y-%m-%d %H:%M', time.strptime(re.sub('[^0-9]', '', publishTime), '%Y%m%d%H%M'))
        l.add_value('publishTime', publishTime)

        content = response.css('#ContentBody').extract()[0]
        # 替换摘要
        substract = response.css('.c_review').extract();
        if len(substract):
            content = content.replace(substract[0], '')

        # 删除广告
        ad = response.css('.reading').extract()
        if len(ad) > 0:
            content = content.replace(ad[0], '')

        # 锚文本替换
        atags = response.css('#ContentBody a')
        ataghtml = response.css('#ContentBody a').extract()
        for index, atag in enumerate(atags):
            atext = atag.css('::text').extract()
            if len(atext):
                content = content.replace(ataghtml[index], atext[0])
            else:
                content = content.replace(ataghtml[index], atag.css(':first-child').extract()[0])
                
        l.add_value('content', content)

        yield l.load_item()
示例#54
0
    def parse_content_page(self, response):

        # Detect if this is a redirection page
        m = redirect_re.search(response.body)
        if m:
            import requests
            new_url = m.group(1)
            new_content = requests.get(new_url).content
            response = scrapy.http.HtmlResponse(new_url, body=new_content)

        # Start scraping
        il = ItemLoader(item = LuliItem(), response=response)
        
        il.add_css('content', 'div#articleNew > p::text')
        il.add_css('content', 'div[itemprop="articleBody"] > p')
        
        il.add_css('date', 'div#articleDate::text')
        il.add_css('date', 'header > time[datetime]::attr(datetime)')
        
        il.add_css('title', 'div#articleNew > h1::text')
        il.add_css('title', 'h1[itemprop="headline"]::text')
        
        il.add_value('url', response.url)

        item = il.load_item() 
        yield item
 def parse_movie(self, response):
     self.logger.info('Parse movie\'s url %s.', response.url)
     if response.status == 403:
         raise DropItem('Function parse_movie 403 page.')
     l = ItemLoader(item=MovieItem(), response=response)
     l.add_value('id', response.url, re=r'/.*?/(\d+)/')
     l.add_xpath('name', '//span[@property="v:itemreviewed"]/text()')
     l.add_xpath('poster', u'//img[@title="点击看更多海报" and @rel="v:image"]/@src')
     l.add_xpath(
         'alternate_name',
         u'//div[@id="info"]/span[@class="pl"][contains(./text(), "又名:")]/following::text()[1]',
         MapCompose(lambda s: s.split('/'), unicode.strip)
     )
     l.add_css('year', '.year::text', re=r'\((\d+)\)')
     l.add_css('rating', '.rating_num::text')
     l.add_xpath('rating_per', '//span[@class="rating_per"]/text()')
     l.add_xpath('rating_betterthan', '//div[@class="rating_betterthan"]/a/text()')
     l.add_xpath('rating_betterthan_href', '//div[@class="rating_betterthan"]/a/@href')
     l.add_xpath('director', '//a[@rel="v:directedBy"]/text()')
     l.add_xpath('director_id', '//a[@rel="v:directedBy"]/@href', re=r'/.*?/(\d+)/')
     l.add_xpath('script_editor', '(//div[@id="info"]//span[@class="attrs"]/a)[2]/text()')
     l.add_xpath('script_editor_id', '(//div[@id="info"]//span[@class="attrs"]/a)[2]/@href', re=r'/.*?/(\d+)/')
     l.add_xpath('genre', '//span[@property="v:genre"]/text()')
     l.add_xpath('tags', '//div[@class="tags-body"]/a/text()')
     l.add_xpath(
         'summary',
         '//span[@property="v:summary"]/text()',
         MapCompose(unicode.strip), Join('<br>')
     )
     l.add_xpath('runtime', '//span[@property="v:runtime"]/text()')
     l.add_xpath('starring', '//a[@rel="v:starring"]/text()')
     l.add_xpath('starring_id', '//a[@rel="v:starring"]/@href', re=r'/.*?/(\d+)/')
     l.add_xpath('initialReleaseDate', '//span[@property="v:initialReleaseDate"]/text()')
     l.add_xpath(
         'region',
         u'//div[@id="info"]/span[@class="pl"][contains(./text(), "制片国家/地区:")]/following::text()[1]',
         MapCompose(unicode.strip)
     )
     l.add_xpath(
         'language',
         u'//div[@id="info"]/span[@class="pl"][contains(./text(), "语言:")]/following::text()[1]',
         MapCompose(unicode.strip)
     )
     l.add_xpath(
         'imdb',
         u'//div[@id="info"]/span[@class="pl"][contains(./text(), "IMDb链接:")]/following::a[1]/text()'
     )
     l.add_xpath(
         'imdb_href',
         u'//div[@id="info"]/span[@class="pl"][contains(./text(), "IMDb链接:")]/following::a[1]/@href'
     )
     l.add_xpath('recommendations_id', '//div[@class="recommendations-bd"]/dl/dd/a/@href', re=r'/.*?/(\d+)/')
     l.add_xpath('recommendations', '//div[@class="recommendations-bd"]/dl/dd/a/text()')
     l.add_value(
         'collections_number',
         '//div[@class="subject-others-interests-ft"]/a[1]/text()',
         re=r'(\d+)'
     )
     l.add_value(
         'wishes_number',
         '//div[@class="subject-others-interests-ft"]/a[2]/text()',
         re=r'(\d+)'
     )
     l.add_value('last_update_time', str(datetime.utcnow()))
     # download poster image file
     l.add_xpath('image_urls', u'//img[@title="点击看更多海报" and @rel="v:image"]/@src')
     yield l.load_item()
     comments_url = response.xpath(r'//div[@id="comments-section"]/div[@class="mod-hd"]/h2//a/@href').extract_first()
     yield Request(
         url=comments_url,
         callback=self.parse_comment
     )
示例#56
0
    def parse_question(self, response):
        #处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            #处理新版本
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            #处理老版本页面的item提取
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
        yield question_item
示例#57
0
 def parse_titles(self, response):
     l = ItemLoader(item=Posts(), response=response)
     l.add_css('content_title', 'h1.pagetitle::text', self.content_title_parser)
     l.add_css('post_title', 'div.entries > ul > li a::text')
     return l.load_item()
示例#58
0
    def parse_item(self, response):
        item_loader = ItemLoader(item=HouseRentingLianjiaItem(), response=response)

        item_loader.add_css(field_name='title', css='div.title *::text')
        item_loader.add_value(field_name='source', value=self.name)
        item_loader.add_css(field_name='author', css='div.brokerName > a.name::text')
        item_loader.add_css(field_name='image_urls', css='div.thumbnail > ul > li > img::attr(src)')
        item_loader.add_css(field_name='author_link', css='div.brokerName > a.name::attr(href)')
        item_loader.add_css(field_name='content', css='div.introduction *::text', re=r'\s*(.*)\s*')
        item_loader.add_value(field_name='source_url', value=response.url)
        item_loader.add_css(field_name='publish_time', css='div.zf-room > p::text')

        item_loader.add_css(field_name='price', css='div.price > span.total::text')
        item_loader.add_css(field_name='detail', css='div.zf-room *::text')

        yield item_loader.load_item()
示例#59
0
    def parse_item(self, response):
        selector = Selector(response=response)
        selector.css('div.main-wrap')

        item_loader = ItemLoader(item=HouseRenting58Item(), selector=selector, response=response)
        item_loader.add_css(field_name='title', css='div.house-title > h1::text')
        item_loader.add_value(field_name='source', value=self.name)
        item_loader.add_css(field_name='author', css='div.house-basic-info div.house-agent-info p.agent-name > a::text')
        item_loader.add_css(field_name='image_urls', css='div.basic-pic-list > ul > li > img::attr(data-src)',
                            re=r'(.*)\?.*')
        item_loader.add_css(field_name='author_link',
                            css='div.house-basic-info div.house-agent-info p.agent-name > a::attr(href)')
        item_loader.add_css(field_name='content', css='ul.introduce-item *::text')
        item_loader.add_value(field_name='source_url', value=response.url)
        item_loader.add_css(field_name='publish_time', css='p.house-update-info::text')
        item_loader.add_css(field_name='price', css='div.house-pay-way *::text')
        item_loader.add_css(field_name='detail', css='div.house-desc-item > ul > li > span::text')

        yield item_loader.load_item()