Пример #1
0
    def parse_reviews(self, response):
        """ Parse review page """
        data = json.loads(response.text)

        if ('entities' not in data) or ('RATING' not in data['entities']):
            return

        ratings = data['entities']['RATING']

        for review in data['entities']['REVIEWS'].values():
            yield ScraperItem(
                title="",
                text=review['reviewText'],
                rating=ratings[str(review['rating']['entities'][0]
                                   ['entity_ids'][0])]['rating'],
                source="Zomato",
            )

        if data['page_data']['sections']['SECTION_REVIEWS'][
                'numberOfPages'] > response.meta['page']:
            product_id = response.meta['id']
            next_page = response.meta['page'] + 1
            url = self.review_url(id, product_id)

            yield scrapy.Request(url,
                                 callback=self.parse_reviews,
                                 meta={
                                     'page': next_page,
                                     'id': product_id
                                 })
Пример #2
0
    def parse_page(self, response):

        # use scrapy shell to find xpath
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url

        try:
            item['title'] = response.xpath(
                "//div[@class='asset-header-content-inner']/h2/a/text()"
            ).extract()[0]

        except IndexError:
            item['title'] = ""

        item['text'] = " ".join(response.xpath(
            "//div[@class='asset-body']/child::node()").extract())

        try:
            item['date'] = response.xpath(
                "//abbr[@class='datetime']/text()").extract()[0]
        except IndexError:
            item['date'] = ''

        try:
            item['comment_count'] = response.xpath(
                "//ul[@class='asset-meta-list']/li[1]/a/text()").extract()[0]
        except IndexError:
            item["comment_count"] = "0"

        yield item
Пример #3
0
    def parse_page(self, response):

        # use scrapy shell to find xpath
        #from scrapy.shell import inspect_response
        # inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url

        try:
            item['title'] = response.xpath(
                "//dt[@class='entry-title']/text()").extract()[0]
        except IndexError:
            item['title'] = ""

        item['text'] = " ".join(
            response.xpath(
                "//div[@class='entry-content']/child::node()").extract())

        try:
            item['date'] = response.xpath(
                "//abbr[@class='updated']/text()").extract()[0]
        except IndexError:
            item['date'] = ''

        try:
            item['comment_count'] = response.xpath(
                "//span[@class='comments-count']/text()").extract()[0]
        except IndexError:
            item["comment_count"] = "0"

        yield item
Пример #4
0
    def parse_page(self, response):

        # use scrapy shell to find xpath
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url
        try:
            #item['title'] = response.xpath("//div[@class='BlogTopic']/a/text()").extract()[0]
            item['title'] = response.xpath("//div[@id='article']/p/b/text()").extract()[0]
        except IndexError:
            item['title'] = ""
        #item['text'] = " ".join(response.xpath("//div[@id='comm0']/child::node()").extract())
        #item['date'] = response.xpath("//td[@class='BlogSmall']/a/text()").extract()[0]
        #item['comment_count'] = str(len(response.xpath("//table[@id='MainPage']/tr/td/table[4]/tr").extract()) - 1)

        item['text'] = " ".join(response.xpath("//div[@id='article']/p").extract())
        try:
            item['date'] = response.xpath("//div[@id='article']/table/tr[1]/td[1]/span[1]/table/tr[1]/td[2]/p/text()").extract()[0].split(":")[1]
        except IndexError:
            item['date'] = ""
        try:
            item['comment_count'] = int(response.xpath("//div[@id='article']/table/tr[1]/td[1]/span[1]/table/tr[1]/td[1]/a/text()").extract()[0].split("(")[1].split(")")[0])
        except IndexError:
            item['comment_count'] = ""
        yield item
Пример #5
0
    def parse_page(self, response):

        # use scrapy shell to find xpath
        #from scrapy.shell import inspect_response
        #inspect_response(response)

        item = ScraperItem()

        item["url"] = response.url

        item["date"] = response.xpath(
            "//p[@class='entry-footer']/text()").extract()[0]

        item["text"] = " ".join(
            response.xpath(
                "//div[@class='entry-body']/child::node()").extract())

        try:
            item["title"] = response.xpath(
                "//h3[@class='entry-header']/text()").extract()[0]
        except IndexError:
            item["title"] = ""

        try:
            item["comment_count"] = response.xpath(
                "//p[@class='entry-footer']/a[3]/text()").extract()[0]
        except IndexError:
            item["comment_count"] = "0"

        yield item
Пример #6
0
    def extract_page(self, response):
        item_content = response.meta.get('item')
        item_fields_page = self.settings.get('ITEM_FIELDS_PAGE', [])
        item = ScraperItem()

        for item_field in item_fields_page:

            field = item_field.get('field', '')
            selector = item_field.get('selector', '')
            method = item_field.get('method', '')

            if not selector or not field:
                continue

            extract_value = response.css(selector).getall()

            if method == 'joinlink':
                extract_value = self.extract_links(response, extract_value)
            elif method == 'joinlink':
                extract_value = ' '.join(extract_value)
            else:
                extract_value[0]

            item_content['content'][field] = extract_value

        item = item_content
        yield item
Пример #7
0
    def parse_reviews(self, response):
        """ Parse review page """
        data = json.loads(response.text)

        if 'Results' not in data:
            return

        for review in data['Results']:
            yield ScraperItem(
                title=review['Title'],
                text=review['ReviewText'],
                rating=review['Rating'],
            )

        if data['TotalResults'] > response.meta['page'] * 5:
            product_id = response.meta['id']
            next_page = response.meta['page'] + 1
            url = self.review_url(id, next_page)

            yield scrapy.Request(url,
                                 callback=self.parse_reviews,
                                 meta={
                                     'page': next_page,
                                     'id': product_id
                                 })
Пример #8
0
    def parse_item(self, response):
        item_loader = ItemLoader(item=ScraperItem(), response=response)

        item_loader.add_xpath('brand',
                              '(//*[@itemprop="title"])[last()]/text()')

        price = response.xpath(
            '//*[@id="our_price_display"]/text()').extract_first()
        item_loader.add_value(
            'price', price, MapCompose(lambda i: i[2:].replace(',', '.'),
                                       float))

        item_loader.add_xpath('price_currency',
                              '(//*[@itemprop="priceCurrency"])[1]/@content')

        model = response.xpath(
            '//*[@id="columns"]/div[1]/span[2]/text()').extract_first()
        item_loader.add_value('model', model)

        available_sizes = response.xpath(
            '(//*[contains(@class, "attribute_select")]/option)[.!="Select Size"]/text()'
        ).extract()
        item_loader.add_value('available_sizes', available_sizes)

        full_product_name = response.xpath(
            '//*[@class="h4"]/text()').extract_first()
        item_loader.add_value('color', full_product_name.replace(model, ''),
                              MapCompose(str.strip))

        image_src = response.xpath(
            '(//*[@itemprop="image"])[1]/@src').extract_first()
        item_loader.add_value('image', image_src)

        description = response.xpath(
            '//*[@id="prod-desc"]/div/p/text()').extract_first()
        item_loader.add_value('description', description)

        is_discounted = len(
            response.xpath(
                '//*[contains(@class, "price_reduced")]').extract()) != 0
        item_loader.add_value('is_discounted', is_discounted)

        inner_id = response.xpath(
            '//*[@name="id_product"]/@value').extract_first()
        item_loader.add_value('inner_id', inner_id)

        inner_category = response.xpath(
            '//*[@itemtype="http://data-vocabulary.org/Breadcrumb"][last()-1]/a[1]/@title'
        ).extract_first()
        item_loader.add_value(
            'db_category',
            self.category_resolver.resolve(inner_category, inner_id))

        # SJS has only clothing for men
        item_loader.add_value('gender', 'm')
        item_loader.add_value('resource', 'slamjamsocialism')
        item_loader.add_value('url', response.url)
        item_loader.add_value('date', str(datetime.datetime.now()))

        return item_loader.load_item()
Пример #9
0
    def parse_details(self, response):
        item = ScraperItem()
        soup = BeautifulSoup(response.text, 'lxml')
        usefulH2 = soup.find_all('h2')[1]
        
        # Event name & URL
        item['title'] = usefulH2.string
        item['uri'] = response.url
        
        # Build the description from multiple paragraphs that may contain additional tags.
        leftPs = usefulH2.find_next_sibling('div', {'class':'att-detail-left-col'}).find_all('p')
        item['description'] = leftPs[0].text
        for thisP in leftPs[1:(len(leftPs) - 2)]:
            item['description'] = item['description'] + " " + thisP.text
        item['description'] = item['description'].replace("\xa0", " ")
        item['description'] = item['description'].replace("\n", " ")
        
        # Date and time
        item = visitingMontgomeryHelper.parseDateTimeString(leftPs[len(leftPs) - 2].text, item)
        
        # Address
        rightPs = usefulH2.find_next_sibling('div', {'class':'att-detail-right-col'}).find_all('p')
        item = visitingMontgomeryHelper.parseAddressString(rightPs[0].string, item)

        return item
Пример #10
0
    def parse_item(self, response):
        # Check if this item is still in stock
        if response.xpath('//*[@id="sold-out-div"]').extract_first():
            print(response.url, "\nIS SOLD OUT!")
            return

        item_loader = ItemLoader(item=ScraperItem(), response=response)

        item_loader.add_xpath('brand', '(//*[@class="u-color--black"])[1]/text()', MapCompose(str.strip))

        price = response.xpath('//*[@id="tr-pdp-price--sale"]/span[1]/text()').extract_first()
        if price:
            item_loader.add_value('is_discounted', True)
        else:
            item_loader.add_value('is_discounted', False)
            price = response.xpath('//*[@id="tr-pdp-price"]/span[1]/text()').extract_first()

        # removing '$' at the beginning of price string and casting to float
        item_loader.add_value('price', price,
                              MapCompose(lambda i: ''.join(ch for ch in i if ch.isdigit()), float))

        # Supposedly currency of all 'forward' items will be in dollars as we asked so in cookies
        item_loader.add_value('price_currency', 'USD')

        model = response.xpath('//*[@class="product_name"]/text()').extract_first()
        item_loader.add_value('model', model)

        # Should we add last in stock info here as well?
        available_sizes = response.xpath('(//*[@id="size-select"]/option)[.!="Select Size"]/text()').extract()
        if not available_sizes:
            available_sizes = ['One Size']
        item_loader.add_value('available_sizes', filter(lambda size: not size.endswith("(Sold Out)"),
                                                        map(str.strip, available_sizes)))

        color = response.xpath('//*[@id="color-select"]/option[1]/text()').extract_first()
        if not color:
            color = response.xpath('//*[contains(@class, "color_dd")]/div[1]/text()').extract_first()
        item_loader.add_value('color', color, MapCompose(str.strip))

        image_src = response.xpath('//*[@class="product-detail-image"]/@src').extract_first()
        item_loader.add_value('image', image_src)

        gender = response.xpath('//*[@class="nav-toggle__item current"]/a[1]/text()').extract_first()
        if gender == "MENS":
            item_loader.add_value('gender', 'm')
        else:
            item_loader.add_value('gender', 'w')

        inner_id = response.xpath('//*[@class="product_detail"]/ul[1]/li[last()]/text()').extract_first()
        inner_id = inner_id.replace('Manufacturer Style No. ', '')
        item_loader.add_value('inner_id', inner_id)

        inner_categories = list(map(str.strip, response.xpath('//*[@id="ctaMainBtn"]/button[1]/@data-category').extract_first().split(':')))
        item_loader.add_value('db_category', self.category_resolver.resolve(inner_categories, inner_id, gender))

        item_loader.add_value('resource', "forward")
        item_loader.add_value('url', response.url)
        item_loader.add_value('date', str(datetime.datetime.now()))

        return item_loader.load_item()
Пример #11
0
    def bruteForceParseEvent(event):
        #        print("!!!!!! ", str(event))
        eventDict = ast.literal_eval(str(event))
        item = ScraperItem()
        item['title'] = eventDict['name']
        item['uri'] = eventDict['url']
        item['description'] = eventDict['description']
        startsAtDate = datetime.strptime(eventDict['startDate'],
                                         "%Y-%m-%dT%H:%M:%S+00:00")
        item['starts_at'] = datetime.strftime(startsAtDate, "%Y-%m-%dT%H:%M")
        endsAtDate = datetime.strptime(eventDict['endDate'],
                                       "%Y-%m-%dT%H:%M:%S+00:00")
        item['ends_at'] = datetime.strftime(endsAtDate, "%Y-%m-%dT%H:%M")
        if "location" in eventDict:
            #            print("======== ", eventDict['location'])
            locationDict = ast.literal_eval(str(eventDict['location']))
            if "name" in locationDict:
                item['location_name'] = locationDict['name']
            if "address" in locationDict:
                addressDict = ast.literal_eval(str(locationDict['address']))
                if "streetAddress" in addressDict:
                    item['location_street1'] = addressDict['streetAddress']
                if "addressLocality" in addressDict:
                    item['location_city'] = addressDict['addressLocality']
                if "addressRegion" in addressDict:
                    item['location_state'] = addressDict['addressRegion']
                if "postalCode" in addressDict:
                    item['location_zip'] = addressDict['postalCode']

        return item
Пример #12
0
    def parse_actor(self, response):

        logging.info("start scrap data from actor sites")
        info = response.xpath('//table[@class="infobox biography vcard"]')
        actorItem = ScraperItem()
        actorItem['actorName'] = info.xpath('//tr/th/span/text()')[0].extract()
        actorItem['actorAge'] = info.xpath(
            '//tr/th[contains(text(),"Born")]/following::td[1]/text()'
        ).extract()
        actorItem['movielist'] = response.xpath(
            '//h2/span[contains(text(),"Filmography")]/following::ul[1]/li/i/a/text()'
        ).extract()

        self.actor_count += 1
        logging.debug("actor scraped: ")
        print(self.actor_count)

        yield actorItem

        for sel in response.xpath(
                '//h2/span[contains(text(),"Filmography")]/following::ul[1]/li'
        ):
            movieUrl = sel.xpath('i/a/@href').extract()[0]
            if (movieUrl is not None) and (self.movie_count <= 125):
                logging.info("movie url is valid")
                yield response.follow(movieUrl, callback=self.parse_movie)
Пример #13
0
    def parse_page(self, response):

        # use scrapy shell to find xpath
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        item = ScraperItem()

        item["url"] = response.url

        item["date"] = " ".join(response.xpath(
            "//small[@class='p-time']/child::node()/text()"
        ).extract())

        item["text"] = " ".join(response.xpath(
            "//div[@class='p-con']/child::node()"
        ).extract())

        try:
            item["title"] = response.xpath(
                "//div[@class='p-head']/h1/text()"
            ).extract()[0]
        except IndexError:
            item["title"] = ""
        try:
            item["comment_count"] = response.xpath("//div[@id='comments']/h2/text()").extract()[0]
        except IndexError:
            item["comment_count"] = 0

        yield item
Пример #14
0
    def parse_page(self, response):

        # use scrapy shell to find xpath
        #from scrapy.shell import inspect_response
        #inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url

        try:
            item['title'] = response.xpath(
                '//h3[@class="entry-header"]/text()').extract()[0]
        except IndexError:
            item['title'] = ""

        try:
            item['text'] = " ".join(
                response.xpath(
                    '//div[@class="entry-body"]/child::node()').extract())
        except IndexError:
            item['text'] = ''

        try:
            item['date'] = response.xpath(
                "//p[@class='entry-footer']/text()").extract()[0]
        except IndexError:
            item['date'] = ''

        try:
            item['comment_count'] = response.xpath(
                "//p[@class='entry-footer']/a/text()").extract()[2]
        except IndexError:
            item['comment_count'] = '0'

        yield item
Пример #15
0
    def parse_reviews(self, response):
        """ Parse review page """
        data = json.loads(response.text)['avaliacao']

        if data['quantidadeAvaliacoes'] is None:
            return

        for review in data['avaliacoes']:
            yield ScraperItem(
                title=review['titulo'],
                text=review['descricao'],
                rating=review['nota'],
            )

        if data['quantidadeAvaliacoes'] > response.meta['page'] * 5:
            product_id = response.meta['id']
            next_page = response.meta['page'] + 1
            url = self.review_url(product_id, next_page)

            yield scrapy.Request(url,
                                 callback=self.parse_reviews,
                                 meta={
                                     'page': next_page,
                                     'id': product_id
                                 })
Пример #16
0
    def parse_page(self, response):

        # use scrapy shell to find xpath
        #from scrapy.shell import inspect_response
        #inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url

        try:
            item['title'] = response.xpath(
                "//div[@class='b-singlepost-wrapper']/h1/text()").extract()[0]
        except IndexError:
            item['title'] = ""

        item['text'] = " ".join(
            response.xpath("//article[2]/child::node()").extract())

        try:
            date = response.xpath("//time/a/text()").extract()
            date.append(response.xpath("//time/text()[3]").extract()[0])
            item['date'] = " ".join(date)
        except IndexError:
            item['date'] = ''

        try:
            item['comment_count'] = response.xpath(
                "//span[@class='js-amount'][1]/text()").extract()
        except IndexError:
            item["comment_count"] = "0"

        yield item
Пример #17
0
    def parse_movie(self, response):

        logging.info("start scrap data from movie sites")
        table = response.xpath('//table[@class="infobox vevent"]')
        movieItem = ScraperItem()
        movieItem['movieName'] = table.xpath('//tr/th/text()')[0].extract()
        movieItem['movieYear'] = table.xpath(
            '//tr/th/div[contains(text(),"Release date")]/following::td/div/ul/li[1]/text()'
        ).extract_first()
        movieItem['movieGrossing'] = table.xpath(
            '//tr/th[contains(text(),"Box office")]/following::td/text()'
        ).extract_first()
        movieItem['actorlist'] = response.xpath(
            '//h2/span[contains(text(),"Cast")]/following::ul[1]/li/a/text()'
        ).extract()

        self.movie_count += 1
        logging.debug("movie scraped: ")
        print(self.movie_count)

        yield movieItem

        for sel in response.xpath(
                '//h2/span[contains(text(),"Cast")]/following::ul[1]/li'):
            actorUrl = sel.xpath('a/@href').extract()[0]
            if (actorUrl is not None) and (self.actor_count <= 250):
                logging.info("actor url is valid")
                yield response.follow(actorUrl, callback=self.parse_actor)
Пример #18
0
    def parse_page(self, response):

        # use scrapy shell to find xpath
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        item = ScraperItem()
        item['url'] = response.url

        try:
            item['title'] = response.xpath(
                '//h3/text()'
            ).extract()[0]
        except IndexError:
            item['title'] = ""

        try:
            item['text'] = " ".join(
                response.xpath("//div[@class='body'][1]/child::node()").extract())

        except IndexError:

            item['text'] = ''

        try:
            item['date'] = response.xpath('//div[@class="index"][1]/a[1]/text()').extract()
        except IndexError:
            item['date'] = ''

        item['comment_count'] = response.xpath(
            '//div[@class="index"][1]/a[2]/text()'
        ).extract()

        yield item
 def parse(self, response):
     item = ScraperItem()
     item["scrape_target"] = ScrapeTarget.objects.get(
         scrape_url=response.url)
     item["scrape_content"] = response.css(
         ".c-rte--default p.rte__paragraph").get()
     item["scrape_time"] = str(datetime.utcnow())
     yield item
Пример #20
0
 def parse_items(self, response):
     p_tags = response.xpath('//p/text()').extract()
     content = CleanTag(p_tags)
     currentPage = response.request.url
     item = ScraperItem()
     item['page'] = currentPage
     item['content'] = content
     yield item
     return
Пример #21
0
    def parse(self, response):
        """ Parse items from search page """
        data = json.loads(response.text)['complainResult']['complains']['data']

        for review in data:
            yield ScraperItem(
                title=review['evaluation'],
                text=review['description'],
                rating=review['score'],
            )
Пример #22
0
    def parse_movie_page(self, response):
        self.logger.info('Parse function called on %s', response.url)

        item = ScraperItem()
        item['title'] = response.css(
            '.title_wrapper > h1 ::text').extract_first()
        item['genres'] = ", ".join(
            response.css(
                '.see-more.inline.canwrap > a[href*=title_type] ::text').
            extract())
        item['rating'] = response.css(
            '.ratingValue > strong > span ::text').extract_first()
        item['stars'] = ", ".join(
            response.css('td:nth-child(2) > a ::text').extract())
        item['type'] = '-'  #Я не увидел, где это поле находится на странице
        item['details'] = {
            'Official Sites: ':
            ", ".join(
                response.css(
                    '#titleDetails >div> a[href*=offsite] ::text').extract()),
            'Country: ':
            ", ".join(
                response.css(
                    '#titleDetails > div > a[href*=country_of_origin] ::text').
                extract()),
            'Language: ':
            ", ".join(
                response.css(
                    '#titleDetails >div > a[href*=primary_language] ::text').
                extract()),
            #'Release Date: ': response.css('#titleDetails > div:nth-child(5)').extract(),
            #'Also Known As: ': response.css('div.txt-block:nth-child(6)').extract(),
            'Filming Locations: ':
            response.css('#titleDetails >div > a[href*=locations] ::text').
            extract_first()
        },
        item['box_office'] = {
            #'Opening Weekend USA: ': response.css('div.txt-block:nth-child(11)').extract(),
            #'Gross USA: ': response.css('#titleDetails > div:nth-child(12)').extract(),
            #'Cumulative Worldwide Gross: ': response.css('#titleDetails > div:nth-child(13)').extract(),
        },
        item['technical_spec'] = {
            'Runtime':
            response.css('.txt-block > time ::text').extract_first(),
            'Sound Mix: ':
            ", ".join(
                response.css(
                    '.txt-block > a[href*=sound_mixes] ::text').extract()),
            'Color: ':
            response.css(
                '.txt-block > a[href*=colors] ::text').extract_first(),
            #'Aspect Ratio: ': response.css('h3 + .txt-block +  .txt-block +  .txt-block +  .txt-block > h4 ').extract(),
        },

        yield item
    def parse_actor(self, response):

        logging.info("start scrap data from actor sites")
        info = response.xpath('//table[@class="infobox biography vcard"]')
        actorItem = ScraperItem()
        actorItem['actorName'] = info.xpath('//tr/th/span/text()')[0].extract()
        actorItem['actorAge'] = info.xpath(
            '//tr/th[contains(text(),"Born")]/following::td[1]/text()'
        ).extract()

        yield actorItem
Пример #24
0
    def save_page(self, response):
        """Write the page and assets to local storage."""
        file = response.url.split("/")[-1]
        filename = os.path.join('/home/vagrant/sync/scraper/files', file)
        with open(filename, 'wb') as f:
            f.write(response.body)

        item = ScraperItem()
        file_urls = response.xpath('//img/@src').extract()
        item['file_urls'] = [self.process_url(url) for url in file_urls]
        yield item
Пример #25
0
 def parse_job(self, response):
     job = ScraperItem()
     sel = Selector(response)
     job['url'] = response.url
     job_offer = sel.xpath('//title/text()').extract()
     job_offer = job_offer[0].strip()
     job_offer = job_offer.split('-')
     job['name'] = job_offer[0]
     job["email"] = None
     job["phone"] = None
     return job
Пример #26
0
    def product_page_cb(self, response: Response) -> ScraperItem:
        sel = Selector(response, type="html")

        # metadata can help extracting more reliable data (json-ld, microdata, ...)
        meta = utils.extract_metadata(url=response.url)
        self.logger.info(meta)

        item = ScraperItem()
        item["url"] = response.url
        item["title"] = sel.xpath("//h1/text()").get()
        return item
Пример #27
0
    def parse_movie(self, response):
        item = ScraperItem()

        item['title'] = response.xpath('//h1/text()').extract_first()
        item['year'] = response.xpath(
            '//*[@id="titleYear"]/a/text()').extract_first()
        # rating= response.xpath('//*[@itemprop="ratingValue"]/text()').extract_first()
        # director=response.xpath('//*[@id="title-overview-widget"]/div[2]/div[1]/div[2]/a/text()').extract_first()
        # writer=response.xpath('//*[@id="title-overview-widget"]/div[2]/div[1]/div[3]/a[1]/text()').extract_first()
        # release_date=response.xpath('//*[@id="titleDetails"]/div[4]/text()').extract()
        # release_date=release_date[1].strip()
        yield item
Пример #28
0
 def tdoll_parser(self, response):
   doll = ScraperItem()
   res = response.xpath('//h1[@id="firstHeading"]').get()
   soup = BeautifulSoup(res, "lxml")
   doll['name'] = soup.get_text('h1')
   skillnamexpath = response.xpath('//div[@class="skilldataraw"]//tr[1]//td[1]').get()
   soup = BeautifulSoup(skillnamexpath, 'lxml')
   skill_name = str(soup.get_text('td'))
   doll['skill_name'] = skill_name
   print("parser for tdoll " + doll['name'] + ' skill name = ' + doll['skill_name'])
   #self.links.append(doll)
   yield doll
   
Пример #29
0
    def parse(self, response):
        for joke in response.xpath("//div[@class='jokes']"):
            l = ItemLoader(item=ScraperItem(), selector=joke)
            l.add_xpath('joke_text', ".//div[@class='joke-text']/p")
            # yield {
            #     'joke_text': joke.xpath(".//div[@class='joke-text']/p").extract_first()
            # }
            yield l.load_item()

        next_page = response.xpath(
            "//li[@class='next']/a/@href").extract_first()
        if next_page is not None:
            next_page_link = response.urljoin(next_page)
            yield scrapy.Request(url=next_page_link, callback=self.parse)
Пример #30
0
    def parse_movie(self, response):

        logging.info("start scrap data from movie sites")
        table = response.xpath('//table[@class="infobox vevent"]')
        movieItem = ScraperItem()
        movieItem['movieName'] = table.xpath('//tr/th/text()')[0].extract()
        movieItem['movieYear'] = table.xpath(
            '//tr/th/div[contains(text(),"Release date")]/following::td/div/ul/li[1]/text()'
        ).extract_first()
        movieItem['movieGrossing'] = table.xpath(
            '//tr/th[contains(text(),"Box office")]/following::td/text()'
        ).extract_first()

        yield movieItem