Exemplos de TextResponse.urljoin em Python, exemplos de scrapy.http.TextResponse.urljoin em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: certipedia.py Projeto: proyy/xiaomi_certification_tracker

    def parse(self, response: TextResponse):
        items = response.css('tbody.search-results tr')
        for item in items:
            yield {
                'models':
                item.xpath('normalize-space(.//td[5]/text())').get().strip(),
                'certification':
                item.xpath('.//td[6]/a/text()').get().strip(),
                'link':
                response.urljoin(item.xpath('.//td[6]/a/@href').get())
            }

        next_page = response.xpath('//a[text()="Next"]/@href').get()
        if next_page:
            yield Request(url=response.urljoin(next_page), callback=self.parse)

Exemplo n.º 2

0

Exibir arquivo

def extract(url):
    res = requests.get(url, headers=header)
    response = TextResponse(url, body=res.text.encode())
    for url in response.xpath(
            "//div[@class='loi ']//div[@id='issueName']/a[@class='issueLinkCon']/@href"
    ):
        yield response.urljoin(url.get())

Exemplo n.º 3

0

Exibir arquivo

Arquivo: yelp.py Projeto: Eustacio/yelp-scraper

    def parse(self, response: TextResponse) -> [Request, YelpService]:
        """
        This is the default callback used by Scrapy to process downloaded responses, when their
        requests don’t specify a callback.

        The parse method is in charge of processing the response and returning scraped data
        and/or more URLs to follow.

        Args:
            :param response: the response to parse
        """
        # Checks if we are in the search result page
        if response.url.startswith("https://www.yelp.com/search?"):
            info_page_urls = response.css(".biz-name::attr(href)")

            # Checks if we have some result
            if info_page_urls is not None:
                for url in info_page_urls[:self.max_results]:
                    # Joins the url found with the domain url, and returns a new Request for it,
                    # that gonna be parsed by this method.
                    info_page = response.urljoin(url.extract())
                    yield Request(info_page)

        # We are in the info page, therefore we already can extract the information
        else:
            yield self._map_response(response)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: proptiger.py Projeto: guptaanjali26/Web-Scraping-with-Scrapy

    def parse(self, response):
        self.driver.get(response.url)

        try:
            WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.XPATH,'//*[@id="views"]/div/div[2]/div[2]/div[3]/div[10]/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/span')))
        except TimeoutException:
            print "Time out"
            return

        # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8');

        for href in resp.xpath('//*[@id="views"]/div/div[2]/div[2]/div[3]/div/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/@href'):
            url = resp.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_property)


        if self.page == 5 :
            return
            
        self.page += 1
        yield scrapy.Request(url="https://www.proptiger.com/noida/property-sale?page=%d" % self.page,
                      headers={"Referer": "https://www.proptiger.com/noida/property-sale", "X-Requested-With": "XMLHttpRequest"},
                      callback=self.parse, 
                      dont_filter=True)

Exemplo n.º 5

0

Exibir arquivo

    def parse(self, response):
        self.driver.get("http://www.yellowpages.com.au")
        #Handle captcha
        if len(self.driver.find_elements_by_xpath("//div[@class='form']//form[@name='captcha']"))>0:
            WebDriverWait(self.driver, 10000).until(EC.presence_of_element_located((By.XPATH,"//body[contains(@class, 'home-body')]")))

        # urls generating
        if self.au_codes:
            urls = ["http://www.yellowpages.com.au/search/listings?clue={}&locationClue={}&selectedViewMode=list".format(clue.replace(' ','+'),str(code)) for clue, code in zip(self.au_clues, self.au_codes)]
        else:
            urls = ["http://www.yellowpages.com.au/search/listings?clue={}&state={}&selectedViewMode=list".format(clue.replace(' ','+'),state) for clue, state in zip(self.au_clues, self.au_states)]

        for url in urls:
            while True:
                self.driver.get(url)
                # Selenium page_source to scrapy response
                response = TextResponse(url=url, body=self.driver.page_source, encoding='utf-8')

                for elem in response.xpath("//div[contains(@class,'search-result')]//div[contains(@class,'in-area-cell')]"):
                    item = YellowpagesauItem()
                    item['page_url'] = response.url
                    item['link'] = response.urljoin(elem.xpath(".//a[@class='listing-name']/@href").extract_first()) 
                    item['name'] = elem.xpath(".//a[@class='listing-name']/text()").extract_first()
                    item['phone'] = elem.xpath(".//span[@class='contact-text']/text()").extract_first()
                    item['email'] = elem.xpath(".//a[contains(@class,'contact-email')]/@data-email").extract_first()
                    item['website'] = elem.xpath(".//a[contains(@class,'contact-url')]/@href").extract_first()
                    item['address'] = elem.xpath(".//p[contains(@class,'listing-address')]/text()").extract_first()
                    yield item

                #Selenium next page
                if len(self.driver.find_elements_by_link_text('Next »'))>0:
                    url = self.driver.find_element_by_link_text('Next »').get_attribute("href");
                else:
                    #If "Next page" button not exist, stop pagination
                    break;

Exemplo n.º 6

0

Exibir arquivo

Arquivo: hannover_messe.py Projeto: sidharthchugh/DP-Data

    def parse(self, response):
        self.driver.get(
            'http://www.hannovermesse.de/en/exhibition/exhibitors-products/advanced-search/'
        )

        wait = WebDriverWait(self.driver, 5)

        wait.until(
            EC.element_to_be_clickable((
                By.XPATH,
                '//*[@id="searchAP:search"]/section/div[6]/div/div/div[2]/div[2]'
            )))
        showallbutton = WebDriverWait(self.driver, 10).until(
            EC.element_to_be_clickable((
                By.XPATH,
                '//*[@id="searchAP:search"]/section/div[6]/div/div/div[2]/div[2]/a'
            )))
        showallbutton.click()
        self.driver.execute_script(
            "document.getElementById('searchAP:zb:442:r').click()")
        self.driver.find_element_by_xpath(
            '//*[@id="searchAP:searchButton2"]').click()

        #searchAP:zb:1:r Agriculture, forestry and fishing
        #searchAP:zb:18:r Mining and extracting rocks and earth
        #searchAP:zb:35:r Manufacturing industry
        #searchAP:zb:220:r Energy supply
        #searchAP:zb:229:r  Water supply, sewage and refuse disposal, sanitation and similar activities
        #searchAP:zb:239:r  Construction/construction industry
        #searchAP:zb:251:r  Sale, maintenance and repair of motor vehicles
        #searchAP:zb:291:r  Transportation and storage
        #searchAP:zb:312:r  Hotels and restaurants/lodging and catering
        #searchAP:zb:320:r Information and communication
        #searchAP:zb:351:r  Provision of financial and insurance services
        #searchAP:zb:357:r  Real estate activities
        #searchAP:zb:361:r Provision of freelanced, scientific and technical services
        #searchAP:zb:379:r Provision of other business activities
        #searchAP:zb:394:r Public adminstration, defence and social security
        #searchAP:zb:404:r  Education
        #searchAP:zb:412:r Human health and social work activities
        #searchAP:zb:425:r Arts, entertainment and recreation
        #searchAP:zb:430:r Recreational, cultural and sporting activities; other
        #searchAP:zb:439:r Households
        #searchAP:zb:440:r  Extra-territorial organisations and bodies
        #searchAP:zb:441:r All sectors, sector independent
        #searchAP:zb:442:r pupils, students

        # Now that the webpage is all revealed Scrapy can bring down all the Company URLs
        # I.e. we need to follow the link for every companys to get onto its page to get our data
        response1 = TextResponse(url=response.url,
                                 body=self.driver.page_source,
                                 encoding='utf-8')
        for href in response1.css('.search-link ::attr(href)'):
            url = response1.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)

Exemplo n.º 7

0

Exibir arquivo

	def parse(self, response):
		for url in self.list_urls:
			self.driver.get(url)
			self.wait_between(1.5, 3.0)            
			iLoop = True
				
			while iLoop:                           
				CheckBox = WebDriverWait(self.driver, 3000).until(
					EC.presence_of_element_located((By.CSS_SELECTOR ,".emphasise"))
					)
				response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')            
							  
				current_url = self.driver.current_url
				aParts = (current_url.split('?')[1]).split('&')
				for x in aParts:
					params = x.split('=')
					if params[0] == 'clue':
						keyword = params[1].replace('%20', ' ').replace('+', ' ')
					if params[0] == 'locationClue':
						suburb = params[1]
						
				for div in response.xpath('.//div[@class="flow-layout outside-gap-large inside-gap inside-gap-large vertical"]//div[@class="cell in-area-cell middle-cell"]'):					
					if div.xpath('.//a[@class="listing-name"]//text()').extract_first():
						if div.xpath('.//a[@title="Phone"]'):
							phone = div.xpath('.//a[@title="Phone"]//@href').extract_first()
							if phone:
								sphone = phone.replace('tel:', '')
								if self.checkDuplicates(sphone) == False:
									item = {}
									item['Phone'] = sphone								
									item['Suburb'] = suburb
									item['Keyword'] = keyword
									item['CompanyName'] = div.xpath('.//a[@class="listing-name"]//text()').extract_first().strip().encode('utf-8')
									if div.xpath('.//p[@class="listing-short-description"]'):
										item['Description'] = div.xpath('.//p[@class="listing-short-description"]//text()').extract_first().encode('utf-8')
									if div.xpath('.//p[@class="listing-address mappable-address"]'):
										item['Address'] = div.xpath('.//p[@class="listing-address mappable-address"]//text()').extract_first().encode('utf-8')
									if div.xpath('.//p[@class="listing-address mappable-address mappable-address-with-poi"]'):
										item['Address'] = div.xpath('.//p[@class="listing-address mappable-address mappable-address-with-poi"]//text()').extract_first().encode('utf-8')
									
									if div.xpath('.//a[@class="contact contact-main contact-email "]'):
										item['Email'] = div.xpath('.//a[@class="contact contact-main contact-email "]//@data-email').extract_first()
									if div.xpath('.//a[@class="contact contact-main contact-url "]'):
										item['Website'] = div.xpath('.//a[@class="contact contact-main contact-url "]//@href').extract_first()
									
									yield item
				
				# parse next page
				if response.xpath('.//a[contains(@class, "pagination navigation") and contains(text(), "Next")]'):
					next_url = response.xpath('.//a[contains(@class, "pagination navigation") and contains(text(), "Next")]//@href').extract_first()
					self.driver.get(response.urljoin(next_url))					
					self.wait_between(1.5, 3.0)
				else:
					iLoop = False

Exemplo n.º 8

0

Exibir arquivo

def extract(url):
    # urls = list(_get_years_urls(response))
    # crawler = Crawler(SciencePaperUrlSpider, get_project_settings())
    # crawler.signals.connect(get_item, signal=signals.item_scraped)
    # crawler.crawl(urls=urls)
    res = requests.get(url, headers=header)
    resp = TextResponse(url, body=res.text.encode())
    for url in _get_years_urls(resp):
        res = requests.get(url, headers=header)
        response = TextResponse(url, body=res.text.encode())
        for url in response.xpath(
                "//ul[@class='issue-month-detail']/li/div/div/a/@href"):
            yield response.urljoin(url.get())

Exemplo n.º 9

0

Exibir arquivo

    def parse(self, response):
        self.driver.get('http://www.metmuseum.org/art/collection')

        # while True:
        #     try:
        #         show_more = self.driver.find_element_by_class_name("show-more")
        #         time.sleep(2)
        #         show_more.click()
        #     except:
        #         break

        # clicking the show more button
        for i in range(5):
            show_more = self.driver.find_element_by_class_name("show-more")
            time.sleep(3)
            show_more.click()

        response = TextResponse(url=self.driver.current_url,
                                body=self.driver.page_source,
                                encoding='utf-8')
        test = response.xpath('//h2[@class="card__title"]/a/@href')
        for href in response.xpath('//h2[@class="card__title"]/a/@href'):
            url = response.urljoin(href.extract())
            print url
            # scraping the urls from the first page & creating a list of links
            # card_link_list = self.driver.find_elements_by_xpath('//h2[@class="card__title"]/a')
            # card_link_list = map(lambda x: x.get_attribute('href'), card_link_list)
            self.driver.get(url)
            time.sleep(2)
            response1 = TextResponse(url=self.driver.current_url,
                                     body=self.driver.page_source,
                                     encoding='utf-8')
            item = MetItem()
            for sel in response1.xpath('//div[@class="l-component-block"]'):
                title = self.driver.find_element_by_xpath(
                    '//h1[@class="collection-details__object-title"]').text
                print title
                location = self.driver.find_element_by_xpath(
                    '//div[@class="collection-details__location"]').text
                print location
                item['title'] = title
                item['location'] = location
            artifact_detail = {}
            for detail in response1.xpath(
                    '//dl[@class="collection-details__tombstone--row"]'
            ).extract():
                key = Selector(text=detail).xpath('//dt/text()').extract()[0]
                value = Selector(text=detail).xpath('//dd/text()').extract()[0]
                artifact_detail[key] = value
            item['artifact_detail'] = artifact_detail
            yield item

Exemplo n.º 10

0

Exibir arquivo

Arquivo: lm.py Projeto: yunxuanc/hyspider

    def parse(self, response):

        res = json.loads(response.body_as_unicode())
        html = res['data']['html']

        response = TextResponse(url=response.urljoin(''),
                                body=html,
                                encoding='utf-8')
        if len(response.body_as_unicode()) == 0:
            return

        cinemas = response.css('.portal-cinema-list-item')
        c = CinemaLM()
        c['city'] = self.city_name
        for cinema in cinemas:
            self.count = self.count + 1
            id_href = cinema.css('a')
            c['id'] = re.search(
                'cinemaId=([\d]+)',
                id_href.css('::attr(href)').extract()[0]).group(1)
            if c['id'] == '8131':
                print("hi")
            price_node = cinema.css('.portal-cinema-price-num')
            if len(price_node) > 0:
                c['min_price'] = price_node.css('::text').extract()[0]

            if not settings.UPDATE_MIN_PRICE:
                c['name'] = cinema.css('.portal-cinema-name').css(
                    '::text').extract()[0].upper().replace('（', '(').replace(
                        '）', ')')
                c['addr'] = cinema.css('.portal-cinema-address-section').css(
                    '::text').extract()[0].replace('（', '(').replace('）', ')')
                c['phone'] = self.get_phone(c['id'])
                c['district'] = get_district(c['addr'], c['city'])
                lat_lng = parse_addr(c['city'], c['addr'])
                if lat_lng is not None:
                    c['lat_lng'] = lat_lng['lat_lng']
                    c['location'] = lat_lng['location']
                    c['precise'] = lat_lng['precise']
                    c['confidence'] = lat_lng['confidence']
                    if len(c['district']) == 0 or len(c['district']) > 6:
                        c['district'] = get_district_from_lat_lng(c['lat_lng'])
            yield c

        print("total:", self.count)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: met_spider.py Projeto: bkanpetch/bkanpetch.github.io

    def parse(self, response):
        self.driver.get('http://www.metmuseum.org/art/collection')

        # while True:
        #     try:
        #         show_more = self.driver.find_element_by_class_name("show-more")
        #         time.sleep(2)
        #         show_more.click()
        #     except:
        #         break

        # clicking the show more button
        for i in range(5):
            show_more = self.driver.find_element_by_class_name("show-more")
            time.sleep(3)
            show_more.click()

        response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
        test = response.xpath('//h2[@class="card__title"]/a/@href')
        for href in response.xpath('//h2[@class="card__title"]/a/@href'):
            url = response.urljoin(href.extract())
            print url
        # scraping the urls from the first page & creating a list of links
        # card_link_list = self.driver.find_elements_by_xpath('//h2[@class="card__title"]/a')
        # card_link_list = map(lambda x: x.get_attribute('href'), card_link_list)
            self.driver.get(url)
            time.sleep(2)
            response1 = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
            item = MetItem()
            for sel in response1.xpath('//div[@class="l-component-block"]'):
                title = self.driver.find_element_by_xpath('//h1[@class="collection-details__object-title"]').text
                print title
                location = self.driver.find_element_by_xpath('//div[@class="collection-details__location"]').text
                print location
                item['title'] = title
                item['location'] = location
            artifact_detail = {}
            for detail in response1.xpath('//dl[@class="collection-details__tombstone--row"]').extract():
                key = Selector(text=detail).xpath('//dt/text()').extract()[0]
                value = Selector(text=detail).xpath('//dd/text()').extract()[0]
                artifact_detail[key] = value
            item['artifact_detail'] = artifact_detail
            yield item

Exemplo n.º 12

0

Exibir arquivo

Arquivo: proptiger.py Projeto: anant-dev/Web-Scraping-with-Scrapy

    def parse(self, response):
        self.driver.get(response.url)
        try:
            WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.XPATH,'//*[@id="views"]/div/div[2]/div[2]/div[3]/div[10]/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/span')))
        except:
            yield scrapy.Request(url="https://www.proptiger.com/%s/property-sale?page=%d" % (self.city,self.page),
                      callback=self.parse)
        # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8');

        for href in resp.xpath('//*[@id="views"]/div/div[2]/div[2]/div[3]/div/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/@href'):
            url = resp.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_property)

        if self.page == self.end_page :
            return
        self.page += 1
        yield scrapy.Request(url="https://www.proptiger.com/%s/property-sale?page=%d" % (self.city,self.page),
                      callback=self.parse)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: Selenium_example2.py Projeto: MistreanuIonutCosmin/Python3-Scrapy-Projects

 def parse_dir_contents2(self,response):    
     self.driver.get(response.url)
     
     # Pressing the "Show More" button until there are no more on the page to reveal all the page
     # But first we need to scroll down to the bottom of the page to allow the "Show More" to work
     while True:
         self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(1) # waiting 1 seconds for the page to load fully
         next =self.driver.find_element_by_xpath('//*[@id="restsPages"]/a')
         try:
             next.click()
             time.sleep(3) # waiting 3 seconds for the page to load fully
         except:
             break
         
     # Now that the webpage is all revealed Scrapy can bring down all the restaurant URLs
     # I.e. we need to follow the link for every restuarant to get onto its page to get our data
     response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
     for href in response1.xpath('//*[@class="restsRestInfo"]/a/@href'):
         url = response1.urljoin(href.extract())
         yield scrapy.Request(url, callback=self.parse_dir_contents3)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: lm.py Projeto: yunxuanc/hyspider

    def parse_ongoing(self, response):
        res = json.loads(response.body_as_unicode())
        html = res['data']['html']

        response = TextResponse(url=response.urljoin(''),
                                body=html,
                                encoding='utf-8')
        if len(response.body_as_unicode()) == 0:
            return

        movies = response.css('.movie-list-item')

        for movie in movies:
            m = MovieLM()
            m['id'] = movie.css('.poster-show').css(
                '::attr(data-movieid)').extract()[0]
            m['name'] = movie.css('.movie-name-text').css(
                '::text').extract()[0]
            fen = movie.css('.fen')
            if len(fen) > 0:
                m['score'] = fen.css('::text').extract()[0]
            m['ongoing'] = 1
            yield m

Exemplo n.º 15

0

Exibir arquivo

Arquivo: tuvsud.py Projeto: proyy/xiaomi_certification_tracker

 def parse(self, response: TextResponse):
     items = response.css('div.result')
     item: TextResponse
     for item in items:
         yield {
             'certification':
             item.css('div.result_title::text').get(),
             'category':
             item.xpath(
                 './/th/span[contains(text(), "Product")]/following::td/div/text()'
             ).get(),
             'models':
             item.xpath(
                 './/th/span[contains(text(), "Model")]/following::td/div/text()'
             ).get().strip(),
             'date':
             datetime.strptime(
                 item.xpath(
                     './/th/span[contains(text(), "Date")]/following::td/div/text()'
                 ).get().strip(), '%d.%m.%Y').strftime("%Y-%m-%d")
         }
     next_page = response.xpath('//nav/a[1]/@href').get()
     if next_page:
         yield Request(url=response.urljoin(next_page), callback=self.parse)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: crawler.py Projeto: jahodfra/shromazdeni

 def parse_building(self, response: http.TextResponse) -> http.FormRequest:
     for link in response.xpath("//table[@summary='Nalezené jednotky']//a"):
         yield scrapy.Request(response.urljoin(link.attrib["href"]),
                              callback=self.parse_flat)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: hannover_messe.py Projeto: sidharthchugh/DP-Data

    def parse_dir_contents(self, response):
        COMPANY_NAME = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[1]/div/div/div[1]/h3/text()'
        COMPANY_SLOGAN = '//*[@id="exhibitorDetail:exhibitor"]/section[1]/div/div[2]/header/h2/text()'
        CONTACT_NAME = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[1]/div/div/div[1]/h3/text()'
        BUSINESS_TYPE = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[3]/div/p[1]/text()'
        CONTACT_NAME = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[2]/div/div/div/div[1]/span[2]/text()'
        YEAR_FOUNDED = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[3]/div/p[2]/text()'
        NO_OF_EMPLOYEES = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[3]/div/p[3]/text()'
        COMPANY_DESCRIPTION = '//*[@id="exhibitorDetail:exhibitor"]/div[3]/div/div/article/div/p/text()'
        STREET_ADDRESS = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[1]/div/div/address/div[1]/text()'
        CONTACT_POSITION = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[2]/div/div/div/div[1]/div/span/text()'
        COMPANY_WEBSITE = '.icon-external-link ::attr(href)'
        ZIP_CODE = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[1]/div/div/address/div[2]/text()'
        COUNTRY = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[1]/div/div/address/div[3]/text()'
        COMPANY_PHONE = '//*[@id="exhibitorDetail:exhibitor"]/div[2]/div/section/div/div[1]/div/div/div[2]/div[1]/text()'
        PRODUCT_DESCRIPTION = '.productlist-item p ::text'
        PRODUCT_NAME = '.productlist-item a ::text'
        PRODUCT_DESCRIPTION = '.productlist-item P ::text'
        PRODUCT_CATEGORIES = '[id^="exhibitorDetail:exhibitor:j_idt409"] ::text'

        PRODUCT_NAMES = response.css(PRODUCT_NAME).extract() if response.css(
            PRODUCT_NAME).extract() is not None else ''
        PRODUCT_DESCRIPTIONS = response.css(PRODUCT_DESCRIPTION).extract(
        ) if response.css(PRODUCT_DESCRIPTION).extract() is not None else ''
        PRODUCT_CATEGORIES = response.css(PRODUCT_CATEGORIES).extract(
        ) if response.css(PRODUCT_CATEGORIES).extract() is not None else ''

        productName_list = [
            product_name.replace('read more', '').replace('\n', '').replace(
                '\t', '').replace('...', '') for product_name in PRODUCT_NAMES
        ]
        productDescription_list = [
            product_name.replace('read more', '').replace('\n', '').replace(
                '\t', '').replace('...', '')
            for product_name in PRODUCT_DESCRIPTIONS
        ]
        productCategories_list = [
            product_name.replace('read more', '').replace('\n', '').replace(
                '\t', '').replace('...', '')
            for product_name in PRODUCT_CATEGORIES
        ]

        yield {
            'passiveProfileSource':
            'HannoverMesse',
            'profileCreatedBy':
            'WebScrapper',
            'createdAt_Date':
            time.strftime("%d %B %Y"),
            'createdAt':
            int(time.time()),
            'companyName':
            response.xpath(COMPANY_NAME).extract_first().replace(
                '\"', '').replace('\r', '').split('\n', 1)[0] if
            response.xpath(COMPANY_NAME).extract_first() is not None else '',
            'businessType':
            response.xpath(BUSINESS_TYPE).extract()[1].replace(
                '\n', '').replace('\t', '')
            if response.xpath(BUSINESS_TYPE).extract()
            and response.xpath(BUSINESS_TYPE).extract()[1] is not None
            and self.RepresentsInt(
                response.xpath(BUSINESS_TYPE).extract()[1].replace(
                    '\n', '').replace('\t', '')) is not True and "Status"
            not in response.xpath(BUSINESS_TYPE).extract()[1] else '',
            'contactName':
            response.xpath(CONTACT_NAME).extract_first().replace(
                '\"', '').replace('\r', '').split('\n', 1)[0] if
            response.xpath(CONTACT_NAME).extract_first() is not None and len(
                response.xpath(CONTACT_NAME).extract_first().replace(
                    '\"', '').replace('\r', '').split('\n', 1)[0]) > 4
            and response.xpath(CONTACT_NAME).extract_first().replace(
                '\"', '').replace('\r', '').split('\n', 1)[0] != 'Dipl.-Ing.'
            else '',
            'companySlogan':
            response.xpath(COMPANY_SLOGAN).extract_first().replace(
                '\"', '').replace('\r', '').split('\n', 1)[0] if
            response.xpath(COMPANY_SLOGAN).extract_first() is not None else '',
            'yearFounded':
            response.xpath(YEAR_FOUNDED).extract()[1].replace(
                '\n', '').replace('\t', '')
            if response.xpath(YEAR_FOUNDED).extract()
            and response.xpath(YEAR_FOUNDED).extract()[1] is not None and
            "Status" not in response.xpath(YEAR_FOUNDED).extract()[1] else '',
            'ftes':
            response.xpath(NO_OF_EMPLOYEES).extract()[1].replace(
                '\n', '').replace('\t', '').split('(Status', 1)[0]
            if response.xpath(NO_OF_EMPLOYEES).extract()
            and response.xpath(NO_OF_EMPLOYEES).extract()[1] is not None and
            ("between" in response.xpath(NO_OF_EMPLOYEES).extract()[1].replace(
                '\n', '').replace('\t', '').split('(Status', 1)[0] or "over"
             in response.xpath(NO_OF_EMPLOYEES).extract()[1]) is not True else
            '',
            'revenue':
            response.xpath(NO_OF_EMPLOYEES).extract()[1].replace(
                '\n', '').replace('\t', '').split('(Status', 1)[0]
            if response.xpath(NO_OF_EMPLOYEES).extract()
            and response.xpath(NO_OF_EMPLOYEES).extract()[1] is not None and
            ("between" in response.xpath(NO_OF_EMPLOYEES).extract()[1].replace(
                '\n', '').replace('\t', '').split('(Status', 1)[0] or "over"
             in response.xpath(NO_OF_EMPLOYEES).extract()[1]) else '',
            'companyDescription':
            response.xpath(COMPANY_DESCRIPTION).extract_first().replace(
                '\"', '').replace('\r', '').split('\n', 1)[0]
            if response.xpath(COMPANY_DESCRIPTION).extract_first() is not None
            else '',
            'companyWebsite':
            response.css(COMPANY_WEBSITE).extract_first().replace(
                '\"', '').replace('\r', '').split('\n', 1)[0] if
            response.css(COMPANY_WEBSITE).extract_first() is not None else '',
            'companyPhone':
            response.xpath(COMPANY_PHONE).extract_first().replace(
                '\"', '').replace('Phone: ', '').replace('\r', '').split(
                    '\n', 1)[0] if
            response.xpath(COMPANY_PHONE).extract_first() is not None else '',
            'streetAddress':
            response.xpath(STREET_ADDRESS).extract_first().replace(
                '\"', '').replace('\r', '').split('\n', 1)[0] if
            response.xpath(STREET_ADDRESS).extract_first() is not None and len(
                response.xpath(STREET_ADDRESS).extract_first().replace(
                    '\"', '').replace('\r', '').split('\n', 1)[0]) > 4 else '',
            'zipCode':
            response.xpath(ZIP_CODE).extract_first().replace('\"', '').replace(
                '\r', '').split(' ', 1)[0].split('\n', 1)[0]
            if response.xpath(ZIP_CODE).extract_first() is not None else '',
            'city':
            response.xpath(ZIP_CODE).extract_first().replace('\"', '').replace(
                '\r', '').split(' ', 2)[1].split('\n', 1)[0]
            if response.xpath(ZIP_CODE).extract_first() is not None
            and self.RepresentsInt(
                response.xpath(ZIP_CODE).extract_first().replace(
                    '\"', '').replace('\r', '').split(' ', 2)[1].split(
                        '\n', 1)[0]) is not True and len(
                            response.xpath(ZIP_CODE).extract_first().replace(
                                '\"', '').replace('\r', '').split(
                                    ' ', 2)[1].split('\n', 1)[0]) > 4 else '',
            'country':
            response.xpath(COUNTRY).extract_first().replace('\"', '').replace(
                '\r', '').split('\n', 1)[0]
            if response.xpath(COUNTRY).extract_first() is not None else '',
            'sector':
            'pupils, students',
            'productName':
            '; '.join(list(filter(None, productName_list))),
            'productDescription':
            '; '.join(list(filter(None, productDescription_list))),
            'productCategories':
            '; '.join(list(filter(None, productCategories_list)))
        }

        if (self.counter is not 4 and self.counter is not 5):
            self.counter += 1
        index = str(self.counter)
        try:
            pagination = self.driver.find_element_by_xpath(
                '//*[@id="searchResult:search"]/section[2]/div/div/div[2]/section[23]/div/div/div/ul/li['
                + index + ']/a')
            #time.sleep(3) # sleep for 3 seconds
            pagination.click()
        except WebDriverException:
            self.driver.quit()
            #self.driver.get('http://www.hannovermesse.de/en/exhibition/exhibitors-products/advanced-search/')
            # checkboxArray = ['searchAP:zb:18:r', 'searchAP:zb:35:r', 'searchAP:zb:220:r', 'searchAP:zb:239:r', 'searchAP:zb:251:r', 'searchAP:zb:291:r', 'searchAP:zb:312:r']
            # self.driver.get('http://www.hannovermesse.de/en/exhibition/exhibitors-products/advanced-search/')
            # WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchAP:search"]/section/div[6]/div/div/div[2]/div[2]')))
            # self.driver.execute_script("document.getElementById('${checkboxArray[1]}').click()")
            # self.driver.find_element_by_xpath('//*[@id="searchAP:searchButton2"]').click();
            # self.counter = 1;
            # self.parse_dir_contents(self, response)
        response2 = TextResponse(url=response.url,
                                 body=self.driver.page_source,
                                 encoding='utf-8')
        for href in response2.css('.search-link ::attr(href)'):
            url = response2.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)

Exemplo n.º 18

0

Exibir arquivo

    quotes = response.css("div.quote > span.text::text").extract()
    div_tags =response.css("div.tags ")
    tags = [i.css("a.tag::text").extract() for i in response.css("div.tags")]
    base_url="http://quotes.toscrape.com/"
    rel_hyperlinks=response.css("small.author~a::attr(href)").extract()
    hyperlink=[base_url+i for i in rel_hyperlinks]
    return pd.DataFrame({"quotes":quotes,"author":author,"tags":tags,"author_page":hyperlink})

quotes = []
url = "http://quotes.toscrape.com/" 
while True:
    response =request(url)
    quotes.append(quote_scraper(response))
    url_for_next_page = response.css("li.next > a::attr(href)").extract_first()
    if url_for_next_page:
        url = response.urljoin(url_for_next_page)
    else:
        break

quotes = pd.concat(quotes)
quotes

"""# ***Problem 2(Scraping movies)***"""

URL="https://www.imdb.com/chart/moviemeter"

page=requests.get(URL)

response= TextResponse(body=page.text,url=URL,encoding="utf-8")

response.css("td>span.secondaryInfo::text").extract()