def parse(self, response): try : while self.s_roll <= self.e_roll: self.driver.get('http://new.aktu.co.in/') try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="ctl00_ContentPlaceHolder1_divSearchRes"]/center/table/tbody/tr[4]/td/center/div/div/img'))) except: continue # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8'); rollno = self.driver.find_element_by_name('ctl00$ContentPlaceHolder1$TextBox1') rollno.send_keys(self.s_roll) try : resp = self.fill_captcha(resp) print format(resp.xpath('//*[@id="ContentPlaceHolder1_Label1"]/text()').extract()) while "Incorrect" in format(resp.xpath('//*[@id="ContentPlaceHolder1_Label1"]/text()').extract()): resp = self.fill_captcha(resp) except : continue self.parse_result(self.driver.current_url) self.s_roll += 1 self.count +=3 self.sheet.write(self.count,0,"First") self.sheet.write(self.count,1,self.top[0][0]) self.sheet.write(self.count+1,0,"Last") self.sheet.write(self.count+1,1,self.top[1][0]) except : self.parse(response) finally : return
def parse(self, response): self.driver.get(response.url) try: WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.XPATH,'//*[@id="views"]/div/div[2]/div[2]/div[3]/div[10]/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/span'))) except TimeoutException: print "Time out" return # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8'); for href in resp.xpath('//*[@id="views"]/div/div[2]/div[2]/div[3]/div/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/@href'): url = resp.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parse_property) if self.page == 5 : return self.page += 1 yield scrapy.Request(url="https://www.proptiger.com/noida/property-sale?page=%d" % self.page, headers={"Referer": "https://www.proptiger.com/noida/property-sale", "X-Requested-With": "XMLHttpRequest"}, callback=self.parse, dont_filter=True)
def parse(self, response): self.driver.get(response.url) while True: time.sleep(1) try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//p[@class="propertyName"]/a'))) except TimeoutException: return resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') urls = resp.xpath('//p[@class="propertyName"]/a/@href').extract() pprint(urls) #urls=['http://www.magicbricks.com/propertyDetails/270-Sq-ft-Studio-Apartment-FOR-Sale-Vatika-City-in-Gurgaon&id=4d423230333337333839?from=search'] if len(urls) == 0: return for url in urls: abs_url = 'http://www.squareyards.com' + url yield scrapy.Request(abs_url, callback=self.parse_property_info) try : link = self.driver.find_element_by_xpath('//ul[@class="newpagination"]/li[2]') actions = ActionChains(self.driver) actions.click(link) actions.perform() except: return
def parse(self, response): while self.roll < 1409110903: self.driver.get('http://new.aktu.co.in/') try: WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.XPATH,'//*[@id="ctl00_ContentPlaceHolder1_divSearchRes"]/center/table/tbody/tr[4]/td/center/div/div/img'))) except: continue # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8'); rollno = self.driver.find_element_by_name('ctl00$ContentPlaceHolder1$TextBox1') rollno.send_keys(self.roll) captcha_url = format(resp.xpath('//*[@id="ctl00_ContentPlaceHolder1_divSearchRes"]/center/table/tbody/tr[4]/td/center/div/div/img/@src').extract()) url = "http://new.aktu.co.in/" + captcha_url[3:-2] print url captcha = url_to_image(url) captcha_value = read_captcha(captcha) print captcha_value captcha_input = self.driver.find_element_by_name('ctl00$ContentPlaceHolder1$txtCaptcha') captcha_input.send_keys(captcha_value) input() submit = self.driver.find_element_by_name('ctl00$ContentPlaceHolder1$btnSubmit') actions = ActionChains(self.driver) actions.click(submit) actions.perform() resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8'); if "Incorrect Code" in format(resp.xpath('*').extract()): continue self.parse_result(self.driver.current_url) self.roll += 1 self.count +=3 self.sheet.write(self.count,0,"First") self.sheet.write(self.count,1,self.top[0][0]) self.sheet.write(self.count+1,0,"Last") self.sheet.write(self.count+1,1,self.top[1][0]) return
def parse(self, response): self.driver.get('https://www.reddit.com/r/technology/') response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') posts = response.xpath('//div[@class="entry unvoted"]').extract() upvotes = response.xpath('//div[@class="score unvoted"]/text()').extract() for i in range(50): for j, post in enumerate(posts): comment = Selector(text=post).xpath( '//ul[@class="flat-list buttons"]/li[@class="first"]/a/text()').extract() label = Selector(text=post).xpath( '//p[@class="title"]/span[@class="linkflairlabel"]/text()').extract() title = Selector(text=post).xpath('//p[@class="title"]/a/text()').extract() date = Selector(text=post).xpath( '//p[@class="tagline"]/time/@datetime').extract() link = Selector(text=post).xpath( '//p[@class="title"]/span[@class="domain"]/a/text()').extract() upvote = upvotes[j] item = RedditItem() item['upvotes'] = upvote item['comments'] = comment item['label'] = label item['title'] = title item['date'] = date item['link'] = link yield item self.driver.find_element_by_xpath('//a[@rel="nofollow next"]').click() time.sleep(2)
def parse(self, response: TextResponse) -> [Request, YelpService]: """ This is the default callback used by Scrapy to process downloaded responses, when their requests don’t specify a callback. The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. Args: :param response: the response to parse """ # Checks if we are in the search result page if response.url.startswith("https://www.yelp.com/search?"): info_page_urls = response.css(".biz-name::attr(href)") # Checks if we have some result if info_page_urls is not None: for url in info_page_urls[:self.max_results]: # Joins the url found with the domain url, and returns a new Request for it, # that gonna be parsed by this method. info_page = response.urljoin(url.extract()) yield Request(info_page) # We are in the info page, therefore we already can extract the information else: yield self._map_response(response)
def comment_parse(self, response): #try: print response.url aid = response.meta['article']['aid'] date = response.meta['article']['date'] self.driver.get(response.url) time.sleep(3) while True: button_more = self.driver.find_element_by_xpath('//a[@class="u_cbox_btn_more __cbox_page_button"]') try: button_more.click() except: break resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') for site in resp.xpath('.//ul[@class="u_cbox_list"]/li'): username = site.xpath('.//span[@class="u_cbox_nick"]/text()').extract() like_count = site.xpath('.//em[@class="u_cbox_cnt_recomm"]/text()').extract() dislike_count = site.xpath('.//em[@class="u_cbox_cnt_unrecomm"]/text()').extract() contents = site.xpath('.//span[@class="u_cbox_contents"]/text()').extract() comment = NaverCommentItem() comment['aid'] = aid comment['username'] = username comment['like_count'] = like_count comment['dislike_count'] = dislike_count comment['contents'] = ''.join(contents) comment['date'] = date yield comment '''
def fake_response_from_file(filename, url=None): """ Create a Scrapy fake HTTP response from a HTML file @param file_name: The relative filename from the responses directory, but absolute paths are also accepted. @param url: The URL of the response. returns: A scrapy HTTP response which can be used for unittesting. taken from http://stackoverflow.com/a/12741030/3605870 """ if not url: url = 'http://www.example.com' request = Request(url=url) if not filename[0] == '/': responses_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(responses_dir, 'test_spiders_data', filename) else: file_path = filename file_content = open(file_path, 'r').read() response = TextResponse(url=url, request=request, body=file_content) response._encoding = 'latin-1' return response
def parse(self, response): self.driver.get(response.url) try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//a[@class="list-name"]'))) except TimeoutException: return resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') urls = resp.xpath('//a[@class="list-name"]/@href').extract() old = 0 new = len(urls) while old != new: print "\n\n\n",old,new,"\n\n\n" for i in xrange(old,new): abs_url = 'http://www.housing.com' + urls[i] yield scrapy.Request(abs_url, callback=self.parse_property_info) try : link = self.driver.find_element_by_xpath('//div[@class="show-more-container"]') actions = ActionChains(self.driver) actions.click(link) actions.perform() except: self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') urls = resp.xpath('//a[@class="list-name"]/@href').extract() old = new new = len(urls)
def parse(self, response): #response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') sel = Selector(response) self.driver.get(response.url) i=0 while True: #next = self.driver.find_element_by_xpath('//*[@id="pagnNextString"]') next = WebDriverWait(self.driver, 10).until( EC.visibility_of_element_located((By.ID, "pagnNextString")) ) #next.click() #i=i+1 #if(i==2): # break #sleep(50000) try: #self.driver.set_page_load_timeout(10000) response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') sites = response.xpath('//*[@id="s-results-list-atf"]/li') for site in sites: '''item = EbayItem() item['title'] = site.xpath('//a/h2[@class="a-size-base a-color-null s-inline s-access-title a-text-normal"]/text()').extract() item['link'] = sel.xpath('//a/@href').extract() item['price'] = site.xpath('//span[@class="a-size-base a-color-price s-price a-text-bold"]/span/text()').extract() yield item''' item=EbayItem() #title = site.xpath('//a/h2[@class="a-size-base a-color-null s-inline s-access-title a-text-normal"]/text()').extract() item['title'] = site.xpath('div/div[2]/div[1]/a/h2/text()').extract() item['link'] = site.xpath('div/div[2]/div[1]/a/@href').extract() item['price'] = site.xpath('div/div[3]/div[0]/a/span[0]/text() | div/div[3]/div[1]/a/span/text() ').extract() item['image'] = site.xpath('div/div[1]/div/div/a/img/@src').extract() #item['rating'] = site.xpath('div/div[5]/span/span/a/i[1]/span/text() | div/div[4]/span/span/a/i[1]/span/text()').extract() item['rating'] = site.xpath('div//span[@class="a-icon-alt"]/text()').extract() #price = site.xpath('div/div[3]/div[0]/a/span[0]/text() | div/div[3]/div[1]/a/span/text() ').extract() #print title,link, price #print price #sleep(50000) yield item try: self.driver.find_element_by_xpath('//*[@id="pagnNextLink"]') except NoSuchElementException: break next.click() # get the data and write it to scrapy items except: #break a=10 #next.click() self.driver.close() #######
def parse_selenium(self, response): #Use the previous instance of the webrowser which was created to go to visit the "response.url" self.driver.get(response.url) self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #All comments have been loaded, once again pass the "body" argument back in response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') data = ScrapyTutorialItem() data['item'] = {'url': response.url.split("=")[1], 'items': response1.xpath("//div[@class='ItemTitle-sc-1bls9ac-0 hrhyAs']/text()").extract()} return data
def parse(self, response): sel = Selector(response) self.driver.get(response.url) block="none" hyper="http://www.snapdeal.com" print hyper i=0 while True: self.driver.set_page_load_timeout(10000) self.driver.execute_script("window.scrollTo(10000000,10000000)") self.driver.set_page_load_timeout(10000) try: show = self.driver.find_element_by_xpath('//*[@id="seemore"]').value_of_css_property('display') print show '''if show==block: self.driver.find_element_by_xpath('//div[@id="show-more-results"]').click()''' no_more = self.driver.find_element_by_xpath('//*[@class="mar_20per_left ajax-loader-icon hidden"]').value_of_css_property('display') print no_more if no_more==block and show==block: break; time.sleep(5) self.driver.execute_script("window.scrollTo(10000000,10000000)") self.driver.set_page_load_timeout(10000) except NoSuchElementException: print "pungi" break #down = self.driver.find_element_by_xpath('//div[@id="show-more-results"]') #location = down.location #self.((JavascriptExecutor) driver).executeScript("window.scrollBy(10000,10000);"); #next = self.driver.find_element_by_xpath('//div[@id="show-more-results"]') response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') try: #self.driver.set_page_load_timeout(10000) #driver.execute_script("window.scrollTo(0, location.get('y')") sites = response.xpath('//*[@class="product_grid_box"]') #print sites for site in sites: item = FlipkartItem() check = site.xpath('div[@class="productWrapper"]//div[@class="soldout-tag prod-grid-sold-out-lang"]/div/text()').extract() if check=='SOLD OUT': continue item['price'] = site.xpath('div[@class="productWrapper"]//div[@class="product-price"]/div/text()').extract()[0] data = site.xpath('div[@class="productWrapper"]//div[@class="product-title"]/a/text()').extract() item['title'] = data item['rating'] = site.xpath('div[@class="productWrapper"]//div[@class="ratingStarsSmall"]/@style | div[@class="productWrapper"]//div[@class="ratingStarsSmall corrClass8"]/@ratings').extract() item['image'] = site.xpath('div[@class="productWrapper"]//div[@class=" product-image "]/a/img/@src').extract() item['link'] = site.xpath('div[@class="productWrapper"]//div[@class="product-title"]/a/@href').extract() yield item except: print "Loop error" a=10 self.driver.close()
def parse(self, response): try : self.driver.get(response.url) resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') blocks = resp.xpath('//div[contains(@id,"resultBlockWrapper")]') old = 0 new = len(blocks) while old != new: print old,new self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') blocks = resp.xpath('//div[contains(@id,"resultBlockWrapper")]') old = new new = len(blocks) for block in blocks: try : price = ''.join(block.xpath('div//div[@class="srpColm2"]//span[contains(@id,"pricePropertyVal")]//text()').extract()) iscr = 'Cr' in price islac = 'Lac' in price price = price.replace(',','').replace('Cr','').replace('Lac','') price = float(price.split()[0]) price *= 10000000 if iscr else 1 price *= 100000 if islac else 1 bhk = ''.join(block.xpath('div//div[@class="srpColm2"]//strong/text()').extract()) bhk = (''.join(bhk.split()[:2])).replace('.5','') if "bhk" in bhk.lower() and not("1bhk" in bhk.lower() or ('1 bhk') in bhk.lower()) : ppf = ''.join(block.xpath('div//div[@class="srpColm2"]//span[@class="proRentArea"]/text()').extract()) if ppf == "": ppf = ''.join(block.xpath('div//div[@class="srpColm2"]//span[@class="proNameSizeTxt"]/text()').extract()) ppf = float(ppf.split()[0]) if bhk in self.obj: self.obj[bhk]['min'] = self.obj[bhk]['min'] if price > self.obj[bhk]['min'] else price self.obj[bhk]['max'] = self.obj[bhk]['max'] if price < self.obj[bhk]['max'] else price self.obj[bhk]['count'] += 1 self.obj[bhk]['avg'] += ppf else : self.obj[bhk] = {'min':price, 'max':price, 'count':1,'avg':ppf} except : pass with open(os.path.dirname(__file__) +'/../../price.json','w')as file: file.write(json.dumps(self.obj)) return except : with open('cus.txt','ab+') as f: f.write("some error occur") return
def load_item_from_values(values): request = Request(values['url']) response = TextResponse( url=values['url'], status=values['http_status'], body=values.get('body', ''), request=request, headers={ 'Content-Type': values['content_type'], } ) if 'redirect_urls' in values: response.meta['redirect_urls'] = values['redirect_urls'] loader = StreamItemLoader(item=StreamItem(), response=response) loaded_item = loader.load_item() loaded_item['metadata'] = values['metadata'] return loaded_item
def test_csviter_delimiter(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, delimiter='\t') self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def parse(self, response): self.driver.get(response.url) try: WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.XPATH,'//*[@id="views"]/div/div[2]/div[2]/div[3]/div[10]/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/span'))) except: yield scrapy.Request(url="https://www.proptiger.com/%s/property-sale?page=%d" % (self.city,self.page), callback=self.parse) # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8'); for href in resp.xpath('//*[@id="views"]/div/div[2]/div[2]/div[3]/div/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/@href'): url = resp.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parse_property) if self.page == self.end_page : return self.page += 1 yield scrapy.Request(url="https://www.proptiger.com/%s/property-sale?page=%d" % (self.city,self.page), callback=self.parse)
def test_csviter_wrong_quotechar(self): body = get_testdata('feeds', 'feed-sample6.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{"'id'": "1", "'name'": "'alpha'", "'value'": "'foobar'"}, {"'id'": "2", "'name'": "'unicode'", "'value'": "'\xfan\xedc\xf3d\xe9\u203d'"}, {"'id'": "'3'", "'name'": "'multi'", "'value'": "'foo"}, {"'id'": "4", "'name'": "'empty'", "'value'": ""}])
def test_remove_multiple_nested_expressions(): # Mock response using expected article data html = """<html> <head></head> <body> <div class="post-content"> <h1 class="post-title">Article title</h1> <div class="post-content"> <p>Paragraph 1</p> <p>Paragraph 2</p> <p>Paragraph 3</p> </div> <div class="bad"> <div class="social"> <p>Twitter</p> </div> <div class="social"> <div class="bad"> <p>Facebook</p> </div> </div> </div> </div> </body> </html>""" response = TextResponse(url="http://example.com", body=html, encoding="utf-8") # Mock config config_yaml = """ site_name: 'example.com' article: content: select_method: 'xpath' select_expression: '//div[@class="post-content"]' match_rule: 'first' remove_expressions: - '//div[@class="social"]' - '//div[@class="bad"]' """ config = yaml.load(config_yaml, Loader=yaml.FullLoader) # Test content extraction with removal expected_html = """ <div class="post-content"> <h1 class="post-title">Article title</h1> <div class="post-content"> <p>Paragraph 1</p> <p>Paragraph 2</p> <p>Paragraph 3</p> </div> </div>""" validate_extract_element(response, config['article']['content'], expected_html)
def test_set(self): """EXSLT set manipulation tests""" # microdata example from http://schema.org/Event body=""" <div itemscope itemtype="http://schema.org/Event"> <a itemprop="url" href="nba-miami-philidelphia-game3.html"> NBA Eastern Conference First Round Playoff Tickets: <span itemprop="name"> Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1) </span> </a> <meta itemprop="startDate" content="2016-04-21T20:00"> Thu, 04/21/16 8:00 p.m. <div itemprop="location" itemscope itemtype="http://schema.org/Place"> <a itemprop="url" href="wells-fargo-center.html"> Wells Fargo Center </a> <div itemprop="address" itemscope itemtype="http://schema.org/PostalAddress"> <span itemprop="addressLocality">Philadelphia</span>, <span itemprop="addressRegion">PA</span> </div> </div> <div itemprop="offers" itemscope itemtype="http://schema.org/AggregateOffer"> Priced from: <span itemprop="lowPrice">$35</span> <span itemprop="offerCount">1938</span> tickets left </div> </div> """ response = TextResponse(url="http://example.com", body=body) sel = self.sscls(response) self.assertEqual( sel.xpath('''//div[@itemtype="http://schema.org/Event"] //@itemprop''').extract(), [u'url', u'name', u'startDate', u'location', u'url', u'address', u'addressLocality', u'addressRegion', u'offers', u'lowPrice', u'offerCount'] ) self.assertEqual(sel.xpath(''' set:difference(//div[@itemtype="http://schema.org/Event"] //@itemprop, //div[@itemtype="http://schema.org/Event"] //*[@itemscope]/*/@itemprop)''').extract(), [u'url', u'name', u'startDate', u'location', u'offers'])
def test_regexp(self): """EXSLT regular expression tests""" body = """ <p><input name='a' value='1'/><input name='b' value='2'/></p> <div class="links"> <a href="/first.html">first link</a> <a href="/second.html">second link</a> <a href="http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml">EXSLT match example</a> </div> """ response = TextResponse(url="http://example.com", body=body) sel = self.sscls(response) # re:test() self.assertEqual( sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]').extract(), [ x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]') ]) self.assertEqual([ x.extract() for x in sel.xpath('//a[re:test(@href, "\.html$")]/text()') ], [u'first link', u'second link']) self.assertEqual([ x.extract() for x in sel.xpath('//a[re:test(@href, "first")]/text()') ], [u'first link']) self.assertEqual([ x.extract() for x in sel.xpath('//a[re:test(@href, "second")]/text()') ], [u'second link']) # re:match() is rather special: it returns a node-set of <match> nodes #[u'<match>http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml</match>', #u'<match>http</match>', #u'<match>www.bayes.co.uk</match>', #u'<match></match>', #u'<match>/xml/index.xml?/xml/utils/rechecker.xml</match>'] self.assertEqual( sel.xpath('re:match(//a[re:test(@href, "\.xml$")]/@href,' '"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()').extract(), [ u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml', u'http', u'www.bayes.co.uk', u'', u'/xml/index.xml?/xml/utils/rechecker.xml' ]) # re:replace() self.assertEqual( sel.xpath( 're:replace(//a[re:test(@href, "\.xml$")]/@href,' '"(\w+)://(.+)(\.xml)", "","https://\\2.html")').extract(), [ u'https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html' ])
def parse(self, response): response = TextResponse(response.url, encoding='iso-8859-1', body=response.body, request=response.request) hxs = HtmlXPathSelector(response) cats = [] if response.url == self.start_urls[0]: cats = hxs.select('//td/p/a/@href').extract() cats += hxs.select('//li//a[@class="noline2"]/@href').extract() price_list_cats = [] price_list = False if not response.meta.get('price_list'): price_list_cats = hxs.select( '//a[contains(text(), "Price List") and contains(@href, "pricelist")]/@href' ).extract() if price_list_cats: price_list = True cats += price_list_cats cats += hxs.select( '//div[@align="center"]/table//font/img/../../@href').extract() cats += hxs.select( '//div[@align="center"]/table//span/img/../../@href').extract() cats += hxs.select( '//div[@align="center"]/table//a/img/../@href').extract() if not price_list: cats += hxs.select( '//table//td[@class="catalog3"]//a/@href').extract() cats += hxs.select( '//table//td[@class="graybg"]//span/../@href').extract() cats += hxs.select( '//table//td[@class="graybg"]//span/../../@href').extract() cats += hxs.select( '//table//td[@class="graybg"]//span/a/@href').extract() for cat in cats: if "action=buy_now" not in cat: url = urljoin_rfc(get_base_url(response), cat) if len(re.findall('\.htm', url)) > 1 or len( re.findall('\.asp', url)) > 1: continue yield Request(url, encoding='iso-8859-1', meta={ 'price_list': price_list or response.meta.get('price_list') }) for product in self.parse_products(hxs, response): yield product
def collect(conf, conn): """Collect ICD-XX-CM conditions. """ # For more information see: # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html URL = "https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip" FILE = "Tabular.xml" VERSION = "ICD-10-CM" LAST_UPDATED = "2015-10-01" # Prepare xml zip = requests.get(URL).content xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read() res = TextResponse(url=URL, body=xml, encoding="utf-8") count = 0 for diag in res.xpath("//diag"): # We need only leafs childs = diag.xpath("./diag") if not childs: continue # Get data data = {} data["name"] = diag.xpath("./name/text()").extract_first() data["desc"] = diag.xpath("./desc/text()").extract_first() data["terms"] = diag.xpath(".//note/text()").extract() data["version"] = VERSION data["last_updated"] = LAST_UPDATED # Create record record = Record.create(URL, data) # Write record record.write(conf, conn) # Log info count += 1 if not count % 100: logger.info('Collected %s "%s" conditions', count, record.table)
def get_details(self, response): item = JobkoreaCrawlerItem() item["company_name"] = response.xpath( '//*[@id="container"]/section/div/article/div[1]/h3/span/text()' )[0].extract().strip() try: item["deadline"] = response.xpath( '//*[@id="tab02"]/div/article[1]/div/dl[2]/dd[2]/span/text()' )[0].extract()[5:] + " 마감" except: item["deadline"] = "수시채용" url = "http://www.jobkorea.co.kr" + response.xpath( '//*/article[contains(@class, "artReadCoInfo") and contains(@class, "divReadBx")]/div/div/p/a/@href' )[0].extract() req = requests.get(url) response_detail_page = TextResponse(req.url, body=req.text, encoding='utf-8') item["business"] = response_detail_page.xpath( '//*[@id="company-body"]/div[1]/div[1]/div/div/div[9]/div[2]/div/div/text()' )[0].extract() item['link'] = response.url item["position"] = response.xpath( '//*[@id="container"]/section/div/article/div[1]/h3/text()' )[1].extract().strip() try: item["job_condition"] = response_detail_page.xpath( '//*[@id="company-body"]/div[1]/div[1]/div/div/div[8]/div[2]/div/div/div/div/text()' )[0].extract() except: item["job_condition"] = "회사 내규에 따름 - 연봉 협의" item["keyword"] = response.xpath( '//*[@id="artKeywordSearch"]/ul/li/button/text()').extract()[:-1] yield item
def collect(conf, conn): """Collect ICD-XX-CM conditions. """ # For more information see: # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip' FILE = 'Tabular.xml' VERSION = 'ICD-10-CM' LAST_UPDATED = '2015-10-01' # Prepare xml zip = requests.get(URL).content xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read() res = TextResponse(url=URL, body=xml, encoding='utf-8') count = 0 for diag in res.xpath('//diag'): # We need only leafs childs = diag.xpath('./diag') if not childs: continue # Get data data = {} data['name'] = diag.xpath('./name/text()').extract_first() data['desc'] = diag.xpath('./desc/text()').extract_first() data['terms'] = diag.xpath('.//note/text()').extract() data['version'] = VERSION data['last_updated'] = LAST_UPDATED # Create record record = Record.create(URL, data) # Write record record.write(conf, conn) # Log info count += 1 if not count % 100: logger.info('Collected %s "%s" conditions', count, record.table)
def collect(conf, conn): """Collect ICD-XX-CM conditions. """ # For more information see: # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip' FILE = 'Tabular.xml' VERSION = 'ICD-10-CM' LAST_UPDATED = '2015-10-01' # Prepare xml zip = requests.get(URL).content xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read() res = TextResponse(url=URL, body=xml, encoding='utf-8') count = 0 for diag in res.xpath('//diag'): # We need only leafs childs = diag.xpath('./diag') if not childs: continue # Get data data = {} data['name'] = diag.xpath('./name/text()').extract_first() data['desc'] = diag.xpath('./desc/text()').extract_first() data['terms'] = diag.xpath('.//note/text()').extract() data['version'] = VERSION data['last_updated'] = LAST_UPDATED # Create record record = Record.create(URL, data) # Write record base.writers.write_record(conn, record) # Log info count += 1 if not count % 100: logger.info('Collected %s "%s" conditions', count, record.table)
def get_datos(v_url, v_grupo, v_subgrupo): import requests from scrapy.http import TextResponse url = v_url user_agent = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58: .0.3029.110 Chrome/58.0.3029.110 Safari/537.36' } r = requests.get(url, headers=user_agent) response2 = TextResponse(r.url, body=r.text, encoding='utf-8') p_normal = response2.xpath( '//div[@class="prices"]/div/span//text()').extract()[0].strip() t_desc = response2.xpath('//div[@class="product-essential"]/div/h1//text()' ).extract()[0].strip() t_marca = '' if len(response2.xpath( '//div[@class="manufacturers"]/span/a//text()')) > 0: t_marca = response2.xpath( '//div[@class="manufacturers"]/span/a//text()').extract()[0].strip( ) else: tabla = response2.xpath( '//div[@class="productTabs-body"]/div/div/div/table[@class="data-table"]/tbody/tr' ) for t in tabla: tag = t.xpath('td//text()').extract() titulo = tag[0].strip() desc = tag[1].strip() if titulo.upper() == 'MARCA': t_marca = desc break skuId = response2.xpath( '//div[@class="additional-details"]/div[@class="sku"]/span[@class="value"]//text()' ).extract()[0].strip() t_modelo = response2.xpath( '//div[@class="additional-details"]/div[@class="gtin"]/span[@class="value"]//text()' ).extract()[0].strip() row = [ v_grupo, v_subgrupo, skuId, t_marca, t_desc, p_normal, t_modelo, v_url ] return row
def test_csviter_exception(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) iter.next() iter.next() iter.next() iter.next() self.assertRaises(StopIteration, iter.next)
def parse(self, response: TextResponse): # getting the data required to store in the pages table r_url = response.url r_page = response.text r_time = datetime.now() print(__file__, "CrawCategory.parse()", "scraping for pages: {}".format(r_url)) # create SQLAlchemy page object pge = Page(url=r_url, html=r_page, date=r_time, category=CrawlCategory.catObject) # add page object CrawlCategory.dbSession.add(pge) # calculating the url for the next page next_page = response.css("li.next a").attrib["href"] if next_page is not None: yield response.follow(next_page, callback=self.parse)
def fake_response(url, meta=None): """Create fake scrapy HTTP response from url""" response_with_body = requests.get(url) request = Request(url=url, meta=meta) response = TextResponse(url=url, request=request, body=response_with_body.content) return response
def parse2(self, response): self.driver.get(response.url) time.sleep(5) selector = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') sub_category_url = selector.xpath( '//a[@class="gb-refinement"]/@href').extract() print("-----------------------", sub_category_url) for product_details in sub_category_url: yield Request(url=product_details, callback=self.parse3) if not sub_category_url: product_details = response.url print("=================parse2========================", product_details) yield Request(url=product_details, callback=self.parse3)
def __init__(self, response=None, text=None, namespaces=None, _root=None, _expr=None): if text is not None: response = TextResponse(url='about:blank', \ body=unicode_to_str(text, 'utf-8'), encoding='utf-8') if response is not None: _root = LxmlDocument(response, self._parser) self.namespaces = namespaces self.response = response self._root = _root self._expr = _expr
def process_request(self, request, spider) -> process_request_type: row_key = spider.name + "_" + Enc.Encrypt.md5(request.url) result = self.getCacheResult(request, spider, row_key=row_key) if result: logger.info("R>>>>\trow_key\t" + str(row_key) + "\t" + request.url) encoding = result.get(b"source:encoding", [b"utf-8"])[0].decode("utf-8") return TextResponse(url=result.get(b"source:url")[0].decode("utf-8"), body=result.get(b"source:html")[0].decode("utf-8"), status=result.get(b"source:status_code")[0].decode("utf-8"), encoding=encoding) else: pass
def parse(self, response): self.driver.get(response.url) urls = [] for i in range(1, 20): response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') self.driver.implicitly_wait(10) for j in range(1, 31): result = response.xpath('//*[@class="col-md-9"]/div[1]/div[' + str(j) + ']/h3/a/@href') urls.extend(result) next_page = self.driver.find_element_by_xpath( '//*[@title="Go to next page"]') next_page.click()
def process_request(self, request, spider) -> process_request_type: if self.conn_redis7.exists(request.url): print("==Read cache 2 redis==") url = self.conn_redis7.hget(request.url, "url") html = self.conn_redis7.hget(request.url, "html") status_code = self.conn_redis7.hget(request.url, "status_code") encoding = self.conn_redis7.hget(request.url, "encoding") return TextResponse(url=url, body=html, status=status_code, encoding=encoding)
def _login(self): """知乎爬虫的登陆模块(此处用selenium模拟登陆)""" r = 1 while r != 0: try: self.driver.set_page_load_timeout(20) # 防止页面加载不完 self.driver.get('http://www.zhihu.com/#signin') time.sleep(10) # sleep 20 秒等待用户输入账号信息 self.driver.get('http://www.zhihu.com/#sighin') response = TextResponse(url=self.driver.current_url, body=self.driver.page_source.encode('utf-8')) user_info = response.xpath('/html/body/script[@data-name="current_user"]/text()') user_info = user_info.extract()[0].replace('[', '').replace(']', '').replace('\"', '').split(',') if not user_info[0] == '': print u'用户%s登陆成功' % user_info[0] logger.info(u'用户%s登陆成功' % user_info[0]) break else: logger.error(u'账号或者密码输入错误.') except: continue
class Quotes: def __init__(self, URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text, url=self.URL, encoding="utf-8") def get_quotes(self): return self.response.css("span.text::text").extract() def get_authors(self): return self.response.css("small.author::text").extract() def get_tags(self): "gets the tags all in one list" return self.response.css("div.tags > a.tag::text").extract() def get_author_link(self): return self.response.css("small.author ~ a::attr(href)").extract()
def parse(self, response): b = BeautifulSoup(response.body) details = b.findAll(attrs={"class": "detail"}) for detail in details: resp = TextResponse(url="..", status=200, body=detail.text.encode("utf8")) for requests_or_item in iterate_spider_output( self.parse_item(resp)): yield requests_or_item
def parse_selenium(self, response): #Use the previous instance of the webrowser which was created to go to visit the "response.url" self.driver.get(response.url) self.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") #All comments have been loaded, once again pass the "body" argument back in response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') data = ScrapyTutorialItem() data['item'] = { 'url': response.url.split("=")[1], 'items': response1.xpath( "//div[@class='ItemTitle-sc-1bls9ac-0 hrhyAs']/text()"). extract() } return data
def test_deprecated_selectorlist_methods(self): sel = Selector( TextResponse(url="http://example.com", body=b'<p>some text</p>')) with warnings.catch_warnings(record=True) as w: sel.xpath('//p').select('.') self.assertSubstring('Use .xpath() instead', str(w[-1].message)) with warnings.catch_warnings(record=True) as w: sel.xpath('//p').extract_unquoted() self.assertSubstring('Use .extract() instead', str(w[-1].message))
def test_csviter_exception(self): body = get_testdata("feeds", "feed-sample3.csv") response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) next(iter) next(iter) next(iter) next(iter) self.assertRaises(StopIteration, next, iter)
def __file_as_response(filename, meta={}): from os import path filepath = 'fake_files/' + filename current_dir = path.dirname(path.abspath(__file__)) fullpath = path.join(current_dir, filepath) with open(fullpath, 'r') as f: content = f.read() url = 'http://www.example.com' req = Request(url, meta=meta) return TextResponse(url, request=req, body=content)
def parse(self, response): self.driver.get(response.url) time.sleep(8) sel = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') # total_products = sel.path('//h4[@class="u-d-inlineblock u-smallest-font implicit subsection"]/span[@data-qa-id="results_count"]/text()').extract_first() # print '---total--',total_products # sel = TextResponse(url = response.url, body = self.driver.page_source, encoding = 'utf-8') pages_element = sel.xpath( '//div[@class="pure-u-14-24"]/ul[@class="c-paginator"]/li[@class=""]/a/text()' ).extract_first() # print '+++pages++++',pages_element pages_element = int(pages_element) for i in range(1, 18 + 1): url = 'https://www.practo.com/delhi/dietitian-nutritionist?page=' + str( i) print '------', url yield scrapy.Request(url=url, callback=self.product)
def get_chapter_list(self, response: TextResponse): chapter_url_list = response.xpath( '//div[@id="list"]/dl/dd/a/@href').extract() chapter_url_list = [ 'http://www.xbiquge.la' + i for i in chapter_url_list ] for chapter_url in chapter_url_list: print(chapter_url) yield Request(chapter_url, callback=self.get_content, dont_filter=True)
def parse_result(self, response): crime = response.meta.get('crime') year = format(int(response.meta.get('year')), '02d') month = format(int(response.meta.get('month')), '02d') filename = "{}/raw/20{}/{}_20{}_{}.xls".format(BASE_PATH, year, crime, year, month) print("Writing file {} now!".format(filename)) with open(filename, 'w+') as csv_file: body = TextResponse(response.url, body=response.body) csv_file.write(body.text) csv_file.close
def fake_response(url_with_categories, content): url = url_with_categories.url category = url_with_categories.category subcategory = url_with_categories.subcategory return TextResponse(url=url, body=content, encoding='utf-8', request=Request(url=url, meta={ 'category': category, 'subcategory': subcategory }))
def test_csviter_falserow(self): body = get_testdata('feeds', 'feed-sample3.csv') body = b'\n'.join((body, b'a,b', b'a,b,c,d')) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_headers(self): sample = get_testdata('feeds', 'feed-sample3.csv').splitlines() headers, body = sample[0].split(b','), b'\n'.join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=[h.decode('utf-8') for h in headers]) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_parser_type_wikis(self): #testing the "Wikis" parsing keywords = ["openstack", "nova", "css"] proxies = ["1.2.3.4:5678", "192.168.1.1:8000"] type_to_look = "Wikis" object_to_serialize = { "keywords": keywords, "proxies": proxies, "type": type_to_look } self.init_spider(keywords, proxies, type_to_look) current_response = TextResponse(url=self.spider_to_test.get_main_url(), body=open("test_files/wikis.html", "r", encoding="utf-8").read(), encoding="utf-8") results = self.spider_to_test.parse_main_search(current_response) results_list = [] results_compare_list = [{ 'url': 'https://github.com/vault-team/vault-website/wiki/Quick-installation-guide' }, { 'url': 'https://github.com/iwazirijr/wiki_learn/wiki/Packstack' }, { 'url': 'https://github.com/marcosaletta/Juno-CentOS7-Guide/wiki/2.-Controller-and-Network-Node-Installation' }, { 'url': 'https://github.com/MirantisDellCrowbar/crowbar/wiki/Release-notes' }, { 'url': 'https://github.com/dellcloudedge/crowbar/wiki/Release-notes' }, { 'url': 'https://github.com/eryeru12/crowbar/wiki/Release-notes' }, { 'url': 'https://github.com/rhafer/crowbar/wiki/Release-notes' }, { 'url': 'https://github.com/jamestyj/crowbar/wiki/Release-notes' }, { 'url': 'https://github.com/vinayakponangi/crowbar/wiki/Release-notes' }, { 'url': 'https://github.com/kingzone/node/wiki/Modules' }] for result in results: results_list.append(result) #print(results_list) self.assertEqual(results_list, results_compare_list)
def parse(self, response): self.driver.get(response.url) urls = [] for i in range(1,20): # self.driver.get(response.url) response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') self.driver.implicitly_wait(10) for j in range(1, 31): result = response.xpath('//*[@class="col-md-9"]/div[1]/div['+str(j)+']/h3/a/@href') urls.extend(result) next_page = self.driver.find_element_by_xpath('//*[@title="Go to next page"]') next_page.click() for href in urls: print href url = href.extract() self.driver.get(url) response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') item = IndeedItem() for sel in response.xpath('//div[@class="col-md-5 col-lg-6"]'): item['job_title'] = sel.xpath('//div[@class="col-md-5 col-lg-6"]/h1/text()').extract() item['location'] = sel.xpath('//div[@class="col-md-5 col-lg-6"]/ul/li[2]/text()').extract() item['company_name'] = sel.xpath('//div[@class="col-md-5 col-lg-6"]/ul/li[1]/a/text()').extract() for sel_1 in response.xpath('//*[@id="bd"]/div/div[1]'): item['job_type'] = sel_1.xpath('//div[2]/div/div[2]/span/text()').extract() item['job_salary'] = sel_1.xpath('//div[3]/div/div[2]/span/text()').extract() yield item self.driver.close()
def parse(self, response): self.driver.get('http://www.the-numbers.com/movie/budgets/all') response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') rows = response.xpath('//*[@id="page_filling_chart"]/center/table/tbody/tr').extract() for i in range(1, 10250, 2): RDate = Selector(text=rows[i]).xpath('//td[2]/a/text()').extract() Title = Selector(text=rows[i]).xpath('//td[3]/b/a/text()').extract() PBudget = Selector(text=rows[i]).xpath('//td[4]/text()').extract() DomesticG = Selector(text=rows[i]).xpath('//td[5]/text()').extract() WorldwideG = Selector(text=rows[i]).xpath('//td[6]/text()').extract() print RDate, Title, PBudget, DomesticG, WorldwideG item = MoviesItem() item['RDate'] = RDate item['Title'] = Title item['PBudget'] = PBudget item['DomesticG'] = DomesticG item['WorldwideG'] = WorldwideG yield item
def parse(self, response): self.driver.maximize_window() self.driver.get(response.url) self.driver.set_page_load_timeout(30) self.driver.execute_script("return document.documentElement.innerHTML;") scheight = 0.1 while scheight < 9.9: self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/{})".format(scheight)) scheight += .01 res = TextResponse(url=response.url, body=self.driver.execute_script("return document.documentElement.innerHTML;"), encoding='utf-8') for item in res.xpath('//div[@class="product-tile"]'): item_name = item.xpath('./div[@class="product-name"]/h3/a/text()').extract()[0].strip() item_link = item.xpath('./div[@class="product-name"]/h3/a/@href').extract()[0].strip() standard_price = item.xpath('./div[@class="product-pricing"]/div/span[@class="text price-standard"]/text()').extract() promoted_price = item.xpath('./div[@class="product-pricing"]/div/span[@class="text promotional-price"]/text()').extract() standard_price = float(standard_price[0].strip().split('$')[1].replace(',', '')) promoted_price = float(promoted_price[0].strip().split('$')[1].replace(',', '')) discount_rate = ((standard_price - promoted_price) / standard_price) * 100 print item_name, ", ", discount_rate, "% OFF", ", ", item_link self.driver.close()
def parse(self, response): # substantiate a selenium driver as the object we scrape self.driver.get(response.url) time.sleep(4) #scroll down so we can see the 'Load More' button self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #click on the 'Load More' button load_more = self.driver.find_element_by_link_text('Load more') load_more.click() time.sleep(2) #how many times do we need to scroll down? Here I've determined once for i in xrange(0, 1): self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(3) #pass the response url along with the new scrolled-down website (body = ) response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') photo_links = response1.xpath("//a[contains(@class, '_8mlbc _vbtk2 _t5r8b')]/@href").extract() for photo in photo_links: url = response.urljoin(photo) #for each photo loaded on the page, callback the parse_photo function yield scrapy.Request(url, callback=self.parse_photo)
def _extract_service_phone(self, response: TextResponse) -> str: """ Extracts the service phone from the response if it can be found, otherwise returns an empty string. Args: :param response: the response received from a `Request` object :return: the service phone if it can be found, otherwise an empty string """ phone = response.css(".biz-phone::text").extract_first() if not phone: self.log("Cannot find the phone of the service: " + response.url, logging.ERROR) return "" else: return phone.strip()
def _extract_service_address(self, response: TextResponse) -> str: """ Extracts the service address from the response if it can be found, otherwise returns an empty string. Args: :param response: the response received from a `Request` object :return: the service address if it can be found, otherwise an empty string """ # The address information is formatted by using "<br>" tags, so, we need to extract all # items within the "<address>" tag and merge them at the end separated by commas. address = response.css(".street-address address::text").extract() if not address: self.log("Cannot find the address of the service: " + response.url, logging.ERROR) return "" else: return ', '.join(address).strip()
def parse(self, response): self.driver.get('http://www.metmuseum.org/art/collection') # while True: # try: # show_more = self.driver.find_element_by_class_name("show-more") # time.sleep(2) # show_more.click() # except: # break # clicking the show more button for i in range(5): show_more = self.driver.find_element_by_class_name("show-more") time.sleep(3) show_more.click() response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') test = response.xpath('//h2[@class="card__title"]/a/@href') for href in response.xpath('//h2[@class="card__title"]/a/@href'): url = response.urljoin(href.extract()) print url # scraping the urls from the first page & creating a list of links # card_link_list = self.driver.find_elements_by_xpath('//h2[@class="card__title"]/a') # card_link_list = map(lambda x: x.get_attribute('href'), card_link_list) self.driver.get(url) time.sleep(2) response1 = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') item = MetItem() for sel in response1.xpath('//div[@class="l-component-block"]'): title = self.driver.find_element_by_xpath('//h1[@class="collection-details__object-title"]').text print title location = self.driver.find_element_by_xpath('//div[@class="collection-details__location"]').text print location item['title'] = title item['location'] = location artifact_detail = {} for detail in response1.xpath('//dl[@class="collection-details__tombstone--row"]').extract(): key = Selector(text=detail).xpath('//dt/text()').extract()[0] value = Selector(text=detail).xpath('//dd/text()').extract()[0] artifact_detail[key] = value item['artifact_detail'] = artifact_detail yield item
def _build_reponse(self, call, callback=None, errback=None): """ `Scrapy.engine` expects from spider `Response` or `Request` obj. As we use api calls directly we miss them. Therefore, we do build and return tuned or fake `Response` obj to cheat scarpy a lille bit. """ r = TextResponse(str(call['url']), status=call['status_code'], body=call['content'], encoding='utf-8', # utf-8 is standard for json response request=Request(str(call['url']), method='GET') ) r.dont_filter = True r.priority = 0 r.method = 'GET' r.callback = callback r.errback = errback return r
def parse_property_info(self, response): item = SquareyardItem() min_price = max_price = price_per_sqft = min_area = max_area = 0 is_price_fix = 1 name = description = code = address = city = location = status = unit_type = property_type ="" amenities ={} speciality = {} wow_factors = {} index = {} connection = [] self.driver.get(response.url) try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//img[@src]'))) except TimeoutException: return resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') try : name = ''.join(resp.xpath('//h1[@itemprop="name"]//text()').extract()) except : pass try : full_price = ''.join(resp.xpath('//span[@class="price-detail-txt"]//text()').extract()) full_price_list = [] for i in full_price.split() : try : full_price_list += [float(i)] except : pass min_price = float(full_price_list[0]) try : max_price = float(full_price_list[1]) except : pass try : if "Lac" in full_price: min_price *= 100000 max_price *= 100000 except : pass try : if "Cr" in full_price: min_price *= 10000000 max_price *= 10000000 except : pass except : pass try : area = ''.join(resp.xpath('//div[@class="proje-detais"]/p//text()').extract()) area_list = [] for i in area.split() : try : area_list += [float(i)] except : pass min_area = float(area_list[0]) max_area = float(area_list[1]) except : max_area = min_area try: price_per = (''.join(resp.xpath('//div[@class="price-details"]/div/div/p[2]/text()').extract())).replace('\n','').replace('\t','').replace(',','') priceunit = price_per price_per_sqft = [] for i in price_per.split() : try : price_per_sqft += [float(i)] except : pass price_per_sqft = int(price_per_sqft[0]) if "sqyrd" in priceunit: price_per_sqft *= 9 except: pass try : address = (','.join(resp.xpath('//ul[@itemprop="address"]//*[contains(@itemprop,"address")]//text()').extract())).replace('\n','').replace('\t','') city = address.split(',')[0] location = address.split(',')[-1] address = ' '.join(address.split(',')) except: pass try: description = '\n'.join(resp.xpath('//div[@class="aboutTextBox"]/p//text()').extract()) except: pass try : special = resp.xpath('//div[contains(@class,"AmenitiesBoxBorder")]') speciality['other'] = [] for spec in special: try : label = (''.join(spec.xpath('span//text()').extract())).encode('utf8') if label == "": speciality['other'] += [(''.join(spec.xpath('div//li//span//text()').extract())).encode('utf8')] else : speciality[label] = (''.join(spec.xpath('div//li//span//text()').extract())).encode('utf8') except : pass except : pass try : amenity_category = resp.xpath('//div[@class="amenitiesSliderBox"]/div') for category in amenity_category: try : category_name = ''.join(category.xpath('div/div[1]/div//text()').extract()).encode('utf8') amenities[category_name] = {} aminity_list = category.xpath('div//li') for amenity in aminity_list: try : header = (''.join(amenity.xpath('span[2]//text()').extract())).encode('utf8') availability = ''.join(amenity.xpath('span[2]/@class').extract()) if "active" in availability: amenities[category_name][header] = True else : amenities[category_name][header] = False except : pass except : pass except : pass try : status = ''.join(resp.xpath('//div[@class="progress-main"]//li[2]//text()').extract()) except : pass try : code = (response.url).split('/')[-2] except : pass try : project_details = resp.xpath('//div[contains(@class,"proje-detais")]') for details in project_details: if "Unit" in ''.join(details.xpath('p/span/text()').extract()): unit_type = (''.join(details.xpath('p/text()').extract())).replace('\n','') if "Property" in ''.join(details.xpath('p/span/text()').extract()): property_type = (''.join(details.xpath('p/text()').extract())).replace('\n','') except : pass try : wow_factor = resp.xpath('//div[contains(@class,"wow-Factors-section")]//li') for factor in wow_factor: value = (''.join(factor.xpath('span//text()').extract())).replace('\n','').encode('utf8') key = (''.join(factor.xpath('small//text()').extract())).replace('\n','').encode('utf8') wow_factors[key] = value except : pass try : connected_road = resp.xpath('//div[contains(@class,"connect-roads")]//li') for road in connected_road: try : value = (''.join(road.xpath('span[1]//text()').extract())).split('~') dis = float(value[1].split()[0]) connection += [{'name':value[0].encode('utf8'), 'distance': dis}] except : pass except : pass try : driver_box = resp.xpath('//div[contains(@class,"decisionDriversBox")]/div/div/div') for box in driver_box: try : head = (''.join(box.xpath('div//div[@class="projectCounter"]//div[@class="heading"]/text()').extract())).encode('utf8') val = (''.join(box.xpath('div//div[@class="projectCounter"]//div[contains(@class,"Box")]/text()').extract())).encode('utf8') index[head] = val except : pass except : pass item['name'] = name.encode('utf8') item['min_price'] = min_price item['max_price'] = max_price item['price_per_sqft'] = price_per_sqft item['address'] = address.encode('utf8') item['city'] = city.encode('utf8') item['location'] = location.encode('utf8') item['min_area'] = min_area item['max_area'] = max_area item['possession_status'] = status.encode('utf8') item['amenities'] = amenities item['speciality'] = speciality item['url'] = response.url item['code'] = code.encode('utf8') item['description'] = description.encode('utf8') item['unit_type'] = unit_type.encode('utf8') item['property_type'] = property_type.encode('utf8') item['index'] = index item['connecting_road'] = connection item['wow_factors'] = wow_factors item['more_info'] = {} urls = resp.xpath('//div[@class="bhkDetails"]//a/@href').extract() for url in urls: abs_url = 'http://www.squareyards.com' + url self.parse_deep_info(abs_url, item['more_info']) yield item input()
def parse_deep_info(self, abs_url, main_item): item = {} self.driver.get(abs_url) try: WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//span[@itemprop="minPrice"]'))) except TimeoutException: return resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') balconies =min_price = max_price = living_area = bedrooms = bathrooms = kitchens = servent_rooms = carpet_area = built_up_area = 0 code = name = "" try : code = ((response.url).split('/')[-2]).encode('utf8') except : pass try : name = (''.join(resp.xpath('//h1[@itemprop="name"]//text()').extract())).split() name = ''.join([name[0],name[1]]) except : pass try : full_price = ''.join(resp.xpath('//span[@itemprop="minPrice"]//text()').extract()) min_price = float(full_price.split()[0]) try : if "Lac" in full_price: min_price *= 100000 except : pass try : if "Cr" in full_price: min_price *= 10000000 except : pass except: pass try : full_price = ''.join(resp.xpath('//span[@itemprop="maxPrice"]//text()').extract()) max_price = float(full_price.split()[0]) try : if "Lac" in full_price: max_price *= 100000 except : pass try : if "Cr" in full_price: max_price *= 10000000 except : pass except: pass try : more_info = resp.xpath('//div[@class="unit-left-section"]//ul/li') for info in more_info: value = ''.join(info.xpath('span//text()').extract()) try : if "Living" in value: living_area = int(value.split()[0]) except : pass try : if "Bed" in value: bedrooms = int(value.split()[0]) except: pass try : if "Bath" in value: bathrooms = int(value.split()[0]) except : pass try : if "Kitchen" in value: kitchens = int(value.split()[0]) except : pass try : if "Servant" in value: servent_rooms = int(value.split()[0]) except : pass try : if "Balcon" in value: balconies = int(value.split()[0]) except : pass except: pass try : more_info = resp.xpath('//div[@class="unit-loder"]//div[@ng-if="!isFragment"]') for info in more_info: header = ''.join(info.xpath('div//p//text()').extract()) try : if "Carpet" in value: carpet_area = int((''.join(info.xpath('div//small//text()').extract())).split()[0]) except : pass try : if "BuiltUp" in value: built_up_area = int((''.join(info.xpath('div//small//text()').extract())).split()[0]) except : pass except: pass item['min_price'] = min_price item['max_price'] = max_price item['carpet_area'] = carpet_area item['built_up_area'] = built_up_area item['bedrooms'] = bedrooms item['bathrooms'] = bathrooms item['balconies'] = balconies item['servent_room'] = servent_rooms item['living_area'] = living_area item['kitchen'] = kitchens item['code'] = code.encode('utf8') if name in main_item: main_item[name] += [item] else : main_item[name] = [item]
def parse_deep_info(self, abs_url, item): deep_item = {} self.driver.get(abs_url) # try: # WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="unitTopTable table-responsive"]//tr[2]/td[2]'))) # except TimeoutException: # return resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') min_price = max_price = living_area = bedrooms = bathrooms = kitchens = servent_rooms = carpet_area = built_up_area = 0 code = name = "" balconies = {} Room_private_area = {} living_common_area = {} open_area = {} additional_area = {} try: code = abs_url.split('/')[-2] except: pass try: name = ("".join(resp.xpath('//h1[@itemprop="name"]//text()').extract())).split()[:2] name = "-".join(name) except: pass try: min_price = "".join(resp.xpath('//div[@class="unitTopTable table-responsive"]//tr[2]/td[2]//text()').extract()) isLac = 'L' in min_price isCrore = 'Cr' in min_price min_price = float(min_price.split()[0]) try: if isLac: min_price *= 100000 except: pass try: if isCrore: min_price *= 10000000 except: pass except: pass try : max_price = "".join(resp.xpath('//div[@class="unitTopTable table-responsive"]//tr[2]/td[3]//text()').extract()) isLac = 'L' in max_price isCrore = 'Cr' in max_price max_price = float(max_price.split()[0]) try : if isLac: max_price *= 100000 except : pass try : if isCrore: max_price *= 10000000 except : pass except: pass try : more_info = resp.xpath('//div[@class="unit-left-section"]//ul/li') for info in more_info: value = "".join(info.xpath('span//text()').extract()) try : if "Living" in value: living_area = int(value.split()[0]) except : pass try : if "Bed" in value: bedrooms = int(value.split()[0]) except: pass try : if "Bath" in value: bathrooms = int(value.split()[0]) except : pass try : if "Kitchen" in value: kitchens = int(value.split()[0]) except : pass try : if "Servant" in value: servent_rooms = int(value.split()[0]) except : pass try : if "Balcon" in value: balconies['count'] = int(value.split()[0]) balconies['size_in_sqft'] = int((value.split()[2])[1:]) except : pass except: pass try : more_info = resp.xpath('//div[@class="unit-loder"]//div[@ng-if="!isFragment"]') for info in more_info: header = "".join(info.xpath('div//p//text()').extract()) try : if "Carpet" in header: carpet_area = int(("".join(info.xpath('div//small//text()').extract())).split()[0]) except : pass try : if "BuiltUp" in header: built_up_area = int(("".join(info.xpath('div//small//text()').extract())).split()[0]) except : pass except: pass try : private_areas = resp.xpath('//div[contains(@class,"unitdimensionsArea")]/div/div[1]/div[1]//tr') for area in private_areas: try : length = breadth = area_sqft = 0.0 temp = area.xpath('td[@class="ng-binding"]//text()').extract() # pprint(temp) # input() try : length = float(temp[1]) except : pass try : breadth = float(temp[2]) except : pass try : area_sqft = float(temp[3].split()[0]) except : pass try : Room_private_area[temp[0].split()[0].encode('utf8')] = {'Length':length, 'Breadth':breadth,'Area' : area_sqft} except : pass except : pass except : pass try : private_areas = resp.xpath('//div[contains(@class,"unitdimensionsArea")]/div/div[1]/div[2]//tr') for area in private_areas: try : length = breadth = area_sqft = 0.0 temp = area.xpath('td[@class="ng-binding"]//text()').extract() # pprint(temp) # input() try : length = float(temp[1]) except : pass try : breadth = float(temp[2]) except : pass try : area_sqft = float(temp[3].split()[0]) except : pass try : living_common_area[temp[0].split()[0].encode('utf8')] = {'Length':length, 'Breadth':breadth,'Area' : area_sqft} except : pass except : pass except : pass try : private_areas = resp.xpath('//div[contains(@class,"unitdimensionsArea")]/div/div[2]/div[1]//tr') for area in private_areas: try: length = breadth = area_sqft = 0.0 temp = area.xpath('td[@class="ng-binding"]//text()').extract() # pprint(temp) # input() try: length = float(temp[1]) except: pass try: breadth = float(temp[2]) except: pass try: area_sqft = float(temp[3].split()[0]) except: pass try: open_area[temp[0].split()[0].encode('utf8')] = {'Length':length, 'Breadth':breadth,'Area' : area_sqft} except: pass except: pass except: pass try: private_areas = resp.xpath('//div[contains(@class,"unitdimensionsArea")]/div/div[2]/div[2]//tr') for area in private_areas: try: length = breadth = area_sqft = 0.0 temp = area.xpath('td[@class="ng-binding"]//text()').extract() # pprint(temp) # input() try: length = float(temp[1]) except: pass try: breadth = float(temp[2]) except: pass try: area_sqft = float(temp[3].split()[0]) except: pass try: additional_area[temp[0].split()[0].encode('utf8')] = {'Length': length, 'Breadth': breadth,'Area' : \ area_sqft} except: pass except: pass except: pass deep_item['min_price'] = min_price deep_item['max_price'] = max_price deep_item['carpet_area'] = carpet_area deep_item['built_up_area'] = built_up_area deep_item['bedrooms'] = bedrooms deep_item['bathrooms'] = bathrooms deep_item['balconies'] = balconies deep_item['servent_room'] = servent_rooms deep_item['living_area'] = living_area deep_item['kitchen'] = kitchens deep_item['code'] = code.encode('utf8') deep_item['room_private_areas'] = Room_private_area deep_item['living_common_areas'] = living_common_area deep_item['open_areas'] = open_area deep_item['additional_areas'] = additional_area try : item[name.encode('utf8')] += [deep_item] except: item[name.encode('utf8')] = [deep_item]
def parse(self, response): fire = firebase.FirebaseApplication('https://abcapp-8345a.firebaseio.com/',None) print "some" time.sleep(2) item = {} min_price = max_price = price_per_sqft = min_area = max_area = 0 is_price_fix = True name = description = code = address = city = location = status = unit_type = property_type ="" amenities ={} speciality = {} wow_factors = {} index = {} connection = {} self.driver.get(response.url) # try: # WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//img[@src]'))) # except TimeoutException: # return resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') try: name = ("".join(resp.xpath('//h1[@itemprop="name"]//text()').extract())).replace('.','') name = (re.sub(r'[^\x00-\x7F]', " ", name)) except: pass try: min_price = "".join(resp.xpath('//span[@class="price-detail-txt"]/span[@itemprop="minPrice"]//text()').extract()) isLac = 'L' in min_price isCrore = 'Cr' in min_price min_price = float(min_price.split()[0]) try: if isLac: min_price *= 100000 except: pass try: if isCrore: min_price *= 10000000 except: pass max_price = "".join(resp.xpath('//span[@class="price-detail-txt"]/span[@itemprop="maxPrice"]//text()').extract()) isLac = 'L' in max_price isCrore = 'Cr' in max_price max_price = float(max_price.split()[0]) try: if isLac: max_price *= 100000 except: pass try: if isCrore: max_price *= 10000000 except: pass except : min_price = max_price = 0 pass try: area = "".join(resp.xpath('//div[@class="proje-detais"]/p//text()').extract()) area_list = [] for i in area.split(): try: area_list += [float(i)] except: pass min_area = float(area_list[0]) max_area = float(area_list[1]) except: max_area = min_area try: price_per = ("".join(resp.xpath('//div[@class="price-details"]/div/div/p[2]/text()').extract())).replace('\n',"").replace('\t',"").replace(',',"") price_per_sqft = float(re.findall('\d+', price_per)[0]) if "sqyrd" in price_per: price_per_sqft *= 9 except: price_per_sqft = -1.0 try: address = (",".join(resp.xpath('//ul[@itemprop="address"]//*[contains(@itemprop,"address")]//text()').extract())).replace('\n',"").replace('\t',"") address = (re.sub(r'[^\x00-\x7F]', " ", address)) city = address.split(',')[0] location = address.split(',')[-1] address = " ".join(address.split(',')) except: pass try: description = " ".join(resp.xpath('//div[@class="aboutTextBox"]/p//text()').extract()) description = (re.sub(r'[^\x00-\x7F]', " ", description)) except: pass try: special = resp.xpath('//div[contains(@class,"AmenitiesBoxBorder")]') for spec in special: try: label = (" ".join(spec.xpath('span//text()').extract())) label = (re.sub(r'[^\x00-\x7F]', " ", label)).encode('utf8') if label == "": try: speciality['other'] += [re.sub(r'[^\x00-\x7F]'," ",("".join(spec.xpath('div//li//span//text()').extract()))).encode('utf8')] except: speciality['other'] = [re.sub(r'[^\x00-\x7F]'," ",("".join(spec.xpath('div//li//span//text()').extract()))).encode('utf8')] else: speciality[label] = re.sub(r'[^\x00-\x7F]'," ",("".join(spec.xpath('div//li//span//text()').extract()))).encode('utf8') except: pass except: pass try: amenity_category = resp.xpath('//div[@class="amenitiesSliderBox"]/div') for category in amenity_category: try: category_name = "".join(category.xpath('div/div[1]/div//text()').extract()) category_name = re.sub(r'[^\x00-\x7F]', " ",category_name).encode('utf8') amenities[category_name] = {} aminity_list = category.xpath('div//li') for amenity in aminity_list: try: header = ("".join(amenity.xpath('span[2]//text()').extract())).replace("'","").replace('/','OR') header = re.sub(r'[^\x00-\x7F]'," ",header).encode('utf8') availability = "".join(amenity.xpath('span[2]/@class').extract()) if "active" in availability: amenities[category_name][header] = 1 else: amenities[category_name][header] = 0 except: pass except: pass except: pass try: status = "".join(resp.xpath('//div[@class="progress-main"]//li[2]//text()').extract()) status = re.sub(r'[^\x00-\x7F]'," ",status) except: pass try: code = (response.url).split('/')[-2] except: pass try: project_details = resp.xpath('//div[contains(@class,"proje-detais")]') for details in project_details: if "Unit" in "".join(details.xpath('p/span/text()').extract()): unit_type = ("".join(details.xpath('p/text()').extract())).replace('\n',"") unit_type = re.sub(r'[^\x00-\x7F]'," ",unit_type) if "Property" in "".join(details.xpath('p/span/text()').extract()): property_type = ("".join(details.xpath('p/text()').extract())).replace('\n',"") property_type = re.sub(r'[^\x00-\x7F]', " ",property_type) except: pass try: wow_factor = resp.xpath('//div[contains(@class,"wow-Factors-section")]//li') for factor in wow_factor: value = ("".join(factor.xpath('span//text()').extract())).replace('\n',"") key = ("".join(factor.xpath('small//text()').extract())).replace('\n',"").replace('.','').replace('/','-') value = (re.sub(r'[^\x00-\x7F]', " ", value)).encode('utf8') key = (re.sub(r'[^\x00-\x7F]', " ", key)).encode('utf8') wow_factors[key] = value except: pass try: connected_road = resp.xpath('//div[contains(@class,"connect-roads")]//li') for road in connected_road: try: value = ("".join(road.xpath('span[1]//text()').extract())).split('~') dis = float(value[1].split()[0]) connection[value[0].encode('utf8')] = dis except: pass except: pass try: driver_box = resp.xpath('//div[contains(@class,"decisionDriversBox")]/div/div/div') for box in driver_box: try: head = ("".join(box.xpath('div//div[@class="projectCounter"]//div[@class="heading"]/text()').extract())) head = re.sub(r'[^\x00-\x7F]'," ",head).encode('utf8') val = ("".join(box.xpath('div//div[@class="projectCounter"]//div[contains(@class,"Box")]/text()').extract())) val = re.sub(r'[^\x00-\x7F]'," ",val).encode('utf8') index[head] = val except: pass except: pass try: item['name'] = name.encode('utf8') item['min_price'] = min_price item['max_price'] = max_price item['price_per_sqft'] = price_per_sqft item['address'] = address.encode('utf8') item['city'] = city.encode('utf8') item['location'] = location.encode('utf8') item['min_area'] = min_area item['max_area'] = max_area item['possession_status'] = status.encode('utf8') item['amenities'] = amenities item['speciality'] = speciality item['url'] = response.url item['code'] = code.encode('utf8') item['description'] = description.encode('utf8') item['unit_type'] = unit_type.encode('utf8') item['property_type'] = property_type.encode('utf8') item['index'] = index item['connecting_road'] = connection item['wow_factors'] = wow_factors item['more_info'] = {} urls = resp.xpath('//div[@class="bhkDetails"]//a/@href').extract() for url in urls: abs_url = 'http://www.squareyards.com' + url self.parse_deep_info(abs_url, item['more_info']) if item['name'] != "": try : item = convert(item) print fire.put('/','temp',item) except: print fire.put('/','temp',{}) print traceback.print_exc(); else: print fire.put('/','temp',{}) print response.url except: print fire.put('/','temp',{}) print traceback.print_exc() print response.url return