Exemplos de TextResponse.xpath em Python, exemplos de scrapy.http.TextResponse.xpath em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: result.py Projeto: sakshi29/Uptu-Result-B.tech-

    def parse(self, response):
        while self.roll < 1409110903:
            self.driver.get('http://new.aktu.co.in/')
            try:
                WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.XPATH,'//*[@id="ctl00_ContentPlaceHolder1_divSearchRes"]/center/table/tbody/tr[4]/td/center/div/div/img')))
            except:
                continue
	        # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over
            resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8');
            rollno = self.driver.find_element_by_name('ctl00$ContentPlaceHolder1$TextBox1')
            rollno.send_keys(self.roll)
            captcha_url = format(resp.xpath('//*[@id="ctl00_ContentPlaceHolder1_divSearchRes"]/center/table/tbody/tr[4]/td/center/div/div/img/@src').extract())
            url = "http://new.aktu.co.in/" + captcha_url[3:-2]
            print url
            captcha = url_to_image(url)
            captcha_value = read_captcha(captcha)
            print captcha_value
            captcha_input = self.driver.find_element_by_name('ctl00$ContentPlaceHolder1$txtCaptcha')
            captcha_input.send_keys(captcha_value)
            input()
            submit = self.driver.find_element_by_name('ctl00$ContentPlaceHolder1$btnSubmit')
            actions = ActionChains(self.driver)
            actions.click(submit)
            actions.perform()
            resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8');
            if "Incorrect Code" in format(resp.xpath('*').extract()):
                continue
            self.parse_result(self.driver.current_url)
            self.roll += 1
        self.count +=3
        self.sheet.write(self.count,0,"First")
        self.sheet.write(self.count,1,self.top[0][0])
        self.sheet.write(self.count+1,0,"Last")
        self.sheet.write(self.count+1,1,self.top[1][0])
        return

Exemplo n.º 2

0

Exibir arquivo

Arquivo: function_front.py Projeto: HyowonJang/Review_Analysis

def review_cnt_check(keyword, item_count_start=0, item_count_end=100):

    search_url = "http://www.coupang.com/np/search?q={}&isPriceRange=false&page=1&sorter=scoreDesc&listSize=100".format(
        keyword)
    rep = requests.get(search_url)
    response = TextResponse(rep.url, body=rep.text, encoding='utf-8')

    # 상품마다의 링크를 불러오기에 앞서 아이템 Top100 리스트 불러오기
    products = json.loads(
        response.xpath('//*[@id="productList"]/@data-products').extract()
        [0])['indexes'][item_count_start:item_count_end]

    review_total_cnt = 0

    for idx, product in enumerate(products):

        # Top100 상품마다의 상품평 개수(나중에 불러오는 정보라 상품링크 안에서는 xpath로 바로 가져올 수 없음)
        try:
            review_count = int(
                response.xpath(
                    '//*[@id="{}"]/a/dl/dd/div/div[4]/div[2]/span[2]/text()'.
                    format(product)).extract()[0].strip("()"))
            review_total_cnt += review_count
            print('{}번째 item {}의 review 수 : {}'.format(idx, product,
                                                       review_count))

        except Exception as e:
            print('No reviews')

    print('{}번째부터 {}번째 item의 총 review 수 : {}'.format(item_count_start,
                                                     item_count_end - 1,
                                                     review_total_cnt))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: fab_india.py Projeto: aankitmaurya/web_scraping

 def parse(self, response):
     self.driver.get(response.url)
     time.sleep(4)
     new_dict = {}
     selector = TextResponse(url=response.url,
                             body=self.driver.page_source,
                             encoding='utf-8')
     city = selector.xpath(
         '//div[@class="StoreLocationsList"]//ul/li//a/text()').extract()
     a = []
     a.append(city)
     for i in city:
         i = "'" + i + "'"
         self.driver.get(response.url)
         time.sleep(3)
         city = self.driver.find_element_by_xpath(
             "//ul/li//a[contains(text()," + i + ")]").click()
         time.sleep(5)
         selector = TextResponse(url=response.url,
                                 body=self.driver.page_source,
                                 encoding='utf-8')
         address1 = selector.xpath('//div[@class="store-address"]')
         for add in address1:
             address = add.xpath('.//span//text()').extract()
             address = "".join(address).replace('/n', '')
             new_dict["City"] = i
             new_dict["address"] = address
             yield new_dict

Exemplo n.º 4

0

Exibir arquivo

Arquivo: result.py Projeto: ncs-jss/uptu-result

 def parse(self, response):
     try :
         while self.s_roll <= self.e_roll:
             self.driver.get('http://new.aktu.co.in/')
             try:
                 WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="ctl00_ContentPlaceHolder1_divSearchRes"]/center/table/tbody/tr[4]/td/center/div/div/img')))
             except:
                 continue
 	        # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over
             resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8');
             rollno = self.driver.find_element_by_name('ctl00$ContentPlaceHolder1$TextBox1')
             rollno.send_keys(self.s_roll)
             try :
                 resp = self.fill_captcha(resp)
                 print format(resp.xpath('//*[@id="ContentPlaceHolder1_Label1"]/text()').extract())
                 while "Incorrect" in format(resp.xpath('//*[@id="ContentPlaceHolder1_Label1"]/text()').extract()):
                     resp = self.fill_captcha(resp)
             except :
                 continue
             self.parse_result(self.driver.current_url)
             self.s_roll += 1
         self.count +=3
         self.sheet.write(self.count,0,"First")
         self.sheet.write(self.count,1,self.top[0][0])
         self.sheet.write(self.count+1,0,"Last")
         self.sheet.write(self.count+1,1,self.top[1][0])
     except :
         self.parse(response)
     finally :
         return

Exemplo n.º 5

0

Exibir arquivo

class Jobs:
    def __init__(self,URL):
        self.URL = URL
        self.page = requests.get(self.URL)
        self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8")

    def get_vacancy(self):
        vac = self.response.xpath('//div[@class="job-inner job-item-title"]/p[@class="font_bold"]/text()').extract()
        return vac 

    def get_company(self):
        comp = self.response.xpath('//div[@class="job-inner job-item-title"]/p[@class="job_list_company_title"]/text()').extract()
        return comp

    def get_deadline(self):
        dl1 = self.response.css('div[class="job-inner job-list-deadline"] p::text').extract()
        dl2 = [''.join(x) for x in zip(dl1[0::2], dl1[1::2])]
        del dl2[1::2]
        dl = [i.replace("\n\n", "").replace("\n"," ").strip() for i in dl2]
        return dl 

    def get_location(self):
        loc = self.response.xpath('//div[@class="job-inner job-list-deadline"]/p[@class="job_location"]/text()').extract()
        loc = [i.replace('\n','').strip() for i in loc]
        return loc 

    def get_ind_page(self):
        ind_page = [base_url + i for i in self.response.xpath('//div[@class="list-view"]/div/div/a/@href').extract()]
        return ind_page

    def get_next(self):
        page = self.response.xpath('//ul[@class="pagination"]/li[@class="next"]/a/@href').extract()
        return page

Exemplo n.º 6

0

Exibir arquivo

Arquivo: collect.py Projeto: tubndgit/scrapy-spiders

    def parse(self, response):
        #print response.body
        iCnt = len(self.input)
        index = 0
        while index < iCnt:
            print 'Review: ', index
            items = self.input[index]
            self.driver.get(items[1])

            response = TextResponse(url=response.url,
                                    body=self.driver.page_source,
                                    encoding='utf-8')
            for div in response.xpath('.//div[@itemprop="review"]'):
                item = {}
                item['PropertyName'] = items[0]
                item['link'] = items[1]
                item['date'] = div.xpath(
                    './/meta[@itemprop="datePublished"]//@content'
                ).extract_first()
                item['rating'] = div.xpath(
                    './/meta[@itemprop="ratingValue"]//@content'
                ).extract_first()
                item['review'] = div.xpath(
                    './/p[@itemprop="description"]//text()').extract_first(
                    ).strip().encode('utf-8')
                item['reviewer'] = div.xpath(
                    './/meta[@itemprop="author"]//@content').extract_first(
                    ).strip().encode('utf-8')
                yield item

            while response.xpath(
                    './/a[@class="u-decoration-none next pagination-links_anchor"]'
            ):
                next_url = response.xpath(
                    './/a[@class="u-decoration-none next pagination-links_anchor"]//@href'
                ).extract_first()
                self.driver.get(next_url)
                response = TextResponse(url=response.url,
                                        body=self.driver.page_source,
                                        encoding='utf-8')
                for div in response.xpath('.//div[@itemprop="review"]'):
                    item = {}
                    item['PropertyName'] = items[0]
                    item['link'] = items[1]
                    item['date'] = div.xpath(
                        './/meta[@itemprop="datePublished"]//@content'
                    ).extract_first()
                    item['rating'] = div.xpath(
                        './/meta[@itemprop="ratingValue"]//@content'
                    ).extract_first()
                    item['review'] = div.xpath(
                        './/p[@itemprop="description"]//text()').extract_first(
                        ).strip().encode('utf-8')
                    item['reviewer'] = div.xpath(
                        './/meta[@itemprop="author"]//@content').extract_first(
                        ).strip().encode('utf-8')
                    yield item

            index = index + 1
            time.sleep(3)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: housing.py Projeto: JINDALG/Roofpik_scrapy

	def parse(self, response):
		self.driver.get(response.url)
		try:
			WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//a[@class="list-name"]')))
		except TimeoutException:
			return
		resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
		urls = resp.xpath('//a[@class="list-name"]/@href').extract()
		old = 0
		new = len(urls)
		while old != new:
			print "\n\n\n",old,new,"\n\n\n"
			for i in xrange(old,new):
				abs_url = 'http://www.housing.com' + urls[i]
				yield scrapy.Request(abs_url, callback=self.parse_property_info)
			try :
				link = self.driver.find_element_by_xpath('//div[@class="show-more-container"]')
				actions = ActionChains(self.driver)
				actions.click(link)
				actions.perform()
			except:
				self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
			time.sleep(3)
			resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
			urls = resp.xpath('//a[@class="list-name"]/@href').extract()
			old = new
			new = len(urls)

Exemplo n.º 8

0

Exibir arquivo

def jobs_scraper(url):
    page = requests.get(url)
    response = TextResponse(body=page.text, url=url, encoding="utf-8")
    companies_name = response.xpath(
        "//p[@class='job_list_company_title']/text()").extract()
    vac_name = response.xpath("//p[@class='font_bold']/text()").extract()
    base_url = "https://staff.am"
    urls = response.xpath(
        "//div[@class='web_item_card hs_job_list_item']/a/@href").extract()
    vacs_url = [base_url + i for i in urls]
    deadline1 = response.css(
        "div[class = 'job-inner job-list-deadline'] >p:not([class='job_location'])"
    )
    deadline2 = [i.css('::text').extract()[1] for i in deadline1]
    deadline = [i.replace('\n', " ") for i in deadline2]
    location1 = response.css(
        "div[class = 'job-inner job-list-deadline'] >p[class='job_location']")
    location2 = [i.css("::text").extract()[1] for i in location1]
    location = [i.replace('\n', "").strip() for i in location2]
    return pd.DataFrame({
        "Companies": companies_name,
        "Vacancies": vac_name,
        'Links': vacs_url,
        "Deadline": deadline,
        'Location': location
    })

Exemplo n.º 9

0

Exibir arquivo

def get_urls(category='105'):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    }
    date = (datetime.today() - timedelta(1)).strftime('%Y%m%d')
    last_p, urls = '', []
    for page in range(1, 1000, 10):
        # 마지막 페이지로
        url = 'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&listType=title&sid1={}&date={}&page={}'.format(
            category, date, page)
        req = requests.get(url, headers=headers)
        resp = TextResponse(req.url, body=req.text, encoding='utf-8')

        try:
            chk_next = resp.xpath(
                '//div[@class="paging"]/a[@class="next nclicks(fls.page)"]/text()'
            )[0].extract()
        except:
            chk_next = '끝'

        if chk_next == '끝':
            pages = resp.xpath('//a[@class="nclicks(fls.page)"]/text() | \
                    //*[@id="main_content"]/div[@class="paging"]/strong/text()'
                               ).extract()
            last_p = pages[-1]
            print(last_p)
            break

    for page in range(1, int(last_p) + 1):
        urls.append(
            'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&listType=title&sid1={}&date={}&page={}'
            .format(category, date, page))
    return urls

Exemplo n.º 10

0

Exibir arquivo

Arquivo: reddit_spider.py Projeto: dearkafka/bootcamp005_project

    def parse(self, response):
        self.driver.get('https://www.reddit.com/r/technology/')
        response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')

        posts = response.xpath('//div[@class="entry unvoted"]').extract()
        upvotes = response.xpath('//div[@class="score unvoted"]/text()').extract()

        for i in range(50):
            for j, post in enumerate(posts):
                comment = Selector(text=post).xpath(
                    '//ul[@class="flat-list buttons"]/li[@class="first"]/a/text()').extract()
                label = Selector(text=post).xpath(
                    '//p[@class="title"]/span[@class="linkflairlabel"]/text()').extract()
                title = Selector(text=post).xpath('//p[@class="title"]/a/text()').extract()
                date = Selector(text=post).xpath(
                    '//p[@class="tagline"]/time/@datetime').extract()
                link = Selector(text=post).xpath(
                    '//p[@class="title"]/span[@class="domain"]/a/text()').extract()
                upvote = upvotes[j]
                item = RedditItem()
                item['upvotes'] = upvote
                item['comments'] = comment
                item['label'] = label
                item['title'] = title
                item['date'] = date
                item['link'] = link
                yield item

            self.driver.find_element_by_xpath('//a[@rel="nofollow next"]').click()
            time.sleep(2)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: zara.py Projeto: aankitmaurya/web_scraping

 def parse(self, response):
     self.driver.get(response.url)
     new_dict = {}
     selector = TextResponse(url=response.url,
                             body=self.driver.page_source,
                             encoding='utf-8')
     text_area = self.driver.find_element_by_xpath(
         '//div/label[@for="store-locator-location"]').click()
     data = [
         'delhi', 'banglore', 'gurgoan', 'noida', 'ahamdabad', 'surat',
         'jaipur', 'kolkata', 'chennai', 'hydrabad', 'pune', 'thane',
         'chandigarh', 'mohali'
     ]
     for i in data:
         text_area = self.driver.find_element_by_xpath(
             '//div/input[@id="store-locator-location"]').clear()
         text_area = self.driver.find_element_by_xpath(
             '//div/input[@id="store-locator-location"]').send_keys(i)
         time.sleep(4)
         search_button = self.driver.find_element_by_xpath(
             '//button[@class="button-primary button-big _searchStores"]')
         search_button.click()
         selector = TextResponse(url=response.url,
                                 body=self.driver.page_source,
                                 encoding='utf-8')
         city = selector.xpath(
             '//a/span/span[@class="shop-info _shopInfo"]//text()').extract(
             )
         # shop = selector.xpath('//a/span/strong[@class="shop"]//text()').extract()
         shop_address = selector.xpath(
             '//div[@class="address"]//text()').extract()
         # for j in shop_address:
         new_dict["City"] = i,
         new_dict["address"] = shop_address
         yield new_dict

Exemplo n.º 12

0

Exibir arquivo

Arquivo: taobao_ver2.py Projeto: HowardHowonYu/taobao_crawler

    def taobao_crawler(self, itemid, current_url):
        from selenium import webdriver
        from scrapy.http import TextResponse
        from fake_useragent import UserAgent

        import pickle
        import requests

        driver = self.__set_cookies_proxies__(current_url)

        driver.get("https://item.taobao.com/item.htm?id={}".format(itemid))

        result = {}
        result['item_id'] = itemid

        # 할인 가격
        try:
            result['promo_price'] = driver.find_element_by_xpath(
                '//*[@id="J_PromoPriceNum"]').text
        except:
            result['promo_price'] = '할인 가격 없음'

        driver.quit()

        # Use 'requests' to get rest of information
        ua = UserAgent(verify_ssl=False)
        userAgent = ua.random

        headers = {'User-Agent': userAgent}

        url = "https://item.taobao.com/item.htm?id={}".format(itemid)

        # https://stackoverflow.com/questions/32910093/python-requests-gets-tlsv1-alert-internal-err
        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES:!aNULL:!eNULL:!MD5'

        response = requests.get(url=url, headers=headers)
        req = TextResponse(response.url, body=response.text, encoding="utf-8")
        result['item_title'] = req.xpath(
            '//*[@id="J_Title"]/h3/@data-title').extract_first()

        option_title = req.xpath(
            '//*[@id="J_isku"]/div/dl/dd/ul/@data-property').extract()
        r = []
        for title in option_title:
            j = {}
            j['option_title'] = title
            j['option_details'] = req.xpath(
                '//*[@id="J_isku"]/div/dl/dd/ul[@data-property="{}"]/li/a/span/text()'
                .format(title)).extract()
            j['option_image_urls'] = [
                path[17:-29] for path in req.xpath(
                    '//*[@id="J_isku"]/div/dl/dd/ul[@data-property="{}"]/li/a/@style'
                    .format(title)).extract()
            ]
            r.append(j)

        result['options'] = r

        return result

Exemplo n.º 13

0

Exibir arquivo

Arquivo: ws_hiraoka.py Projeto: ivanmejiach/webscraping

def get_datos(v_url, v_grupo, v_subgrupo):
    import requests
    from scrapy.http import TextResponse

    my_url = v_url
    #url = 'http://www.hiraoka.com.pe/viewprod.php?id=P000003015&n=Cocina%20a%20Gas'
    user_agent = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58: .0.3029.110 Chrome/58.0.3029.110 Safari/537.36'
    }

    r = requests.get(my_url, headers=user_agent)
    response2 = TextResponse(r.url, body=r.text, encoding='utf-8')

    row = []
    t_nombre = ''
    t_marca = ''
    t_modelo = ''
    skuID = ''
    p_normal = ''
    p_online = ''

    datos = response2.xpath(
        '//div[@class="col-sm-12 col-lg-6 col-md-6"]/div[@class="vpmodelo vptexto"]'
    )
    for d in datos:
        tipo = d.xpath('span//text()').extract()[0].strip().upper()

        valor = ''
        #print(d.xpath('text()').extract())
        if len(d.xpath('text()').extract()) > 0:
            valor = d.xpath('text()').extract()[0].strip()

        if tipo.upper() == 'MARCA:':
            t_marca = valor
        if tipo.upper() == 'MODELO:':
            t_modelo = valor
        if tipo.upper() == 'CÃ³DIGO:':
            skuID = valor
        if tipo.upper() == 'PRECIO NORMAL:':
            p_normal = d.xpath(
                'span[@class="tachado"]//text()').extract()[0].upper()

    online = response2.xpath(
        '//div[@class="col-sm-12 col-lg-6 col-md-6"]/div[@class="blockprecio"]/span[@class="precio"]//text()'
    ).extract()
    p_online = online[0].strip()

    nombre = response2.xpath(
        '//div[@class="col-sm-12 col-lg-6 col-md-6"]/div[@class="vpnombre"]//text()'
    ).extract()
    t_nombre = nombre[0].strip() + ' ' + t_marca + ' ' + t_modelo

    row = [
        v_grupo, v_subgrupo, skuID, t_marca, t_nombre, p_normal, p_online,
        t_modelo, v_url
    ]
    return row

Exemplo n.º 14

0

Exibir arquivo

Arquivo: script_first.py Projeto: volartem/small-crawler2

def get_links(url, proxy='your_proxy_here_with_port'):
    proxy_dict = {
        # 'https': '',
        'http': proxy,
    }
    session = requests.Session()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.21 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.8'
    }
    link_list = []

    r = session.request('GET',
                        url,
                        headers=headers,
                        proxies=proxy_dict,
                        timeout=1)
    response = TextResponse(r.url, body=r.text, encoding='utf-8')
    last_page_index = int(
        response.xpath(
            './/ul[@class="pagination__list"]/'
            'li[@class="pagination__list__item"]/a/text()')[3].extract())
    page_num = 1
    while page_num <= last_page_index:
        if page_num != 1:
            r = session.request('GET',
                                '{0}?page={1}'.format(url, page_num),
                                headers=headers,
                                proxies=proxy_dict)
            response = TextResponse(r.url, body=r.text, encoding='utf-8')
        for quote in response.xpath('//div[@class="single_list_item"]'):
            link_list.append({
                'link':
                quote.xpath(
                    './/div[@class="single_list_item__company"]/a/@href'
                ).extract_first(),
                'name':
                quote.xpath(
                    './/div[@class="single_list_item__company"]/a/text()').
                extract_first().strip(),
                'id':
                quote.xpath(
                    './/div[@class="single_list_item__fiscal"]/span/text()').
                extract_first(),
                'date':
                quote.xpath(
                    './/div[@class="single_list_item__subscribe_date"]/strong/text()'
                ).extract_first(),
            })
        print(page_num)
        page_num += 1
    return link_list

Exemplo n.º 15

0

Exibir arquivo

 def parse(self, response):
     cur = self.conn.cursor()
     url = 'https://www.mataharimall.com'
     try:
         # import pdb;pdb.set_trace()
         self.driver.get(url)
     except:
         print traceback.print_exc()
     for tidur in range(0, 100):
         time.sleep(1)
         try:
             for kat in range(0, 20):
                 response = TextResponse(url=response.url,
                                         body=self.driver.page_source,
                                         encoding='utf-8')
                 url = response.xpath(
                     '//*[contains(@id,"header")]/div[3]/div/div/div[1]/div/ul/li['
                     + str(kat + 1) + ']/a/@href').extract_first()
                 nama_kategori = response.xpath(
                     '//*[contains(@id,"header")]/div[3]/div/div/div[1]/div/ul/li['
                     + str(kat + 1) + ']/a/text()').extract()
                 nama_kategori = ''.join(nama_kategori[1])
                 nama_kategori = nama_kategori.encode('utf-8').replace(
                     "\n", "").strip()
                 time.sleep(2)
                 # import pdb;pdb.set_trace()
                 print "========================================"
                 print(nama_kategori)
                 print(url)
                 print "========================================"
                 sql = "select * from matahari_category where url = '{}' and nama_kategori = '{}'".format(
                     url, nama_kategori)
                 cur.execute(sql)
                 results = cur.fetchall()
                 if len(results) == 0:
                     sql = "INSERT INTO matahari_category VALUES ('{}','{}')".format(
                         url, nama_kategori)
                     print sql
                     cur.execute(sql)
                     self.conn.commit()
                     print "======================================"
                     print "[INFO] Mysql insert sukses : {}".format(sql)
                     print "======================================"
                 else:
                     print "======================================"
                     print "[ERROR] Mysql insert failure : {}".format(sql)
                     print "============s=========================="
         except:
             pass
     cur.close()
     try:
         self.driver.close()
     except:
         pass

Exemplo n.º 16

0

Exibir arquivo

Arquivo: middlewares.py Projeto: wiferok/scraper_craig

 def grab_proxy(self):
     url = 'https://webanetlabs.net/publ/24'
     resp = TextResponse(url=url, body=urlopen(url).read())
     # grab the rel_path of the newest post with free proxy
     relative_path = resp.xpath('//div[@class ="eTitle"][1]/a[1]/@href').extract_first()
     new_url = '/'.join(url.split('/')[:-1]) + relative_path
     resp = TextResponse(url=url, body=urlopen(new_url).read())
     proxy_list = resp.xpath('//span[@itemprop="articleBody"]/p/text()').extract()
     proxy_list= [x.replace('\n', '') for x in proxy_list if len(x) > 5]
     logging.debug('PROXY LIST GRABBED:\n %s'%proxy_list)
     return proxy_list

Exemplo n.º 17

0

Exibir arquivo

Arquivo: bot.py Projeto: antoniogamiz/FrikiBot

    def parse(self, response):

        init_page = "https://www.frikitrivial.com"
        url = "https://www.frikitrivial.com/game.php"

        self.driver.get(init_page)
        i = 0
        time.sleep(60)

        self.driver.get(url)

        while i < RUNNING_TIMES:
            CORRECT = True
            while CORRECT:

                d1 = self.driver.page_source.encode('utf-8')
                html = str(d1)
                response = TextResponse('none', 200, {}, html, [], None)

                question = response.xpath(
                    '/html/body/div[1]/div/div[3]/text()').extract_first()
                first_answer = response.xpath(
                    '/html/body/div[1]/div/a[1]/text()').extract_first()
                second_answer = response.xpath(
                    '/html/body/div[1]/div/a[2]/text()').extract_first()
                third_answer = response.xpath(
                    '/html/body/div[1]/div/a[3]/text()').extract_first()
                forth_answer = response.xpath(
                    '/html/body/div[1]/div/a[4]/text()').extract_first()

                qMResponse = self.qM.getAnswer(question, first_answer,
                                               second_answer, third_answer,
                                               forth_answer)

                if qMResponse[1] == -1:
                    answer = 4
                else:
                    answer = qMResponse[1]

                next = self.driver.find_element_by_xpath(
                    '/html/body/div[1]/div/a[' + str(answer) + ']')
                next.click()

                if self.driver.current_url == 'https://www.frikitrivial.com/end.php':
                    self.qM.processQuestion(qMResponse[1], qMResponse[0], 0)
                    CORRECT = False
                else:
                    self.qM.processQuestion(qMResponse[1], qMResponse[0], 1)

            self.driver.get(url)
            i += 1

        self.qM.backup()
        self.driver.close()

Exemplo n.º 18

0

Exibir arquivo

Arquivo: raymond.py Projeto: SwamyAcount/webcrawling

    def parse2(self, response):
        selector = TextResponse(url=response.url,
                                body=self.driver.page_source,
                                encoding='utf-8')
        new_dict = {}
        name = selector.xpath('//h1[@itemprop="name"]//text()').extract()
        price = selector.xpath('//span[@class="payBlkBig"]//text()').extract()

        new_dict['Name'] = name
        new_dict['Price'] = price
        yield new_dict

Exemplo n.º 19

0

Exibir arquivo

	def parse(self, response):
		for url in self.list_urls:
			self.driver.get(url)
			self.wait_between(1.5, 3.0)            
			iLoop = True
				
			while iLoop:                           
				CheckBox = WebDriverWait(self.driver, 3000).until(
					EC.presence_of_element_located((By.CSS_SELECTOR ,".emphasise"))
					)
				response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')            
							  
				current_url = self.driver.current_url
				aParts = (current_url.split('?')[1]).split('&')
				for x in aParts:
					params = x.split('=')
					if params[0] == 'clue':
						keyword = params[1].replace('%20', ' ').replace('+', ' ')
					if params[0] == 'locationClue':
						suburb = params[1]
						
				for div in response.xpath('.//div[@class="flow-layout outside-gap-large inside-gap inside-gap-large vertical"]//div[@class="cell in-area-cell middle-cell"]'):					
					if div.xpath('.//a[@class="listing-name"]//text()').extract_first():
						if div.xpath('.//a[@title="Phone"]'):
							phone = div.xpath('.//a[@title="Phone"]//@href').extract_first()
							if phone:
								sphone = phone.replace('tel:', '')
								if self.checkDuplicates(sphone) == False:
									item = {}
									item['Phone'] = sphone								
									item['Suburb'] = suburb
									item['Keyword'] = keyword
									item['CompanyName'] = div.xpath('.//a[@class="listing-name"]//text()').extract_first().strip().encode('utf-8')
									if div.xpath('.//p[@class="listing-short-description"]'):
										item['Description'] = div.xpath('.//p[@class="listing-short-description"]//text()').extract_first().encode('utf-8')
									if div.xpath('.//p[@class="listing-address mappable-address"]'):
										item['Address'] = div.xpath('.//p[@class="listing-address mappable-address"]//text()').extract_first().encode('utf-8')
									if div.xpath('.//p[@class="listing-address mappable-address mappable-address-with-poi"]'):
										item['Address'] = div.xpath('.//p[@class="listing-address mappable-address mappable-address-with-poi"]//text()').extract_first().encode('utf-8')
									
									if div.xpath('.//a[@class="contact contact-main contact-email "]'):
										item['Email'] = div.xpath('.//a[@class="contact contact-main contact-email "]//@data-email').extract_first()
									if div.xpath('.//a[@class="contact contact-main contact-url "]'):
										item['Website'] = div.xpath('.//a[@class="contact contact-main contact-url "]//@href').extract_first()
									
									yield item
				
				# parse next page
				if response.xpath('.//a[contains(@class, "pagination navigation") and contains(text(), "Next")]'):
					next_url = response.xpath('.//a[contains(@class, "pagination navigation") and contains(text(), "Next")]//@href').extract_first()
					self.driver.get(response.urljoin(next_url))					
					self.wait_between(1.5, 3.0)
				else:
					iLoop = False

Exemplo n.º 20

0

Exibir arquivo

 def get_content(self, response: TextResponse):
     book_name = response.xpath(
         '//div[@class="con_top"]/a[3]/text()').extract_first()
     title_name = response.xpath(
         '//div[@class="bookname"]/h1/text()').extract_first().strip('正文')
     article = ''.join(
         response.xpath('string(//div[@id="content"])').extract())
     if book_name and title_name and article:
         with open(
                 '/Users/csdn/Desktop/book/{}_{}.txt'.format(
                     book_name, title_name), 'w') as f:
             f.write(article)
             print(book_name + title_name + '下载成功')

Exemplo n.º 21

0

Exibir arquivo

Arquivo: satyapaul.py Projeto: SwamyAcount/webcrawling

	def parse(self,response):
		self.driver.get(response.url)
		time.sleep(3)
		selector = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
		href = selector.xpath('//li[@class="city-bar"]/a/@href').extract()
		city = selector.xpath('//li[@class="city-bar"]/a/text()').extract()
		
		for i in href:
			for c in city:
				if c.lower().strip().replace(' ','') in i.replace('-',''):
		
					url= i
					yield Request(url= url, callback= self.parse2 ,meta={"city":c})

Exemplo n.º 22

0

Exibir arquivo

    def parse(self, response):
        self.driver.get('http://www.metmuseum.org/art/collection')

        # while True:
        #     try:
        #         show_more = self.driver.find_element_by_class_name("show-more")
        #         time.sleep(2)
        #         show_more.click()
        #     except:
        #         break

        # clicking the show more button
        for i in range(5):
            show_more = self.driver.find_element_by_class_name("show-more")
            time.sleep(3)
            show_more.click()

        response = TextResponse(url=self.driver.current_url,
                                body=self.driver.page_source,
                                encoding='utf-8')
        test = response.xpath('//h2[@class="card__title"]/a/@href')
        for href in response.xpath('//h2[@class="card__title"]/a/@href'):
            url = response.urljoin(href.extract())
            print url
            # scraping the urls from the first page & creating a list of links
            # card_link_list = self.driver.find_elements_by_xpath('//h2[@class="card__title"]/a')
            # card_link_list = map(lambda x: x.get_attribute('href'), card_link_list)
            self.driver.get(url)
            time.sleep(2)
            response1 = TextResponse(url=self.driver.current_url,
                                     body=self.driver.page_source,
                                     encoding='utf-8')
            item = MetItem()
            for sel in response1.xpath('//div[@class="l-component-block"]'):
                title = self.driver.find_element_by_xpath(
                    '//h1[@class="collection-details__object-title"]').text
                print title
                location = self.driver.find_element_by_xpath(
                    '//div[@class="collection-details__location"]').text
                print location
                item['title'] = title
                item['location'] = location
            artifact_detail = {}
            for detail in response1.xpath(
                    '//dl[@class="collection-details__tombstone--row"]'
            ).extract():
                key = Selector(text=detail).xpath('//dt/text()').extract()[0]
                value = Selector(text=detail).xpath('//dd/text()').extract()[0]
                artifact_detail[key] = value
            item['artifact_detail'] = artifact_detail
            yield item

Exemplo n.º 23

0

Exibir arquivo

 def parse(self, response):
     cur = self.conn.cursor()
     url = 'https://www.bukalapak.com/products'
     try:
         # import pdb;pdb.set_trace()
         self.driver.get(url)
     except:
         print traceback.print_exc()
     for tidur in range(0, 100):
         time.sleep(1)
         try:
             for kat in range(0, 20):
                 response = TextResponse(url=response.url,
                                         body=self.driver.page_source,
                                         encoding='utf-8')
                 url = response.xpath(
                     '/html/body/div[1]/section/div/nav/div/div/div/ul/li['
                     + str(kat + 1) + ']/a/@href').extract_first()
                 url = "https://bukalapak.com" + url
                 nama_kategori = response.xpath(
                     '/html/body/div[1]/section/div/nav/div/div/div/ul/li['
                     + str(kat + 1) + ']/a/text()').extract_first()
                 time.sleep(2)
                 print "========================================"
                 print(nama_kategori)
                 print(url)
                 print "========================================"
                 sql = "select * from bukalapak_category where url = '{}' and nama_kategori = '{}'".format(
                     url, nama_kategori)
                 cur.execute(sql)
                 results = cur.fetchall()
                 if len(results) == 0:
                     sql = "INSERT INTO bukalapak_category VALUES ('{}','{}')".format(
                         url, nama_kategori)
                     print sql
                     cur.execute(sql)
                     conn.commit()
                     print "======================================"
                     print "[INFO] Mysql insert sukses : {}".format(sql)
                     print "======================================"
                 else:
                     print "======================================"
                     print "[ERROR] Mysql insert failure : {}".format(sql)
                     print "============s=========================="
         except:
             pass
     cur.close()
     try:
         self.driver.close()
     except:
         pass

Exemplo n.º 24

0

Exibir arquivo

    def parse(self, response):

        self.driver.get(response.url)
        urls = []

        for i in range(1, 20):

            # self.driver.get(response.url)
            response = TextResponse(url=self.driver.current_url,
                                    body=self.driver.page_source,
                                    encoding='utf-8')
            self.driver.implicitly_wait(10)

            for j in range(1, 31):
                result = response.xpath('//*[@class="col-md-9"]/div[1]/div[' +
                                        str(j) + ']/h3/a/@href')
                urls.extend(result)

            next_page = self.driver.find_element_by_xpath(
                '//*[@title="Go to next page"]')
            next_page.click()

        for href in urls:
            print href
            url = href.extract()
            self.driver.get(url)
            response = TextResponse(url=self.driver.current_url,
                                    body=self.driver.page_source,
                                    encoding='utf-8')
            item = IndeedItem()

            for sel in response.xpath('//div[@class="col-md-5 col-lg-6"]'):
                item['job_title'] = sel.xpath(
                    '//div[@class="col-md-5 col-lg-6"]/h1/text()').extract()
                item['location'] = sel.xpath(
                    '//div[@class="col-md-5 col-lg-6"]/ul/li[2]/text()'
                ).extract()
                item['company_name'] = sel.xpath(
                    '//div[@class="col-md-5 col-lg-6"]/ul/li[1]/a/text()'
                ).extract()

            for sel_1 in response.xpath('//*[@id="bd"]/div/div[1]'):
                item['job_type'] = sel_1.xpath(
                    '//div[2]/div/div[2]/span/text()').extract()
                item['job_salary'] = sel_1.xpath(
                    '//div[3]/div/div[2]/span/text()').extract()

            yield item

        self.driver.close()

Exemplo n.º 25

0

Exibir arquivo

def scrape_data(url, names, values, types):
    response = requests.get(url)
    response = TextResponse(body=response.content, url=url)
    data = {'url': response.url}
    for name in names:
        index = names.index(name)
        value = values[index]
        extract_type = types[index]

        if extract_type == 'list':
            data = {**data, **{name: response.xpath(value).extract()}}
        else:
            data = {**data, **{name: response.xpath(value).extract_first()}}

    return data

Exemplo n.º 26

0

Exibir arquivo

Arquivo: mb_rent.py Projeto: mfz16/Roofpik_scrapy

	def parse(self, response):
		try :		
			self.driver.get(response.url)
			resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
			blocks = resp.xpath('//div[contains(@id,"resultBlockWrapper")]')
			old = 0
			new = len(blocks)
			while old != new:
				print old,new
				self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
				time.sleep(3)
				resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
				blocks = resp.xpath('//div[contains(@id,"resultBlockWrapper")]')
				old = new
				new = len(blocks)

			for block in blocks:
				try :
					price = ''.join(block.xpath('div//div[@class="srpColm2"]//span[contains(@id,"pricePropertyVal")]//text()').extract())
					iscr = 'Cr' in price
					islac = 'Lac' in price
					price = price.replace(',','').replace('Cr','').replace('Lac','')
					price = float(price.split()[0])	
					price *= 10000000 if iscr else 1
					price *= 100000 if islac else 1
					bhk = ''.join(block.xpath('div//div[@class="srpColm2"]//strong/text()').extract())
					bhk = (''.join(bhk.split()[:2])).replace('.5','')
					if "bhk" in bhk.lower() and not("1bhk" in bhk.lower() or ('1 bhk') in bhk.lower()) :
						ppf = ''.join(block.xpath('div//div[@class="srpColm2"]//span[@class="proRentArea"]/text()').extract())
						if ppf == "":
							ppf = ''.join(block.xpath('div//div[@class="srpColm2"]//span[@class="proNameSizeTxt"]/text()').extract())
						ppf = float(ppf.split()[0])
						if bhk in self.obj:
							self.obj[bhk]['min'] = self.obj[bhk]['min'] if price > self.obj[bhk]['min'] else price
							self.obj[bhk]['max'] = self.obj[bhk]['max'] if price < self.obj[bhk]['max'] else price
							self.obj[bhk]['count'] += 1
							self.obj[bhk]['avg'] += ppf
						else :
							self.obj[bhk] = {'min':price, 'max':price, 'count':1,'avg':ppf}
				except :
					pass
			with open(os.path.dirname(__file__) +'/../../price.json','w')as file:
				file.write(json.dumps(self.obj))
			return
		except :
			with open('cus.txt','ab+') as f:
				f.write("some error occur")
			return

Exemplo n.º 27

0

Exibir arquivo

Arquivo: yahoo.py Projeto: wrtd/crawler-image-google

 def start_requests(self):
     self.driver.get("https://www.yahoo.com")
     time.sleep(1)
     keyword = ["pas foto", "pas foto 3x4", "pas foto 4x6", "pas foto ktp"]
     for a in range(len(keyword)):
         try:
             image = self.driver.find_element_by_xpath('//input[@id="UHSearchBox"]')
         except:
             try:
                 image = self.driver.find_element_by_xpath('//div[@id="sbq-wrap"]/input')
             except Exception, e:
                 print e
         image.click()
         time.sleep(1)
         image.send_keys(keyword[a])
         time.sleep(1)
         image.send_keys(Keys.ENTER)
         time.sleep(3)
         self.driver.find_element_by_xpath('//div[@class="compList mt-5"]/ul/li[2]/a').click()
         time.sleep(2)
         self.driver.find_element_by_xpath('//ul[@id="filt-tabs-v2"]/li[3]/ul/li[5]/a').click()
         time.sleep(2)
         count = 0
         for i in range(1, 10000):
             count += 1
             # import pdb;pdb.set_trace()
             try:
                 self.driver.find_element_by_xpath('/html/body').send_keys(Keys.END)
                 time.sleep(3)
                 response = TextResponse(self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
                 end = response.xpath('//section[@id="results"]/button/text()').extract()
                 link = response.xpath('//section[@id="results"]/div/ul/li[' + str(i) + ']/a/img/@src').extract()
                 link = ''.join(link).encode('utf-8')
                 link = link +'.jpg'
                 end = ''.join(end).encode('utf-8')
                 nama = str(keyword[a]).replace(' ', '_') + '_ke-' + str(count)
                 direktori = 'C:\Users\EB-NB19\Documents\pict\ ' + nama + '.jpg'
                 urllib.urlretrieve(link, direktori)
                 time.sleep(1)
                 print "=========================="
                 print nama
                 print "=========================="
                 if end == "Tampilkan Lebih Banyak Gambar":
                     self.driver.find_element_by_xpath('//section[@id="results"]/button').click()
                 else:
                     pass
             except Exception, e:
                 print e

Exemplo n.º 28

0

Exibir arquivo

Arquivo: proptiger.py Projeto: guptaanjali26/Web-Scraping-with-Scrapy

    def parse(self, response):
        self.driver.get(response.url)

        try:
            WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.XPATH,'//*[@id="views"]/div/div[2]/div[2]/div[3]/div[10]/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/span')))
        except TimeoutException:
            print "Time out"
            return

        # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8');

        for href in resp.xpath('//*[@id="views"]/div/div[2]/div[2]/div[3]/div/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/@href'):
            url = resp.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_property)


        if self.page == 5 :
            return
            
        self.page += 1
        yield scrapy.Request(url="https://www.proptiger.com/noida/property-sale?page=%d" % self.page,
                      headers={"Referer": "https://www.proptiger.com/noida/property-sale", "X-Requested-With": "XMLHttpRequest"},
                      callback=self.parse, 
                      dont_filter=True)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: ws_hiraoka.py Projeto: ivanmejiach/webscraping

def get_url_detalle():
    import requests
    from scrapy.http import TextResponse

    web_main = "http://www.hiraoka.com.pe/"
    user_agent = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58: .0.3029.110 Chrome/58.0.3029.110 Safari/537.36'
    }

    r = requests.get(web_main, headers=user_agent)
    response2 = TextResponse(r.url, body=r.text, encoding='utf-8')

    rows = []
    pag = response2.xpath('//div[@class="inners"]')
    for p in pag:
        sub_grupo = p.xpath('div[@class="popgrupo"]')

        for s in sub_grupo:
            grupo = s.xpath(
                'div[@class="poptitulogrupo"]//text()').extract()[0].strip()
            #print(re.sub('[\W]+','', s.xpath('div[@class="poptitulogrupo"]//text()').extract()[0].strip()) )

            for a in s.xpath('a[@class="popitem"]'):
                url = web_main + '' + a.xpath('@href').extract()[0].strip()
                sg = a.xpath('text()').extract()[0].strip()
                #re.sub('[\W]+', '', url)
                row = [url, sg, grupo]
                #print(re.sub('[\W]+', '', a.xpath('@href').extract()[0].strip()))

                rows.append(row)
    df_url = pd.DataFrame(rows, columns=['Url_Grupo', 'Sub_Grupo', 'Grupo'])
    return df_url

Exemplo n.º 30

0

Exibir arquivo

    def parse(self, response):
        self.driver.get(response.url)

        while True:
            time.sleep(1)
            try:
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, '//p[@class="propertyName"]/a')))
            except TimeoutException:
                return
            resp = TextResponse(url=self.driver.current_url,
                                body=self.driver.page_source,
                                encoding='utf-8')
            urls = resp.xpath('//p[@class="propertyName"]/a/@href').extract()
            pprint(urls)
            #urls=['http://www.magicbricks.com/propertyDetails/270-Sq-ft-Studio-Apartment-FOR-Sale-Vatika-City-in-Gurgaon&id=4d423230333337333839?from=search']
            if len(urls) == 0:
                return
            for url in urls:
                abs_url = 'http://www.squareyards.com' + url
                yield scrapy.Request(abs_url,
                                     callback=self.parse_property_info)

            try:
                link = self.driver.find_element_by_xpath(
                    '//ul[@class="newpagination"]/li[2]')
                actions = ActionChains(self.driver)
                actions.click(link)
                actions.perform()
            except:
                return

Exemplo n.º 31

0

Exibir arquivo

Arquivo: instascraper.py Projeto: xiaolaurenwang/Zach-Escalante-Code

    def parse(self, response):
        #       substantiate a selenium driver as the object we scrape
        self.driver.get(response.url)
        time.sleep(4)
        #scroll down so we can see the 'Load More' button
        self.driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        #click on the 'Load More' button
        load_more = self.driver.find_element_by_link_text('Load more')
        load_more.click()
        time.sleep(2)
        #how many times do we need to scroll down? Here I've determined once
        for i in xrange(0, 1):
            self.driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)

    #pass the response url along with the new scrolled-down website (body = )

        response1 = TextResponse(url=response.url,
                                 body=self.driver.page_source,
                                 encoding='utf-8')

        photo_links = response1.xpath(
            "//a[contains(@class, '_8mlbc _vbtk2 _t5r8b')]/@href").extract()
        for photo in photo_links:
            url = response.urljoin(photo)
            #for each photo loaded on the page, callback the parse_photo function
            yield scrapy.Request(url, callback=self.parse_photo)

Exemplo n.º 32

0

Exibir arquivo

Arquivo: naver_quick_spider.py Projeto: snubdi/news_crawler

    def comment_parse(self, response):

        #try:
        print response.url
        aid = response.meta['article']['aid']
        date = response.meta['article']['date']
        self.driver.get(response.url)
        time.sleep(3)

        while True:
            button_more = self.driver.find_element_by_xpath('//a[@class="u_cbox_btn_more __cbox_page_button"]')
            try:
                button_more.click()
            except:
                break

        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
        for site in resp.xpath('.//ul[@class="u_cbox_list"]/li'):
            username = site.xpath('.//span[@class="u_cbox_nick"]/text()').extract()
            like_count = site.xpath('.//em[@class="u_cbox_cnt_recomm"]/text()').extract()
            dislike_count = site.xpath('.//em[@class="u_cbox_cnt_unrecomm"]/text()').extract()
            contents = site.xpath('.//span[@class="u_cbox_contents"]/text()').extract()
            comment = NaverCommentItem()
            comment['aid'] = aid
            comment['username'] = username
            comment['like_count'] = like_count
            comment['dislike_count'] = dislike_count
            comment['contents'] = ''.join(contents)
            comment['date'] = date
            yield comment
        '''

Exemplo n.º 33

0

Exibir arquivo

def extract(url):
    res = requests.get(url, headers=header)
    response = TextResponse(url, body=res.text.encode())
    for url in response.xpath(
            "//div[@class='loi ']//div[@id='issueName']/a[@class='issueLinkCon']/@href"
    ):
        yield response.urljoin(url.get())

Exemplo n.º 34

0

Exibir arquivo

Arquivo: sqyrd.py Projeto: JINDALG/Roofpik_scrapy

    def parse(self, response):
        self.driver.get(response.url)

        while True:
            time.sleep(1)
            try:
                WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//p[@class="propertyName"]/a')))
            except TimeoutException:
                return
            resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
            urls = resp.xpath('//p[@class="propertyName"]/a/@href').extract()
            pprint(urls)
            #urls=['http://www.magicbricks.com/propertyDetails/270-Sq-ft-Studio-Apartment-FOR-Sale-Vatika-City-in-Gurgaon&id=4d423230333337333839?from=search']
            if len(urls) == 0:
                return
            for url in urls:
                abs_url = 'http://www.squareyards.com' + url
                yield scrapy.Request(abs_url, callback=self.parse_property_info)

            try :
                link = self.driver.find_element_by_xpath('//ul[@class="newpagination"]/li[2]')
                actions = ActionChains(self.driver)
                actions.click(link)
                actions.perform()
            except:
                return

Exemplo n.º 35

0

Exibir arquivo

 def get_article_list(self, response: TextResponse):
     article_url_list = response.xpath(
         '//div[@class="novellist"]/ul/li/a/@href').extract()
     for article_url in article_url_list:
         yield Request(article_url,
                       callback=self.get_chapter_list,
                       dont_filter=True)

Exemplo n.º 36

0

Exibir arquivo

    def parse(self, response):
        self.driver.get('http://www.the-numbers.com/movie/budgets/all')
        response = TextResponse(url=response.url,
                                body=self.driver.page_source,
                                encoding='utf-8')
        rows = response.xpath(
            '//*[@id="page_filling_chart"]/center/table/tbody/tr').extract()

        for i in range(1, 10250, 2):
            RDate = Selector(text=rows[i]).xpath('//td[2]/a/text()').extract()
            Title = Selector(
                text=rows[i]).xpath('//td[3]/b/a/text()').extract()
            PBudget = Selector(text=rows[i]).xpath('//td[4]/text()').extract()
            DomesticG = Selector(
                text=rows[i]).xpath('//td[5]/text()').extract()
            WorldwideG = Selector(
                text=rows[i]).xpath('//td[6]/text()').extract()

            print RDate, Title, PBudget, DomesticG, WorldwideG

            item = MoviesItem()
            item['RDate'] = RDate
            item['Title'] = Title
            item['PBudget'] = PBudget
            item['DomesticG'] = DomesticG
            item['WorldwideG'] = WorldwideG

            yield item

Exemplo n.º 37

0

Exibir arquivo

Arquivo: sampleLoopForClickResponse.py Projeto: mtaziz/scrapy-example-spiders

	def parse(self, response):
		#response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
		sel = Selector(response)
		self.driver.get(response.url)
		i=0
		while True:
			#next = self.driver.find_element_by_xpath('//*[@id="pagnNextString"]')
			
			next = WebDriverWait(self.driver, 10).until(
				EC.visibility_of_element_located((By.ID, "pagnNextString"))
			)
			#next.click()
			#i=i+1
			#if(i==2):
			#	break
			#sleep(50000)
			try:
				#self.driver.set_page_load_timeout(10000) 
				response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
				sites = response.xpath('//*[@id="s-results-list-atf"]/li')
				for site in sites:
			
					'''item = EbayItem()
					item['title'] = site.xpath('//a/h2[@class="a-size-base a-color-null s-inline s-access-title a-text-normal"]/text()').extract()
					item['link'] = sel.xpath('//a/@href').extract()
					item['price'] = site.xpath('//span[@class="a-size-base a-color-price s-price a-text-bold"]/span/text()').extract()
					yield item'''
					item=EbayItem()
					#title = site.xpath('//a/h2[@class="a-size-base a-color-null s-inline s-access-title a-text-normal"]/text()').extract()
					item['title'] = site.xpath('div/div[2]/div[1]/a/h2/text()').extract()
					item['link'] = site.xpath('div/div[2]/div[1]/a/@href').extract()
					item['price'] = site.xpath('div/div[3]/div[0]/a/span[0]/text() | div/div[3]/div[1]/a/span/text() ').extract()
					item['image'] = site.xpath('div/div[1]/div/div/a/img/@src').extract()
					#item['rating'] = site.xpath('div/div[5]/span/span/a/i[1]/span/text() | div/div[4]/span/span/a/i[1]/span/text()').extract()
					item['rating'] = site.xpath('div//span[@class="a-icon-alt"]/text()').extract()
					
					#price = site.xpath('div/div[3]/div[0]/a/span[0]/text() | div/div[3]/div[1]/a/span/text() ').extract()
					#print title,link, price
					#print price
					#sleep(50000)
					yield item
				try:
					self.driver.find_element_by_xpath('//*[@id="pagnNextLink"]')
				except NoSuchElementException:
					break
				next.click()
				
				# get the data and write it to scrapy items
				
			except:
				#break
				a=10
				#next.click()
		self.driver.close()

#######

Exemplo n.º 38

0

Exibir arquivo

Arquivo: CFEMScraper.py Projeto: zachescalante/Zach-Escalante-Code

    def parse_selenium(self, response):

        #Use the previous instance of the webrowser which was created to go to visit the "response.url"
        self.driver.get(response.url)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        #All comments have been loaded, once again pass the "body" argument back in
        response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
        data = ScrapyTutorialItem()
        data['item'] = {'url': response.url.split("=")[1], 'items': response1.xpath("//div[@class='ItemTitle-sc-1bls9ac-0 hrhyAs']/text()").extract()}
        return data

Exemplo n.º 39

0

Exibir arquivo

Arquivo: met_spider.py Projeto: bkanpetch/bkanpetch.github.io

    def parse(self, response):
        self.driver.get('http://www.metmuseum.org/art/collection')

        # while True:
        #     try:
        #         show_more = self.driver.find_element_by_class_name("show-more")
        #         time.sleep(2)
        #         show_more.click()
        #     except:
        #         break

        # clicking the show more button
        for i in range(5):
            show_more = self.driver.find_element_by_class_name("show-more")
            time.sleep(3)
            show_more.click()

        response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
        test = response.xpath('//h2[@class="card__title"]/a/@href')
        for href in response.xpath('//h2[@class="card__title"]/a/@href'):
            url = response.urljoin(href.extract())
            print url
        # scraping the urls from the first page & creating a list of links
        # card_link_list = self.driver.find_elements_by_xpath('//h2[@class="card__title"]/a')
        # card_link_list = map(lambda x: x.get_attribute('href'), card_link_list)
            self.driver.get(url)
            time.sleep(2)
            response1 = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
            item = MetItem()
            for sel in response1.xpath('//div[@class="l-component-block"]'):
                title = self.driver.find_element_by_xpath('//h1[@class="collection-details__object-title"]').text
                print title
                location = self.driver.find_element_by_xpath('//div[@class="collection-details__location"]').text
                print location
                item['title'] = title
                item['location'] = location
            artifact_detail = {}
            for detail in response1.xpath('//dl[@class="collection-details__tombstone--row"]').extract():
                key = Selector(text=detail).xpath('//dt/text()').extract()[0]
                value = Selector(text=detail).xpath('//dd/text()').extract()[0]
                artifact_detail[key] = value
            item['artifact_detail'] = artifact_detail
            yield item

Exemplo n.º 40

0

Exibir arquivo

Arquivo: asoiaf_spider.py Projeto: dearkafka/bootcamp005_project

    def parse_url(self, response):
        self.driver.get(response.url)
        time.sleep(5)      #Pause so page has enough time for AJAX to load
        response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')

        item = AsoiafItem()     #item consists of Appearing, POV, ChapterNum, Book, ChapterName, Summary, Blurb, Score

        #Get features from url
        item['Appearing'] = map(lambda s: str(s), response.xpath('//div[@id="appearances"]/ol/li/a/text()').extract())
        item['POV'] = str(response.xpath('//div[@class = "jumplist"]/ul/li/a/text()')[0].extract())
        item['ChapterNum'] = int(response.xpath('//span[@class = "teaser"]/b/text()').extract()[0].split()[-1])
        book_chapter = str(response.xpath('//*[@id="headline"]/h2/text()').extract()[0])
        item['Book'] = book_chapter.split()[0]
        item['ChapterName'] = ' '.join(book_chapter.split()[1:])
        item['Blurb'] = str(response.xpath('//*[@id="content"]/div[2]/div[1]/div[1]/span/text()[2]').extract()[0])

        #Score is the variable that requires selenium. It renders in AJAX.
        item['Score'] = float(response.xpath('//div[@class = "score"]/text()').extract()[0])


        if book == "ADWD" and chapternum > 14:      #Not all chapters have summaries yet (work in progress)
            item['Summary'] = " "
        else:
            #Try to get the summary. This attempts to extract the summary words that are no contained in hyperlinks.
            summary_no_href = str(''.join(response.xpath('//*[@id="content"]/div[2]/div[2]/div[1]/p/text()').extract()))

            #Some summaries are only one paragraph and the html is set up to not be in a /p/ tag. In this case,
            # summary_no_href above is empty. In this case, we get the summary without /p/ tags.
            if summary_no_href == "":
                #Get words in summary without hyperlinks.
                summary_no_href = str(
                    ''.join(response.xpath('//*[@id="content"]/div[2]/div[2]/div[1]/text()').extract()))
                #Get words in hyperlinks.
                hrefs = str(' '.join(response.xpath('//*[@id="content"]/div[2]/div[2]/div[1]/a/text()').extract()))
            else:
                #This is the case the summary is in the /p/ tag. Get words that are in hyperlinks.
                hrefs = str(' '.join(response.xpath('//*[@id="content"]/div[2]/div[2]/div[1]/p/a/text()').extract()))

            #Summary is the summary without hyperlinks and hyperlinks put together. Note: This is not in correct order,
            # but contains all words.
            item['Summary'] = summary_no_href + ' ' + hrefs
        return item

Exemplo n.º 41

0

Exibir arquivo

Arquivo: flip_spider.py Projeto: pukhrajborania/RECOMMENDER-SYSTEM-FOR-E-COMMERCE-PORTAL

    def parse(self, response):
		sel = Selector(response) 
		self.driver.get(response.url)
		block="none"
		hyper="http://www.snapdeal.com"
		print hyper
		i=0
		while True:
			self.driver.set_page_load_timeout(10000)
			self.driver.execute_script("window.scrollTo(10000000,10000000)")
			self.driver.set_page_load_timeout(10000)
			try:
				show = self.driver.find_element_by_xpath('//*[@id="seemore"]').value_of_css_property('display')
				print show
				'''if show==block:
					self.driver.find_element_by_xpath('//div[@id="show-more-results"]').click()'''
				no_more = self.driver.find_element_by_xpath('//*[@class="mar_20per_left ajax-loader-icon hidden"]').value_of_css_property('display')
				print no_more
				if no_more==block and show==block:
					break;
				time.sleep(5)
				self.driver.execute_script("window.scrollTo(10000000,10000000)")
				self.driver.set_page_load_timeout(10000)
			except NoSuchElementException:
				print "pungi"
				break
		#down = self.driver.find_element_by_xpath('//div[@id="show-more-results"]')
		#location = down.location
		#self.((JavascriptExecutor) driver).executeScript("window.scrollBy(10000,10000);");
		#next = self.driver.find_element_by_xpath('//div[@id="show-more-results"]')
		response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
		try:
			#self.driver.set_page_load_timeout(10000)
			#driver.execute_script("window.scrollTo(0, location.get('y')")
			sites = response.xpath('//*[@class="product_grid_box"]')
			#print sites
			for site in sites:
				item = FlipkartItem()
				check = site.xpath('div[@class="productWrapper"]//div[@class="soldout-tag prod-grid-sold-out-lang"]/div/text()').extract()
				if check=='SOLD OUT':
					continue
				item['price'] =  site.xpath('div[@class="productWrapper"]//div[@class="product-price"]/div/text()').extract()[0]
				data = site.xpath('div[@class="productWrapper"]//div[@class="product-title"]/a/text()').extract()
				item['title'] = data
				item['rating'] = site.xpath('div[@class="productWrapper"]//div[@class="ratingStarsSmall"]/@style | div[@class="productWrapper"]//div[@class="ratingStarsSmall corrClass8"]/@ratings').extract()
				item['image'] = site.xpath('div[@class="productWrapper"]//div[@class=" product-image "]/a/img/@src').extract()
				item['link'] = site.xpath('div[@class="productWrapper"]//div[@class="product-title"]/a/@href').extract()
				yield item
		except:
			print "Loop error"
			a=10
		self.driver.close()

Exemplo n.º 42

0

Exibir arquivo

Arquivo: indeed_spider.py Projeto: dearkafka/bootcamp005_project

    def parse(self, response):

        self.driver.get(response.url)
        urls = []

        for i in range(1,20):

            # self.driver.get(response.url)
            response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
            self.driver.implicitly_wait(10)

            for j in range(1, 31):
                result = response.xpath('//*[@class="col-md-9"]/div[1]/div['+str(j)+']/h3/a/@href')
                urls.extend(result)

            next_page = self.driver.find_element_by_xpath('//*[@title="Go to next page"]')
            next_page.click()


        for href in urls:
            print href
            url = href.extract()
            self.driver.get(url)
            response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
            item = IndeedItem()

            for sel in response.xpath('//div[@class="col-md-5 col-lg-6"]'):
                item['job_title'] = sel.xpath('//div[@class="col-md-5 col-lg-6"]/h1/text()').extract()
                item['location'] = sel.xpath('//div[@class="col-md-5 col-lg-6"]/ul/li[2]/text()').extract()
                item['company_name'] = sel.xpath('//div[@class="col-md-5 col-lg-6"]/ul/li[1]/a/text()').extract()

            for sel_1 in response.xpath('//*[@id="bd"]/div/div[1]'):
                item['job_type'] = sel_1.xpath('//div[2]/div/div[2]/span/text()').extract()
                item['job_salary'] = sel_1.xpath('//div[3]/div/div[2]/span/text()').extract()


            yield item


        self.driver.close()

Exemplo n.º 43

0

Exibir arquivo

Arquivo: proptiger.py Projeto: anant-dev/Web-Scraping-with-Scrapy

    def parse(self, response):
        self.driver.get(response.url)
        try:
            WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.XPATH,'//*[@id="views"]/div/div[2]/div[2]/div[3]/div[10]/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/span')))
        except:
            yield scrapy.Request(url="https://www.proptiger.com/%s/property-sale?page=%d" % (self.city,self.page),
                      callback=self.parse)
        # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8');

        for href in resp.xpath('//*[@id="views"]/div/div[2]/div[2]/div[3]/div/div/div/div/div/div[2]/div[1]/div[1]/div[1]/div[1]/div/a/@href'):
            url = resp.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_property)

        if self.page == self.end_page :
            return
        self.page += 1
        yield scrapy.Request(url="https://www.proptiger.com/%s/property-sale?page=%d" % (self.city,self.page),
                      callback=self.parse)

Exemplo n.º 44

0

Exibir arquivo

Arquivo: collector.py Projeto: opentrials/collectors

def collect(conf, conn):
    """Collect ICD-XX-CM conditions.
    """

    # For more information see:
    # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html
    URL = "https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip"
    FILE = "Tabular.xml"
    VERSION = "ICD-10-CM"
    LAST_UPDATED = "2015-10-01"

    # Prepare xml
    zip = requests.get(URL).content
    xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read()
    res = TextResponse(url=URL, body=xml, encoding="utf-8")

    count = 0
    for diag in res.xpath("//diag"):

        # We need only leafs
        childs = diag.xpath("./diag")
        if not childs:
            continue

        # Get data
        data = {}
        data["name"] = diag.xpath("./name/text()").extract_first()
        data["desc"] = diag.xpath("./desc/text()").extract_first()
        data["terms"] = diag.xpath(".//note/text()").extract()
        data["version"] = VERSION
        data["last_updated"] = LAST_UPDATED

        # Create record
        record = Record.create(URL, data)

        # Write record
        record.write(conf, conn)

        # Log info
        count += 1
        if not count % 100:
            logger.info('Collected %s "%s" conditions', count, record.table)

Exemplo n.º 45

0

Exibir arquivo

Arquivo: collector.py Projeto: BenGosub/collectors

def collect(conf, conn):
    """Collect ICD-XX-CM conditions.
    """

    # For more information see:
    # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html
    URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip'
    FILE = 'Tabular.xml'
    VERSION = 'ICD-10-CM'
    LAST_UPDATED = '2015-10-01'

    # Prepare xml
    zip = requests.get(URL).content
    xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read()
    res = TextResponse(url=URL, body=xml, encoding='utf-8')

    count = 0
    for diag in res.xpath('//diag'):

        # We need only leafs
        childs = diag.xpath('./diag')
        if not childs:
            continue

        # Get data
        data = {}
        data['name'] = diag.xpath('./name/text()').extract_first()
        data['desc'] = diag.xpath('./desc/text()').extract_first()
        data['terms'] = diag.xpath('.//note/text()').extract()
        data['version'] = VERSION
        data['last_updated'] = LAST_UPDATED

        # Create record
        record = Record.create(URL, data)

        # Write record
        base.writers.write_record(conn, record)

        # Log info
        count += 1
        if not count % 100:
            logger.info('Collected %s "%s" conditions', count, record.table)

Exemplo n.º 46

0

Exibir arquivo

Arquivo: zhihu_spider.py Projeto: HaiQW/webspiders

 def _login(self):
     """知乎爬虫的登陆模块(此处用selenium模拟登陆)"""
     r = 1
     while r != 0:
         try:
             self.driver.set_page_load_timeout(20)  # 防止页面加载不完
             self.driver.get('http://www.zhihu.com/#signin')
             time.sleep(10)  # sleep 20 秒等待用户输入账号信息
             self.driver.get('http://www.zhihu.com/#sighin')
             response = TextResponse(url=self.driver.current_url, body=self.driver.page_source.encode('utf-8'))
             user_info = response.xpath('/html/body/script[@data-name="current_user"]/text()')
             user_info = user_info.extract()[0].replace('[', '').replace(']', '').replace('\"', '').split(',')
             if not user_info[0] == '':
                 print u'用户%s登陆成功' % user_info[0]
                 logger.info(u'用户%s登陆成功' % user_info[0])
                 break
             else:
                 logger.error(u'账号或者密码输入错误.')
         except:
            continue

Exemplo n.º 47

0

Exibir arquivo

Arquivo: Movies_spider.py Projeto: WinfieldM/WinfieldM.github.io

    def parse(self, response):
        self.driver.get('http://www.the-numbers.com/movie/budgets/all')
        response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
        rows = response.xpath('//*[@id="page_filling_chart"]/center/table/tbody/tr').extract()

        for i in range(1, 10250, 2):
            RDate = Selector(text=rows[i]).xpath('//td[2]/a/text()').extract()
            Title = Selector(text=rows[i]).xpath('//td[3]/b/a/text()').extract()
            PBudget = Selector(text=rows[i]).xpath('//td[4]/text()').extract()
            DomesticG = Selector(text=rows[i]).xpath('//td[5]/text()').extract()
            WorldwideG = Selector(text=rows[i]).xpath('//td[6]/text()').extract()

            print RDate, Title, PBudget, DomesticG, WorldwideG

            item = MoviesItem()
            item['RDate'] = RDate
            item['Title'] = Title
            item['PBudget'] = PBudget
            item['DomesticG'] = DomesticG
            item['WorldwideG'] = WorldwideG

            yield item

Exemplo n.º 48

0

Exibir arquivo

Arquivo: AF_spider.py Projeto: dicksonchow/AandF_crawler

    def parse(self, response):
        self.driver.maximize_window()
        self.driver.get(response.url)
        self.driver.set_page_load_timeout(30)
        self.driver.execute_script("return document.documentElement.innerHTML;")
        scheight = 0.1
        while scheight < 9.9:
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight/{})".format(scheight))
            scheight += .01

        res = TextResponse(url=response.url, body=self.driver.execute_script("return document.documentElement.innerHTML;"), encoding='utf-8')

        for item in res.xpath('//div[@class="product-tile"]'):
            item_name = item.xpath('./div[@class="product-name"]/h3/a/text()').extract()[0].strip()
            item_link = item.xpath('./div[@class="product-name"]/h3/a/@href').extract()[0].strip()
            standard_price = item.xpath('./div[@class="product-pricing"]/div/span[@class="text price-standard"]/text()').extract()
            promoted_price = item.xpath('./div[@class="product-pricing"]/div/span[@class="text promotional-price"]/text()').extract()
            standard_price = float(standard_price[0].strip().split('$')[1].replace(',', ''))
            promoted_price = float(promoted_price[0].strip().split('$')[1].replace(',', ''))
            discount_rate = ((standard_price - promoted_price) / standard_price) * 100
            print item_name, ", ", discount_rate, "% OFF", ", ", item_link

        self.driver.close()

Exemplo n.º 49

0

Exibir arquivo

Arquivo: instascraper.py Projeto: zachescalante/Zach-Escalante-Code

    def parse(self, response):
#       substantiate a selenium driver as the object we scrape
        self.driver.get(response.url)
        time.sleep(4)
        #scroll down so we can see the 'Load More' button
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        #click on the 'Load More' button
        load_more = self.driver.find_element_by_link_text('Load more')
        load_more.click()
        time.sleep(2)
        #how many times do we need to scroll down? Here I've determined once        
        for i in xrange(0, 1):
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)
            
       #pass the response url along with the new scrolled-down website (body = )

        response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')  

        photo_links = response1.xpath("//a[contains(@class, '_8mlbc _vbtk2 _t5r8b')]/@href").extract()
        for photo in photo_links:
            url = response.urljoin(photo)
            #for each photo loaded on the page, callback the parse_photo function
            yield scrapy.Request(url, callback=self.parse_photo)

Exemplo n.º 50

0

Exibir arquivo

Arquivo: result.py Projeto: JINDALG/Uptu-Result-B.tech-

    def parse_result(self, response):
        item = {}
        # Load the current page into Selenium
        
        self.driver.get(response)
        try:
            WebDriverWait(self.driver, 40).until(EC.presence_of_element_located((By.XPATH, '//*[@id="ctl00_ContentPlaceHolder1_imgstud"]')))
        except TimeoutException:
            print "Time out"
            return
        # Sync scrapy and selenium so they agree on the page we're looking at then let scrapy take over
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8');
        temp = format(resp.xpath('//*[@id="lblname"]/text()').extract())
        item['name'] = temp[3:-2]

        temp = format(resp.xpath('//*[@id="lblfname"]/text()').extract())
        item['father'] = temp[3:-2]

        temp = format(resp.xpath('//*[@id="lblrollno"]/text()').extract())
        item['roll'] = temp[3:-2]

        temp = format(resp.xpath('//*[@id="lblenrollno"]/text()').extract())
        item['enroll'] = temp[3:-2]

        temp = format(resp.xpath('//*[@id="lblbranch"]/text()').extract())
        item['branch'] = temp[3:-2]

        temp = format(resp.xpath('//*[@id="lblcollegename"]/text()').extract())
        item['clg'] = temp[3:-2]

        temp = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[2]/td[2]/b/text()').extract())
        item['s1'] = temp[3:-2]

        t1 = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[2]/td[3]/b/text()').extract())
        t2  = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[2]/td[4]/b/text()').extract())
        item[item['s1']] = t1[3:-2] + ' , '  + t2[3:-2]

        temp = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[3]/td[2]/b/text()').extract())
        item['s2'] = temp[3:-2]
        t1 = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[3]/td[3]/b/text()').extract())
        t2  = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[3]/td[4]/b/text()').extract())
        item[item['s2']] = t1[3:-2] + ' , '  + t2[3:-2]


        temp = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[4]/td[2]/b/text()').extract())
        item['s3'] = temp[3:-2]
        t1 = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[4]/td[3]/b/text()').extract())
        t2  = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[4]/td[4]/b/text()').extract())
        item[item['s3']] = t1[3:-2] + ' , '  + t2[3:-2]    

        temp = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[5]/td[2]/b/text()').extract())
        item['s4'] = temp[3:-2]
        t1 = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[5]/td[3]/b/text()').extract())
        t2  = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[5]/td[4]/b/text()').extract())
        item[item['s4']] = t1[3:-2] + ' , '  + t2[3:-2]

        temp = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[6]/td[2]/b/text()').extract())
        item['s5'] = temp[3:-2]
        t1 = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[6]/td[3]/b/text()').extract())
        t2  = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[6]/td[4]/b/text()').extract())
        item[item['s5']] = t1[3:-2] + ' , '  + t2[3:-2]

        temp = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[7]/td[2]/b/text()').extract())
        item['s6'] = temp[3:-2]
        t1 = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[7]/td[3]/b/text()').extract())
        t2  = format(resp.xpath('//*[@id="Pane0_content"]/table[1]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[7]/td[4]/b/text()').extract())
        item[item['s6']] = t1[3:-2] + ' , '  + t2[3:-2]

        temp = format(resp.xpath('//*[@id="ctl00_ContentPlaceHolder1_tr1"]/td[3]/text()').extract())
        item['gp'] = temp[3:-2]
        temp = format(resp.xpath('//*[@id="Pane0_content"]/table[3]/tbody/tr[2]/td[3]/text()').extract())
        print temp[5:-7]
        item['tot'] = temp[5:-7]
        self.add_in_sheet(item)

Exemplo n.º 51

0

Exibir arquivo

Arquivo: instascraper.py Projeto: zachescalante/Zach-Escalante-Code

    def parse_photo(self, response):

        self.driver.get(response.url)
        #find the 'Load More' button in the 'comments' section and load all of them
        try:
            while True:                
                self.driver.find_element_by_xpath('//button[@class="_l086v _ifrvy"]').click()
        except:
            pass
        #All comments have been loaded, once again pass the "body" argument back in
        response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')

        li_class = response1.xpath("//li[@class='_nk46a']")

        data = InstagramItem()
        data['href'] = response1.url
        data['username'] = response1.xpath(".//header/div//a[1]/@title").extract()
        data['username_href'] = response1.xpath(".//header//div/a[1]/@href").extract()
        data['location'] = response1.xpath(".//header//div//a[2]/@title").extract()
        data['location_href'] = response1.xpath(".//header//div//a[2]/@href").extract()
        data['likes'] = response1.xpath(".//span[@class='_tf9x3']/span/text()").extract()
        data['time'] = response1.xpath(".//a[@class='_rmo1e']/time/@datetime").extract()
        data['comments'] = defaultdict()
        for i in li_class:
            try:
                data['comments'][str(i.xpath(".//a/@title").extract())] = i.xpath(".//span//text()").extract()
            except:
                pass
        yield data
        #the data is stored in a csv by the command which is run from the console
        #'scrapy crawl instascraper -o data.csv'
###########################################################################
##parse_commenters begins to follow the commenters to their pages where we
##can scrape their data. It has several callback functions, without which
##it does not provide full functionality
#
#----Collect all of the urls of each person who commented on a photo------
#    def parse_commenters(self, response):
#        
#        self.driver.get(response.url)
#        for i in xrange(0, 2):
#            try:                
#                self.driver.find_element_by_xpath('//button[@class="_l086v _ifrvy"]').click()
#            except:
#                pass
#      
#        response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
#        
#        user_links = response1.xpath("//a[@class='_4zhc5 notranslate _iqaka']/@href").extract()
#        for user in user_links:
#            url = response.urljoin(user)
#            yield scrapy.Request(url, callback=self.parse_commenter_hrefs)
##-----------Load all the photos of each person who commented--------------       
#    def parse_commenter_hrefs(self, response):
#
#        self.driver.get(response.url)
#        time.sleep(3)
#        posts = self.driver.find_element_by_class_name("_bkw5z").getText()
#        posts = self.driver.find_element_by_class_name("_bkw5z").text
##-------How many posts are there? How many time should we scroll down?----
#        p = int(posts)   
#        
#        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#        try:
#            load_more = self.driver.find_element_by_link_text('Load more')
#            load_more.click()
#        except:
#            pass
#        
#     
#        for i in xrange(0, p/12):
#            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#            time.sleep(3)
#        
#        try:
#            response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')  
##-------For each photo on the page, call the scraping function------------
#            photo_links = response1.xpath("//a[contains(@class, '_8mlbc _vbtk2 _t5r8b')]/@href").extract()
#            for photo in photo_links:
#                url = response.urljoin(photo)
#                yield scrapy.Request(url, callback=self.parse_locations)
#        except:
#            pass
#        
##-- This function will scrape the data frome each photo and store in an object-------    
#    def parse_locations(self, response):
#        data = InstagramItem()
#        data['href'] = response.url
#        data['username'] = response.xpath("//a[@class='_4zhc5 notranslate _ook48']/@title").extract()
#        data['username_href'] = response.xpath("//a[@class='_4zhc5 notranslate _ook48']/@href").extract()
#        data['location'] = response.xpath("//a[@class='_kul9p _rnlnu']/@title").extract()
#        data['location_href'] = response.xpath("//a[@class='_kul9p _rnlnu']/@href").extract()
##        data['likes'] = response.xpath(".//span[@class='_tf9x3']/span/text()").extract()
##        data['time'] = response.xpath(".//a[@class='_rmo1e']/time/@datetime").extract()
#        yield data
#
#        
#
#            
#        
#            
#        
#

Exemplo n.º 52

0

Exibir arquivo

Arquivo: acres.py Projeto: JINDALG/Roofpik_scrapy

	def parse(self,response):
		item = {}
		min_booking_price = min_area = max_booking_price = max_area = area = max_resale_price = min_resale_price= -1
		status = apartment_type = apartment_bhk = project_detail= builderName= address ="" 
		superBuiltupArea = builtupArea = min_book_price = max_book_price = min_sale_price = max_sale_price = bhk = -1
		try :
			name = ''.join(response.xpath('//div[@class="bannerOver"]//span[@itemprop="name"]//text()').extract())
		except:
			name = ""
		
		try :
			facts = response.xpath('//div[@id="xidFactTable"]/div[contains(@class,"factBox")]')
			for fact in facts:
				try :
					head = ''.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factLbl")]//text()').extract())
					if "Possession" in head:
						status =' '.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal1")]//text()').extract())
					
					if "Address" in head:
						address =' '.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal")]//text()').extract())
				
					if "Configuration" in head:
						apartment_type =''.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal1")]//text()').extract())
						apartment_bhk = ''.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal2")]//text()').extract()).replace('BHK','')
					
					if "Total Project Area" in head:
						area =''.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal")]//text()').extract()).replace('Acres','')
						area  = float(area)*43560
					
					if "Saleable Area" in head:
						areas = ''.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal")]//text()').extract())
						areas = re.findall('\d+', areas)
						min_area = float(areas[0])
						max_area = float(areas[1])

					if "Resale Price" in head:
						price =''.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal")]//text()').extract())
						price = (re.sub(r'[^\x00-\x7F]', " ", price))
						iscr = 'Crore' in price
						islac = 'Lac' in price
						price = price.replace('Crore','').replace('Lac','').strip()
						min_resale_price,max_resale_price = map(float,price.split('to'))
						min_resale_price *= 10000000 if iscr else 1
						max_resale_price *= 10000000 if iscr else 1
						min_resale_price *= 100000 if islac else 1
						min_resale_price *= 100000 if islac else 1

					if "New Booking Base Price" in head:
						price =''.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal")]//text()').extract())
						price = (re.sub(r'[^\x00-\x7F]', " ", price))
						iscr = 'Crore' in price
						islac = 'Lac' in price
						price = price.replace('Crore','').replace('Lac','').strip()
						min_booking_price, max_booking_price = map(float,price.split('to'))
						min_booking_price *= 10000000 if iscr else 1
						max_booking_price *= 10000000 if iscr else 1
						min_booking_price *= 100000 if islac else 1
						min_booking_price *= 100000 if islac else 1

					if "Project Details" in head:
						project_detail =' '.join(fact.xpath('div[contains(@class,"factData")]//div[contains(@class,"factVal")]//text()').extract())
						project_detail = (re.sub(r'[^\x00-\x7F]', " ", project_detail))
						
				except:
					pass
		except :
			pass
		self.driver.get(response.url)
		amenity = []
		try :
			
			resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
			basic = resp.xpath('//div[contains(@class,"Basic")]/div/div')
			for bas in basic:
				try :
					amen = ''.join(bas.xpath('div/text()').extract())
					amenity += [amen.encode('utf8')]
				except:
					pass
		except :
			pass
		try :
			extras = resp.xpath('//div[@class="xidPrmAmn"]//li[not(@class)]/text()').extract()
			for extra in extras:
				try :
					amenity += [extra.encode('utf8')]
				except:
					pass
		except:
			pass

		det = []
		try :
			details = resp.xpath('//div[@class="elems-cs"]')
			for detail in details:
				try :
					try :
						bhk = ''.join(detail.xpath('div[@class="head"]/div//text()').extract())[0].encode('utf8')
						bhk = int(bhk.replace('BHK Apartment',''))
					except :
						pass

					more_det = detail.xpath('div[@class="unit-d-cs"]')
					for internal_det in more_det:
						try :
							if "super built-up area" in ''.join(internal_det.xpath('div[1]/text()').extract()):
								superBuiltupArea = (''.join(internal_det.xpath('div[2]/text()').extract())).encode('utf8')

							if "Built-up area" in ''.join(internal_det.xpath('div[1]/text()').extract()):
								builtupArea = (''.join(internal_det.xpath('div[2]/text()').extract())).encode('utf8')

							if "New Booking Base Price" in ''.join(internal_det.xpath('div[1]/text()').extract()):
								price = (''.join(internal_det.xpath('div[2]/text()').extract()))
								iscr = 'Crore' in price
								islac = 'Lac' in price
								price = price.replace('Crore','').replace('Lac','')
								if '-' in price:
									price = price.split('-')
									min_book_price = float(price[0])
									max_book_price = float(price[1])
								else :
									min_book_price = max_book_price = float(price)
								min_book_price *= 10000000 if iscr else 1
								max_book_price *= 10000000 if iscr else 1
								min_book_price *= 100000 if islac else 1
								max_book_price *= 100000 if islac else 1

							if "Resale Price" in ''.join(internal_det.xpath('div[1]/text()').extract()):
								price = (''.join(internal_det.xpath('div[2]/text()').extract()))
								iscr = 'Crore' in price
								islac = 'Lac' in price
								price = price.replace('Crore','').replace('Lac','')
								if '-' in price:
									price = price.split('-')
									min_sale_price = float(price[0])
									max_sale_price = float(price[1])
								else :
									min_sale_price = max_sale_price = float(price)
								min_sale_price *= 10000000 if iscr else 1
								max_sale_price *= 10000000 if iscr else 1
								min_sale_price *= 100000 if islac else 1
								max_sale_price *= 100000 if islac else 1
						except:
							pass
					det += [{'bhk':bhk,
					'superBuiltupArea':superBuiltupArea,
					'builtupArea':builtupArea,
					'min_book_price':min_book_price,
					'max_book_price':max_book_price,
					'min_sale_price':min_sale_price,
					'max_sale_price':max_sale_price}]
				except :
					pass
		except :
			pass

		try :
			builderName = ''.join(resp.xpath('//span[@id="item_manufacturer"]//text()').extract())
		except :
			pass
		item['url'] = response.url
		item['projectName'] = name.encode('utf8')
		item['status'] = status.encode('utf8')
		item['projectType'] = apartment_type.encode('utf8')
		item['bhk'] = apartment_bhk.encode('utf8')
		item['area'] = area
		item['min_area'] = min_area
		item['max_area'] = max_area
		item['min_booking_price'] = min_booking_price
		item['max_booking_price'] = max_booking_price
		item['min_resale_price'] = min_resale_price
		item['max_resale_price'] = max_resale_price
		item['amenity'] = amenity
		item['address'] = address.encode('utf8')
		item['units'] = det
		item['website']  = (response.url).split('/')[2].split('.')[1]
		item['project_detail'] = project_detail.encode('utf8')
		item['builderName'] = builderName.encode('utf-8')

		try :
			item = convert(item)
		except :
			print traceback.print_exc()
			item = {}
		fire = firebase.FirebaseApplication('https://abcapp-8345a.firebaseio.com/',None)
		fire.put('/','temp',item)
		return

Exemplo n.º 53

0

Exibir arquivo

Arquivo: commonfloor.py Projeto: JINDALG/Roofpik_scrapy

	def parse(self, response):
		self.driver.get(response.url)
		try:
			WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="row listing"]')))
		except TimeoutException:
			return
		resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
		block = resp.xpath('//div[@class="row listing"]')
		for box in block:
			try :
				price = ''.join(box.xpath('div[2]/div[2]/div/div[1]/div[2]/p/span[2]/text()').extract())
				iscr = 'Cr' in price
				islac = 'L' in price
				price = price.replace(',','').replace('Cr','').replace('L','')
				price = float(price.encode('utf8'))
				price *= 10000000 if iscr else 1
				price *= 100000 if islac else 1
				bhk = ''.join(box.xpath('div[2]/div[1]/div/div/h4/a/span[1]/text()').extract())
				bhk = bhk.split()[0].replace('.5','')
				if ("bhk" in bhk.lower()) and not("1bhk" in bhk.lower() or ('1 bhk') in bhk.lower()) :
					ppf = ''.join(box.xpath('div[2]/div[2]/div/div[2]/div[2]/p/text()').extract())
					ppf = float(ppf)
					ppf = price/ppf
					if bhk in self.obj:
						self.obj[bhk]['min'] = self.obj[bhk]['min'] if price > self.obj[bhk]['min'] else price
						self.obj[bhk]['max'] = self.obj[bhk]['max'] if price < self.obj[bhk]['max'] else price
						self.obj[bhk]['count'] += 1
						self.obj[bhk]['avg'] += ppf
					else :
						self.obj[bhk] = {'min':price, 'max':price, 'count':1,'avg':ppf}
			except :
				pass

		while 1:
			next_button = self.driver.find_element_by_xpath('//span[@class="icon-navigate_next"]')
			check = resp.xpath('//ul[@class="pagination pageNumber"]/li/@style').extract()[-1]
			if "none" in check:
				print self.obj
				with open(os.path.dirname(__file__) +'/../../price.json','w')as file:
					file.write(json.dumps(self.obj))
				return
			actions = ActionChains(self.driver)
			actions.click(next_button)
			actions.perform()
			time.sleep(2)
			resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
			block = resp.xpath('//div[@class="row listing"]')
			for box in block:
				try :
					price = ''.join(box.xpath('div[2]/div[2]/div/div[1]/div[2]/p/span[2]/text()').extract())
					iscr = 'Cr' in price
					islac = 'L' in price
					price = price.replace(',','').replace('Cr','').replace('L','')
					price = float(price.encode('utf8'))
					price *= 10000000 if iscr else 1
					price *= 100000 if islac else 1
					bhk = ''.join(box.xpath('div[2]/div[1]/div/div/h4/a/span[1]/text()').extract())
					bhk = bhk.split()[0].replace('.5','')
					if "bhk" in bhk.lower() and not("1bhk" in bhk.lower() or ('1 bhk') in bhk.lower()) :
						ppf = ''.join(box.xpath('div[2]/div[2]/div/div[2]/div[2]/p/text()').extract())
						ppf = float(ppf)
						ppf = price/ppf
						if bhk in self.obj:
							self.obj[bhk]['min'] = self.obj[bhk]['min'] if price > self.obj[bhk]['min'] else price
							self.obj[bhk]['max'] = self.obj[bhk]['max'] if price < self.obj[bhk]['max'] else price
							self.obj[bhk]['count'] += 1
							self.obj[bhk]['avg'] += ppf
						else :
							self.obj[bhk] = {'min':price, 'max':price, 'count':1,'avg':ppf}
				except :
					pass

		return

Exemplo n.º 54

0

Exibir arquivo

Arquivo: sqyrd.py Projeto: JINDALG/Roofpik_scrapy

    def parse(self, response):
        fire = firebase.FirebaseApplication('https://abcapp-8345a.firebaseio.com/',None)
        print "some"
        time.sleep(2)
        item = {}
        min_price = max_price = price_per_sqft = min_area = max_area = 0
        is_price_fix = True
        name = description =  code = address = city = location =  status = unit_type = property_type  =""
        amenities ={}
        speciality = {}
        wow_factors =  {}
        index = {}
        connection = {}
        self.driver.get(response.url)
        # try:
        #     WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//img[@src]')))
        # except TimeoutException:
        #     return
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')

        try:
            name = ("".join(resp.xpath('//h1[@itemprop="name"]//text()').extract())).replace('.','')
            name = (re.sub(r'[^\x00-\x7F]', " ", name))
        except:
            pass

        try:
            min_price = "".join(resp.xpath('//span[@class="price-detail-txt"]/span[@itemprop="minPrice"]//text()').extract())
            isLac = 'L' in min_price
            isCrore = 'Cr' in min_price
            min_price = float(min_price.split()[0])
            try:
                if isLac:
                    min_price *= 100000
            except:
                pass
            try:
                if isCrore:
                    min_price *= 10000000
            except:
                pass
            max_price = "".join(resp.xpath('//span[@class="price-detail-txt"]/span[@itemprop="maxPrice"]//text()').extract())
            isLac = 'L' in max_price
            isCrore = 'Cr' in max_price
            max_price = float(max_price.split()[0])
            try:
                if isLac:
                    max_price *= 100000
            except:
                pass
            try:
                if isCrore:
                    max_price *= 10000000
            except:
                pass
        except :
            min_price = max_price = 0
            pass

        try:
            area = "".join(resp.xpath('//div[@class="proje-detais"]/p//text()').extract())
            area_list = []
            for i in area.split():
                try:
                    area_list += [float(i)]
                except:
                    pass
            min_area = float(area_list[0])
            max_area = float(area_list[1])
        except:
            max_area = min_area

        try:
            price_per = ("".join(resp.xpath('//div[@class="price-details"]/div/div/p[2]/text()').extract())).replace('\n',"").replace('\t',"").replace(',',"")
            price_per_sqft = float(re.findall('\d+', price_per)[0])
            if "sqyrd" in price_per:
                price_per_sqft *= 9
        except:
            price_per_sqft = -1.0

        try:
            address = (",".join(resp.xpath('//ul[@itemprop="address"]//*[contains(@itemprop,"address")]//text()').extract())).replace('\n',"").replace('\t',"")
            address = (re.sub(r'[^\x00-\x7F]', " ", address))
            city = address.split(',')[0]
            location = address.split(',')[-1]
            address = " ".join(address.split(','))
        except:
            pass

        try:
            description = " ".join(resp.xpath('//div[@class="aboutTextBox"]/p//text()').extract())
            description = (re.sub(r'[^\x00-\x7F]', " ", description))
        except:
            pass

        try:
            special = resp.xpath('//div[contains(@class,"AmenitiesBoxBorder")]')
            for spec in special:
                try:
                    label = (" ".join(spec.xpath('span//text()').extract()))
                    label = (re.sub(r'[^\x00-\x7F]', " ", label)).encode('utf8')
                    if label == "":
                        try:
                            speciality['other'] += [re.sub(r'[^\x00-\x7F]'," ",("".join(spec.xpath('div//li//span//text()').extract()))).encode('utf8')]
                        except:
                            speciality['other'] = [re.sub(r'[^\x00-\x7F]'," ",("".join(spec.xpath('div//li//span//text()').extract()))).encode('utf8')]
                    else:
                        speciality[label] = re.sub(r'[^\x00-\x7F]'," ",("".join(spec.xpath('div//li//span//text()').extract()))).encode('utf8')
                except:
                    pass
        except:
            pass

        try:
            amenity_category = resp.xpath('//div[@class="amenitiesSliderBox"]/div')
            for category in amenity_category:
                try:
                    category_name = "".join(category.xpath('div/div[1]/div//text()').extract())
                    category_name = re.sub(r'[^\x00-\x7F]', " ",category_name).encode('utf8')
                    amenities[category_name] = {}
                    aminity_list = category.xpath('div//li')
                    for amenity in aminity_list:
                        try:
                            header = ("".join(amenity.xpath('span[2]//text()').extract())).replace("'","").replace('/','OR')
                            header = re.sub(r'[^\x00-\x7F]'," ",header).encode('utf8')
                            availability = "".join(amenity.xpath('span[2]/@class').extract())
                            if "active" in availability:
                                amenities[category_name][header] = 1
                            else:
                                amenities[category_name][header] = 0
                        except:
                            pass
                except:
                    pass
        except:
            pass
        try:
            status = "".join(resp.xpath('//div[@class="progress-main"]//li[2]//text()').extract())
            status =  re.sub(r'[^\x00-\x7F]'," ",status)
        except:
            pass

        try:
            code = (response.url).split('/')[-2]
        except:
            pass

        try:
            project_details = resp.xpath('//div[contains(@class,"proje-detais")]')
            for details in project_details:
                if "Unit" in "".join(details.xpath('p/span/text()').extract()):
                    unit_type = ("".join(details.xpath('p/text()').extract())).replace('\n',"")
                    unit_type = re.sub(r'[^\x00-\x7F]'," ",unit_type)
                if "Property" in "".join(details.xpath('p/span/text()').extract()):
                    property_type = ("".join(details.xpath('p/text()').extract())).replace('\n',"")
                    property_type = re.sub(r'[^\x00-\x7F]', " ",property_type)
        except:
            pass

        try:
            wow_factor = resp.xpath('//div[contains(@class,"wow-Factors-section")]//li')
            for factor in wow_factor:
                value = ("".join(factor.xpath('span//text()').extract())).replace('\n',"")
                key = ("".join(factor.xpath('small//text()').extract())).replace('\n',"").replace('.','').replace('/','-')
                value = (re.sub(r'[^\x00-\x7F]', " ", value)).encode('utf8')
                key = (re.sub(r'[^\x00-\x7F]', " ", key)).encode('utf8')
                wow_factors[key] = value
        except:
            pass

        try:
            connected_road = resp.xpath('//div[contains(@class,"connect-roads")]//li')
            for road in connected_road:
                try:
                    value = ("".join(road.xpath('span[1]//text()').extract())).split('~')
                    dis = float(value[1].split()[0])
                    connection[value[0].encode('utf8')] = dis
                except:
                    pass
        except:
            pass

        try:
            driver_box = resp.xpath('//div[contains(@class,"decisionDriversBox")]/div/div/div')
            for box in driver_box:
                try:
                    head = ("".join(box.xpath('div//div[@class="projectCounter"]//div[@class="heading"]/text()').extract()))
                    head  = re.sub(r'[^\x00-\x7F]'," ",head).encode('utf8')
                    val = ("".join(box.xpath('div//div[@class="projectCounter"]//div[contains(@class,"Box")]/text()').extract()))
                    val = re.sub(r'[^\x00-\x7F]'," ",val).encode('utf8')
                    index[head] = val  
                except:
                    pass     
        except:
            pass

        try:
            item['name'] = name.encode('utf8')
            item['min_price'] = min_price
            item['max_price'] = max_price
            item['price_per_sqft'] = price_per_sqft
            item['address'] = address.encode('utf8')
            item['city'] = city.encode('utf8')
            item['location'] = location.encode('utf8')
            item['min_area'] = min_area
            item['max_area'] = max_area
            item['possession_status'] = status.encode('utf8')
            item['amenities'] = amenities
            item['speciality'] = speciality
            item['url'] = response.url
            item['code'] = code.encode('utf8')
            item['description'] = description.encode('utf8')
            item['unit_type'] = unit_type.encode('utf8')
            item['property_type'] = property_type.encode('utf8')
            item['index'] = index
            item['connecting_road'] = connection
            item['wow_factors'] = wow_factors
            item['more_info'] = {}

            urls = resp.xpath('//div[@class="bhkDetails"]//a/@href').extract()
            for url in urls:
                abs_url = 'http://www.squareyards.com' + url
                self.parse_deep_info(abs_url, item['more_info'])
            if item['name'] != "":
                try :
                    item = convert(item)
                    print fire.put('/','temp',item)
                except:
                    print fire.put('/','temp',{})
                    print traceback.print_exc();
            else:
                print fire.put('/','temp',{})
                print response.url
        except:
            print fire.put('/','temp',{})
            print traceback.print_exc()
            print response.url
        return

Exemplo n.º 55

0

Exibir arquivo

Arquivo: sqyrd.py Projeto: JINDALG/Roofpik_scrapy

    def parse_deep_info(self, abs_url, item):
        deep_item = {}
        self.driver.get(abs_url)
        # try:
        #     WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="unitTopTable table-responsive"]//tr[2]/td[2]')))
        # except TimeoutException:
        #     return
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')

        min_price = max_price = living_area  = bedrooms = bathrooms = kitchens = servent_rooms = carpet_area = built_up_area =  0
        code = name = ""
        balconies = {}
        Room_private_area = {}
        living_common_area = {}
        open_area = {}
        additional_area = {}
        try:
            code = abs_url.split('/')[-2]
        except:
            pass

        try:
            name = ("".join(resp.xpath('//h1[@itemprop="name"]//text()').extract())).split()[:2]
            name = "-".join(name)
        except:
            pass

        try:
            min_price = "".join(resp.xpath('//div[@class="unitTopTable table-responsive"]//tr[2]/td[2]//text()').extract())
            isLac = 'L' in min_price
            isCrore = 'Cr' in min_price
            min_price = float(min_price.split()[0])
            try:
                if isLac:
                    min_price *= 100000
            except:
                pass
            try:
                if isCrore:
                    min_price  *= 10000000
            except:
                pass
        except:
            pass

        try :
            max_price = "".join(resp.xpath('//div[@class="unitTopTable table-responsive"]//tr[2]/td[3]//text()').extract())
            isLac = 'L' in max_price
            isCrore = 'Cr' in max_price
            max_price = float(max_price.split()[0])   
            try :
                if isLac:
                    max_price *= 100000
            except :
                pass
            try :
                if isCrore:
                    max_price  *= 10000000
            except :
                pass
        except:
            pass    

        try :
            more_info = resp.xpath('//div[@class="unit-left-section"]//ul/li')
            for info in more_info:
                value = "".join(info.xpath('span//text()').extract())
                try :
                    if "Living" in value:
                        living_area = int(value.split()[0])
                except :
                    pass
                try :
                    if "Bed" in value:
                        bedrooms = int(value.split()[0])
                except:
                    pass
                try :
                    if "Bath" in value:
                        bathrooms = int(value.split()[0])
                except :
                    pass
                try :
                    if "Kitchen" in value:
                        kitchens = int(value.split()[0])
                except :
                    pass
                try :
                    if "Servant" in value:
                        servent_rooms = int(value.split()[0])
                except :
                    pass
                try :
                    if "Balcon" in value:

                        balconies['count'] = int(value.split()[0])
                        balconies['size_in_sqft'] = int((value.split()[2])[1:])
                except :
                    pass
        except:
            pass        

        try :
            more_info = resp.xpath('//div[@class="unit-loder"]//div[@ng-if="!isFragment"]')
            for info in more_info:
                header = "".join(info.xpath('div//p//text()').extract())
                try :
                    if "Carpet" in header:
                        carpet_area = int(("".join(info.xpath('div//small//text()').extract())).split()[0])
                except :
                    pass
                try :
                    if "BuiltUp" in header:
                        built_up_area = int(("".join(info.xpath('div//small//text()').extract())).split()[0])
                except :
                    pass
        except:
            pass

        try :
            private_areas = resp.xpath('//div[contains(@class,"unitdimensionsArea")]/div/div[1]/div[1]//tr')
            for area in private_areas:
                try :
                    length = breadth = area_sqft = 0.0
                    temp = area.xpath('td[@class="ng-binding"]//text()').extract()
                    # pprint(temp)
                    # input()
                    try :
                        length = float(temp[1])
                    except :
                        pass
                    try :
                        breadth = float(temp[2])
                    except :
                        pass
                    try :
                        area_sqft =  float(temp[3].split()[0])
                    except :
                        pass
                    try :
                        Room_private_area[temp[0].split()[0].encode('utf8')] = {'Length':length, 'Breadth':breadth,'Area' : area_sqft}
                    except :
                        pass
                except :
                    pass
        except :
            pass
        try :
            private_areas = resp.xpath('//div[contains(@class,"unitdimensionsArea")]/div/div[1]/div[2]//tr')
            for area in private_areas:
                try :
                    length = breadth = area_sqft = 0.0
                    temp = area.xpath('td[@class="ng-binding"]//text()').extract()
                    # pprint(temp)
                    # input()
                    try :
                        length = float(temp[1])
                    except :
                        pass
                    try :
                        breadth = float(temp[2])
                    except :
                        pass
                    try :
                        area_sqft =  float(temp[3].split()[0])
                    except :
                        pass
                    try :
                        living_common_area[temp[0].split()[0].encode('utf8')] = {'Length':length, 'Breadth':breadth,'Area' : area_sqft}
                    except :
                        pass
                except :
                    pass
        except :
            pass
        try :
            private_areas = resp.xpath('//div[contains(@class,"unitdimensionsArea")]/div/div[2]/div[1]//tr')
            for area in private_areas:
                try:
                    length = breadth = area_sqft = 0.0
                    temp = area.xpath('td[@class="ng-binding"]//text()').extract()
                    # pprint(temp)
                    # input()
                    try:
                        length = float(temp[1])
                    except:
                        pass
                    try:
                        breadth = float(temp[2])
                    except:
                        pass
                    try:
                        area_sqft = float(temp[3].split()[0])
                    except:
                        pass
                    try:
                        open_area[temp[0].split()[0].encode('utf8')] = {'Length':length, 'Breadth':breadth,'Area' : area_sqft}
                    except:
                        pass
                except:
                    pass
        except:
            pass

        try:
            private_areas = resp.xpath('//div[contains(@class,"unitdimensionsArea")]/div/div[2]/div[2]//tr')
            for area in private_areas:
                try:
                    length = breadth = area_sqft = 0.0
                    temp = area.xpath('td[@class="ng-binding"]//text()').extract()
                    # pprint(temp)
                    # input()
                    try:
                        length = float(temp[1])
                    except:
                        pass
                    try:
                        breadth = float(temp[2])
                    except:
                        pass
                    try:
                        area_sqft = float(temp[3].split()[0])
                    except:
                        pass
                    try:
                        additional_area[temp[0].split()[0].encode('utf8')] = {'Length': length, 'Breadth': breadth,'Area' : \
                        area_sqft}
                    except:
                        pass
                except:
                    pass
        except:
            pass

        deep_item['min_price'] = min_price
        deep_item['max_price'] = max_price
        deep_item['carpet_area'] = carpet_area
        deep_item['built_up_area'] = built_up_area
        deep_item['bedrooms'] = bedrooms
        deep_item['bathrooms'] = bathrooms
        deep_item['balconies'] = balconies
        deep_item['servent_room'] = servent_rooms
        deep_item['living_area'] = living_area
        deep_item['kitchen'] = kitchens
        deep_item['code'] = code.encode('utf8')
        deep_item['room_private_areas'] = Room_private_area
        deep_item['living_common_areas'] = living_common_area
        deep_item['open_areas'] = open_area
        deep_item['additional_areas'] = additional_area

        try :
            item[name.encode('utf8')] += [deep_item]
        except:
            item[name.encode('utf8')] = [deep_item]

Exemplo n.º 56

0

Exibir arquivo

Arquivo: sqyrd.py Projeto: JINDALG/Roofpik_scrapy

    def parse_property_info(self, response):
        item = SquareyardItem()

        min_price = max_price = price_per_sqft  = min_area = max_area  =  0
        is_price_fix = 1
        name = description =  code = address = city = location =  status = unit_type = property_type  =""
        amenities ={}
        speciality = {}
        wow_factors =  {}
        index = {}
        connection = []
        self.driver.get(response.url)
        try:
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//img[@src]')))
        except TimeoutException:
            return
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')

        try :
            name = ''.join(resp.xpath('//h1[@itemprop="name"]//text()').extract())
        except :
            pass

        try :
            full_price = ''.join(resp.xpath('//span[@class="price-detail-txt"]//text()').extract())
            full_price_list = []
            for i in full_price.split() :
                try :
                    full_price_list += [float(i)]
                except :
                    pass
            min_price = float(full_price_list[0])
            try :
                max_price = float(full_price_list[1])
            except :
                pass
            try :
                if "Lac" in full_price:
                    min_price *= 100000
                    max_price *= 100000
            except :
                pass
            try :
                if "Cr" in full_price:
                    min_price  *= 10000000
                    max_price  *= 10000000
            except :
                pass
        except :
            pass

        try :
            area = ''.join(resp.xpath('//div[@class="proje-detais"]/p//text()').extract())
            area_list = []
            for i in area.split() :
                try :
                    area_list += [float(i)]
                except :
                    pass
            min_area = float(area_list[0])
            max_area = float(area_list[1])
        except :
            max_area = min_area

        try:
            price_per = (''.join(resp.xpath('//div[@class="price-details"]/div/div/p[2]/text()').extract())).replace('\n','').replace('\t','').replace(',','')
            priceunit = price_per
            price_per_sqft = []
            for i in price_per.split() :
                try :
                    price_per_sqft += [float(i)]
                except :
                    pass
            price_per_sqft = int(price_per_sqft[0])
            if "sqyrd" in priceunit:
                price_per_sqft *= 9
            
        except:
            pass

        try :
            address = (','.join(resp.xpath('//ul[@itemprop="address"]//*[contains(@itemprop,"address")]//text()').extract())).replace('\n','').replace('\t','')
            city = address.split(',')[0]
            location = address.split(',')[-1]
            address = ' '.join(address.split(','))
        except:
            pass
        
        try:
            description = '\n'.join(resp.xpath('//div[@class="aboutTextBox"]/p//text()').extract())
        except:
            pass

        try :
            special = resp.xpath('//div[contains(@class,"AmenitiesBoxBorder")]')
            speciality['other'] = []
            for spec in special:
                try :
                    label = (''.join(spec.xpath('span//text()').extract())).encode('utf8')
                    if label == "":
                        speciality['other'] += [(''.join(spec.xpath('div//li//span//text()').extract())).encode('utf8')]
                    else :
                        speciality[label] = (''.join(spec.xpath('div//li//span//text()').extract())).encode('utf8')
                except :
                    pass
        except :
            pass

        try :
            amenity_category = resp.xpath('//div[@class="amenitiesSliderBox"]/div')
            for category in amenity_category:
                try :
                    category_name = ''.join(category.xpath('div/div[1]/div//text()').extract()).encode('utf8')
                    amenities[category_name] = {}
                    aminity_list = category.xpath('div//li')
                    for amenity in aminity_list:
                        try :
                            header = (''.join(amenity.xpath('span[2]//text()').extract())).encode('utf8')
                            availability = ''.join(amenity.xpath('span[2]/@class').extract())
                            if "active" in availability:
                                amenities[category_name][header] = True
                            else :
                                amenities[category_name][header] = False
                        except :
                            pass
                except :
                    pass
        except :
            pass
        try :
            status = ''.join(resp.xpath('//div[@class="progress-main"]//li[2]//text()').extract())
        except :
            pass

        try :
            code = (response.url).split('/')[-2]
        except :
            pass

        try :
            project_details = resp.xpath('//div[contains(@class,"proje-detais")]')
            for details in project_details:
                if "Unit" in ''.join(details.xpath('p/span/text()').extract()):
                    unit_type = (''.join(details.xpath('p/text()').extract())).replace('\n','')
                if "Property" in ''.join(details.xpath('p/span/text()').extract()):
                    property_type = (''.join(details.xpath('p/text()').extract())).replace('\n','')
        except :
            pass

        try :
            wow_factor = resp.xpath('//div[contains(@class,"wow-Factors-section")]//li')
            for factor in wow_factor:
                value = (''.join(factor.xpath('span//text()').extract())).replace('\n','').encode('utf8')
                key = (''.join(factor.xpath('small//text()').extract())).replace('\n','').encode('utf8')
                wow_factors[key] = value
        except :
            pass

        try :
            connected_road = resp.xpath('//div[contains(@class,"connect-roads")]//li')
            for road in connected_road:
                try :
                    value = (''.join(road.xpath('span[1]//text()').extract())).split('~')
                    dis = float(value[1].split()[0])
                    connection += [{'name':value[0].encode('utf8'), 'distance': dis}]
                except :
                    pass
        except :
            pass

        try :
            driver_box = resp.xpath('//div[contains(@class,"decisionDriversBox")]/div/div/div')
            for box in driver_box:
                try :
                    head = (''.join(box.xpath('div//div[@class="projectCounter"]//div[@class="heading"]/text()').extract())).encode('utf8')
                    val = (''.join(box.xpath('div//div[@class="projectCounter"]//div[contains(@class,"Box")]/text()').extract())).encode('utf8')
                    index[head] = val  
                except :
                    pass     
        except :
            pass

        item['name'] = name.encode('utf8')
        item['min_price'] = min_price
        item['max_price'] = max_price
        item['price_per_sqft'] = price_per_sqft
        item['address'] = address.encode('utf8')
        item['city'] = city.encode('utf8')
        item['location'] = location.encode('utf8')
        item['min_area'] = min_area
        item['max_area'] = max_area
        item['possession_status'] = status.encode('utf8')
        item['amenities'] = amenities
        item['speciality'] = speciality
        item['url'] = response.url
        item['code'] = code.encode('utf8')
        item['description'] = description.encode('utf8')
        item['unit_type'] = unit_type.encode('utf8')
        item['property_type'] = property_type.encode('utf8')
        item['index'] = index
        item['connecting_road'] = connection
        item['wow_factors'] = wow_factors
        item['more_info'] = {}

        urls = resp.xpath('//div[@class="bhkDetails"]//a/@href').extract()
        for url in urls:
            abs_url = 'http://www.squareyards.com' + url
            self.parse_deep_info(abs_url, item['more_info'])
        
        yield item
        input()

Exemplo n.º 57

0

Exibir arquivo

Arquivo: sqyrd.py Projeto: JINDALG/Roofpik_scrapy

    def parse_deep_info(self, abs_url, main_item):
        item = {}
        self.driver.get(abs_url)
        try:
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//span[@itemprop="minPrice"]')))
        except TimeoutException:
            return
        resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')

        balconies =min_price = max_price = living_area  = bedrooms = bathrooms = kitchens = servent_rooms = carpet_area = built_up_area =  0
        code = name = ""
        try :
            code = ((response.url).split('/')[-2]).encode('utf8')
        except :
            pass

        try :
            name = (''.join(resp.xpath('//h1[@itemprop="name"]//text()').extract())).split()
            name = ''.join([name[0],name[1]])
        except :
            pass

        try :
            full_price = ''.join(resp.xpath('//span[@itemprop="minPrice"]//text()').extract())
            min_price = float(full_price.split()[0])   
            try :
                if "Lac" in full_price:
                    min_price *= 100000
            except :
                pass
            try :
                if "Cr" in full_price:
                    min_price  *= 10000000
            except :
                pass
        except:
            pass

        try :
            full_price = ''.join(resp.xpath('//span[@itemprop="maxPrice"]//text()').extract())
            max_price = float(full_price.split()[0])   
            try :
                if "Lac" in full_price:
                    max_price *= 100000
            except :
                pass
            try :
                if "Cr" in full_price:
                    max_price  *= 10000000
            except :
                pass
        except:
            pass    

        try :
            more_info = resp.xpath('//div[@class="unit-left-section"]//ul/li')
            for info in more_info:
                value = ''.join(info.xpath('span//text()').extract())
                try :
                    if "Living" in value:
                        living_area = int(value.split()[0])
                except :
                    pass
                try :
                    if "Bed" in value:
                        bedrooms = int(value.split()[0])
                except:
                    pass
                try :
                    if "Bath" in value:
                        bathrooms = int(value.split()[0])
                except :
                    pass
                try :
                    if "Kitchen" in value:
                        kitchens = int(value.split()[0])
                except :
                    pass
                try :
                    if "Servant" in value:
                        servent_rooms = int(value.split()[0])
                except :
                    pass
                try :
                    if "Balcon" in value:
                        balconies = int(value.split()[0])
                except :
                    pass
        except:
            pass        

        try :
            more_info = resp.xpath('//div[@class="unit-loder"]//div[@ng-if="!isFragment"]')
            for info in more_info:
                header = ''.join(info.xpath('div//p//text()').extract())
                try :
                    if "Carpet" in value:
                        carpet_area = int((''.join(info.xpath('div//small//text()').extract())).split()[0])
                except :
                    pass
                try :
                    if "BuiltUp" in value:
                        built_up_area = int((''.join(info.xpath('div//small//text()').extract())).split()[0])
                except :
                    pass
        except:
            pass

        item['min_price'] = min_price
        item['max_price'] = max_price
        item['carpet_area'] = carpet_area
        item['built_up_area'] = built_up_area
        item['bedrooms'] = bedrooms
        item['bathrooms'] = bathrooms
        item['balconies'] = balconies
        item['servent_room'] = servent_rooms
        item['living_area'] = living_area
        item['kitchen'] = kitchens
        item['code'] = code.encode('utf8')

        if name in main_item:
            main_item[name] += [item]
        else :
            main_item[name] = [item]

Exemplo n.º 58

0

Exibir arquivo

Arquivo: acres.py Projeto: JINDALG/Roofpik_scrapy

	def parse_property_info(self, response):
		item = BuyItem()
		self.driver.get(response.url)
		input()
		try:
			WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="npPrice"]//text()')))
		except TimeoutException:
			return
		response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
		is_resale = price = bedrooms = bathrooms = price_per_sqft = 0
		is_price_fix = 1
		print "\n",response.url,"\n"
		try :
			full_price = ','.join(response.xpath('//div[@class="npPrice"]//text()').extract())
			print full_price
			print price
			price = float(full_price.split(',')[3])
			if 'Cr' in full_price :
				price *= 10000000
			if "Lac" in full_price :
				price *= 100000
		except :
			pass

		if price == 0:
			try :
				full_price = ' '.join(response.xpath('//span[@id="pdPrice"]//text()').extract())
				print full_price
				print price
				price = float(full_price.split()[0])
				if 'Cr' in full_price :
					price *= 10000000
				if "Lacs" in full_price :
					price *= 100000
			except :
				pass
		print price
		input()

		try :
			price_per_sqft = float((response.xpath('//div[@class="npBasePrice"]/span/text()').extract())[3])
		except :
			pass

		try :
			price_per_sqft = float((response.xpath('//div[@id="pricePerUnitArea"]/text()').extract()).split()[1])
		except :
			pass


		city  = address = location = ""
		try :
			address = (''.join(response.xpath('//div[@class="project-location"]/span//text()').extract())).replace('\n','')
			city = address.split(',')[-2]
			location = (response.xpath('//a[@class="ttlLink"]/text()').extract()[1])		
		except :
			pass

		if address == "":
			try :
				address = (''.join(response.xpath('//span[@id="address"]/text()').extract())).replace('\n','')
				city = address.split(',')[-2]
				location = (response.xpath('//a[@class="ttlLink"]/text()').extract()[1])	
			except :
				pass

		status =  ""
		min_area = max_area = 0.0
		try :
			status = ''.join(response.xpath('//div[@class="npPossessionDate"]/text()').extract()[2])	
		except:
			pass

		if status == "":
			try :
				status = ''.join(response.xpath('//div[@class="pdDetailInfoOther"]/div[3]/span/text()').extract())	
			except:
				pass
		try :
			temp = ''.join(response.xpath('//div[@class="npAreaPrice"]/span[1]/text()').extract())
			temp = temp.split()
			temp = [float(i) for i in temp if i.isdigit()]
			try :
				min_area = temp[0]
				max_area = temp[1]
			except :
				max_area = min_area
		except :
			pass

		SuperBuiltupArea = 0.0
		try :
			SuperBuiltupArea = ' '.join(response.xpath('//div[@class="npPrjArea"]	/span//text()').extract())
			if "acres" in SuperBuiltupArea:
				SuperBuiltupArea = float(SuperBuiltupArea.split()[0])*43560
			else :
				SuperBuiltupArea = float(SuperBuiltupArea.split()[0])
		except :
			pass


		if min_area == 0.0 :
			try :
				min_area = float(''.join(response.xpath('//span[@id="superbuiltupArea_span"]/text()').extract()))
				max_area = min_area

			except :
				pass

			try :
				SuperBuiltupArea = ''.join(response.xpath('//div[@id="socAreaOccupied"]/text()').extract())
				if "acres" in SuperBuiltupArea:
					SuperBuiltupArea = float(SuperBuiltupArea.split()[0])*43560
				else :
					SuperBuiltupArea = float(SuperBuiltupArea.split()[0])	
			except:
				pass


		launch_date = CarpetArea = posted_on = ''

		try :
			posted_on = (''.join(response.xpath('//span[@class="pdPropDate"]/text()').extract()).replace(',','')).split()
			posted_on[0],posted_on[1] = posted_on[1],posted_on[0]
			posted_on[1] =  find_month(posted_on[1])
			posted_on = ' '.join(posted_on)

		except :
			pass



		Description =amenities  = age_of_property = ''
		speciality = {}

		try :
			Description = (''.join(response.xpath('//div[@id = "description"]//text()').extract())).replace('\n','')
		except:
			pass
		
		try :
			amenities  = ','.join(response.xpath('//div[@id="amenitiesSection"]/div/div[2]/div/div/div//text()').extract())
		except:
			pass

		if amenities == "":
			try :
				amenities  = ','.join(response.xpath('//div[@id="features"]/div/div//text()').extract())
			except:
				pass

		try :
			special  = response.xpath('//div[@class=" pdOtherFacts responsive"]/div')
			for spec in special :
				try :
					header = ''.join(special.xpath('span[1]//text()').extract())
					text = ''.join(special.xpath('span[2]//text()').extract())
					speciality[header] = text
				except:
					pass

		except :
			pass



		agent_name = agent_type =""
		try :
			agent_type = ''.join(response.xpath('//div[@id="QryFormPd"]//span[@class="dealerWidgetHeading"]//text()').extract())
			agent_type = agent_type.replace('Details','')
			agent_name = (','.join(response.xpath('//div[@id="QryFormPd"]//div[@class="c2dInfo"]//text()').extract())).split()[0]
		except :
			pass

		if agent_name == "" :
			try :
				agent_name = (' '.join(response.xpath('//div[@id="QryFormPd"]//div[@class="c2dRunCaptionAbtDev "]//span[@class="spanBold"]//text()').extract()))
				agent_name = agent_name.replace('About ','')
			except :
				pass

		try :
			resale = response.xpath('//span[@id="transactionType"]//text()').extract()
			if 'Resale' in resale:
				is_resale = 1
		except:
			pass

		try :
			bedrooms = int((''.join(response.xpath('//div[@id="bedRoomNum"]//text()').extract())).split()[0])
		except :
			pass

		try :
			bathrooms = int((''.join(response.xpath('//div[@id="bathroomNum"]//text()').extract())).split()[0])
		except :
			pass

		try :
			age_of_property = ''.join(response.xpath('//div[@id="agePossessionLbl"]//text()').extract())
		except :
			pass

		try :
			additional_rooms = ''.join(response.xpath('//div[@id="additionalRooms"]//text()').extract())
			amenities += (", " + additional_rooms)	
		except :
			pass

		more_info = {}

		item['price'] = price
		item['price_per_sqft'] = price_per_sqft
		item['is_price_fix'] = is_price_fix
		item['address'] = address.encode('utf8')
		item['city'] = city.encode('utf8')
		item['location'] = location.encode('utf8')
		item['min_area'] = min_area
		item['max_area'] = max_area
		item['bathrooms'] = bathrooms
		item['bedrooms'] = bedrooms
		item['SuperBuiltupArea'] = SuperBuiltupArea
		item['age_of_property'] = age_of_property.encode('utf8')
		item['launch_date'] = launch_date.encode('utf8')
		item['possession_status'] = status.encode('utf8')
		item['agent_name'] = agent_name.encode('utf8')
		item['agent_type'] = agent_type.encode('utf8')
		item['amenities'] = amenities.encode('utf8')
		item['speciality'] = speciality
		item['more_info'] = more_info
		item['is_resale'] = is_resale
		item['url'] = response.url

		yield item
		input()