Python get_html_from_url 예제들, request_url.get_html_from_url Python 예제들

예제 #1

0

파일 보기

    def parse_product_data(self, url):
        try:
            #print "parse product url: %s ..." % temp
            html = request_url.get_html_from_url(url, USE_TOR)

            if html:
                if html == '404':
                    self.mongo_collection.update({"url": url},
                                                 {"$set": {
                                                     "is_active": 0
                                                 }})
                    return

                parsed_html = BeautifulSoup(html, 'html5lib')

                product_name_obj = parsed_html.body.find(
                    'h1', {'id': 'prod_title'})

                if product_name_obj:

                    #get product id
                    product_id = re.search(r'(\d+)\.html$', url).group(1)

                    #parse product name
                    product_name = product_name_obj.text.strip()

                    #parse image
                    product_image = parsed_html.body.findAll(
                        'span', {"class": "productImage"})[0]['data-image']

                    #parse price
                    price = parsed_html.body.find('span', {
                        'id': 'product_price'
                    }).text.strip()

                    #use regular expression to replace VND and dot symbol
                    price = re.sub('\s+VND|\.\d+$', '', price)

                    product_data = {
                        'product_id': int(product_id),
                        'name': product_name,
                        'image': product_image,
                        'price': int(price),
                        'url': url,
                        'is_active': 1
                    }

                    #insert data to mongo
                    self.mongo_collection.update(
                        {'product_id': int(product_id)},
                        product_data,
                        upsert=True)
        except Exception as e:
            #log info here
            #@TODO: send mail notify
            with open('fail.txt', 'a') as file_:
                file_.write('Cannot parse data from lazada. Error: ' +
                            str(e.args))
            pass

예제 #2

0

파일 보기

파일: tiki.py 프로젝트: nguyenhoaibao/crawler

	def parse_product_data(self, url):
		try:
			#print "parse product url: %s ..." % temp
			html = request_url.get_html_from_url(url, USE_TOR)

			if html:
				if html == '404':
					self.mongo_collection.update({"url" : url}, {"$set" : {"is_active" : 0}})
					return

				parsed_html = BeautifulSoup(html.encode('utf-8'))

				#parse product name
				product_obj = parsed_html.body.find('h1', {'class' : 'item-name'})

				if product_obj:

					#product name
					product_name = product_obj.text.strip()
					
					#get product id
					product_id = re.search(r'.*p(\d+)\.html', url).group(1)
					
					#parse image
					product_image = parsed_html.body.find('img', attrs={'itemprop': 'image'})['src']
				
					#parse price
					price = parsed_html.body.find('span', attrs={'itemprop': 'price'}).text.strip()
					price = u'%s' % price
					price = price.encode("ascii", "ignore")

					#use regular expression to replace VND and dot symbol
					price = re.sub('\.', '', price)
				
					product_data = {
						'product_id' : int(product_id),
						'name'  : product_name,
						'image' : product_image,
						'price' : int(price),
						'url'   : url,
						'is_active': 1
					}
				
					#insert data to mongo
					self.mongo_collection.update({'product_id': int(product_id)}, product_data, upsert = True)
		except Exception as e:
			#log info here
			#@TODO: send mail notify
			with open('fail.txt', 'a') as file_:
				file_.write('Cannot parse data from tiki. Error: ' + str(e.args))
			pass

예제 #3

0

파일 보기

	def parse_product_data(self, url):
		try:
			#print "parse product url: %s ..." % temp
			html = request_url.get_html_from_url(url, USE_TOR)

			if html:

				parsed_html = BeautifulSoup(html.encode('utf-8'))

				#parse product name
				product_obj = parsed_html.body.find('h1', attrs={'itemprop': 'name'})

				if product_obj:

					#product name
					product_name = product_obj.text.strip()
					
					#get product id
					product_id = parsed_html.body.find('select', {'id': 'estimated-time-select'})['data-pid']
					
					#parse image
					product_image = parsed_html.body.find('a', {'id': 'zoom1'}).find('img')['src']
				
					#parse price
					price = parsed_html.body.findAll('span', attrs={'class': 'price', 'id': re.compile(r".*")})
					if len(price) == 2:
						price = u'%s' % price[1].text.strip()
					else:
						price = u'%s' % price[0].text.strip()
					
					price = price.encode("ascii", "ignore")

					#use regular expression to replace VND and dot symbol
					price = re.sub('\.', '', price)
				
					product_data = {
						'product_id' : int(product_id),
						'name'  : product_name,
						'image' : product_image,
						'price' : (int)price,
						'url'   : url
					}
				
					#insert data to mongo
					self.mongo_collection.update({'product_id': int(product_id)}, product_data, upsert = True)
		except Exception as e:
			#log info here
			#@TODO: send mail notify
			with open('fail.txt', 'a') as file_:
				file_.write('Cannot parse data from cdiscount. Error: ' + str(e.args))
			pass

예제 #4

0

파일 보기

파일: lazada.py 프로젝트: nguyenhoaibao/crawler

	def parse_product_data(self, url):
		try:
			#print "parse product url: %s ..." % temp
			html = request_url.get_html_from_url(url, USE_TOR)

			if html:
				if html == '404':
					self.mongo_collection.update({"url" : url}, {"$set" : {"is_active" : 0}})
					return

				parsed_html = BeautifulSoup(html, 'html5lib')

				product_name_obj = parsed_html.body.find('h1', {'id' : 'prod_title'})

				if product_name_obj:

					#get product id
					product_id = re.search(r'(\d+)\.html$', url).group(1)
						
					#parse product name
					product_name = product_name_obj.text.strip()

					#parse image
					product_image = parsed_html.body.findAll('span', {"class": "productImage"})[0]['data-image']
				
					#parse price
					price = parsed_html.body.find('span', {'id': 'product_price'}).text.strip()
					
					#use regular expression to replace VND and dot symbol
					price = re.sub('\s+VND|\.\d+$', '', price)
				
					product_data = {
						'product_id' : int(product_id),
						'name'  : product_name,
						'image' : product_image,
						'price' : int(price),
						'url'   : url,
						'is_active': 1
					}
				
					#insert data to mongo
					self.mongo_collection.update({'product_id': int(product_id)}, product_data, upsert = True)
		except Exception as e:
			#log info here
			#@TODO: send mail notify
			with open('fail.txt', 'a') as file_:
				file_.write('Cannot parse data from lazada. Error: ' + str(e.args))
			pass

예제 #5

0

파일 보기

파일: crawl.py 프로젝트: nguyenhoaibao/crawler

	def get_soup_html(self, url):
		#download html
		html = request_url.get_html_from_url(url, self.use_tor)
		if html:
			#get all link
			#trick for parse lazada page
			#TODO: test other page
			if self.init_url == 'http://www.lazada.vn':
				soup = BeautifulSoup(html, 'html5lib')
			else:
				soup = BeautifulSoup(html)

			#format soup before find link from soup
			soup = self.before_find_link(soup)
			return soup
		else:
			return ''

예제 #6

0

파일 보기

    def get_soup_html(self, url):
        #download html
        html = request_url.get_html_from_url(url, self.use_tor)
        if html:
            #get all link
            #trick for parse lazada page
            #TODO: test other page
            if self.init_url == 'http://www.lazada.vn':
                soup = BeautifulSoup(html, 'html5lib')
            else:
                soup = BeautifulSoup(html)

            #format soup before find link from soup
            soup = self.before_find_link(soup)
            return soup
        else:
            return ''

예제 #7

0

파일 보기

	def parse_product_data(self, url):
		try:
			#print "parse product url: %s ..." % temp
			html = request_url.get_html_from_url(url, USE_TOR)

			if html:
				parsed_html = BeautifulSoup(html.encode('utf-8'))

				#parse product name
				product_obj = parsed_html.body.find('h1', {'class' : 'block_product-title'})

				if product_obj:

					#product name
					product_name = product_obj.text.strip()

					#get product id
					product_id = parsed_html.body.find('span', attrs={'id': re.compile(r"product_code.*")}).text.strip()
					
					#parse image
					product_image = parsed_html.body.find('img', {"class": "pict"})['src']
				
					#parse price
					price = parsed_html.body.findAll('span', {'class' : 'price-num'})[0].text.strip()
					#use regular expression to replace VND and dot symbol
					price = re.sub('\s+VND|\.', '', price)

				
					product_data = {
						'product_id' : product_id,
						'name'  : product_name,
						'image' : product_image,
						'price' : (int)price,
						'url'   : url
					}
				
					#insert data to mongo
					self.mongo_collection.update({'product_id': product_id}, product_data, upsert = True)
		except Exception as e:
			#log info here
			#@TODO: send mail notify
			with open('fail.txt', 'a') as file_:
				file_.write('Cannot parse data from nguyenkim. Error: ' + str(e.args))
			pass

예제 #8

0

파일 보기

    def parse_product_data(self, url):
        try:
            #print "parse product url: %s ..." % temp
            html = request_url.get_html_from_url(url, USE_TOR)

            if html:
                if html == '404':
                    self.mongo_collection.update({"url": url},
                                                 {"$set": {
                                                     "is_active": 0
                                                 }})
                    return

                parsed_html = BeautifulSoup(html.encode('utf-8'))

                #parse product name
                product_obj = parsed_html.body.find('h1',
                                                    {'class': 'item-name'})

                if product_obj:

                    #product name
                    product_name = product_obj.text.strip()

                    #get product id
                    product_id = re.search(r'.*p(\d+)\.html', url).group(1)

                    #parse image
                    product_image = parsed_html.body.find(
                        'img', attrs={'itemprop': 'image'})['src']

                    #parse price
                    price = parsed_html.body.find('span',
                                                  attrs={
                                                      'itemprop': 'price'
                                                  }).text.strip()
                    price = u'%s' % price
                    price = price.encode("ascii", "ignore")

                    #use regular expression to replace VND and dot symbol
                    price = re.sub('\.', '', price)

                    product_data = {
                        'product_id': int(product_id),
                        'name': product_name,
                        'image': product_image,
                        'price': int(price),
                        'url': url,
                        'is_active': 1
                    }

                    #insert data to mongo
                    self.mongo_collection.update(
                        {'product_id': int(product_id)},
                        product_data,
                        upsert=True)
        except Exception as e:
            #log info here
            #@TODO: send mail notify
            with open('fail.txt', 'a') as file_:
                file_.write('Cannot parse data from tiki. Error: ' +
                            str(e.args))
            pass