def parse_product_data(self, url): try: #print "parse product url: %s ..." % temp html = request_url.get_html_from_url(url, USE_TOR) if html: if html == '404': self.mongo_collection.update({"url": url}, {"$set": { "is_active": 0 }}) return parsed_html = BeautifulSoup(html, 'html5lib') product_name_obj = parsed_html.body.find( 'h1', {'id': 'prod_title'}) if product_name_obj: #get product id product_id = re.search(r'(\d+)\.html$', url).group(1) #parse product name product_name = product_name_obj.text.strip() #parse image product_image = parsed_html.body.findAll( 'span', {"class": "productImage"})[0]['data-image'] #parse price price = parsed_html.body.find('span', { 'id': 'product_price' }).text.strip() #use regular expression to replace VND and dot symbol price = re.sub('\s+VND|\.\d+$', '', price) product_data = { 'product_id': int(product_id), 'name': product_name, 'image': product_image, 'price': int(price), 'url': url, 'is_active': 1 } #insert data to mongo self.mongo_collection.update( {'product_id': int(product_id)}, product_data, upsert=True) except Exception as e: #log info here #@TODO: send mail notify with open('fail.txt', 'a') as file_: file_.write('Cannot parse data from lazada. Error: ' + str(e.args)) pass
def parse_product_data(self, url): try: #print "parse product url: %s ..." % temp html = request_url.get_html_from_url(url, USE_TOR) if html: if html == '404': self.mongo_collection.update({"url" : url}, {"$set" : {"is_active" : 0}}) return parsed_html = BeautifulSoup(html.encode('utf-8')) #parse product name product_obj = parsed_html.body.find('h1', {'class' : 'item-name'}) if product_obj: #product name product_name = product_obj.text.strip() #get product id product_id = re.search(r'.*p(\d+)\.html', url).group(1) #parse image product_image = parsed_html.body.find('img', attrs={'itemprop': 'image'})['src'] #parse price price = parsed_html.body.find('span', attrs={'itemprop': 'price'}).text.strip() price = u'%s' % price price = price.encode("ascii", "ignore") #use regular expression to replace VND and dot symbol price = re.sub('\.', '', price) product_data = { 'product_id' : int(product_id), 'name' : product_name, 'image' : product_image, 'price' : int(price), 'url' : url, 'is_active': 1 } #insert data to mongo self.mongo_collection.update({'product_id': int(product_id)}, product_data, upsert = True) except Exception as e: #log info here #@TODO: send mail notify with open('fail.txt', 'a') as file_: file_.write('Cannot parse data from tiki. Error: ' + str(e.args)) pass
def parse_product_data(self, url): try: #print "parse product url: %s ..." % temp html = request_url.get_html_from_url(url, USE_TOR) if html: parsed_html = BeautifulSoup(html.encode('utf-8')) #parse product name product_obj = parsed_html.body.find('h1', attrs={'itemprop': 'name'}) if product_obj: #product name product_name = product_obj.text.strip() #get product id product_id = parsed_html.body.find('select', {'id': 'estimated-time-select'})['data-pid'] #parse image product_image = parsed_html.body.find('a', {'id': 'zoom1'}).find('img')['src'] #parse price price = parsed_html.body.findAll('span', attrs={'class': 'price', 'id': re.compile(r".*")}) if len(price) == 2: price = u'%s' % price[1].text.strip() else: price = u'%s' % price[0].text.strip() price = price.encode("ascii", "ignore") #use regular expression to replace VND and dot symbol price = re.sub('\.', '', price) product_data = { 'product_id' : int(product_id), 'name' : product_name, 'image' : product_image, 'price' : (int)price, 'url' : url } #insert data to mongo self.mongo_collection.update({'product_id': int(product_id)}, product_data, upsert = True) except Exception as e: #log info here #@TODO: send mail notify with open('fail.txt', 'a') as file_: file_.write('Cannot parse data from cdiscount. Error: ' + str(e.args)) pass
def parse_product_data(self, url): try: #print "parse product url: %s ..." % temp html = request_url.get_html_from_url(url, USE_TOR) if html: if html == '404': self.mongo_collection.update({"url" : url}, {"$set" : {"is_active" : 0}}) return parsed_html = BeautifulSoup(html, 'html5lib') product_name_obj = parsed_html.body.find('h1', {'id' : 'prod_title'}) if product_name_obj: #get product id product_id = re.search(r'(\d+)\.html$', url).group(1) #parse product name product_name = product_name_obj.text.strip() #parse image product_image = parsed_html.body.findAll('span', {"class": "productImage"})[0]['data-image'] #parse price price = parsed_html.body.find('span', {'id': 'product_price'}).text.strip() #use regular expression to replace VND and dot symbol price = re.sub('\s+VND|\.\d+$', '', price) product_data = { 'product_id' : int(product_id), 'name' : product_name, 'image' : product_image, 'price' : int(price), 'url' : url, 'is_active': 1 } #insert data to mongo self.mongo_collection.update({'product_id': int(product_id)}, product_data, upsert = True) except Exception as e: #log info here #@TODO: send mail notify with open('fail.txt', 'a') as file_: file_.write('Cannot parse data from lazada. Error: ' + str(e.args)) pass
def get_soup_html(self, url): #download html html = request_url.get_html_from_url(url, self.use_tor) if html: #get all link #trick for parse lazada page #TODO: test other page if self.init_url == 'http://www.lazada.vn': soup = BeautifulSoup(html, 'html5lib') else: soup = BeautifulSoup(html) #format soup before find link from soup soup = self.before_find_link(soup) return soup else: return ''
def parse_product_data(self, url): try: #print "parse product url: %s ..." % temp html = request_url.get_html_from_url(url, USE_TOR) if html: parsed_html = BeautifulSoup(html.encode('utf-8')) #parse product name product_obj = parsed_html.body.find('h1', {'class' : 'block_product-title'}) if product_obj: #product name product_name = product_obj.text.strip() #get product id product_id = parsed_html.body.find('span', attrs={'id': re.compile(r"product_code.*")}).text.strip() #parse image product_image = parsed_html.body.find('img', {"class": "pict"})['src'] #parse price price = parsed_html.body.findAll('span', {'class' : 'price-num'})[0].text.strip() #use regular expression to replace VND and dot symbol price = re.sub('\s+VND|\.', '', price) product_data = { 'product_id' : product_id, 'name' : product_name, 'image' : product_image, 'price' : (int)price, 'url' : url } #insert data to mongo self.mongo_collection.update({'product_id': product_id}, product_data, upsert = True) except Exception as e: #log info here #@TODO: send mail notify with open('fail.txt', 'a') as file_: file_.write('Cannot parse data from nguyenkim. Error: ' + str(e.args)) pass
def parse_product_data(self, url): try: #print "parse product url: %s ..." % temp html = request_url.get_html_from_url(url, USE_TOR) if html: if html == '404': self.mongo_collection.update({"url": url}, {"$set": { "is_active": 0 }}) return parsed_html = BeautifulSoup(html.encode('utf-8')) #parse product name product_obj = parsed_html.body.find('h1', {'class': 'item-name'}) if product_obj: #product name product_name = product_obj.text.strip() #get product id product_id = re.search(r'.*p(\d+)\.html', url).group(1) #parse image product_image = parsed_html.body.find( 'img', attrs={'itemprop': 'image'})['src'] #parse price price = parsed_html.body.find('span', attrs={ 'itemprop': 'price' }).text.strip() price = u'%s' % price price = price.encode("ascii", "ignore") #use regular expression to replace VND and dot symbol price = re.sub('\.', '', price) product_data = { 'product_id': int(product_id), 'name': product_name, 'image': product_image, 'price': int(price), 'url': url, 'is_active': 1 } #insert data to mongo self.mongo_collection.update( {'product_id': int(product_id)}, product_data, upsert=True) except Exception as e: #log info here #@TODO: send mail notify with open('fail.txt', 'a') as file_: file_.write('Cannot parse data from tiki. Error: ' + str(e.args)) pass