def process_item(self, item, spider): # Check if the Product already exists product = (self.session.query(Product).filter_by( store=item["store"], sku=item["sku"]).first()) if product is None: product = Product(store=item["store"], sku=item["sku"]) product.barcodes = item["barcodes"] product.brand = item["brand"] product.name = item["name"] product.description = item["description"] product.image_url = item["image_url"] self.session.add(product) self.session.commit() # Check if the BranchProduct already exists branch_product = (self.session.query(BranchProduct).filter_by( product=product, branch=item["branch"]).first()) if branch_product is None: branch_product = BranchProduct(product=product, branch=item["branch"]) branch_product.stock = item["stock"] branch_product.price = item["price"] self.session.add(branch_product) self.session.commit() return item
def fetch_category(search_index, amazon_node_id): api = caching.ResponseCachingAPI(settings.AMAZON_AWS_KEY, settings.AMAZON_SECRET_KEY, settings.AMAZON_API_LOCALE, settings.AMAZON_ASSOCIATE_TAG, cachedir='cache', cachetime=86400) try: for root in api.item_search( search_index, BrowseNode=str(amazon_node_id), ResponseGroup=settings.AMAZON_RESPONSE_GROUP): for item in root.Items.Item: product = Product() product.category = Category.objects.get( amazon_node_id=amazon_node_id) product.asin = item.ASIN product.title = unicode(item.ItemAttributes.Title) product.detailpageurl = unicode(item.DetailPageURL) product.manufacturer = unicode( getattr(item.ItemAttributes, 'Manufacturer', None)) product.publisher = unicode( getattr(item.ItemAttributes, 'Publisher', None)) product.brand = unicode( getattr(item.ItemAttributes, 'Brand', None)) product.popularity = getattr(item, 'SalesRank', 1000) if hasattr(item, 'MediumImage'): product.medium_image = getattr(item.MediumImage, 'URL', None) if hasattr(item, 'LargeImage'): product.large_image = getattr(item.LargeImage, 'URL', None) if hasattr(item, 'EditorialReviews'): product.description = unicode( getattr(item.EditorialReviews.EditorialReview, 'Content', None)) if hasattr(item.Offers, 'Offer'): product.price = item.Offers.Offer.OfferListing.Price.FormattedPrice.pyval elif hasattr(item.ItemAttributes, 'ListPrice'): product.price = item.ItemAttributes.ListPrice.FormattedPrice.pyval elif hasattr(item.OfferSummary, 'LowestUsedPrice'): product.price = u'used from %s' % item.OfferSummary.LowestUsedPrice.FormattedPrice.pyval else: product.price = None product.save() except AWSError, e: if e.code == 'AWS.ParameterOutOfRange': pass # reached the api limit of 10 pages else: raise ValidationError(message=e.msg)
def fetch_category(search_index, amazon_node_id): api = caching.ResponseCachingAPI( settings.AMAZON_AWS_KEY, settings.AMAZON_SECRET_KEY, settings.AMAZON_API_LOCALE, settings.AMAZON_ASSOCIATE_TAG, cachedir='cache', cachetime=86400) try: for root in api.item_search(search_index, BrowseNode=str(amazon_node_id), ResponseGroup=settings.AMAZON_RESPONSE_GROUP): for item in root.Items.Item: product = Product() product.category = Category.objects.get(amazon_node_id=amazon_node_id) product.asin = item.ASIN product.title = unicode(item.ItemAttributes.Title) product.detailpageurl = unicode(item.DetailPageURL) product.manufacturer = unicode(getattr(item.ItemAttributes, 'Manufacturer', None)) product.publisher = unicode(getattr(item.ItemAttributes, 'Publisher', None)) product.brand = unicode(getattr(item.ItemAttributes, 'Brand', None)) product.popularity = getattr(item, 'SalesRank', 1000) if hasattr(item, 'MediumImage'): product.medium_image = getattr(item.MediumImage, 'URL', None) if hasattr(item, 'LargeImage'): product.large_image = getattr(item.LargeImage, 'URL', None) if hasattr(item, 'EditorialReviews'): product.description = unicode(getattr(item.EditorialReviews.EditorialReview, 'Content', None)) if hasattr(item.Offers, 'Offer'): product.price = item.Offers.Offer.OfferListing.Price.FormattedPrice.pyval elif hasattr(item.ItemAttributes, 'ListPrice'): product.price = item.ItemAttributes.ListPrice.FormattedPrice.pyval elif hasattr(item.OfferSummary, 'LowestUsedPrice'): product.price = u'used from %s' % item.OfferSummary.LowestUsedPrice.FormattedPrice.pyval else: product.price = None product.save() except AWSError, e: if e.code == 'AWS.ParameterOutOfRange': pass # reached the api limit of 10 pages else: raise ValidationError(message=e.msg)
def products_to_db(products): """ It saves the products in the database :param products: dictionary with the desired information """ session = load_session() for key, item in products.items(): print('\n>>> Processing:', key, item['NAME']) product = (session.query(Product).filter_by(store="Richart's", sku=item["SKU"]).first()) if product is None: product = Product(store="Richart's", sku=item["SKU"]) product.barcodes = item["BARCODES"] product.brand = item["BRAND"].capitalize() product.name = item["NAME"].capitalize() description = remove_html_tags(item["DESCRIPTION"]) product.description = description.capitalize() product.image_url = item["IMAGE_URL"] product.category = item["FULL_CATEGORY"] product.package = product.description.replace(product.name, '') session.add(product) session.commit() # Check if the BranchProduct already exists branch_product = (session.query(BranchProduct).filter_by( product=product, branch=item["BRANCH"]).first()) if branch_product is None: branch_product = BranchProduct(product=product, branch=item["BRANCH"]) branch_product.stock = item["STOCK"] branch_product.price = item["PRICE"] session.add(branch_product) session.commit() session.close()
def saveProduct(self, product): try: record = Product() if Product.objects.filter(title=str(product['title'])).exists(): record = Product.objects.filter(title=str(product['title']))[0] record.title = str(product['title']) record.description = str(product['description']) record.current_price = formatPrice( product['price']['current_price'] if 'current_price' in product['price'] else 0) record.old_price = formatPrice( product['price']['old_price'] if 'old_price' in product['price'] else 0) record.you_save = formatPrice( product['price']['you_save'] if 'you_save' in product['price'] else 0) record.url = str(product['url']) record.images = product['images'] if product['connections']: record.connection_value = str(product['connections']) record.manufacturer_en = str(product['manufacturer_en']) record.brand = str(product['manufacturer_en'] ) if product['manufacturer_en'] else str( product['seller']['name']) record.tags = product['tags'] record.other_specs = str(product['specs']) record.original_json = json.dumps(product, ensure_ascii=False) record.save() return record except Exception as e: print('Error during save proudct {} cause {} '.format( product['title'], str(e))) return None
def process_item(item): Session = sessionmaker(bind=engine) session = Session() # Check if the Product already exists product = (session.query(Product).filter_by(store=item["store"], sku=item["sku"]).first()) if product is None: product = Product(store=item["store"], sku=item["sku"]) product.barcodes = item["barcodes"] product.brand = item["brand"] product.name = item["name"] product.description = item["description"] product.image_url = item["image_url"] product.category = item["category"] product.package = item["package"] session.add(product) session.commit() # Check if the BranchProduct already exists branch_product = (session.query(BranchProduct).filter_by( product=product, branch=item["branch"]).first()) if branch_product is None: branch_product = BranchProduct(product=product, branch=item["branch"]) branch_product.stock = item["stock"] branch_product.price = item["price"] session.add(branch_product) session.commit() return item
def crawl_listing(self, url, ctx='', **kwargs): res = requests.get(url) res.raise_for_status() tree = lxml.html.fromstring(res.content) category = Category.objects(key=kwargs.get('key')).first() if not category: common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key')) return product_nodes = tree.cssselect('div#searchResults a') for product_node in product_nodes: price = None; listprice = None price = product_node.cssselect('.price-6pm')[0].text listprice_node = product_node.cssselect('.discount') listprice = ''.join(listprice_node[0].xpath('text()')) if listprice_node else None # eliminate products of no discountIndexError: if price is None or listprice is None: # common_failed.send(sender=ctx, url=url, \ # reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice)) continue key = product_node.get('data-product-id') if not key: common_failed.send(sender=ctx, url=url, reason='listing product has no key') continue combine_url = product_node.get('href') key = '%s_%s' % (key, combine_url.split('/')[-1]) match = re.search(r'https?://.+', combine_url) if not match: combine_url = '%s%s' % (HOST, combine_url) brand = product_node.cssselect('.brandName')[0].text.strip() title = product_node.cssselect('.productName')[0].text.strip() is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if title and title != product.title: product.title = title is_updated = True if brand and brand != product.brand: product.brand = brand is_updated = True if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True if category.cats and set(category.cats).difference(product.dept): product.dept = list(set(category.cats) | set(product.dept or [])) is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True if is_updated: product.list_update_time = datetime.utcnow() # To pick the product which fit our needs, such as a certain discount, brand, dept etc. selected = Picker(site='6pm').pick(product) if not selected: continue product.hit_time = datetime.utcnow() product.save() common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) ) print product.key; print product.brand; print product.title; \ print product.price, ' / ', product.listprice; print product.combine_url; \ print product.dept; print # Go to the next page to keep on crawling. next_page = None page_node = tree.cssselect('div.pagination') if not page_node: return last_node =page_node[0].cssselect('.last') if last_node: next_page = page_node[0].cssselect('a')[-1].get('href') if next_page: match = re.search(r'https?://.+', next_page) if not match: next_page = '%s%s' % (HOST, next_page) print next_page self.crawl_listing(url=next_page, ctx=ctx, **kwargs)
def crawl_product(self, url, casin, ctx='', **kwargs): r = req.get(url) data = re.compile(r'parse_asin_\w+\((.*)\);$').search(r.text).group(1) data = json.loads(data) image_urls = [] for i in data['detailJSON']['main']['altviews']: if i['zoomImage'] not in image_urls: image_urls.append(i['zoomImage']) if not image_urls: for i in data['detailJSON']['asins']: if i['asin'] == casin: for j in i['altviews']: if j['zoomImage'] not in image_urls: image_urls.append(j['zoomImage']) break asin = data['detailJSON']['asin'] summary = data['productDescription']['shortProdDesc'] if data['productDescription']['bullets']: list_info = [i.replace('"', '"').replace(''', '\'') for i in data['productDescription']['bullets'][0]['bulletsList']] else: list_info = [] brand = data['detailJSON']['brand'] returned = data['detailJSON']['returnPolicy'] # if 'intlShippable' in data['detailJSON']: # shipping = 'international shipping' if data['detailJSON']['intlShippable'] == 1 else 'no international shipping' # elif 'choices' in data['detailJSON']: # for i in data['detailJSON']['choices']: # if i['asin'] == casin: # shipping = 'international shipping' if i['intlShippable'] == 1 else 'no international shipping' # break # shipping = shipping if shipping else '' video = '' for p in data['detailJSON']['asins']: if p['asin'] == casin: video = p['videos'][0]['url'] if p['videos'] else '' break is_new, is_updated = False, False product = Product.objects(key=casin).first() if not product: is_new = True product = Product(key=casin) product.summary = summary product.list_info = list_info product.brand = brand product.shipping = 'FAST, FREE SHIPPING, FREE RETURN SHIPPING in the U.S.' product.returned = returned product.video = video product.image_urls = image_urls product.full_update_time = datetime.utcnow() if product.updated == False: product.updated = True ready = True else: ready = False product.save() common_saved.send(sender=ctx, obj_type='Product', key=casin, url=url, is_new=is_new, is_updated=is_updated, ready=ready)
def update_product(self, product_id): # ------------------------- # Update data of product # ------------------------- if request.form.get('_method') != 'PUT': app.logger.Info( 'Cannot perform this action. Please contact administrator') abort(405) product = Product(id=product_id) try: product = product.list_one_or_none_product() if product is None: app.logger.info( f'No data with Employee ID = {product_id} could be found!') abort(422) product.id = product_id product.name = request.form.get('name', product.name) product.price_per_cost_unit = request.form.get( 'price_per_cost_unit', product.price_per_cost_unit) product.cost_unit = request.form.get('cost_unit', product.cost_unit) product.quantity_in_stock = request.form.get( 'quantity_in_stock', product.quantity_in_stock) product.brand = request.form.get('brand', product.brand) product.production_date = request.form.get( 'production_date', product.production_date) product.best_before_date = request.form.get( 'best_before_date', product.best_before_date) product.plu = request.form.get('plu', product.plu) product.upc = request.form.get('upc', product.upc) form_organic = request.form.get('organic', 'off') product.organic = 0 if form_organic == 'on': product.organic = 1 product.cut = request.form.get('cut', product.cut) product.animal = request.form.get('animal', product.animal) department = request.form.get('department_name') product.department_id = department.split(' - ', 2)[0] # Need to update aisle_number in AisleContains table as well aisle = request.form.get('aisle_name') if aisle is not None: aisle_number = int(aisle.split(' - ', 2)[0]) aisle_contains = AisleContains( aisle_number=aisle_number, product_id=product_id ) aisle_contains = \ aisle_contains.list_one_or_none_aisle_contains(product) if aisle_contains is not None: aisle_contains.aisle_number = aisle_number else: aisle_contains = AisleContains( aisle_number=aisle_number, product_id=product_id ) # If the product is associated with any aisle, this code # should never have been reached. The other option here # would be to add the association to the AisleContains # table. try: aisle_contains = \ aisle_contains.add_aisle_contains_to_database() except BaseException: app.logger.info( f'An error occurred. Product {product_id} failed to be \ associated with Aisle {aisle_number}.') abort(422) try: product.update_product_in_database() flash( f'Product {product_id} was successfully updated!', 'success') except BaseException: app.logger.info( f'An error occurred. Product {product_id} \ could not be updated!') abort(422) except BaseException: app.logger.info( f'An error occurred. No data with Product ID\ = {product_id} could be found!') abort(422) return redirect('/products')
def crawl_listing(self, url, ctx='', **kwargs): res = requests.get(url) res.raise_for_status() tree = lxml.html.fromstring(res.content) category = kwargs['category'] if kwargs.get('category') else Category.objects(key=kwargs.get('key')).first() if not category: common_failed.send(sender=ctx, url=url, reason='category %s not found in db' % kwargs.get('key')) return product_nodes = tree.cssselect('div#atg_store_prodList ul li') for product_node in product_nodes: info_node = product_node.cssselect('div.thumbnailInfo')[0] price = None; listprice = None price_node = info_node.cssselect('div.our_price')[0] weekly_price_node = price_node.cssselect('.newPrice_value') sale_price_node = price_node.cssselect('#salePrice') if weekly_price_node: price = weekly_price_node[0].text.strip() elif sale_price_node: price = sale_price_node[0].text.strip() else: price = ''.join(price_node.xpath('.//text()')).strip() listprice = info_node.cssselect('div.retail_price')[0].text.strip() listprice = re.sub('\n', '', listprice) # eliminate products of no discountIndexError: if price is None or listprice is None: # common_failed.send(sender=ctx, url=url, \ # reason='listing product %s.%s cannot crawl price info -> %s / %s' % (key, title, price, listprice)) continue key = info_node.cssselect('div.product_id')[0].text.strip() brand = info_node.cssselect('a.sameBrandProduct')[0].text.strip() title_node = info_node.cssselect('a.product_gender_name')[0] # title = title_node.get('title') combine_url = title_node.get('href') match = re.search(r'https?://.+', combine_url) if not match: combine_url = '%s%s' % (HOST, combine_url) # is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if brand and brand != product.brand: product.brand = brand is_updated = True if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True # if category.cats and set(category.cats).difference(product.dept): # product.dept = list(set(category.cats) | set(product.dept or [])) # is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True # To pick the product which fit our needs, such as a certain discount, brand, dept etc. try: selected = Picker(site='ashford').pick(product) if product.updated \ else self.crawl_detail(ctx, is_new, is_updated, product) except: common_failed.send(sender=ctx, url=product.combine_url, reason=traceback.format_exc()) continue if not selected: continue if is_updated: product.list_update_time = datetime.utcnow() product.hit_time = datetime.utcnow() product.save() common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated), ready=(product.ready if hasattr(product, 'ready') else False))
def crawl_listing(self, url, ctx='', **kwargs): res = requests.get(url, params={'Ns': 'P_sale_flag|1'}) res.raise_for_status() tree = lxml.html.fromstring(res.content) category = Category.objects(key=kwargs.get('key')).first() if not category: print 'Category does not exist' common_failed.send(sender=ctx, url=url, reason='Category does not exist -> {0} .'.format(kwargs)) return product_nodes = tree.cssselect('div#product-container div'); no_discount_num = 0 # sometimes no discount product occurs between the discount ones ordered by sale. for product_node in product_nodes: if not product_node.get('id') or 'product' not in product_node.get('id').lower(): continue key = product_node.get('id') info_node = product_node.cssselect('div.product-text a')[0] price = None; listprice = None listprice_node = info_node.cssselect('span.product-price') price_node = info_node.cssselect('span.product-sale-price') if listprice_node: listprice = ''.join(listprice_node[0].xpath('.//text()')).strip() if price_node: price = ''.join(price_node[0].xpath('.//text()')).strip() if price is None or listprice is None: no_discount_num += 1 if no_discount_num < 3: continue return no_discount_num = 0 brand = info_node.cssselect('p span.product-designer-name')[0].text if brand: brand = brand.strip() title = info_node.cssselect('p.product-description')[0].text.strip() combine_url = info_node.get('href') is_new = False; is_updated = False product = Product.objects(key=key).first() if not product: is_new = True product = Product(key=key) product.updated = False product.event_type = False if title and title != product.title: product.title = title is_updated = True product.update_history['title'] = datetime.utcnow() if brand and brand != product.brand: product.brand = brand is_updated = True if combine_url and combine_url != product.combine_url: product.combine_url = combine_url is_updated = True product.update_history['combine_url'] = datetime.utcnow() if price and price != product.price: product.price = price is_updated = True if listprice and listprice != product.listprice: product.listprice = listprice is_updated = True if category.cats and set(category.cats).difference(product.dept): product.dept = list(set(category.cats) | set(product.dept or [])) is_updated = True if category.key not in product.category_key: product.category_key.append(category.key) is_updated = True if is_updated: product.list_update_time = datetime.utcnow() # To pick the product which fit our needs, such as a certain discount, brand, dept etc. selected = Picker(site='saksfifthavenue').pick(product) if not selected: continue product.hit_time = datetime.utcnow() product.save() # print product.brand; print product.title; print product.combine_url; print product.listprice, ' / ', product.price; print is_new; print is_updated # print common_saved.send(sender=ctx, obj_type='Product', key=product.key, url=product.combine_url, \ is_new=is_new, is_updated=((not is_new) and is_updated) ) # Go to the next page to keep on crawling. next_page = None page_nodes = tree.cssselect('div.pagination-container ol.pa-page-number li a') for page_node in page_nodes: if page_node.get('class') == 'next': href = page_node.get('href') match = re.search(r'https?://.+', href) next_page = href if match else '{0}/{1}'.format(HOST, href) break if next_page: print next_page self.crawl_listing(url=next_page, ctx=ctx, **kwargs)