def crawl_items(): url = helpers.dequeue_items_url() if not url: helpers.log("WARNING: No URLs found in the queue. Retrying...") # pile.spawn(crawl_items) return product = Product(category="node", list_url=url, crawl_time=datetime.now(), asin="", title="", product_url="", price="", img_url="", img_path="") product.save() page, html = helpers.make_request(url) if not page: return next_link_tag = page.select("a#pagnNextLink") if next_link_tag: helpers.log(" Found 'Next' link on {}: {}".format( url, next_link_tag[0]["href"])) helpers.enqueue_items_url(next_link_tag[0]["href"]) items = page.select('.s-result-list li.s-result-item') category = extractors.get_category(page) for item in items: asin = extractors.get_asin(item) title = extractors.get_title(item) product_url = extractors.get_url(item) list_url = url price = extractors.get_price(item) img_url = extractors.get_primary_img(item) img_path = extractors.download_img(img_url, category.split(":::")[-1], asin) product = Product(category=category, asin=asin, title=title, product_url=product_url, list_url=list_url, price=price, img_url=img_url, img_path=img_path, crawl_time=datetime.now()) product.save() pile.spawn(crawl_items)
def fetch_listing(): global crawl_time url = dequeue_url() url = url.decode('utf-8') if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = make_request(url) if not page: return items = page.findAll("div", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product = ProductRecord( title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time ) product_id = product.save() # download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def fetch_listing(): ''' This is the root function that green threads call. This is essentially step 1 (but step 0 is above!) ''' global crawl_time # Pop a random URL from the Redis listing_url_queue url = helpers.dequeue_url() if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = helpers.make_request(url) if not page: return items = page.findAll("li", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = extractors.get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = extractors.get_title(item) product_url = extractors.get_url(item) product_price = extractors.get_price(item) product = models.ProductRecord( title=product_title, product_url=helpers.format_url(product_url), listing_url=helpers.format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time) product_id = product.save() helpers.download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) helpers.enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def fetch_listing(): global crawl_time url = dequeue_url() if not url: log("WARNING: No URLs found in the queue. Retrying...") pile.spawn(fetch_listing) return page, html = make_request(url) if not page: return items = page.findAll("li", "s-result-item") log("Found {} items on {}".format(len(items), url)) for item in items[:settings.max_details_per_listing]: product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product = ProductRecord( title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, crawl_time=crawl_time ) product_id = product.save() # download_image(product_image, product_id) # add next page to queue next_link = page.find("a", id="pagnNextLink") if next_link: log(" Found 'Next' link on {}: {}".format(url, next_link["href"])) enqueue_url(next_link["href"]) pile.spawn(fetch_listing)
def fetch_listing(ASIN, marketplace): global crawl_time url = marketplace.country_host + "/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + ASIN if not url: log("WARNING: No URLs {} found in the queue. Retrying...".format(url)) #pile.spawn(fetch_listing) return page, html = make_request(ASIN, marketplace.country_host) if not page: log("WARNING: No page. Retrying") #sleep(3) #fetch_listing(ASIN, marketplace) if page == None: return amazon_api(ASIN, url, marketplace.country_code) item = page product_image = get_primary_img(item) if not product_image: log("No product image detected, skipping") # continue product_title = get_title(item) product_url = get_url(item) product_price = get_price(item) product_indexing = get_indexing(item) if (product_title == '<missing product title>' and product_url == '<missing product url>'): product = amazon_api(ASIN, url, marketplace.country_code) else: product = ProductRecord(title=product_title, product_url=format_url(product_url), listing_url=format_url(url), price=product_price, primary_img=product_image, product_indexing=product_indexing, crawl_time=crawl_time, asin=ASIN) return product