Пример #1
0
def begin_crawl(url):

    page, html = make_request(url)
    count = 0
    # look for subcategory links on this page
    subcategories = page.findAll("div",
                                 "bxc-grid__image")  # downward arrow graphics
    subcategories.extend(page.findAll(
        "li", "sub-categories__list__item"))  # carousel hover menu
    sidebar = page.find("div", "browseBox")
    if sidebar:
        subcategories.extend(sidebar.findAll("li"))  # left sidebar

    for subcategory in subcategories:
        link = subcategory.find("a")
        if not link:
            continue
        link = link["href"]
        count += 1
        enqueue_url(link)

    log("Found {} subcategories on {}".format(count, line))
Пример #2
0
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return
    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

    # product_title = get_title(item)
        product_url = get_url(item)
        data = ParseReviews(product_url)
        #  product_price = get_price(item)
        data
        data.update({
            'Product URL': format_url(product_url),
            "Listing URL": format_url(url),
            "Product Image": product_image,
        })

        f = open('data.json', 'a')
        json.dump(data, f, indent=4)
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        host = get_host(url)
        enqueue_url(host + next_link["href"])
        pile.spawn(fetch_listing)
Пример #3
0
def crawlamazon(url):

    begin_crawl(url)  # put a bunch of subcategory URLs into the queue
    log("Beginning crawl at {}".format(crawl_time))
    [pile.spawn(fetch_listing) for _ in range(settings.max_threads)]
    pool.waitall()
Пример #4
0
    # Export only the latest crawl
    # cur.execute("SELECT products.* FROM products JOIN (SELECT MAX(crawl_time) FROM products) AS p ON products.crawl_time = p.max;")

    # Dedupe products on their primary_img URL
    cur.execute("SELECT DISTINCT ON (primary_img) * FROM products;")
    return cur.fetchall()


def write_to_csv(data):

    file_name = "{}-amazon-crawl.csv".format(
        datetime.today().strftime("%Y-%m-%d"))
    file_path = os.path.join(settings.export_dir, file_name)

    with open(file_path, "w") as f:
        writer = csv.writer(f)
        for row in data:
            writer.writerow(row)

    return file_path


if __name__ == '__main__':
    log("Beginning export")

    rows = dump_latest_scrape()
    log("Got {} rows from database".format(len(rows)))

    file_path = write_to_csv(rows)
    log("Wrote data to {}".format(file_path))