Exemplo n.º 1
0
def begin_crawl():
	# Explore out all of our category start_urls into subcategories
	with open(settings.start_file, "r") as f:
		for line in f:
			line = line.strip()
			if not line or line.startswith("#"):
				continue # skip blank and commented out lines

			page, html = make_request(line)
			count = 0

			# Look for sub links
			subcategories = page.findAll("div", "bcx-grid__image") # downward arrow graphics
			subcategories.extend(page.findAll("li", "sub-categories__list__item")) # carousel hover menu
			sidebar = page.find("div", "browseBox")
			if sidebar:
				subcategories.extend(sidebar.findAll("li"))

			for subcategory in subcategories:
				line = subcategory.find("a")
				if not link:
					continue
				link = link["href"]
				count += 1
				enqueue_url(link)

			log("Found {} subcategories on {}".format(count, line))
Exemplo n.º 2
0
def begin_crawl():

    # explode out all of our category `start_urls` into subcategories
    with open(settings.start_file, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue  # skip blank and commented out lines

            page, html = make_request(line)
            count = 0

            # look for subcategory links on this page
            subcategories = page.findAll("div", "bxc-grid__image")  # downward arrow graphics
            subcategories.extend(page.findAll("li", "sub-categories__list__item"))  # carousel hover menu
            sidebar = page.find("div", "browseBox")
            if sidebar:
                subcategories.extend(sidebar.findAll("li"))  # left sidebar

            for subcategory in subcategories:
                link = subcategory.find("a")
                if not link:
                    continue
                link = link["href"]
                count += 1
                enqueue_url(link)

            log("Found {} subcategories on {}".format(count, line))
Exemplo n.º 3
0
def fetch_listing():

    global crawl_time
    out_0 = open("products-0.txt", "a")
    out_many = open("products-many.txt", "a")

    url = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    # print(url)
    items = page.find_all("li", class_="s-result-item")
    log("Found {} items on {}".format(len(items), url))

    if len(items) == 0:
        out_0.write(str(url) + "\n")
    else:
        out_many.write(str(url) + "\n")

        # input()

    # for item in items[:settings.max_details_per_listing]:
    #     try:
    #         out.write(item.get_text() + "\n")
    #     except:
    #         pass

    #     product_url = get_url(item)
    #     product_price = get_price(item)

    #     product = ProductRecord(
    #         title=product_title,
    #         product_url=format_url(product_url),
    #         listing_url=format_url(url),
    #         price=product_price,
    #         primary_img=product_image,
    #         crawl_time=crawl_time

    #     )
    #     product_id = product.save()
    #     # download_image(product_image, product_id)

    # # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        page_number = int(url.split("&page=")[1]) + 1
        enqueue_url(url.split("&page=")[0] + "&page=" + str(page_number))
        pile.spawn(fetch_listing)
Exemplo n.º 4
0
def begin_crawl_detail():
    print("pushing asin in the stack")
    # explode out all of our category `start_urls` into subcategories
    with open(settings.detail_file, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue  # skip blank and commented out lines
            enqueue_url(line)

    print(get_queue_length())
    set_header_id(0)
    print("finished asin pushing")
Exemplo n.º 5
0
def begin_crawl():
    flush_all()
    out = open("nodes.txt", "a")
    # explode out all of our category `start_urls` into subcategories
    count = 0
    with open(settings.start_file, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue  # skip blank and commented out lines
            enqueue_url("https://www.amazon.com/b/?node=" + line + "&page=1")
            count += 1
        log("Found {} subcategories".format(count))
        return count
Exemplo n.º 6
0
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    url = url.decode('utf-8')
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    items = page.findAll("div", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time

        )
        product_id = product.save()
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
Exemplo n.º 7
0
def fetch_listing():
    '''
    This is the root function that green threads call.
    This is essentially step 1 (but step 0 is above!)
    '''
    global crawl_time

    # Pop a random URL from the Redis listing_url_queue
    url = helpers.dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = helpers.make_request(url)
    if not page:
        return
    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:
        product_image = extractors.get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = extractors.get_title(item)
        product_url = extractors.get_url(item)
        product_price = extractors.get_price(item)

        product = models.ProductRecord(
            title=product_title,
            product_url=helpers.format_url(product_url),
            listing_url=helpers.format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time)
        product_id = product.save()
        helpers.download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        helpers.enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
Exemplo n.º 8
0
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time

        )
        product_id = product.save()
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
Exemplo n.º 9
0
def begin_crawl():

    # explode out all of our category `start_urls` into subcategories
    with open(settings.start_file, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue  # skip blank and commented out lines

            #page, html = make_request(line)
            count = 0

            # look for subcategory links on this page
            #subcategories = page.findAll("div", "bxc-grid__image")  # downward arrow graphics
            #subcategories = page.findAll("li", "acs-ln-special-link") # Only "Shop all" links
            #sidebar = page.find("div", "browseBox")
            #if sidebar:
            #    subcategories.extend(sidebar.findAll("li"))  # left sidebar
            enqueue_url(line)
Exemplo n.º 10
0
def begin_crawl():
    '''
    Initialize everything (except the Postsql database)
    The Postsql database tables are setup by directly executing models.py
    This is essentially step 0
    '''
    # explode out all of our category `start_urls` into subcategories
    with open(settings.start_file, "r") as f:
        # read each url in the start-urls.txt file
        for line in f:
            # Count the number of subcategories found from Starting URL
            subcategory_count = 0

            # remove all leading and trailing whitespace and commented out URLs
            url = line.strip()
            if not url or line.startswith("#"):
                continue

            # Make a request.  This properly parses the url and makes a green request
            # page - <BeautifulSoup> constructed from html
            # html - string of html text from the request
            page, html = helpers.make_request(url)

            # look for subcategory links on this page
            subcategories = page.findAll(
                "div", "bxc-grid__image")  # downward arrow graphics
            subcategories.extend(
                page.findAll(
                    "li", "sub-categories__list__item"))  # carousel hover menu
            sidebar = page.find("div", "browseBox")
            if sidebar:
                subcategories.extend(sidebar.findAll("li"))  # left sidebar

            for subcategory in subcategories:
                link = subcategory.find("a")
                if not link:
                    continue
                link = link["href"]
                subcategory_count += 1
                # Add the subcategory link to Redis
                helpers.enqueue_url(link)
            log("Found {} subcategories on {}".format(subcategory_count, line))
Exemplo n.º 11
0
def begin_crawl():
    print("pushing url info in to the stack")
    # explode out all of our category `start_urls` into subcategories
    completed_target = 0
    url = get_category_info(completed_target)
    print(len(url))

    #initialize queue
    if get_queue_length() > 0:
        clean_url()
    #print(url)

    for index in range(0, len(url)):
        #populate urls from page 1 to max
        if url[index]['completed'] == completed_target:
            new_url = {}
            #print(url[index])
            try:
                new_url['category1'] = url[index]['category1']
                new_url['category2'] = url[index]['category2']
                new_url['category3'] = url[index]['category3']
                new_url['category4'] = url[index]['category4']
                new_url['category5'] = url[index]['category5']
                new_url['category6'] = url[index]['category6']
                new_url['category7'] = url[index]['category7']
                new_url['pageunit'] = url[index]['pageunit']
                new_url['url'] = url[index]['url']

                #print(url[index]['url'].split("page=")[1])
                if url[index]['url'].split("page=")[1] == "1":
                    new_url['url'] = url[index]['url'].replace(
                        "?", "?bbn=1&dc&")
                    #print(new_url['url'])
            except:
                print(url[index])

            enqueue_url(new_url)

    print(get_queue_length())
    set_header_id(0)
    print("completed url pushing")
Exemplo n.º 12
0
def fetch_products():
    item = dequeue_url()
    print(item)

    page, html = make_request_cfg(item["url"])

    document = {}
    #print(page)
    if page != None:
        captcha = get_captcha(page)
        #print("--------------")
        if captcha != None:
            print("[Warning] caught by captcha!!! id: {}".format(
                get_header_id()))
            enqueue_url(item)
            set_header_id((get_header_id() + 1) % 6)
        if captcha == None:
            #print("no captch")
            # look for subcategory links on this page
            #print(item['url'])

            #print(page)
            asins = get_asin(page)
            #print(asins)
            #print(len(asins))

            titles = get_title(page)
            #print(titles)
            #print(len(titles))

            #stars = get_star(page)
            #print(stars)
            #print("----------------")
            #print(len(stars))
            #print("----------------")

            #reviewnums = get_reviewnum(page)
            #print(reviewnums)
            #print(len(reviewnums))
            if asins != None:
                if len(asins) != 0:
                    for index in range(0, len(asins)):
                        document = {}
                        document['asin'] = asins[index]
                        #document['title'] = titles[index]
                        document['title'] = ""
                        #document['star'] = stars[index]
                        #document['reviewnum'] = reviewnums[index]
                        document['category1'] = item['category1']
                        document['category2'] = item['category2']
                        document['category3'] = item['category3']
                        document['category4'] = item['category4']
                        document['category5'] = item['category5']
                        document['category6'] = item['category6']
                        document['category7'] = item['category7']
                        document['date'] = datetime.now()

                        print(document)
                        #print("inserting")
                        save_DB(document)

                    #complete-> 1
                    prod = {}
                    prod['cat1'] = item['category1']
                    prod['cat2'] = item['category2']
                    prod['cat3'] = item['category3']
                    prod['cat4'] = item['category4']
                    prod['cat5'] = item['category5']
                    prod['cat6'] = item['category6']
                    prod['cat7'] = item['category7']
                    if item['url'].split("page=")[1] == "1":
                        prod['url'] = item['url'].replace("?bbn=1&dc&", "?")
                    else:
                        prod['url'] = item['url']
                    prod['completed'] = 1
                    #print("save complete as 1")
                    save_DB_completed(prod)
                else:
                    print("[Warning] missing product info1")
                    prod = {}
                    prod['cat1'] = item['category1']
                    prod['cat2'] = item['category2']
                    prod['cat3'] = item['category3']
                    prod['cat4'] = item['category4']
                    prod['cat5'] = item['category5']
                    prod['cat6'] = item['category6']
                    prod['cat7'] = item['category7']
                    if item['url'].split("page=")[1] == "1":
                        prod['url'] = item['url'].replace("?bbn=1&dc&", "?")
                    else:
                        prod['url'] = item['url']
                    prod['completed'] = 1

                    save_DB_completed(prod)
            else:
                print(item)
                print("[Warning] missing product info2")
                prod = {}
                prod['cat1'] = item['category1']
                prod['cat2'] = item['category2']
                prod['cat3'] = item['category3']
                prod['cat4'] = item['category4']
                prod['cat5'] = item['category5']
                prod['cat6'] = item['category6']
                prod['cat7'] = item['category7']
                if item['url'].split("page=")[1] == "1":
                    prod['url'] = item['url'].replace("?bbn=1&dc&", "?")
                else:
                    prod['url'] = item['url']
                prod['completed'] = 1

                save_DB_completed(prod)
Exemplo n.º 13
0
def fetch_products_detail():
    asin = dequeue_url()
    url = 'https://www.amazon.com/dp/' + asin
    print(url)

    DB_product = get_DB_product(asin)[0]

    item = {}
    item['code'] = DB_product['code']
    item['title'] = DB_product['title']
    item['price'] = DB_product['price']
    item['byLineInfo'] = DB_product['byLineInfo']
    item['sellerNum'] = DB_product['sellerNum']
    item['salesRank'] = DB_product['salesRank']
    item['avgRating'] = DB_product['avgRating']
    item['ratingNum'] = DB_product['ratingNum']

    page, html = make_request_cfg(url)

    product = {}
    if page == 503:
        return None
    elif page != None:

        captcha = get_captcha(page)
        #print(captcha)
        if captcha != None:
            print("[Warning] caught by captcha!!! id: {}".format(
                get_header_id()))
            enqueue_url(asin)
            set_header_id((get_header_id() + 1) % 6)
            #set_header_id( (get_header_id()) % 6)
        else:
            #output.write(str(page))
            #output.close()
            # look for subcategory links on this page
            title = get_title_detail(page)
            #print(title)

            price = get_price_detail(page)
            #print(price)

            byLineInfo = get_byLineInfo_detail(page)
            #print(byLineInfo)

            sellerNum = get_sellerNum_detail(page)
            #print(sellerNum)

            salesRank = get_salesRank_detail(page)
            #print(salesRank)

            avgRating = get_avgRating_detail(page)
            #print(avgRating)

            ratingNum = get_ratingNum_detail(page)
            #print(ratingNum)
            #print("----------------")

            product['code'] = asin
            product['title'] = title
            product['price'] = price
            product['byLineInfo'] = byLineInfo
            product['sellerNum'] = sellerNum
            product['salesRank'] = salesRank
            product['avgRating'] = avgRating
            product['ratingNum'] = ratingNum

            #print(item)
            #print("-----------------------vs-------------------------")
            #print(product)

            if item != product:
                #price change
                flag = 0

                message_price = ''
                message_salesRank = ''
                message_sellerNum = ''

                if item['price'] != product['price']:
                    print('price changed')
                    message_price = str(item['price']) + '-> ' + str(
                        product['price']) + '\n'
                    flag = 1
                #salesRank change
                if item['salesRank'] != product['salesRank']:

                    if len(item['salesRank']) == 2 or len(
                            item['salesRank']) == 4 or len(
                                item['salesRank']) == 6:
                        # increased
                        cat = item['salesRank'][1]
                        previousRank = int(item['salesRank'][0].replace(
                            ",", ""))
                        currentRank = int(product['salesRank'][0].replace(
                            ",", ""))
                        sign = ''
                        threshold = 5
                        if previousRank != currentRank:
                            if previousRank < currentRank:
                                diff = currentRank - previousRank
                                diff_percentage = float(diff / previousRank *
                                                        100)
                                sign = '+'
                            #decreased
                            elif previousRank > currentRank:
                                diff = previousRank - currentRank
                                diff_percentage = float(diff / previousRank *
                                                        100)
                                sign = '-'
                            if diff_percentage > threshold and sign == '+':
                                diff_percentage = '%.2f' % diff_percentage
                                message_salesRank += cat + ' (' + sign + str(
                                    diff_percentage) + '%, ' + sign + str(
                                        diff
                                    ) + ') ' + product['salesRank'][0] + '\n'
                                flag = 1
                    if len(item['salesRank']) == 4 or len(
                            item['salesRank']) == 6:
                        cat = item['salesRank'][3]
                        previousRank = int(item['salesRank'][2].replace(
                            ",", ""))
                        currentRank = int(product['salesRank'][2].replace(
                            ",", ""))
                        sign = ''
                        if previousRank != currentRank:
                            if previousRank < currentRank:
                                diff = currentRank - previousRank
                                diff_percentage = float(diff / previousRank)
                                sign = '+'
                            #decreased
                            elif previousRank > currentRank:
                                diff = previousRank - currentRank
                                diff_percentage = float(diff / previousRank)
                                sign = '-'
                            if diff_percentage > threshold and sign == '+':
                                diff_percentage = '%.2f' % diff_percentage
                                message_salesRank += cat + ' (' + sign + str(
                                    diff_percentage) + '%, ' + sign + str(
                                        diff
                                    ) + ') ' + product['salesRank'][2] + '\n'
                                flag = 1
                    if len(item['salesRank']) == 6:
                        cat = item['salesRank'][5]
                        previousRank = int(item['salesRank'][4].replace(
                            ",", ""))
                        currentRank = int(product['salesRank'][4].replace(
                            ",", ""))
                        sign = ''
                        if previousRank != currentRank:
                            if previousRank < currentRank:
                                diff = currentRank - previousRank
                                diff_percentage = float(diff / previousRank)
                                sign = '+'
                            #decreased
                            elif previousRank > currentRank:
                                diff = previousRank - currentRank
                                diff_percentage = float(diff / previousRank)
                                sign = '-'
                            if diff_percentage > threshold and sign == '+':
                                diff_percentage = '%.2f' % diff_percentage
                                message_salesRank += cat + ' (' + sign + str(
                                    diff) + '%, ' + sign + str(
                                        diff
                                    ) + ') ' + product['salesRank'][4] + '\n'
                                flag = 1
                    #print(message_salesRank)
                #seller number change
                if int(item['sellerNum']) != int(product['sellerNum']):
                    print('sellerNum changed')
                    message_sellerNum = str(item['sellerNum']) + '-> ' + str(
                        product['sellerNum']) + '\n'
                    flag = 1

                if flag == 1:
                    message = str(datetime.now()).split('.')[
                        0] + '\t' + 'https://amazon.com/dp/' + str(asin) + '\n'
                    if message_price != '':
                        message += 'price changed: ' + message_price
                    if message_salesRank != '':
                        message += 'sales rank changed:\n' + message_salesRank
                    if message_sellerNum != '':
                        message += 'seller number changed: ' + message_sellerNum
                    print(message)
                    #trigger_slackmessage(message)
                    update_DB_detail(asin, product)
Exemplo n.º 14
0
def fetch_listing():

    global crawl_time
    url, category_code, mode = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    # make request through selenium
    products_robot = ProductsRobot().run(url)
    page = BeautifulSoup(products_robot.page_source, "html.parser")
    try:
        element = WebDriverWait(products_robot, 2).until(
            EC.presence_of_element_located((By.ID, "prodDetails"))
        )
    except TimeoutException as e:
        pass
    finally:
        products_robot.quit()
    # put this login in get_products_link
    items = []
    items_container = page.find(id="mainResults")

    if items_container:
        items = items_container.find_all(id=re.compile('result_\d*'))

    log("Found {} items on {}".format(len(items), url))

    crawler = CrawlerAmazonContext().define_type_product_detail_crawler(category_code)
    if mode == settings.LINK_DETAIL_PRODUCT:
        crawler.get_products_link(items, category_code)
    elif mode == settings.PRODUCT_CRAWLER:
        crawler.get_product_info(url, page, category_code)

    # page, html = make_request(url) TODO: delete
    # if not page:
    #     return
    #
    # items = page.findAll("li", "s-result-item")
    '''
    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time,
            category_code=category_code,
            category=CATEGORY_LABELS[int(category_code)]
        )
        product_id = product.save()
        # download_image(product_image, product_id)
    '''
    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"],  category_code)
        pile.spawn(fetch_listing)
Exemplo n.º 15
0
 def get_products_link(self, items, category_code):
     for item in items[:settings.max_details_per_listing]:
         product_url = get_url(item)
         enqueue_url(product_url, category_code, settings.PRODUCT_CRAWLER)
         pile.spawn(fetch_listing)
Exemplo n.º 16
0
 def crawl_href_and_enqueue(self, element):
     link = element["href"]
     self.count += 1
     enqueue_url(link, self.category_label)