def fetch_listing():

    global crawl_time
    out_0 = open("products-0.txt", "a")
    out_many = open("products-many.txt", "a")

    url = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    # print(url)
    items = page.find_all("li", class_="s-result-item")
    log("Found {} items on {}".format(len(items), url))

    if len(items) == 0:
        out_0.write(str(url) + "\n")
    else:
        out_many.write(str(url) + "\n")

        # input()

    # for item in items[:settings.max_details_per_listing]:
    #     try:
    #         out.write(item.get_text() + "\n")
    #     except:
    #         pass

    #     product_url = get_url(item)
    #     product_price = get_price(item)

    #     product = ProductRecord(
    #         title=product_title,
    #         product_url=format_url(product_url),
    #         listing_url=format_url(url),
    #         price=product_price,
    #         primary_img=product_image,
    #         crawl_time=crawl_time

    #     )
    #     product_id = product.save()
    #     # download_image(product_image, product_id)

    # # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        page_number = int(url.split("&page=")[1]) + 1
        enqueue_url(url.split("&page=")[0] + "&page=" + str(page_number))
        pile.spawn(fetch_listing)
示例#2
0
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    url = url.decode('utf-8')
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    items = page.findAll("div", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time

        )
        product_id = product.save()
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
示例#3
0
def fetch_listing():
    '''
    This is the root function that green threads call.
    This is essentially step 1 (but step 0 is above!)
    '''
    global crawl_time

    # Pop a random URL from the Redis listing_url_queue
    url = helpers.dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = helpers.make_request(url)
    if not page:
        return
    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:
        product_image = extractors.get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = extractors.get_title(item)
        product_url = extractors.get_url(item)
        product_price = extractors.get_price(item)

        product = models.ProductRecord(
            title=product_title,
            product_url=helpers.format_url(product_url),
            listing_url=helpers.format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time)
        product_id = product.save()
        helpers.download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        helpers.enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time

        )
        product_id = product.save()
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
示例#5
0
def exploit_listings_urls(platform):
    print('\n---\n\nStart exploitation\n\n---\n')
    # go through start urls
    time_watcher = models.TimeWatcher(MIN_SLEEP_TIME)
    sql_handler = models.SQLHandler()
    flag = True

    while flag:
        url = helpers.dequeue_url('listing_files', platform)
        if url:
            page = helpers.make_request(url, platform, time_watcher)
            if not page:
                continue
            try:
                handlers.handle_listing(page, platform, url, sql_handler)
            except Exception as e:
                helpers.queue_url(url, 'listing_files', platform)
                raise Exception('Exception: {}'.format(e))
        else:
            flag = False
示例#6
0
def fetch_products():
    item = dequeue_url()
    print(item)

    page, html = make_request_cfg(item["url"])

    document = {}
    #print(page)
    if page != None:
        captcha = get_captcha(page)
        #print("--------------")
        if captcha != None:
            print("[Warning] caught by captcha!!! id: {}".format(
                get_header_id()))
            enqueue_url(item)
            set_header_id((get_header_id() + 1) % 6)
        if captcha == None:
            #print("no captch")
            # look for subcategory links on this page
            #print(item['url'])

            #print(page)
            asins = get_asin(page)
            #print(asins)
            #print(len(asins))

            titles = get_title(page)
            #print(titles)
            #print(len(titles))

            #stars = get_star(page)
            #print(stars)
            #print("----------------")
            #print(len(stars))
            #print("----------------")

            #reviewnums = get_reviewnum(page)
            #print(reviewnums)
            #print(len(reviewnums))
            if asins != None:
                if len(asins) != 0:
                    for index in range(0, len(asins)):
                        document = {}
                        document['asin'] = asins[index]
                        #document['title'] = titles[index]
                        document['title'] = ""
                        #document['star'] = stars[index]
                        #document['reviewnum'] = reviewnums[index]
                        document['category1'] = item['category1']
                        document['category2'] = item['category2']
                        document['category3'] = item['category3']
                        document['category4'] = item['category4']
                        document['category5'] = item['category5']
                        document['category6'] = item['category6']
                        document['category7'] = item['category7']
                        document['date'] = datetime.now()

                        print(document)
                        #print("inserting")
                        save_DB(document)

                    #complete-> 1
                    prod = {}
                    prod['cat1'] = item['category1']
                    prod['cat2'] = item['category2']
                    prod['cat3'] = item['category3']
                    prod['cat4'] = item['category4']
                    prod['cat5'] = item['category5']
                    prod['cat6'] = item['category6']
                    prod['cat7'] = item['category7']
                    if item['url'].split("page=")[1] == "1":
                        prod['url'] = item['url'].replace("?bbn=1&dc&", "?")
                    else:
                        prod['url'] = item['url']
                    prod['completed'] = 1
                    #print("save complete as 1")
                    save_DB_completed(prod)
                else:
                    print("[Warning] missing product info1")
                    prod = {}
                    prod['cat1'] = item['category1']
                    prod['cat2'] = item['category2']
                    prod['cat3'] = item['category3']
                    prod['cat4'] = item['category4']
                    prod['cat5'] = item['category5']
                    prod['cat6'] = item['category6']
                    prod['cat7'] = item['category7']
                    if item['url'].split("page=")[1] == "1":
                        prod['url'] = item['url'].replace("?bbn=1&dc&", "?")
                    else:
                        prod['url'] = item['url']
                    prod['completed'] = 1

                    save_DB_completed(prod)
            else:
                print(item)
                print("[Warning] missing product info2")
                prod = {}
                prod['cat1'] = item['category1']
                prod['cat2'] = item['category2']
                prod['cat3'] = item['category3']
                prod['cat4'] = item['category4']
                prod['cat5'] = item['category5']
                prod['cat6'] = item['category6']
                prod['cat7'] = item['category7']
                if item['url'].split("page=")[1] == "1":
                    prod['url'] = item['url'].replace("?bbn=1&dc&", "?")
                else:
                    prod['url'] = item['url']
                prod['completed'] = 1

                save_DB_completed(prod)
示例#7
0
def fetch_products_detail():
    asin = dequeue_url()
    url = 'https://www.amazon.com/dp/' + asin
    print(url)

    DB_product = get_DB_product(asin)[0]

    item = {}
    item['code'] = DB_product['code']
    item['title'] = DB_product['title']
    item['price'] = DB_product['price']
    item['byLineInfo'] = DB_product['byLineInfo']
    item['sellerNum'] = DB_product['sellerNum']
    item['salesRank'] = DB_product['salesRank']
    item['avgRating'] = DB_product['avgRating']
    item['ratingNum'] = DB_product['ratingNum']

    page, html = make_request_cfg(url)

    product = {}
    if page == 503:
        return None
    elif page != None:

        captcha = get_captcha(page)
        #print(captcha)
        if captcha != None:
            print("[Warning] caught by captcha!!! id: {}".format(
                get_header_id()))
            enqueue_url(asin)
            set_header_id((get_header_id() + 1) % 6)
            #set_header_id( (get_header_id()) % 6)
        else:
            #output.write(str(page))
            #output.close()
            # look for subcategory links on this page
            title = get_title_detail(page)
            #print(title)

            price = get_price_detail(page)
            #print(price)

            byLineInfo = get_byLineInfo_detail(page)
            #print(byLineInfo)

            sellerNum = get_sellerNum_detail(page)
            #print(sellerNum)

            salesRank = get_salesRank_detail(page)
            #print(salesRank)

            avgRating = get_avgRating_detail(page)
            #print(avgRating)

            ratingNum = get_ratingNum_detail(page)
            #print(ratingNum)
            #print("----------------")

            product['code'] = asin
            product['title'] = title
            product['price'] = price
            product['byLineInfo'] = byLineInfo
            product['sellerNum'] = sellerNum
            product['salesRank'] = salesRank
            product['avgRating'] = avgRating
            product['ratingNum'] = ratingNum

            #print(item)
            #print("-----------------------vs-------------------------")
            #print(product)

            if item != product:
                #price change
                flag = 0

                message_price = ''
                message_salesRank = ''
                message_sellerNum = ''

                if item['price'] != product['price']:
                    print('price changed')
                    message_price = str(item['price']) + '-> ' + str(
                        product['price']) + '\n'
                    flag = 1
                #salesRank change
                if item['salesRank'] != product['salesRank']:

                    if len(item['salesRank']) == 2 or len(
                            item['salesRank']) == 4 or len(
                                item['salesRank']) == 6:
                        # increased
                        cat = item['salesRank'][1]
                        previousRank = int(item['salesRank'][0].replace(
                            ",", ""))
                        currentRank = int(product['salesRank'][0].replace(
                            ",", ""))
                        sign = ''
                        threshold = 5
                        if previousRank != currentRank:
                            if previousRank < currentRank:
                                diff = currentRank - previousRank
                                diff_percentage = float(diff / previousRank *
                                                        100)
                                sign = '+'
                            #decreased
                            elif previousRank > currentRank:
                                diff = previousRank - currentRank
                                diff_percentage = float(diff / previousRank *
                                                        100)
                                sign = '-'
                            if diff_percentage > threshold and sign == '+':
                                diff_percentage = '%.2f' % diff_percentage
                                message_salesRank += cat + ' (' + sign + str(
                                    diff_percentage) + '%, ' + sign + str(
                                        diff
                                    ) + ') ' + product['salesRank'][0] + '\n'
                                flag = 1
                    if len(item['salesRank']) == 4 or len(
                            item['salesRank']) == 6:
                        cat = item['salesRank'][3]
                        previousRank = int(item['salesRank'][2].replace(
                            ",", ""))
                        currentRank = int(product['salesRank'][2].replace(
                            ",", ""))
                        sign = ''
                        if previousRank != currentRank:
                            if previousRank < currentRank:
                                diff = currentRank - previousRank
                                diff_percentage = float(diff / previousRank)
                                sign = '+'
                            #decreased
                            elif previousRank > currentRank:
                                diff = previousRank - currentRank
                                diff_percentage = float(diff / previousRank)
                                sign = '-'
                            if diff_percentage > threshold and sign == '+':
                                diff_percentage = '%.2f' % diff_percentage
                                message_salesRank += cat + ' (' + sign + str(
                                    diff_percentage) + '%, ' + sign + str(
                                        diff
                                    ) + ') ' + product['salesRank'][2] + '\n'
                                flag = 1
                    if len(item['salesRank']) == 6:
                        cat = item['salesRank'][5]
                        previousRank = int(item['salesRank'][4].replace(
                            ",", ""))
                        currentRank = int(product['salesRank'][4].replace(
                            ",", ""))
                        sign = ''
                        if previousRank != currentRank:
                            if previousRank < currentRank:
                                diff = currentRank - previousRank
                                diff_percentage = float(diff / previousRank)
                                sign = '+'
                            #decreased
                            elif previousRank > currentRank:
                                diff = previousRank - currentRank
                                diff_percentage = float(diff / previousRank)
                                sign = '-'
                            if diff_percentage > threshold and sign == '+':
                                diff_percentage = '%.2f' % diff_percentage
                                message_salesRank += cat + ' (' + sign + str(
                                    diff) + '%, ' + sign + str(
                                        diff
                                    ) + ') ' + product['salesRank'][4] + '\n'
                                flag = 1
                    #print(message_salesRank)
                #seller number change
                if int(item['sellerNum']) != int(product['sellerNum']):
                    print('sellerNum changed')
                    message_sellerNum = str(item['sellerNum']) + '-> ' + str(
                        product['sellerNum']) + '\n'
                    flag = 1

                if flag == 1:
                    message = str(datetime.now()).split('.')[
                        0] + '\t' + 'https://amazon.com/dp/' + str(asin) + '\n'
                    if message_price != '':
                        message += 'price changed: ' + message_price
                    if message_salesRank != '':
                        message += 'sales rank changed:\n' + message_salesRank
                    if message_sellerNum != '':
                        message += 'seller number changed: ' + message_sellerNum
                    print(message)
                    #trigger_slackmessage(message)
                    update_DB_detail(asin, product)
示例#8
0
def fetch_listing():

    global crawl_time
    url, category_code, mode = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    # make request through selenium
    products_robot = ProductsRobot().run(url)
    page = BeautifulSoup(products_robot.page_source, "html.parser")
    try:
        element = WebDriverWait(products_robot, 2).until(
            EC.presence_of_element_located((By.ID, "prodDetails"))
        )
    except TimeoutException as e:
        pass
    finally:
        products_robot.quit()
    # put this login in get_products_link
    items = []
    items_container = page.find(id="mainResults")

    if items_container:
        items = items_container.find_all(id=re.compile('result_\d*'))

    log("Found {} items on {}".format(len(items), url))

    crawler = CrawlerAmazonContext().define_type_product_detail_crawler(category_code)
    if mode == settings.LINK_DETAIL_PRODUCT:
        crawler.get_products_link(items, category_code)
    elif mode == settings.PRODUCT_CRAWLER:
        crawler.get_product_info(url, page, category_code)

    # page, html = make_request(url) TODO: delete
    # if not page:
    #     return
    #
    # items = page.findAll("li", "s-result-item")
    '''
    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time,
            category_code=category_code,
            category=CATEGORY_LABELS[int(category_code)]
        )
        product_id = product.save()
        # download_image(product_image, product_id)
    '''
    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"],  category_code)
        pile.spawn(fetch_listing)