Exemplo n.º 1
0
def scrape_all():

    print('INFO scrape_all(): Start craping')

    results = []
    queue = []

    # Get all sub category links
    categories = db.execute_query(
        "SELECT category_id, url, weight, count FROM categories WHERE parent IS NOT NULL;"
    )  #get_categories()

    for cat in categories:
        url = cat[1]
        cat_id = cat[0]
        weight = cat[2]
        count = cat[3]

        max_products = round(weight * count)  # limit of number of products

        queue.append((cat_id, url, max_products))

    while queue:
        url = queue[-1][1]
        cat_id = queue[-1][0]
        max_product = queue[-1][2]
        queue.pop()
        new_rows = scraping_products_on_page(cat_id, url)

        print(len(new_rows))

        if new_rows:
            results += new_rows

            # Insert products to database
            for product in new_rows:
                db.insert_row(product, "products")

            total_product = len(new_rows)
            page = int(url.split('&page=')[1]) + 1 if len(
                url.split('&page=')) > 1 else 2
            new_url = url.split('&page=')[0] + '&page=' + str(page)

            max_product -= total_product
            if max_product >= 0:
                queue.append((cat_id, new_url, max_product))
                print("{} {} {}".format(cat_id, new_url, max_product))

    # Return the final list of all products
    return results
def is_category_existed(url):
    """
        This function will check if a category is existed in the db or not
        by checking the category url. 
        If it is existed, return True, else False
    """

    existed_category = db.execute_query(
        "SELECT 1 FROM categories WHERE url = '" + url + "'")

    # if the current category is not in the database
    if existed_category:
        return True
    else:
        return False
def add_sub_categories():

    categories = db.execute_query(
        "SELECT category_id, url, weight FROM categories \
                                    WHERE parent IS NULL;")

    for item in categories:
        print('INFO: Get sub-categories of {}'.format(item[1]))
        total_products_sub = 0
        parent_id = item[0]
        weight = item[2]

        # parse the root category's html
        s = parse(item[1])

        # Find all sub category of the current root category
        sub_categories = s.findAll('div',
                                   {'class': 'list-group-item is-child'})

        # loop through each sub category
        for sub in sub_categories:

            url = TIKI_URL + sub.a['href']

            # if the current sub category is not in the database, then insert it
            if is_category_existed(url) == False:
                index = sub.a.text.rindex('(')
                name = sub.a.text[:index].strip()
                count = int(sub.a.text.strip()[index + 1:-1])
                created_on = datetime.datetime.now()

                print(count)
                # add the current sub category total products number to the total products number
                total_products_sub += count

                # insert sub category into db
                db.insert_row(
                    (name, url, parent_id, weight, count, created_on),
                    'categories')

        # After inserting all sub categories, update the total number of products
        # to the root category
        query = "UPDATE categories SET count = {} WHERE category_id = {}".format(
            total_products_sub, parent_id)
        db.update_query(query)
Exemplo n.º 4
0
def add_sub_categories():

    categories = db.execute_query(
        "SELECT category_id, url, weight FROM categories;")

    for item in categories:
        print('INFO: Get sub-categories of {}'.format(item[1]))
        s = parse(item[1])
        sub_categories = s.findAll('div',
                                   {'class': 'list-group-item is-child'})
        for sub in sub_categories:
            url = TIKI_URL + sub.a['href']
            index = sub.a.text.rindex('(')
            name = sub.a.text[:index].strip()
            count = int(sub.a.text.strip()[index + 1:-1])
            created_on = datetime.datetime.now()

            db.insert_row((name, url, item[0], item[2], count, created_on),
                          'categories')
def scrape_all():
    """
    This function will:
    1. Get the list of sub categories from the db
    2. For each sub category, it will scrape the products 
        until the total scraped products reach the maximum scraping products
        we calculate with weight number. At that point, it will change to another sub category.
    """

    print('INFO scrape_all(): Start craping')

    results = []
    queue = []

    # Get all sub category links
    categories = db.execute_query("SELECT category_id, url, weight, count \
                                    FROM categories WHERE parent IS NOT NULL;")

    # Compute the list of sub categories
    for cat in categories:
        url = cat[1]
        cat_id = cat[0]
        weight = cat[2]
        count = cat[3]

        # Set the maximum number of products will be scraped for each sub category
        max_products = round(weight * count)

        queue.append((cat_id, url, max_products))

    # loop through the queue of all sub categories
    while queue:

        # process the last sub category url in the queue
        url = queue[-1][1]
        cat_id = queue[-1][0]
        max_product = queue[-1][2]

        # remove the last sub category url out of the queue
        # because we have already extracted data from it
        queue.pop()

        # get the products
        new_rows = scraping_products_on_page(cat_id, url)

        # if the product page has products
        if new_rows:
            results += new_rows

            # Insert products to database
            for product in new_rows:
                db.insert_row(product, "products")

            total_product = len(new_rows)

            # create the new product page that will be scraped
            page = int(url.split('&page=')[1]) + 1 if len(
                url.split('&page=')) > 1 else 2
            new_url = url.split('&page=')[0] + '&page=' + str(page)

            # subtract maximum number of products to check when to stop scraping the products
            # of the current sub category
            max_product -= total_product

            # when the maximum number of products still >= 0, append this new product page
            # to the queue so that it will be scraped next
            if max_product >= 0:
                queue.append((cat_id, new_url, max_product))
                print(
                    "Scraping data from sub category: {}, \n page url: {}\n max product remains {}"
                    .format(cat_id, new_url, max_product))

    # Return the final list of all products
    return results