def scrape_all(): print('INFO scrape_all(): Start craping') results = [] queue = [] # Get all sub category links categories = db.execute_query( "SELECT category_id, url, weight, count FROM categories WHERE parent IS NOT NULL;" ) #get_categories() for cat in categories: url = cat[1] cat_id = cat[0] weight = cat[2] count = cat[3] max_products = round(weight * count) # limit of number of products queue.append((cat_id, url, max_products)) while queue: url = queue[-1][1] cat_id = queue[-1][0] max_product = queue[-1][2] queue.pop() new_rows = scraping_products_on_page(cat_id, url) print(len(new_rows)) if new_rows: results += new_rows # Insert products to database for product in new_rows: db.insert_row(product, "products") total_product = len(new_rows) page = int(url.split('&page=')[1]) + 1 if len( url.split('&page=')) > 1 else 2 new_url = url.split('&page=')[0] + '&page=' + str(page) max_product -= total_product if max_product >= 0: queue.append((cat_id, new_url, max_product)) print("{} {} {}".format(cat_id, new_url, max_product)) # Return the final list of all products return results
def is_category_existed(url): """ This function will check if a category is existed in the db or not by checking the category url. If it is existed, return True, else False """ existed_category = db.execute_query( "SELECT 1 FROM categories WHERE url = '" + url + "'") # if the current category is not in the database if existed_category: return True else: return False
def add_sub_categories(): categories = db.execute_query( "SELECT category_id, url, weight FROM categories \ WHERE parent IS NULL;") for item in categories: print('INFO: Get sub-categories of {}'.format(item[1])) total_products_sub = 0 parent_id = item[0] weight = item[2] # parse the root category's html s = parse(item[1]) # Find all sub category of the current root category sub_categories = s.findAll('div', {'class': 'list-group-item is-child'}) # loop through each sub category for sub in sub_categories: url = TIKI_URL + sub.a['href'] # if the current sub category is not in the database, then insert it if is_category_existed(url) == False: index = sub.a.text.rindex('(') name = sub.a.text[:index].strip() count = int(sub.a.text.strip()[index + 1:-1]) created_on = datetime.datetime.now() print(count) # add the current sub category total products number to the total products number total_products_sub += count # insert sub category into db db.insert_row( (name, url, parent_id, weight, count, created_on), 'categories') # After inserting all sub categories, update the total number of products # to the root category query = "UPDATE categories SET count = {} WHERE category_id = {}".format( total_products_sub, parent_id) db.update_query(query)
def add_sub_categories(): categories = db.execute_query( "SELECT category_id, url, weight FROM categories;") for item in categories: print('INFO: Get sub-categories of {}'.format(item[1])) s = parse(item[1]) sub_categories = s.findAll('div', {'class': 'list-group-item is-child'}) for sub in sub_categories: url = TIKI_URL + sub.a['href'] index = sub.a.text.rindex('(') name = sub.a.text[:index].strip() count = int(sub.a.text.strip()[index + 1:-1]) created_on = datetime.datetime.now() db.insert_row((name, url, item[0], item[2], count, created_on), 'categories')
def scrape_all(): """ This function will: 1. Get the list of sub categories from the db 2. For each sub category, it will scrape the products until the total scraped products reach the maximum scraping products we calculate with weight number. At that point, it will change to another sub category. """ print('INFO scrape_all(): Start craping') results = [] queue = [] # Get all sub category links categories = db.execute_query("SELECT category_id, url, weight, count \ FROM categories WHERE parent IS NOT NULL;") # Compute the list of sub categories for cat in categories: url = cat[1] cat_id = cat[0] weight = cat[2] count = cat[3] # Set the maximum number of products will be scraped for each sub category max_products = round(weight * count) queue.append((cat_id, url, max_products)) # loop through the queue of all sub categories while queue: # process the last sub category url in the queue url = queue[-1][1] cat_id = queue[-1][0] max_product = queue[-1][2] # remove the last sub category url out of the queue # because we have already extracted data from it queue.pop() # get the products new_rows = scraping_products_on_page(cat_id, url) # if the product page has products if new_rows: results += new_rows # Insert products to database for product in new_rows: db.insert_row(product, "products") total_product = len(new_rows) # create the new product page that will be scraped page = int(url.split('&page=')[1]) + 1 if len( url.split('&page=')) > 1 else 2 new_url = url.split('&page=')[0] + '&page=' + str(page) # subtract maximum number of products to check when to stop scraping the products # of the current sub category max_product -= total_product # when the maximum number of products still >= 0, append this new product page # to the queue so that it will be scraped next if max_product >= 0: queue.append((cat_id, new_url, max_product)) print( "Scraping data from sub category: {}, \n page url: {}\n max product remains {}" .format(cat_id, new_url, max_product)) # Return the final list of all products return results