class ParserLaucher(JsonSaveMixin, UrlProcessMixin, DatabaseSaver): """Class to interact with Parser. """ BASE_URL = 'https://www.ozon.ru' def __init__(self, result_save_directory, save_to_json=False, save_to_db=False, *args, **kwargs): self.result_save_directory = result_save_directory self.save_to_json = save_to_json self.save_to_db = save_to_db super().__init__(*args, **kwargs) self.parser = Parser() @staticmethod def get_parent_categories_from_db(): """Load categories with no parents. """ return Category.query.filter(Category.is_parent()).all() def fetch_parent_categories(self): print('Fetching parent categories...', file=sys.stdout) # Parse site for parent categories categories = self.parser.get_parent_categories(self.BASE_URL) categories = categories[0][self.BASE_URL] # SAVE TO .JSON FILE if self.save_to_json: print('Saving to .json file...', file=sys.stdout) self.save_to_jsonfile('parent_categories', categories, self.result_save_directory) # SAVE TO DATABASE if self.save_to_db: print('Saving to database...', file=sys.stdout) self.save_categories_to_database(categories) def fetch_subcategories(self): print('Fetching subcategories...', file=sys.stdout) # Get parent categories parent_categories = self.get_parent_categories_from_db() full_urls = [ self.get_full_url(self.BASE_URL, parent_category.url) for parent_category in parent_categories ] subcategories = self.parser.get_subcategories(full_urls) merged_subcategories = {} for subcategory in subcategories: key = list(subcategory)[0] value = subcategory[key] merged_subcategories[self.get_url_path(key)] = value # SAVE TO .JSON FILE if self.save_to_json: print('Saving to .json file...', file=sys.stdout) self.save_to_jsonfile('subcategories', merged_subcategories, self.result_save_directory) # SAVE TO DATABASE if self.save_to_db: print('Saving to database...', file=sys.stdout) self.save_subcategories_to_database(merged_subcategories, parent_categories) def fetch_items(self): print('Fetching items...', file=sys.stdout) # Get parent categories parent_categories = self.get_parent_categories_from_db() for parent_category in parent_categories: # For every parent category get its leaves categories leaf_categories = Category.query.filter( Category.has_no_children(), Category.path.descendant_of(parent_category.path)).all() for leaf_category in leaf_categories: # For every leaf category get corresponding items print('Parsing category:', leaf_category.name, file=sys.stdout) url = self.get_full_url(self.BASE_URL, leaf_category.url) items = self.parser.get_items(url) # SAVE ITEMS TO .JSON FILE if self.save_to_json: print('Saving to .json file...', file=sys.stdout) self.save_to_jsonfile( 'items_{}'.format(leaf_category.slug), items, self.result_save_directory) # SAVE ITEMS TO DATABASE if self.save_to_db: print('Saving to database...', file=sys.stdout) self.save_items_to_database(items, leaf_category)