def preparse(self): self.preparsed_elements = [] self.data = scrape_schema_recipe.scrape(self.text, python_objects=True) if not self.data: BaseParser.preparse(self) return recipe = self.data[0] self.recipe = recipe for schema_key, output_key in self.schema_org_mappings.items(): if schema_key in recipe: value = recipe[schema_key] if isinstance(value, str): self.preparsed_elements.append((value, output_key)) if isinstance(value, list): for entry in value: self.preparsed_elements.append((entry, output_key)) elif isinstance(value, timedelta): minutes = int(value.total_seconds() // 60) if not minutes % 60 == 0: # Not full hours. self.preparsed_elements.append( ("{} min".format(minutes), output_key)) else: # Full hours. self.preparsed_elements.append( ("{} h".format(minutes // 60), output_key)) if self.preparsed_elements: self.ignore_unparsed = True else: BaseParser.preparse(self)
def test_recipe2(self): recipes = scrape( f"{DATA_PATH}/sweetestkitchen-truffles.html", python_objects=True, migrate_old_schema=False, ) recipe = recipes[0] input_keys = list(recipe.keys()) expected_output = [ "prepTime", "cookTime", "name", "recipeYield", "recipeCategory", "image", "description", "@type", "author", "aggregateRating", "ingredients", "recipeInstructions", "totalTime", "@context", ] assert lists_are_equal(expected_output, input_keys)
def test_recipe2(self): recipes = scrape('test_data/sweetestkitchen-truffles.html', python_objects=True, migrate_old_schema=False) recipe = recipes[0] input_keys = list(recipe.keys()) expectated_output = ['prepTime', 'cookTime', 'name', 'recipeYield', 'recipeCategory', 'image', 'description', '@type', 'author', 'aggregateRating', 'ingredients', 'recipeInstructions', 'totalTime', '@context'] assert lists_are_equal(expectated_output, input_keys)
def setUpClass(cls): cls.recipes = scrape(f"{DATA_PATH}/allrecipes-moscow-mule.html", python_objects=True) cls.recipe = cls.recipes[0]
def test_scrape(self): self.recipes = scrape(self.url) self.recipe = self.recipes[0] assert self.recipe["name"] == "Irish Coffee"
def test_scrape(self): with self.assertRaises(SSRTypeError): scrape(0xBEE)
def setUpClass(cls): cls.recipes = scrape(f"{DATA_PATH}/crumb-lemon-tea-cakes-2018.html", python_objects=True) cls.recipe = cls.recipes[0]
def setUpClass(cls): cls.recipes = scrape(f"{DATA_PATH}/bevvy-irish-coffee-2018.html") cls.recipe = cls.recipes[0]
def setUpClass(cls): cls.recipes = scrape(f"{DATA_PATH}/sweetestkitchen-truffles.html", python_objects=True) cls.recipe = cls.recipes[0]
def setUpClass(cls): cls.recipes = scrape('test_data/sweetestkitchen-truffles.html', python_objects=True) cls.recipe = cls.recipes[0]
def test_scrape(self): self.recipes = scrape(self.url) self.recipe = self.recipes[0] assert self.recipe['name'] == 'Irish Coffee'
def setUpClass(cls): cls.recipes = scrape('test_data/crumb-lemon-tea-cakes-2018.html', python_objects=True) cls.recipe = cls.recipes[0]
def setUpClass(cls): cls.recipes = scrape('test_data/bevvy-irish-coffee-2018.html') cls.recipe = cls.recipes[0]
urls = list(f.readlines()) # create a new list with cleaned urls - add https:// and strip '//' and '\n' from each url urls_cleaned = ['https:' + url.strip('\n') for url in urls] # urls_subset = urls_cleaned[:5] # initialize an empty list to contain recipes recipes = [] # loop through list of cleaned urls for url in urls_cleaned: try: # scrape url and obtain page information recipe_list = scrape_schema_recipe.scrape(url) # get relevant information out of the page recipe = recipe_list[0] # print('Recipe: \n', recipe) # transform the time values to a string of the total time of the recipes transform_time(recipe) # add recipe to list of recipes recipes.append(recipe) # print that the page has finished being scraped print('Page Done') print(url)