class EpicuriousSiteMapDownloader(SiteMapDownloader): subdirectory_output = 'epicurious' robots_url = 'http://www.epicurious.com/robots.txt' ignore_recipe_pattern = ['/review'] recipe_url_pattern = ['www.epicurious.com/recipes/food/views'] parser = HRecipeParser.get_parser()
def test_hrecipe_parser(self): parser = HRecipeParser.get_parser() for site, url in self.sample_sites_hrecipe.items(): req = request.Request(url, data=None, headers={'User-Agent': get_agent()}) resp = request.urlopen(req) data = parser(BeautifulSoup(resp.read(), 'lxml')) self.assertNotIn(data['title'], [None, ''], "No title found for site {0}".format(site)) self.assertNotEqual( data['ingredients'], [], "No ingredients found for site {0}".format(site)) if site not in ['williamssonoma', 'thekitchn']: self.assertNotEqual( data['instructions'], [], "No instructions found for site {0}".format(site)) if site in TEST_REVIEWS_DATA: self.assertTrue( data['reviews']['text'] not in [None, [], ''] or data['reviews']['ratings'] not in [None, [], ''], "No review body/text for site: {0}".format(site)) if site in TEST_PREP_TIMES: self.assertNotEqual( data['time']['cookTime'], [], "No cookTimes found for site {0}".format(site)) self.assertNotEqual( data['time']['prepTime'], [], "No prepTimes found for site {0}".format(site)) if DEBUG_PRINT: print("-----------------{0}---------------".format(site)) print(str(json.dumps(data, indent=4)))
class ChowSiteMapDownloader(SiteMapDownloader): subdirectory_output = 'chow' robots_url = 'http://www.chowhound.com/robots.txt' recipe_url_pattern = ['www.chowhound.com/recipes/'] parser = HRecipeParser.get_parser()
class TheKitchnSiteMapDownloader(SiteMapDownloader): subdirectory_output = 'thektichn' robots_url = ['http://www.thekitchn.com/sitemap.xml'] recipe_url_pattern = ['thekitchn.com/recipe'] parser = HRecipeParser.get_parser()
class FineDiningSiteMapDownloader(SiteMapDownloader): subdirectory_output = 'fine_dining' robots_url = 'https://www.finedininglovers.com/robots.txt' recipe_url_pattern = ['finedininglovers.com/recipes/'] parser = HRecipeParser.get_parser()
class WillimasonomaSiteMapDownloader(SiteMapDownloader): subdirectory_output = 'william_sonoma' robots_url = 'http://www.williams-sonoma.com/robots.txt' recipe_url_pattern = ['http://www.williams-sonoma.com/recipe'] parser = HRecipeParser.get_parser()
class BBCFoodSiteMapDownloader(SiteMapDownloader): subdirectory_output = 'bbc_food' robots_url = 'https://www.bbcgoodfood.com/robots.txt' recipe_url_pattern = ['bbcgoodfood.com/recipes/'] parser = HRecipeParser.get_parser()
class SimplyRecipesSiteMapDownloader(SiteMapDownloader): subdirectory_output = 'simply_recipes' robots_url = 'http://www.simplyrecipes.com/robots.txt' recipe_url_pattern = ['http://www.simplyrecipes.com/recipes/'] parser = HRecipeParser.get_parser()
class RecipeDepositorySiteMapDownloader(SiteMapDownloader): subdirectory_output = 'recipedepository' robots_url = 'http://www.therecipedepository.com/robots.txt' recipe_url_pattern = ['www.therecipedepository.com/recipe'] parser = HRecipeParser.get_parser()
class AllRecipesSiteMapDownloader(SiteMapDownloader): subdirectory_output = 'allrecipes' robots_url = 'http://allrecipes.com/robots.txt' recipe_url_pattern = ['allrecipes.com/recipe/'] parser = HRecipeParser.get_parser()
import asyncio import sys from recipe_scraper.recipe_parsers import HRecipeParser, JsonLdParser from recipe_scraper.async_scraper import AsyncScraper, AsyncSraperSiteMap from recipe_scraper.tools.log_inspector import LogInspector from recipe_scraper.tools.data_loader import DataLoader from recipe_scraper.tools import SITEMAP_DOWNLOADERS, SiteMapDownloader import argparse # default start_ids are the minimum id that returns a valid result SCRAPER_CONFIGS = { 'allrecipes': { 'base_path': ['allrecipes.com/recipe'], 'parser': HRecipeParser.get_parser(), 'url_id_format': 'http://allrecipes.com/recipe/{0}', 'start_id': 6663, }, 'foodnetwork': { 'base_path': ['foodnetwork.com/recipes', '/recipes'], 'parser': JsonLdParser.get_parser(), 'url_id_format': 'http://www.foodnetwork.com/recipes/{0}', 'start_id': 3, }, 'epicurious': { 'base_path': ['epicurious.com/recipes/food/views', '/recipes/food/views/'], 'parser': HRecipeParser.get_parser(), 'url_id_format': 'http://www.epicurious.com/recipes/food/views/{0}', # 'start_id': 412, # initial start but it looks like there is a large gap 'start_id': 4000, }, 'recipedepository': {