class EpicuriousSiteMapDownloader(SiteMapDownloader):

    subdirectory_output = 'epicurious'
    robots_url = 'http://www.epicurious.com/robots.txt'
    ignore_recipe_pattern = ['/review']
    recipe_url_pattern = ['www.epicurious.com/recipes/food/views']
    parser = HRecipeParser.get_parser()
    def test_hrecipe_parser(self):
        parser = HRecipeParser.get_parser()
        for site, url in self.sample_sites_hrecipe.items():
            req = request.Request(url,
                                  data=None,
                                  headers={'User-Agent': get_agent()})
            resp = request.urlopen(req)
            data = parser(BeautifulSoup(resp.read(), 'lxml'))

            self.assertNotIn(data['title'], [None, ''],
                             "No title found for site {0}".format(site))
            self.assertNotEqual(
                data['ingredients'], [],
                "No ingredients found for site {0}".format(site))
            if site not in ['williamssonoma', 'thekitchn']:
                self.assertNotEqual(
                    data['instructions'], [],
                    "No instructions found for site {0}".format(site))
            if site in TEST_REVIEWS_DATA:
                self.assertTrue(
                    data['reviews']['text'] not in [None, [], '']
                    or data['reviews']['ratings'] not in [None, [], ''],
                    "No review body/text for site: {0}".format(site))
            if site in TEST_PREP_TIMES:
                self.assertNotEqual(
                    data['time']['cookTime'], [],
                    "No cookTimes found for site {0}".format(site))
                self.assertNotEqual(
                    data['time']['prepTime'], [],
                    "No prepTimes found for site {0}".format(site))
            if DEBUG_PRINT:
                print("-----------------{0}---------------".format(site))
                print(str(json.dumps(data, indent=4)))
class ChowSiteMapDownloader(SiteMapDownloader):
    subdirectory_output = 'chow'
    robots_url = 'http://www.chowhound.com/robots.txt'
    recipe_url_pattern = ['www.chowhound.com/recipes/']
    parser = HRecipeParser.get_parser()
class TheKitchnSiteMapDownloader(SiteMapDownloader):
    subdirectory_output = 'thektichn'
    robots_url = ['http://www.thekitchn.com/sitemap.xml']
    recipe_url_pattern = ['thekitchn.com/recipe']
    parser = HRecipeParser.get_parser()
class FineDiningSiteMapDownloader(SiteMapDownloader):
    subdirectory_output = 'fine_dining'
    robots_url = 'https://www.finedininglovers.com/robots.txt'
    recipe_url_pattern = ['finedininglovers.com/recipes/']
    parser = HRecipeParser.get_parser()
class WillimasonomaSiteMapDownloader(SiteMapDownloader):
    subdirectory_output = 'william_sonoma'
    robots_url = 'http://www.williams-sonoma.com/robots.txt'
    recipe_url_pattern = ['http://www.williams-sonoma.com/recipe']
    parser = HRecipeParser.get_parser()
class BBCFoodSiteMapDownloader(SiteMapDownloader):
    subdirectory_output = 'bbc_food'
    robots_url = 'https://www.bbcgoodfood.com/robots.txt'
    recipe_url_pattern = ['bbcgoodfood.com/recipes/']
    parser = HRecipeParser.get_parser()
class SimplyRecipesSiteMapDownloader(SiteMapDownloader):
    subdirectory_output = 'simply_recipes'
    robots_url = 'http://www.simplyrecipes.com/robots.txt'
    recipe_url_pattern = ['http://www.simplyrecipes.com/recipes/']
    parser = HRecipeParser.get_parser()
class RecipeDepositorySiteMapDownloader(SiteMapDownloader):

    subdirectory_output = 'recipedepository'
    robots_url = 'http://www.therecipedepository.com/robots.txt'
    recipe_url_pattern = ['www.therecipedepository.com/recipe']
    parser = HRecipeParser.get_parser()
class AllRecipesSiteMapDownloader(SiteMapDownloader):

    subdirectory_output = 'allrecipes'
    robots_url = 'http://allrecipes.com/robots.txt'
    recipe_url_pattern = ['allrecipes.com/recipe/']
    parser = HRecipeParser.get_parser()
示例#11
0
import asyncio
import sys
from recipe_scraper.recipe_parsers import HRecipeParser, JsonLdParser
from recipe_scraper.async_scraper import AsyncScraper, AsyncSraperSiteMap
from recipe_scraper.tools.log_inspector import LogInspector
from recipe_scraper.tools.data_loader import DataLoader
from recipe_scraper.tools import SITEMAP_DOWNLOADERS, SiteMapDownloader
import argparse

# default start_ids are the minimum id that returns a valid result
SCRAPER_CONFIGS = {
    'allrecipes': {
        'base_path': ['allrecipes.com/recipe'],
        'parser': HRecipeParser.get_parser(),
        'url_id_format': 'http://allrecipes.com/recipe/{0}',
        'start_id': 6663,
    },
    'foodnetwork': {
        'base_path': ['foodnetwork.com/recipes', '/recipes'],
        'parser': JsonLdParser.get_parser(),
        'url_id_format': 'http://www.foodnetwork.com/recipes/{0}',
        'start_id': 3,
    },
    'epicurious': {
        'base_path': ['epicurious.com/recipes/food/views', '/recipes/food/views/'],
        'parser': HRecipeParser.get_parser(),
        'url_id_format': 'http://www.epicurious.com/recipes/food/views/{0}',
        # 'start_id': 412,  # initial start but it looks like there is a large gap
        'start_id': 4000,
    },
    'recipedepository': {