Пример #1
0
    def recipe_scraper2json(args, url):
        from recipe_scrapers import scrape_me

        print_debug("Using recipe-scraper module...")

        recipe_json={}
        recipe_json['url'] = url

        try:
            scraper = scrape_me(url)

            recipe_json['title'] = scraper.title()
            recipe_json['description'] = ''
            recipe_json['yield'] = scraper.yields()
            recipe_json['preptime'] = ''
            recipe_json['cooktime'] = ''
            recipe_json['totaltime'] = minutes2time(scraper.total_time())
            recipe_json['ingredient_groups'] = []
            recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
            recipe_json['ingredient_groups'][0]['ingredients'] = scraper.ingredients()
            recipe_json['direction_groups'] = []
            recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
            instructions = scraper.instructions().split('\n')
            recipe_json['direction_groups'][0]['directions'] = instructions

        except:
            raise UrlError(url, 'URL not supported.')

        return recipe_json
Пример #2
0
def main(args=None):
    if args is None:
        args = parse_arguments()

    print_debug(args)
    if args.quick_tests:
        quick_tests(args)
    else:
        if not args.URL == [[]]:
            for url in args.URL[0]:
                try:
                    recipe_json = url2recipe_json(args, url)
                except UrlError as err:
                    print_error("Specified URL Not suported!")
                    sys.exit(os.EX_SOFTWARE)
                except Exception as err:
                    print_error(err.args[1])  # arguments stored in .args
                    sys.exit(os.EX_TEMPFAIL)
                recipe_output(args, recipe_json)
        else:
            if not args.infile is None and args.infile != "":
                print_info("Processsing %s..." % args.infile)
                with open(args.infile) as json_file:
                    recipe_json = json.load(json_file)
                    recipe_output(args, recipe_json)
            else:
                print_error(
                    "You must specify an input URL or input JSON file.")
                parse_arguments(print_usage=True)
                sys.exit(os.EX_USAGE)
Пример #3
0
            def get_page_using_session(args, url):

                print_debug ("Getting page using sessions...")

                auth_json = get_credentials()

                session_requests = requests.session()

                domain = url2domain(url)
                signin_url = "https://" + domain +"/sign_in?next=%2F"

                signin_page = session_requests.get(signin_url)
                tree = html.fromstring(signin_page.text)
                action = tree.xpath('//form[@class="appForm"]/@action')[0]

                payload={}
                input_elements = tree.xpath('//form[@class="appForm"]//input')
                for input_element in input_elements:
                    payload[input_element.name] = input_element.value
                payload['utf8'] = '✓'
                payload['user[email]'] = auth_json['user']
                payload['user[password]'] = auth_json['pass']

                # Perform login
                authorize_url = "https://" + domain + action + "?next=%2F"
                result = session_requests.post(authorize_url, data = payload, headers = dict(referer = signin_url))

                save_cookies(session_requests.cookies, url)

                # Grab page
                recipe_page = session_requests.get(url, headers = dict(referer = url))

                return recipe_page.text
Пример #4
0
        def get_json(url):
            """ Find and load "standardized" json document containing recipe """

            return_value = None

            user_agent = {'User-agent': 'Mozilla/5.0'}
            page = requests.get(url, headers = user_agent)

            match = re.search(r'<script[^>]*type=.?application/ld\+json.?[^>]*>', page.text)
            if match:
                print_debug("Found an occurance of 'application/ld+json'")
                soup = BeautifulSoup(page.text, 'html5lib')
                scripts = soup.findAll('script', attrs = {'type':'application/ld+json'})
                for script in scripts:
                    json_stripped=re.sub('^[^\{\[]*', '', script.text)
                    raw_json = json.loads(json_stripped)
                    if type(raw_json) == list:
                        return_value = json_find_array_element(raw_json, '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json, '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher'] = url2publisher(url)
                    elif '@graph' in raw_json and type(raw_json['@graph']) == list:
                        return_value = json_find_array_element(raw_json['@graph'], '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json['@graph'], '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)
                    else:
                        if return_value is None:
                            try:
                                if raw_json['@type'] == 'Recipe' and 'recipeIngredient' in raw_json:
                                    return_value = raw_json
                                else:
                                    return_value = None
                            except:
                                return_value = None
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}')), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)

                    if (not return_value is None) and ('recipeIngredient' in return_value):
                        pass
                    else:
                        return_value = None
            return return_value
Пример #5
0
    def stcg2json(args, url):
        """ Loads Saveur URL and builds recipe JSON """

        print_debug("Using Sam the Cooking Guy scraper...")
        recipe_json={}
        recipe_json['url'] = url

        page = BeautifulSoup(requests.get(url).text.replace("\u2014"," "), 'html5lib')

        title = page.select_one('title').text
        recipe_json['title'] = re.sub('. SAM THE COOKING GUY', '', title)
        recipe_json['yield'] = page.select_one('div.sqs-block-content p').text
        if page.select('div.sqs-block-content p')[1]:
            recipe_json['description'] = page.select('div.sqs-block-content p')[1].text

        # Parse Times
        minutes_prep = 0
        minutes_cook = 0
        #minutes_cook = iso8601.to_minutes(page.select_one('div.cook-time meta')['content'])
        minutes_total = minutes_prep + minutes_cook
        if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
            minutes_prep = minutes_total - minutes_cook
        # recipe_json['preptime'] = minutes2time(minutes_prep, '')
        # recipe_json['cooktime'] = minutes2time(minutes_cook, '')
        # recipe_json['totaltime'] = minutes2time(minutes_total)

        recipe_json['author'] = url2publisher(url)

        # Ingredients
        recipe_json['ingredient_groups'] = []
        recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
        ingredients = page.select_one('div.sqs-block div.sqs-block-content').find_all('p', attrs={'class': None, 'style': 'white-space:pre-wrap'})
        ingredients = page.select('div.sqs-layout div.row div.sqs-block div.sqs-block-content p')
        if not ingredients:
            ingredients = page.select_one('div.sqs-block-content ul').find_all('li', attrs={'class': None})
        for ingredient in ingredients:
            recipe_json['ingredient_groups'][0]['ingredients'].append(ingredient.text.replace("\n","").strip())

        # Directions
        out_instruction=[]
        instructions = page.select_one('div.sqs-block-content ul').find_all('li', attrs={'class': None})
        if not instructions:
            instructions = page.select_one('div.sqs-block-content').find('ul', attrs={'data-rte-list': 'default'}).find_all('li', attrs={'class': None})
        for instruction in instructions:
            try:
                instruction_json = instruction
                out_instruction.append(instruction_json['text'].text.replace("\n","").strip())
            except:
                out_instruction.append(instruction.text.replace("\n","").strip())
        recipe_json['direction_groups'] = []
        recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
        recipe_json['direction_groups'][0]['directions'] = out_instruction

        return recipe_json
Пример #6
0
            def get_page_using_cookie(args, url, cookies = None):
                """ Load page using existing cookies """

                print_debug ("Getting page using cookies...")

                recipe_page = None

                if cookies is None:
                    #load cookies and do a request
                    cookies = load_cookies(url)

                if not cookies is None:
                    print_debug ('cookies = ' + str(requests.utils.dict_from_cookiejar(cookies)))
                    recipe_page = requests.get(url, cookies=cookies).text

                return recipe_page
Пример #7
0
    def saveur2json(args, url):
        """ Loads Saveur URL and builds recipe JSON """

        print_debug("Using Saveur scraper...")
        recipe_json={}
        recipe_json['url'] = url

        page = BeautifulSoup(requests.get(url).text.replace("\u2014"," "), 'html5lib')

        recipe_json['title'] = page.select_one('.entry-title').text
        #recipe_json['description'] = page.select_one('p.paragraph:first-child').text
        recipe_json['description'] = page.find("div", {'property':'description'}).text
        recipe_json['yield'] = page.select_one('div.yield span').text

        # Parse Times
        minutes_prep = 0
        minutes_cook = iso8601.to_minutes(page.select_one('div.cook-time meta')['content'])
        minutes_total = minutes_prep + minutes_cook
        if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
            minutes_prep = minutes_total - minutes_cook
        recipe_json['preptime'] = minutes2time(minutes_prep, '')
        recipe_json['cooktime'] = minutes2time(minutes_cook, '')
        recipe_json['totaltime'] = minutes2time(minutes_total)

        recipe_json['author'] = url2publisher(url)

        # Ingredients
        recipe_json['ingredient_groups'] = []
        recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
        for ingredient in page.find_all("li", class_="ingredient"):
            recipe_json['ingredient_groups'][0]['ingredients'].append(ingredient.text.replace("\n","").strip())

        # Directions
        out_instruction=[]
        for instruction in page.find_all("li", class_="instruction"):
            try:
                instruction_json = instruction
                out_instruction.append(instruction_json['text'].text.replace("\n","").strip())
            except:
                out_instruction.append(instruction.text.replace("\n","").strip())
        recipe_json['direction_groups'] = []
        recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
        recipe_json['direction_groups'][0]['directions'] = out_instruction
        #raise UrlError(url, 'URL not supported.')
        return recipe_json
Пример #8
0
            def save_cookies(requests_cookiejar, url):
                """ save cookie jar """
                filename = cookie_filename(url)
                # Check if ~/.config/recipe-dl exists
                path = os.path.expanduser('~') + "/.config/recipe-dl"
                filename = path + '/' + filename
                if not os.path.isdir(path):
                    # If not check for ~/.config and create recipe-dl
                    if os.path.isdir(os.path.expanduser('~') + "/.config"):
                        try:
                            os.makedirs(path)
                        except OSError:
                            if not os.path.isdir(path):
                                raise
                    else:
                        path = os.path.expanduser('.')
                        if os.path.isdir(path):
                            filename = path + '/' + filename

                print_debug ('Saving cookies to ' + path )
                with open(filename, 'wb') as f:
                    pickle.dump(requests_cookiejar, f)
Пример #9
0
        def recipe_json2doc(args, recipe_json, format='rst', base_level=1):
            """ Build reStructuredText from recipe JSON """
            def format2text(format):
                """ Formats output ext to human readable format name """

                format_text = ''
                if format == 'json':
                    format_text = 'JSON'
                elif format == 'md':
                    format_text = 'Markdown'
                elif format == 'rst':
                    format_text = 'reStructuredText'
                else:
                    format_text = "Unknown format [%s]" % (format)
                    print_warning("Unknown format [%s]" % (format))
                    #raise ("ERROR: Unknown format [%s]" % (format))
                return format_text

            def output_header(header_text, format='rst', level=1):
                """ returns string containg formated header """

                out_string = ''
                if format == 'md':
                    out_string += '#' * (level + 1)
                    out_string += ' '
                out_string += header_text + '\n'
                if format == 'rst':
                    level_chars = ['=', '-', '^']
                    level_char = level_chars[level - 1]
                    out_string += re.sub('.', level_char, header_text) + '\n'
                out_string += '\n'

                return out_string

            def output_group(json_obj,
                             group_key,
                             item_key,
                             item_prefix,
                             item_wrap=False,
                             format='rst',
                             base_level=2):
                """ returns string containg formated groups/lists """

                out_string = ''
                group_count = len(json_clean_value(recipe_json, group_key))
                for group_index, group in enumerate(
                        json_clean_value(recipe_json, group_key)):
                    group_title = json_clean_value(group, 'title')

                    if group_title != '':
                        if group_index > 0:
                            out_string += '\n'
                        out_string += output_header(group_title,
                                                    format=format,
                                                    level=(base_level + 1))

                    for item_count, item in enumerate(
                            json_clean_value(group, item_key), 1):
                        if item_prefix == '#':
                            prefix = str(item_count).strip() + '. '
                        else:
                            prefix = item_prefix.strip() + ' '
                        if item_wrap:
                            item_lines = textwrap.wrap(
                                item,
                                width=75,
                                initial_indent=prefix,
                                subsequent_indent=re.sub('.', ' ', prefix))
                            for line in item_lines:
                                out_string += line + '\n'
                        else:
                            out_string += prefix.strip() + ' ' + str(
                                item) + '\n'

                return out_string

            format_prefix = '-'
            if format == 'md':
                format_prefix = '*'

            print_debug("Building " + format2text(format) +
                        " from recipe JSON...")
            print_debug(recipe_json)

            output = output_header(json_clean_value(recipe_json, 'title'),
                                   format)

            recipe_yield = json_clean_value(recipe_json, 'yield')
            preptime = json_clean_value(recipe_json, 'preptime')
            cooktime = json_clean_value(recipe_json, 'cooktime')
            totaltime = json_clean_value(recipe_json, 'totaltime')

            info = "| "
            if preptime != '':
                info += 'Prep: ' + preptime + ' | '
            if totaltime != '':
                info += 'Total: ' + totaltime + ' | '
            if recipe_yield != '':
                info += 'Yield: ' + str(recipe_yield) + ' | '
            info = info.strip()

            if info != '|':
                divider_line = re.sub('[^|]', '-', info)
                if format == 'rst':
                    divider_line = re.sub('[|]', '+', divider_line)
                output += divider_line + '\n' + info + '\n' + divider_line + '\n\n'

            # TODO: make this work with markdown and missing URL
            url = json_clean_value(recipe_json, 'url')
            author = json_clean_value(recipe_json, 'author')
            if url is None or url == '':
                if not author is None and author != '':
                    output += 'Source: ' + author + '\n\n'
            else:
                if author is None or author == '':
                    author = url2domain(url)
                if format == 'md':
                    output += 'Source: [' + author + '](' + url + ')\n\n'
                elif format == 'rst':
                    output += 'Source: `' + author + ' <' + url + '>`__\n\n'
                else:
                    output += 'Source: ' + author + '\n\n'

            description = textwrap.wrap(json_clean_value(
                recipe_json, 'description'),
                                        width=75)
            for line in description:
                output += line + '\n'

            output += '\n'
            output += output_header('Ingredients', format=format, level=2)
            output += output_group(recipe_json,
                                   'ingredient_groups',
                                   'ingredients',
                                   format_prefix,
                                   format=format,
                                   base_level=2)

            output += '\n'
            output += output_header('Directions', format=format, level=2)
            output += output_group(recipe_json,
                                   'direction_groups',
                                   'directions',
                                   '#',
                                   item_wrap=True,
                                   format=format,
                                   base_level=2)

            notes = json_clean_value(recipe_json, 'notes')
            if not notes is None and notes != '':
                output += '\n'
                output += output_header('Notes', format=format, level=2)

                for note in notes:
                    note = re.sub('\*\*\*', '', note)
                    if len(notes) > 1:
                        not_prefic = format_prefix.strip() + ' '
                        for line in textwrap.wrap(note,
                                                  width=75,
                                                  initial_indent=note_prefix,
                                                  subsequent_indent=re.sub(
                                                      '.', ' ', note_prefix)):
                            output += line + '\n'
                    else:
                        for line in textwrap.wrap(note, width=75):
                            output += line + '\n'
                    output += '\n'

            return output
Пример #10
0
def parse_arguments(print_usage=False, detail=False):
    """ Creates a new argument parser. """

    parser = argparse.ArgumentParser('recipe-dl')
    version = '%(prog)s v' + __version__
    parser.add_argument('--version', action='version', version=version)
    parser.add_argument(
        '-a',
        '--authorize',
        action="store_true",
        dest="authorize_ci",
        default=False,
        help='Force authorization of Cook Illustrated sites',
    )
    parser.add_argument(
        "-d",
        "--debug",
        action="store_true",
        dest="debug",
        default=False,
        help="Add additional Output",
    )
    parser.add_argument(
        "-q",
        "--quiet",
        action="store_true",
        dest="quiet",
        default=None,
        #help="Suppress most output aka Silent Mode.",
        help=argparse.SUPPRESS)
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        dest="verbose",
        default=False,
        help="Make output verbose",
    )
    parser.add_argument(
        "-j",
        "--output-json",
        action="store_true",
        dest="output_json",
        default=False,
        help="Output results in JSON format.",
    )
    parser.add_argument(
        "-m",
        "--output-md",
        action="store_true",
        dest="output_md",
        default=False,
        help="Output results in Markdown format.",
    )
    parser.add_argument(
        "-r",
        "--output-rst",
        action="store_true",
        dest="output_rst",
        default=False,
        help="Output results in reStructuredText format.",
    )
    parser.add_argument(
        '-i',
        '--infile',
        action="store",
        dest="infile",
        help="Specify input json file infile.",
    )
    parser.add_argument(
        '-o',
        '--outfile',
        action="store",
        dest="outfile",
        help="Specify output file outfile.",
    )
    parser.add_argument(
        "-s",
        "--save-to-file",
        action="store_true",
        dest="save_to_file",
        default=False,
        help="Save output file(s).",
    )
    parser.add_argument(
        "-f",
        "--force-recipe-scraper",
        action="store_true",
        dest="force_recipe_scraper",
        default=False,
        help="For the use of the recipe scraper where applicable.",
    )
    parser.add_argument("--quick-tests",
                        action="store_true",
                        dest="quick_tests",
                        help=argparse.SUPPRESS,
                        default=False)

    parser.add_argument(
        'URL',
        nargs='*',
        action="append",
        default=[],
    )

    if print_usage:
        if detail:
            parser.print_help()
        else:
            parser.print_usage()
    else:
        args = parser.parse_args()

        if args.quiet is None:
            args.quiet = not args.verbose

        if args.debug and args.quiet:
            args.quiet = False
            print_warning(
                "Debug option selected. Can not run in \"Silent Mode\"")

        custom_print_init(quiet=args.quiet, debug=args.debug)

        filetype_count = 0
        if args.output_json:
            filetype_count += 1
        if args.output_md:
            filetype_count += 1
        if args.output_rst:
            filetype_count += 1

        print_debug("filetype_count=%s" % filetype_count)
        if filetype_count == 0:
            args.output_rst = True
        elif filetype_count > 1:
            print_warning(
                "More than one output file type select. Assuming 'Save to File'"
            )
            args.save_to_file = True

        if not args.save_to_file and not args.outfile is None and args.outfile != '':
            args.save_to_file = True

        return args
Пример #11
0
    def ci2json(args, url):
        """ Loads Cook's Illustrated (and affiliated) URL and checks for
            authentication and then builds Recipe JSON
        """

        def get_json(args, url):
            """ Get JSON from page """

            import pickle

            def find_script(source_html):
                if source_html is None:
                    return None
                else:
                    tree = html.fromstring(source_html)
                    script_element = tree.xpath('//script[@id="__NEXT_DATA__"]')[0]
                    return json.loads(script_element.text)

            def found_paywall(source_json):
                ret_value = False
                paywall_json = list(json_find_key(source_json, 'paywall'))

                if paywall_json or paywall_json[0] == 'TRUE' or json_clean_value(paywall_json[1], 'status') == "READY":
                    ret_value = True

            def cookie_filename(url):
                return '.' + url2domain(url) + '.cookies'

            def save_cookies(requests_cookiejar, url):
                """ save cookie jar """
                filename = cookie_filename(url)
                # Check if ~/.config/recipe-dl exists
                path = os.path.expanduser('~') + "/.config/recipe-dl"
                filename = path + '/' + filename
                if not os.path.isdir(path):
                    # If not check for ~/.config and create recipe-dl
                    if os.path.isdir(os.path.expanduser('~') + "/.config"):
                        try:
                            os.makedirs(path)
                        except OSError:
                            if not os.path.isdir(path):
                                raise
                    else:
                        path = os.path.expanduser('.')
                        if os.path.isdir(path):
                            filename = path + '/' + filename

                print_debug ('Saving cookies to ' + path )
                with open(filename, 'wb') as f:
                    pickle.dump(requests_cookiejar, f)

            def load_cookies(url):
                """ Loads Cookie jar """
                print_debug ('Loading cookies...')

                filename = cookie_filename(url)
                # First look in current directory
                if not os.path.isfile(filename):
                    print_debug ("Unable to find " + filename + ' locally.')
                    # Next look in ~/.config/recipe-dl
                    path = os.path.expanduser('~') + "/.config/recipe-dl"
                    print_debug ("Searching " + path)
                    if os.path.isfile(path + '/' + filename):
                        filename = path + '/' + filename
                    else:
                        # Lastly look where the script is located.
                        print_debug ("Not found. Using script location.")
                        filename = os.path.dirname(os.path.abspath(__file__)) + "/" + filename

                if os.path.isfile(filename):
                    print_debug("found.")
                    with open(filename, 'rb') as f:
                        return pickle.load(f)
                else:
                    print_debug ("Unable to find " + filename)
                    return None

            def get_credentials():
                """ Retrieve Credentals """

                def input_credential(prompt):
                    """ Prompt and input credentals """
                    credential = ''
                    while credential == '':
                        print_to_console(prompt)
                        credential = input()
                    return credential

                credential_json = {}
                credential_json['user'] = input_credential("Enter email address:")
                credential_json['pass'] = input_credential("Enter password:"******""" Load page using existing cookies """

                print_debug ("Getting page using cookies...")

                recipe_page = None

                if cookies is None:
                    #load cookies and do a request
                    cookies = load_cookies(url)

                if not cookies is None:
                    print_debug ('cookies = ' + str(requests.utils.dict_from_cookiejar(cookies)))
                    recipe_page = requests.get(url, cookies=cookies).text

                return recipe_page

            def get_page_using_session(args, url):

                print_debug ("Getting page using sessions...")

                auth_json = get_credentials()

                session_requests = requests.session()

                domain = url2domain(url)
                signin_url = "https://" + domain +"/sign_in?next=%2F"

                signin_page = session_requests.get(signin_url)
                tree = html.fromstring(signin_page.text)
                action = tree.xpath('//form[@class="appForm"]/@action')[0]

                payload={}
                input_elements = tree.xpath('//form[@class="appForm"]//input')
                for input_element in input_elements:
                    payload[input_element.name] = input_element.value
                payload['utf8'] = '&#x2713;'
                payload['user[email]'] = auth_json['user']
                payload['user[password]'] = auth_json['pass']

                # Perform login
                authorize_url = "https://" + domain + action + "?next=%2F"
                result = session_requests.post(authorize_url, data = payload, headers = dict(referer = signin_url))

                save_cookies(session_requests.cookies, url)

                # Grab page
                recipe_page = session_requests.get(url, headers = dict(referer = url))

                return recipe_page.text

            raw_json = json.loads('{ "paywall": true }')
            if not args.authorize_ci:
                # Getting file using cookies
                raw_html = get_page_using_cookie(args, url)
                if not raw_html is None:
                    raw_json = find_script(raw_html)

            if args.authorize_ci or raw_html is None or found_paywall(raw_json):
                'Getting page using full authentication'
                raw_html = get_page_using_session(args, url)
                raw_json = find_script(raw_html)

            raw_json = raw_json['props']['initialState']['content']['documents']
            raw_json = raw_json[list(json.loads(json.dumps(raw_json)))[0]]

            return raw_json

        print_debug("Using Cook's Illustrated scraper...")
        recipe_json={}
        recipe_json['url'] = url

        source_json = get_json(args, url)

        if not source_json is None:
            print_debug(str(source_json))
            recipe_json['title'] = json_clean_value(source_json, 'title')
            recipe_json['description'] = strip_tags(json_clean_value(source_json['metaData']['fields'], 'description'), strip_newline = True)
            recipe_json['yield'] = json_clean_value(source_json, 'yields')

            # Parse Times
            time_note = json_clean_value(source_json, 'recipeTimeNote')
            if time_note == '':
                time_note = 'TBD'
            recipe_json['preptime'] = ''
            recipe_json['cooktime'] = ''
            recipe_json['totaltime'] = time_note

            author = json_clean_value(source_json['metaData']['fields'], 'source')
            if author == '':
                author = url2publisher(url)
            recipe_json['author'] = author

            # Ingredients
            recipe_json['ingredient_groups'] = []
            ingredient_groups = json_clean_value(source_json, "ingredientGroups")
            for group in ingredient_groups:
                group_json = json.loads('{"title":"","ingredients":[]}')
                if len(ingredient_groups) > 1:
                    group_json['title'] = json_clean_value(group['fields'], 'title')
                ingredients = json_clean_value(group['fields'], "recipeIngredientItems")
                for ingredient in ingredients:
                    qty  = json_clean_value(ingredient['fields'], "qty")
                    unit  = json_clean_value(ingredient['fields'], "preText")
                    item  = json_clean_value(json_clean_value(ingredient['fields'], "ingredient", json.loads('{"fields": ""}'))['fields'], 'title')
                    modifier  = json_clean_value(ingredient['fields'], "postText")
                    group_json['ingredients'].append(strip_tags("%s %s %s%s" % (qty, unit, item, modifier), strip_newline = True))
                recipe_json['ingredient_groups'].append(group_json)

            # Directions
            recipe_json['direction_groups'] = []
            group_json = json.loads('{"group":"","directions":[]}')
            steps = json_clean_value(source_json, "instructions")
            for step in steps:
                group_json['directions'].append(strip_tags(json_clean_value(step['fields'], "content"), strip_newline = True))
            recipe_json['direction_groups'].append(group_json)

            recipe_json['notes'] = []
            recipe_json['notes'].append(strip_tags(json_clean_value(source_json, 'headnote'), strip_newline = True))

        else:
            raise UrlError(url, 'URL not supported.')

        return recipe_json
Пример #12
0
    def generic2json(args, url):
        """ Loads generic URL and builds recipe JSON """

        def get_json(url):
            """ Find and load "standardized" json document containing recipe """

            return_value = None

            user_agent = {'User-agent': 'Mozilla/5.0'}
            page = requests.get(url, headers = user_agent)

            match = re.search(r'<script[^>]*type=.?application/ld\+json.?[^>]*>', page.text)
            if match:
                print_debug("Found an occurance of 'application/ld+json'")
                soup = BeautifulSoup(page.text, 'html5lib')
                scripts = soup.findAll('script', attrs = {'type':'application/ld+json'})
                for script in scripts:
                    json_stripped=re.sub('^[^\{\[]*', '', script.text)
                    raw_json = json.loads(json_stripped)
                    if type(raw_json) == list:
                        return_value = json_find_array_element(raw_json, '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json, '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher'] = url2publisher(url)
                    elif '@graph' in raw_json and type(raw_json['@graph']) == list:
                        return_value = json_find_array_element(raw_json['@graph'], '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json['@graph'], '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)
                    else:
                        if return_value is None:
                            try:
                                if raw_json['@type'] == 'Recipe' and 'recipeIngredient' in raw_json:
                                    return_value = raw_json
                                else:
                                    return_value = None
                            except:
                                return_value = None
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}')), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)

                    if (not return_value is None) and ('recipeIngredient' in return_value):
                        pass
                    else:
                        return_value = None
            return return_value

        print_debug("Using generic scraper...")
        recipe_json={}
        recipe_json['url'] = url
        source_json = get_json(url)

        if source_json is None:
            print_info("No application+ld json attempting to use recipe-scrapers...")
            recipe_json = recipe_scraper2json(args, url)
        else:
            print_debug(json.dumps(source_json))

            recipe_json['title'] = json_clean_value(source_json, 'headline', json_clean_value(source_json, 'name'))
            recipe_json['description'] = json_clean_value(source_json, 'description')
            if 'recipeYield' in source_json and type(source_json['recipeYield']) == list:
                recipe_json['yield'] = max(source_json['recipeYield'])
            else:
                recipe_json['yield'] = json_clean_value(source_json, 'recipeYield')

            # Parse Times
            minutes_total = iso8601.to_minutes(json_clean_value(source_json, 'totalTime'))
            minutes_cook = iso8601.to_minutes(json_clean_value(source_json, 'cookTime'))
            minutes_prep = iso8601.to_minutes(json_clean_value(source_json, 'prepTime'))
            if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
                minutes_prep = minutes_total - minutes_cook
            if minutes_total == 0 and (minutes_prep > 0 or minutes_cook > 0):
                minutes_total = minutes_prep + minutes_cook
            recipe_json['preptime'] = minutes2time(minutes_prep, '')
            recipe_json['cooktime'] = minutes2time(minutes_cook, '')
            recipe_json['totaltime'] = minutes2time(minutes_total)

            # Parse Author
            publisher = json_clean_value(source_json, 'publisher')
            author = json_clean_value(source_json, 'author')
            if type(author) == list:
                if 'name' in author[0]:
                    author = author[0]['name']
            elif 'name' in author:
                author = author['name']
            if publisher != "":
                if author == "" or publisher == author:
                    author == publisher
                else:
                    if not (publisher in author):
                        author = publisher + ' (' + author + ')'
            recipe_json['author'] = author

            # Ingredients
            ingredients = list(json_find_key(source_json, "recipeIngredient"))[0]
            recipe_json['ingredient_groups'] = []
            recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
            out_ingredients = []
            for ingredient in ingredients:
                out_ingredients.append(strip_tags(ingredient))
            recipe_json['ingredient_groups'][0]['ingredients'] = out_ingredients

            # Directions
            out_instruction=[]
            instructionsSection=list(json_find_key(source_json, 'recipeInstructions'))[0]
            try:
                instructions=list(json_find_key(source_json, 'itemListElement'))[0]
            except IndexError:
                instructions=instructionsSection
            print_debug(str(instructions))
            if str(instructions)[0] == '[':

                for instruction in instructions:
                    try:
                        instruction_json = instruction
                        out_instruction.append(strip_tags(instruction_json['text']))
                    except:
                        out_instruction.append(strip_tags(str(instruction)))
            else:
                out_instruction.append(strip_tags(str(instructions)))

            recipe_json['direction_groups'] = []
            recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
            recipe_json['direction_groups'][0]['directions'] = out_instruction

        return recipe_json
Пример #13
0
    def epicurious2json(args, url):
        """ Loads Epicurious URL and builds recipe JSON """

        def get_json(args, url):
            """ Find and load "standardized" json document containing recipe """
            return_value = None
            page = requests.get(url)

            page = BeautifulSoup(requests.get(url).text, 'html5lib')
            scripts = page.findAll('script')
            for script in scripts:
                match = re.search(r'root\.__INITIAL_STATE__\.store', script.text)
                if match:
                    for line in iter(script.text.splitlines()):
                        match = re.search(r'root\.__INITIAL_STATE__\.store', line)
                        if match:
                            raw_json_text = re.sub('[^}]*$','', line)
                            raw_json_text = re.sub('^[^{]*', '', raw_json_text)
                            raw_json_text = re.sub('"email":{"regExp":.*,"password"', '"email":{"regExp":"","password"', raw_json_text)
                            raw_json_text = re.sub('"password":{"regExp":.*,"messages"', '"password":{"regExp":""},"messages"', raw_json_text)
                            raw_json = json.loads(raw_json_text)
                            return_value = json_clean_value(raw_json, 'content', json.loads('{}'))
                            #print_debug(json.dumps(return_value, indent=4))
            return return_value

        print_debug("Using Epicurious scraper...")
        recipe_json={}
        recipe_json['url'] = url

        source_json = get_json(args, url)

        if not source_json is None:
            recipe_json['title'] = json_clean_value(source_json, 'hed')
            recipe_json['description'] = strip_tags(json_clean_value(source_json, 'dek'))
            recipe_json['yield'] = json_clean_value(json_clean_value(source_json, 'servingSizeInfo',json.loads('{}')), 'servingSizeDescription')

            # Parse Times
            minutes_prep = iso8601.to_minutes(json_clean_value(source_json, 'formattedPrepTime'))
            minutes_cook = iso8601.to_minutes(json_clean_value(source_json, 'formattedCookTime'))
            minutes_total = minutes_prep + minutes_cook
            if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
                minutes_prep = minutes_total - minutes_cook
            recipe_json['preptime'] = minutes2time(minutes_prep, '')
            recipe_json['cooktime'] = minutes2time(minutes_cook, '')
            recipe_json['totaltime'] = minutes2time(minutes_total)

            # Parse Author
            publisher = "Epicurious"
            author = json_clean_value(source_json, 'author', '')
            if type(author) == list:
                if 'name' in author[0]:
                    author = author[0]['name']
            elif 'name' in author:
                author = author['name']
            if publisher != "":
                if author == "" or publisher == author:
                    author == publisher
                else:
                    if not (publisher in author):
                        author = publisher + ' (' + author + ')'
            recipe_json['author'] = author

            # Ingredients
            recipe_json['ingredient_groups'] = []
            ingredient_groups = json_clean_value(source_json, "ingredientGroups")
            for group in ingredient_groups:
                group_json = json.loads('{"title":"","ingredients":[]}')
                if len(ingredient_groups) > 1:
                    group_json['title'] = json_clean_value(group_json, "hed")
                ingredients = json_clean_value(group, "ingredients")
                for ingredient in ingredients:
                    group_json['ingredients'].append(strip_tags(json_clean_value(ingredient, "description")))
                recipe_json['ingredient_groups'].append(group_json)

            # Directions
            recipe_json['direction_groups'] = []
            direction_groups = json_clean_value(source_json, "preparationGroups")
            for group in direction_groups:
                group_json = json.loads('{"group":"","directions":[]}')
                if len(direction_groups) > 1:
                    group_json['group'] = strip_tags(json_clean_value(group_json, "hed"))
                steps = json_clean_value(group, "steps")
                for step in steps:
                    group_json['directions'].append(strip_tags(json_clean_value(step, "description")))
                recipe_json['direction_groups'].append(group_json)

        else:
            raise UrlError(url, 'URL not supported.')

        return recipe_json
Пример #14
0
def url2recipe_json(args, url):
    """ Loads recipe JSON from URL """

    def json_find_key(dictionary, key):
        """ Finds a key and returns value(s) """

        for k, v in dictionary.items():
            if k == key:
                yield v
            elif isinstance(v, dict):
                for result in json_find_key(v, key):
                    yield result
            elif isinstance(v, list):
                for d in v:
                    if isinstance(d, dict):
                        for result in json_find_key(d, key):
                            yield result

    def json_find_array_element(json_obj, key, value):
        """ Searches for key in JSON array and returns value """

        ret_value = None
        for array in json_obj:
            if array[key] == value:
                ret_value = array
                break
        return ret_value

    def minutes2time(minutes = 0, default = 'TBD'):
        """ Takes minutes and returns a human friendly version """

        return_time = ''

        if minutes > 0:
            return_hours = int( minutes/60 )
            return_minutes = ( minutes - ( return_hours*60 ) )

            if return_hours > 0:
                return_time = str(return_hours)
                if return_hours > 1:
                    return_time = return_time + ' hours '
                else:
                    return_time = return_time + ' hour '
            if return_minutes > 0:
                return_time = return_time + str(return_minutes)
                if return_minutes > 1:
                    return_time = return_time + ' minutes'
                else:
                    return_time = return_time + ' minute'
        else:
            return_time = default

        return return_time

    def ci2json(args, url):
        """ Loads Cook's Illustrated (and affiliated) URL and checks for
            authentication and then builds Recipe JSON
        """

        def get_json(args, url):
            """ Get JSON from page """

            import pickle

            def find_script(source_html):
                if source_html is None:
                    return None
                else:
                    tree = html.fromstring(source_html)
                    script_element = tree.xpath('//script[@id="__NEXT_DATA__"]')[0]
                    return json.loads(script_element.text)

            def found_paywall(source_json):
                ret_value = False
                paywall_json = list(json_find_key(source_json, 'paywall'))

                if paywall_json or paywall_json[0] == 'TRUE' or json_clean_value(paywall_json[1], 'status') == "READY":
                    ret_value = True

            def cookie_filename(url):
                return '.' + url2domain(url) + '.cookies'

            def save_cookies(requests_cookiejar, url):
                """ save cookie jar """
                filename = cookie_filename(url)
                # Check if ~/.config/recipe-dl exists
                path = os.path.expanduser('~') + "/.config/recipe-dl"
                filename = path + '/' + filename
                if not os.path.isdir(path):
                    # If not check for ~/.config and create recipe-dl
                    if os.path.isdir(os.path.expanduser('~') + "/.config"):
                        try:
                            os.makedirs(path)
                        except OSError:
                            if not os.path.isdir(path):
                                raise
                    else:
                        path = os.path.expanduser('.')
                        if os.path.isdir(path):
                            filename = path + '/' + filename

                print_debug ('Saving cookies to ' + path )
                with open(filename, 'wb') as f:
                    pickle.dump(requests_cookiejar, f)

            def load_cookies(url):
                """ Loads Cookie jar """
                print_debug ('Loading cookies...')

                filename = cookie_filename(url)
                # First look in current directory
                if not os.path.isfile(filename):
                    print_debug ("Unable to find " + filename + ' locally.')
                    # Next look in ~/.config/recipe-dl
                    path = os.path.expanduser('~') + "/.config/recipe-dl"
                    print_debug ("Searching " + path)
                    if os.path.isfile(path + '/' + filename):
                        filename = path + '/' + filename
                    else:
                        # Lastly look where the script is located.
                        print_debug ("Not found. Using script location.")
                        filename = os.path.dirname(os.path.abspath(__file__)) + "/" + filename

                if os.path.isfile(filename):
                    print_debug("found.")
                    with open(filename, 'rb') as f:
                        return pickle.load(f)
                else:
                    print_debug ("Unable to find " + filename)
                    return None

            def get_credentials():
                """ Retrieve Credentals """

                def input_credential(prompt):
                    """ Prompt and input credentals """
                    credential = ''
                    while credential == '':
                        print_to_console(prompt)
                        credential = input()
                    return credential

                credential_json = {}
                credential_json['user'] = input_credential("Enter email address:")
                credential_json['pass'] = input_credential("Enter password:"******""" Load page using existing cookies """

                print_debug ("Getting page using cookies...")

                recipe_page = None

                if cookies is None:
                    #load cookies and do a request
                    cookies = load_cookies(url)

                if not cookies is None:
                    print_debug ('cookies = ' + str(requests.utils.dict_from_cookiejar(cookies)))
                    recipe_page = requests.get(url, cookies=cookies).text

                return recipe_page

            def get_page_using_session(args, url):

                print_debug ("Getting page using sessions...")

                auth_json = get_credentials()

                session_requests = requests.session()

                domain = url2domain(url)
                signin_url = "https://" + domain +"/sign_in?next=%2F"

                signin_page = session_requests.get(signin_url)
                tree = html.fromstring(signin_page.text)
                action = tree.xpath('//form[@class="appForm"]/@action')[0]

                payload={}
                input_elements = tree.xpath('//form[@class="appForm"]//input')
                for input_element in input_elements:
                    payload[input_element.name] = input_element.value
                payload['utf8'] = '&#x2713;'
                payload['user[email]'] = auth_json['user']
                payload['user[password]'] = auth_json['pass']

                # Perform login
                authorize_url = "https://" + domain + action + "?next=%2F"
                result = session_requests.post(authorize_url, data = payload, headers = dict(referer = signin_url))

                save_cookies(session_requests.cookies, url)

                # Grab page
                recipe_page = session_requests.get(url, headers = dict(referer = url))

                return recipe_page.text

            raw_json = json.loads('{ "paywall": true }')
            if not args.authorize_ci:
                # Getting file using cookies
                raw_html = get_page_using_cookie(args, url)
                if not raw_html is None:
                    raw_json = find_script(raw_html)

            if args.authorize_ci or raw_html is None or found_paywall(raw_json):
                'Getting page using full authentication'
                raw_html = get_page_using_session(args, url)
                raw_json = find_script(raw_html)

            raw_json = raw_json['props']['initialState']['content']['documents']
            raw_json = raw_json[list(json.loads(json.dumps(raw_json)))[0]]

            return raw_json

        print_debug("Using Cook's Illustrated scraper...")
        recipe_json={}
        recipe_json['url'] = url

        source_json = get_json(args, url)

        if not source_json is None:
            print_debug(str(source_json))
            recipe_json['title'] = json_clean_value(source_json, 'title')
            recipe_json['description'] = strip_tags(json_clean_value(source_json['metaData']['fields'], 'description'), strip_newline = True)
            recipe_json['yield'] = json_clean_value(source_json, 'yields')

            # Parse Times
            time_note = json_clean_value(source_json, 'recipeTimeNote')
            if time_note == '':
                time_note = 'TBD'
            recipe_json['preptime'] = ''
            recipe_json['cooktime'] = ''
            recipe_json['totaltime'] = time_note

            author = json_clean_value(source_json['metaData']['fields'], 'source')
            if author == '':
                author = url2publisher(url)
            recipe_json['author'] = author

            # Ingredients
            recipe_json['ingredient_groups'] = []
            ingredient_groups = json_clean_value(source_json, "ingredientGroups")
            for group in ingredient_groups:
                group_json = json.loads('{"title":"","ingredients":[]}')
                if len(ingredient_groups) > 1:
                    group_json['title'] = json_clean_value(group['fields'], 'title')
                ingredients = json_clean_value(group['fields'], "recipeIngredientItems")
                for ingredient in ingredients:
                    qty  = json_clean_value(ingredient['fields'], "qty")
                    unit  = json_clean_value(ingredient['fields'], "preText")
                    item  = json_clean_value(json_clean_value(ingredient['fields'], "ingredient", json.loads('{"fields": ""}'))['fields'], 'title')
                    modifier  = json_clean_value(ingredient['fields'], "postText")
                    group_json['ingredients'].append(strip_tags("%s %s %s%s" % (qty, unit, item, modifier), strip_newline = True))
                recipe_json['ingredient_groups'].append(group_json)

            # Directions
            recipe_json['direction_groups'] = []
            group_json = json.loads('{"group":"","directions":[]}')
            steps = json_clean_value(source_json, "instructions")
            for step in steps:
                group_json['directions'].append(strip_tags(json_clean_value(step['fields'], "content"), strip_newline = True))
            recipe_json['direction_groups'].append(group_json)

            recipe_json['notes'] = []
            recipe_json['notes'].append(strip_tags(json_clean_value(source_json, 'headnote'), strip_newline = True))

        else:
            raise UrlError(url, 'URL not supported.')

        return recipe_json

    def saveur2json(args, url):
        """ Loads Saveur URL and builds recipe JSON """

        print_debug("Using Saveur scraper...")
        recipe_json={}
        recipe_json['url'] = url

        page = BeautifulSoup(requests.get(url).text.replace("\u2014"," "), 'html5lib')

        recipe_json['title'] = page.select_one('.entry-title').text
        #recipe_json['description'] = page.select_one('p.paragraph:first-child').text
        recipe_json['description'] = page.find("div", {'property':'description'}).text
        recipe_json['yield'] = page.select_one('div.yield span').text

        # Parse Times
        minutes_prep = 0
        minutes_cook = iso8601.to_minutes(page.select_one('div.cook-time meta')['content'])
        minutes_total = minutes_prep + minutes_cook
        if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
            minutes_prep = minutes_total - minutes_cook
        recipe_json['preptime'] = minutes2time(minutes_prep, '')
        recipe_json['cooktime'] = minutes2time(minutes_cook, '')
        recipe_json['totaltime'] = minutes2time(minutes_total)

        recipe_json['author'] = url2publisher(url)

        # Ingredients
        recipe_json['ingredient_groups'] = []
        recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
        for ingredient in page.find_all("li", class_="ingredient"):
            recipe_json['ingredient_groups'][0]['ingredients'].append(ingredient.text.replace("\n","").strip())

        # Directions
        out_instruction=[]
        for instruction in page.find_all("li", class_="instruction"):
            try:
                instruction_json = instruction
                out_instruction.append(instruction_json['text'].text.replace("\n","").strip())
            except:
                out_instruction.append(instruction.text.replace("\n","").strip())
        recipe_json['direction_groups'] = []
        recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
        recipe_json['direction_groups'][0]['directions'] = out_instruction
        #raise UrlError(url, 'URL not supported.')
        return recipe_json

    def stcg2json(args, url):
        """ Loads Saveur URL and builds recipe JSON """

        print_debug("Using Sam the Cooking Guy scraper...")
        recipe_json={}
        recipe_json['url'] = url

        page = BeautifulSoup(requests.get(url).text.replace("\u2014"," "), 'html5lib')

        title = page.select_one('title').text
        recipe_json['title'] = re.sub('. SAM THE COOKING GUY', '', title)
        recipe_json['yield'] = page.select_one('div.sqs-block-content p').text
        if page.select('div.sqs-block-content p')[1]:
            recipe_json['description'] = page.select('div.sqs-block-content p')[1].text

        # Parse Times
        minutes_prep = 0
        minutes_cook = 0
        #minutes_cook = iso8601.to_minutes(page.select_one('div.cook-time meta')['content'])
        minutes_total = minutes_prep + minutes_cook
        if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
            minutes_prep = minutes_total - minutes_cook
        # recipe_json['preptime'] = minutes2time(minutes_prep, '')
        # recipe_json['cooktime'] = minutes2time(minutes_cook, '')
        # recipe_json['totaltime'] = minutes2time(minutes_total)

        recipe_json['author'] = url2publisher(url)

        # Ingredients
        recipe_json['ingredient_groups'] = []
        recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
        ingredients = page.select_one('div.sqs-block div.sqs-block-content').find_all('p', attrs={'class': None, 'style': 'white-space:pre-wrap'})
        ingredients = page.select('div.sqs-layout div.row div.sqs-block div.sqs-block-content p')
        if not ingredients:
            ingredients = page.select_one('div.sqs-block-content ul').find_all('li', attrs={'class': None})
        for ingredient in ingredients:
            recipe_json['ingredient_groups'][0]['ingredients'].append(ingredient.text.replace("\n","").strip())

        # Directions
        out_instruction=[]
        instructions = page.select_one('div.sqs-block-content ul').find_all('li', attrs={'class': None})
        if not instructions:
            instructions = page.select_one('div.sqs-block-content').find('ul', attrs={'data-rte-list': 'default'}).find_all('li', attrs={'class': None})
        for instruction in instructions:
            try:
                instruction_json = instruction
                out_instruction.append(instruction_json['text'].text.replace("\n","").strip())
            except:
                out_instruction.append(instruction.text.replace("\n","").strip())
        recipe_json['direction_groups'] = []
        recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
        recipe_json['direction_groups'][0]['directions'] = out_instruction

        return recipe_json

    def epicurious2json(args, url):
        """ Loads Epicurious URL and builds recipe JSON """

        def get_json(args, url):
            """ Find and load "standardized" json document containing recipe """
            return_value = None
            page = requests.get(url)

            page = BeautifulSoup(requests.get(url).text, 'html5lib')
            scripts = page.findAll('script')
            for script in scripts:
                match = re.search(r'root\.__INITIAL_STATE__\.store', script.text)
                if match:
                    for line in iter(script.text.splitlines()):
                        match = re.search(r'root\.__INITIAL_STATE__\.store', line)
                        if match:
                            raw_json_text = re.sub('[^}]*$','', line)
                            raw_json_text = re.sub('^[^{]*', '', raw_json_text)
                            raw_json_text = re.sub('"email":{"regExp":.*,"password"', '"email":{"regExp":"","password"', raw_json_text)
                            raw_json_text = re.sub('"password":{"regExp":.*,"messages"', '"password":{"regExp":""},"messages"', raw_json_text)
                            raw_json = json.loads(raw_json_text)
                            return_value = json_clean_value(raw_json, 'content', json.loads('{}'))
                            #print_debug(json.dumps(return_value, indent=4))
            return return_value

        print_debug("Using Epicurious scraper...")
        recipe_json={}
        recipe_json['url'] = url

        source_json = get_json(args, url)

        if not source_json is None:
            recipe_json['title'] = json_clean_value(source_json, 'hed')
            recipe_json['description'] = strip_tags(json_clean_value(source_json, 'dek'))
            recipe_json['yield'] = json_clean_value(json_clean_value(source_json, 'servingSizeInfo',json.loads('{}')), 'servingSizeDescription')

            # Parse Times
            minutes_prep = iso8601.to_minutes(json_clean_value(source_json, 'formattedPrepTime'))
            minutes_cook = iso8601.to_minutes(json_clean_value(source_json, 'formattedCookTime'))
            minutes_total = minutes_prep + minutes_cook
            if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
                minutes_prep = minutes_total - minutes_cook
            recipe_json['preptime'] = minutes2time(minutes_prep, '')
            recipe_json['cooktime'] = minutes2time(minutes_cook, '')
            recipe_json['totaltime'] = minutes2time(minutes_total)

            # Parse Author
            publisher = "Epicurious"
            author = json_clean_value(source_json, 'author', '')
            if type(author) == list:
                if 'name' in author[0]:
                    author = author[0]['name']
            elif 'name' in author:
                author = author['name']
            if publisher != "":
                if author == "" or publisher == author:
                    author == publisher
                else:
                    if not (publisher in author):
                        author = publisher + ' (' + author + ')'
            recipe_json['author'] = author

            # Ingredients
            recipe_json['ingredient_groups'] = []
            ingredient_groups = json_clean_value(source_json, "ingredientGroups")
            for group in ingredient_groups:
                group_json = json.loads('{"title":"","ingredients":[]}')
                if len(ingredient_groups) > 1:
                    group_json['title'] = json_clean_value(group_json, "hed")
                ingredients = json_clean_value(group, "ingredients")
                for ingredient in ingredients:
                    group_json['ingredients'].append(strip_tags(json_clean_value(ingredient, "description")))
                recipe_json['ingredient_groups'].append(group_json)

            # Directions
            recipe_json['direction_groups'] = []
            direction_groups = json_clean_value(source_json, "preparationGroups")
            for group in direction_groups:
                group_json = json.loads('{"group":"","directions":[]}')
                if len(direction_groups) > 1:
                    group_json['group'] = strip_tags(json_clean_value(group_json, "hed"))
                steps = json_clean_value(group, "steps")
                for step in steps:
                    group_json['directions'].append(strip_tags(json_clean_value(step, "description")))
                recipe_json['direction_groups'].append(group_json)

        else:
            raise UrlError(url, 'URL not supported.')

        return recipe_json

    def recipe_scraper2json(args, url):
        from recipe_scrapers import scrape_me

        print_debug("Using recipe-scraper module...")

        recipe_json={}
        recipe_json['url'] = url

        try:
            scraper = scrape_me(url)

            recipe_json['title'] = scraper.title()
            recipe_json['description'] = ''
            recipe_json['yield'] = scraper.yields()
            recipe_json['preptime'] = ''
            recipe_json['cooktime'] = ''
            recipe_json['totaltime'] = minutes2time(scraper.total_time())
            recipe_json['ingredient_groups'] = []
            recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
            recipe_json['ingredient_groups'][0]['ingredients'] = scraper.ingredients()
            recipe_json['direction_groups'] = []
            recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
            instructions = scraper.instructions().split('\n')
            recipe_json['direction_groups'][0]['directions'] = instructions

        except:
            raise UrlError(url, 'URL not supported.')

        return recipe_json

    def generic2json(args, url):
        """ Loads generic URL and builds recipe JSON """

        def get_json(url):
            """ Find and load "standardized" json document containing recipe """

            return_value = None

            user_agent = {'User-agent': 'Mozilla/5.0'}
            page = requests.get(url, headers = user_agent)

            match = re.search(r'<script[^>]*type=.?application/ld\+json.?[^>]*>', page.text)
            if match:
                print_debug("Found an occurance of 'application/ld+json'")
                soup = BeautifulSoup(page.text, 'html5lib')
                scripts = soup.findAll('script', attrs = {'type':'application/ld+json'})
                for script in scripts:
                    json_stripped=re.sub('^[^\{\[]*', '', script.text)
                    raw_json = json.loads(json_stripped)
                    if type(raw_json) == list:
                        return_value = json_find_array_element(raw_json, '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json, '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher'] = url2publisher(url)
                    elif '@graph' in raw_json and type(raw_json['@graph']) == list:
                        return_value = json_find_array_element(raw_json['@graph'], '@type', 'Recipe')
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}'), 'name', ''))
                            if return_value['publisher'] == '':
                                return_value['publisher'] = json_clean_value(json_find_array_element(raw_json['@graph'], '@type', 'Organization'), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)
                    else:
                        if return_value is None:
                            try:
                                if raw_json['@type'] == 'Recipe' and 'recipeIngredient' in raw_json:
                                    return_value = raw_json
                                else:
                                    return_value = None
                            except:
                                return_value = None
                        try:
                            return_value['publisher'] = json_clean_value(json_clean_value(source_json, 'publisher', json.loads('{}')), 'name', url2publisher(url))
                        except:
                            if not return_value is None:
                                return_value['publisher']=url2publisher(url)

                    if (not return_value is None) and ('recipeIngredient' in return_value):
                        pass
                    else:
                        return_value = None
            return return_value

        print_debug("Using generic scraper...")
        recipe_json={}
        recipe_json['url'] = url
        source_json = get_json(url)

        if source_json is None:
            print_info("No application+ld json attempting to use recipe-scrapers...")
            recipe_json = recipe_scraper2json(args, url)
        else:
            print_debug(json.dumps(source_json))

            recipe_json['title'] = json_clean_value(source_json, 'headline', json_clean_value(source_json, 'name'))
            recipe_json['description'] = json_clean_value(source_json, 'description')
            if 'recipeYield' in source_json and type(source_json['recipeYield']) == list:
                recipe_json['yield'] = max(source_json['recipeYield'])
            else:
                recipe_json['yield'] = json_clean_value(source_json, 'recipeYield')

            # Parse Times
            minutes_total = iso8601.to_minutes(json_clean_value(source_json, 'totalTime'))
            minutes_cook = iso8601.to_minutes(json_clean_value(source_json, 'cookTime'))
            minutes_prep = iso8601.to_minutes(json_clean_value(source_json, 'prepTime'))
            if minutes_prep == 0 and minutes_total > 0 and minutes_cook > 0:
                minutes_prep = minutes_total - minutes_cook
            if minutes_total == 0 and (minutes_prep > 0 or minutes_cook > 0):
                minutes_total = minutes_prep + minutes_cook
            recipe_json['preptime'] = minutes2time(minutes_prep, '')
            recipe_json['cooktime'] = minutes2time(minutes_cook, '')
            recipe_json['totaltime'] = minutes2time(minutes_total)

            # Parse Author
            publisher = json_clean_value(source_json, 'publisher')
            author = json_clean_value(source_json, 'author')
            if type(author) == list:
                if 'name' in author[0]:
                    author = author[0]['name']
            elif 'name' in author:
                author = author['name']
            if publisher != "":
                if author == "" or publisher == author:
                    author == publisher
                else:
                    if not (publisher in author):
                        author = publisher + ' (' + author + ')'
            recipe_json['author'] = author

            # Ingredients
            ingredients = list(json_find_key(source_json, "recipeIngredient"))[0]
            recipe_json['ingredient_groups'] = []
            recipe_json['ingredient_groups'].append(json.loads('{"title":"","ingredients":[]}'))
            out_ingredients = []
            for ingredient in ingredients:
                out_ingredients.append(strip_tags(ingredient))
            recipe_json['ingredient_groups'][0]['ingredients'] = out_ingredients

            # Directions
            out_instruction=[]
            instructionsSection=list(json_find_key(source_json, 'recipeInstructions'))[0]
            try:
                instructions=list(json_find_key(source_json, 'itemListElement'))[0]
            except IndexError:
                instructions=instructionsSection
            print_debug(str(instructions))
            if str(instructions)[0] == '[':

                for instruction in instructions:
                    try:
                        instruction_json = instruction
                        out_instruction.append(strip_tags(instruction_json['text']))
                    except:
                        out_instruction.append(strip_tags(str(instruction)))
            else:
                out_instruction.append(strip_tags(str(instructions)))

            recipe_json['direction_groups'] = []
            recipe_json['direction_groups'].append(json.loads('{"group":"","directions":[]}'))
            recipe_json['direction_groups'][0]['directions'] = out_instruction

        return recipe_json

    custom_print_init (quiet=args.quiet, debug=args.debug)

    print_info ("Processsing %s..." % (url))

    # Branch based on domain
    domain = url2domain(url)
    print_debug ("Branching based on domain (%s)..." % domain)
    if domain in [ 'www.americastestkitchen.com','www.cookscountry.com','www.cooksillustrated.com' ]:
        recipe_json = ci2json(args, url)
    elif domain == 'www.epicurious.com':
        if args.force_recipe_scraper:
            try:
                recipe_json = recipe_scraper2json(args, url)
            except:
                recipe_json = epicurious2json(args, url)
        else:
            recipe_json = epicurious2json(args, url)
    elif domain == 'www.saveur.com':
        recipe_json = saveur2json(args, url)
    elif domain == 'www.thecookingguy.com':
        recipe_json = stcg2json(args, url)
    else:
        if args.force_recipe_scraper:
            try:
                recipe_json = recipe_scraper2json(args, url)
            except:
                recipe_json = generic2json(args, url)
        else:
            recipe_json = generic2json(args, url)
    return recipe_json
Пример #15
0
            def load_cookies(url):
                """ Loads Cookie jar """
                print_debug ('Loading cookies...')

                filename = cookie_filename(url)
                # First look in current directory
                if not os.path.isfile(filename):
                    print_debug ("Unable to find " + filename + ' locally.')
                    # Next look in ~/.config/recipe-dl
                    path = os.path.expanduser('~') + "/.config/recipe-dl"
                    print_debug ("Searching " + path)
                    if os.path.isfile(path + '/' + filename):
                        filename = path + '/' + filename
                    else:
                        # Lastly look where the script is located.
                        print_debug ("Not found. Using script location.")
                        filename = os.path.dirname(os.path.abspath(__file__)) + "/" + filename

                if os.path.isfile(filename):
                    print_debug("found.")
                    with open(filename, 'rb') as f:
                        return pickle.load(f)
                else:
                    print_debug ("Unable to find " + filename)
                    return None