def parse_schema_ingredient(ing): """Parse one ingredient and return the corrected string suitable for ConvenientImporter.add_ing_from_text() @param ing: the ingredient @ptype ing: string @return: the adjusted ingredient string suitable for the default importer @rtype: string """ split = ing.split(None, 1) if len(split) > 1: try: # replace german comma with dot amount = split[0].replace(',', '.') # convert to fraction amount = Fraction(amount) # convert to string so non-integer fractions can be parsed # by gourmet amount = str(amount) ing = "%s %s" % (amount, split[1]) except ValueError: warn("could not parse amount %r" % split[0]) # do nothing pass else: # do nothing pass return ing
def getdatafromurl(url, content_type_check=None): """Download data from URL. @param url: the URL to download @ptype url: string @param content_type_check: if non-empty, only return data if the Content-Type header starts with the given string @ptype content_type_check: string @return: URL data or None, and the url (which may be redirected to a new URL) @rtype: tuple (string, string) or (None, string) """ data = None try: sock = get_url_socket(url) # update URL in case of redirects url = sock.geturl() if content_type_check: content_type = sock.info().get('content-type', 'application/octet-stream') if content_type.lower().startswith(content_type_check): data = sock.read() else: data = sock.read() except urllib.error.URLError as msg: warn("could not get data from URL %r: %s" % (url, msg)) return data, url
def do_run(self): """Construct recipe and ingredients from parsed JSON data.""" json_data = parse_json_from_data(self.data, self.url) if json_data: self.start_rec() # start_rec() initializes self.rec to an empty recipe dict self.rec['source'] = 'Web' self.rec['link'] = self.url parse_schema_recipe(json_data, self.rec) for txt in parse_schema_ingredients(json_data): # add_ing_from_text(txt) uses self.db.parse_ingredient(txt) self.add_ing_from_text(txt) self.commit_rec() else: warn( "could not find recipe data in chefkoch.de URL %s - please submit a bug report" % self.url) return super(ChefkochDeParser, self).do_run()
def parse_schema_recipe(json_data, recipe): """Fill given recipe dict with data from JSON. @param json_data: the parsed JSON recipe schema data @ptype json_data: dict (with various keys and values) @param recipe: the gourmet recipe to fill @ptype recipe: dict @return: nothing, the recipe will be modified instead @rtype: None """ recipe['title'] = json_data['name'] if 'recipeCategory' in json_data: categories = json_data["recipeCategory"] if categories: # gourmet only has one category per recipe, so get the first one in the list recipe['category'] = categories[0] if len(categories) > 1: # If there are several categories add them to the modifications. recipe['modifications'] = "Kategorien: %s" % ( ", ".join(categories)) try: cooktime = parse_iso8601_duration(json_data['cookTime']) except ValueError: warn("could not parse cookTime %r" % json_data['cookTime']) else: recipe['cooktime'] = cooktime try: preptime = parse_iso8601_duration(json_data['prepTime']) except ValueError: warn("could not parse prepTime %r" % json_data['prepTime']) else: recipe['preptime'] = preptime recipe['instructions'] = json_data['recipeInstructions'] recipe['yields'] = json_data['recipeYield'] if 'aggregateRating' in json_data: rating = float(json_data['aggregateRating']['ratingValue']) # adjust "1 to 5" rating of chefkoch to "1 to 10" of gourmet recipe['rating'] = int(rating * 2) image = json_data['image'] if image: if isinstance(image, list): image = image[0] recipe['image'], _ = getdatafromurl(image, content_type_check="image/")
def parse_schema_ingredient(ing): """Parse one chefkoch.de ingredient and return the corrected string suitable for ConvenientImporter.add_ing_from_text() Note that this parser has specific features only suitable for german recipes from chefkoch.de. @param ing: the ingredient @ptype ing: string @return: the adjusted ingredient string suitable for the default importer @rtype: string """ if ing.startswith(" "): # amount is missing when ingredient starts with a space # this is likely specific for chefkoch.de ing = ing.strip() if ing.lower().startswith("evtl."): # add english optional tag ing = "optional: " + ing[5:] else: split = ing.split(None, 1) if len(split) > 1: try: # replace german comma with dot amount = split[0].replace(',', '.') # convert to fraction amount = Fraction(amount) # convert to string so non-integer fractions can be parsed # by gourmet amount = str(amount) ing = "%s %s" % (amount, split[1]) except ValueError: warn("could not parse amount %r" % split[0]) # do nothing pass else: # do nothing pass return ing
def parse_schema_recipe(soup, recipe): """Fill given recipe dict with data from HTML. @param soup: the parsed HTML data from BeautifulSoup @ptype soup: BeautifulSoup.Tag @param recipe: the gourmet recipe to fill @ptype recipe: dict @return: nothing, the recipe will be modified instead @rtype: None """ nonempty = re.compile(r".+") tag = soup.find(itemprop="name", content=nonempty) if tag: recipe['title'] = tag["content"] tag = soup.find(itemprop="recipeCategory") if tag: recipe['category'] = tag.text tag = soup.find(itemprop="ratingValue", content=nonempty) if tag: rating = float(tag["content"]) # adjust "1 to 5" rating to "1 to 10" of gourmet recipe['rating'] = int(rating * 2) tag = soup.find(itemprop="image", src=nonempty) if tag: image = tag["src"] recipe['image'], unused = getdatafromurl(image, content_type_check="image/") preptime = 0 tag = soup.find(itemprop="performTime", content=nonempty) if tag: try: preptime = parse_iso8601_duration(tag['content']) except ValueError: warn("could not parse prepTime %r" % tag['content']) else: recipe['preptime'] = preptime tag = soup.find(itemprop="totalTime", content=nonempty) if tag: try: totaltime = parse_iso8601_duration(tag['content']) except ValueError: warn("could not parse prepTime %r" % tag['content']) else: # the cooking time is the difference between total and prep time recipe['cooktime'] = totaltime - preptime tag = soup.find(itemprop="recipeYield") if tag and tag.text: recipe['yields'] = tag.text.strip() tag = soup.find(itemprop="description") if tag: # replace images in the text with alternative text since they # are sometimes used in place for img in tag.find_all('img'): img.string = translate_image_text("[" + get_alt_text(img) + "]") recipe['instructions'] = tag.get_text() tag = soup.find("div", attrs={"class": "tips"}) if tag: p = tag.find("p") if p: recipe['modifications'] = p.text.strip() categories = soup.find_all(id="recipesCatFilterLink") if categories: cattext = ", ".join(cat.text.strip() for cat in categories) text = "%s %s" % (_("Category:"), cattext) if "modifications" in recipe: recipe["modifications"] += "\n\n" + text else: recipe["modifications"] = text