def _get_recipe_overviews(self, overview_soup, entry_url): def get_other_recipe(detail_url): res = requests.get(detail_url, verify=False) if res.ok: soup = BeautifulSoup(res.content, "html5lib", from_encoding=res.apparent_encoding) other_recipe_node = soup.select_one("#other-recipe") if other_recipe_node: other_recipe = Recipe() other_recipe.detail_url = urllib.parse.urljoin( detail_url, other_recipe_node.a["href"]) other_recipe.id = re.search( r".*/(.*)\.html", other_recipe.detail_url).group(1) return other_recipe recipes = dict() # key: Recipe.id, value: Recipe for item in [ item for item in overview_soup.find_all("div", "waku") if item.a ]: recipe = Recipe() recipe.detail_url = urllib.parse.urljoin(entry_url, item.a["href"]) recipe.id = re.search(r".*/(.*)\.html", recipe.detail_url).group(1) recipes[recipe.id] = recipe other_recipe = get_other_recipe(recipe.detail_url) if other_recipe: recipes[other_recipe.id] = other_recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe links = [a for a in overview_soup.find_all("a") if a.text == "レシピ"] for link in links: recipe = Recipe() recipe.detail_url = link["href"] recipe.id = pathlib.Path(recipe.detail_url).stem recipe.program_name = self.program_name recipes[recipe.id] = recipe return recipes
def get_other_recipes(detail_url): ret = dict() # key: Recipe.id, value: Recipe res = requests.get(detail_url, verify=False) if res.ok: soup = BeautifulSoup(res.content, "html5lib", from_encoding=res.apparent_encoding) for other_recipe_node in soup.find_all("div", "detail-more-title"): other_recipe = Recipe() other_recipe.detail_url = urllib.parse.urljoin(detail_url, other_recipe_node.a["href"]) other_recipe.id = "_".join(re.search(r".*/(.*)/(.*)/", other_recipe.detail_url).groups()) ret[other_recipe.id] = other_recipe return ret
def get_other_recipe(detail_url): res = requests.get(detail_url, verify=False) if res.ok: soup = BeautifulSoup(res.content, "html5lib", from_encoding=res.apparent_encoding) other_recipe_node = soup.select_one("#other-recipe") if other_recipe_node: other_recipe = Recipe() other_recipe.detail_url = urllib.parse.urljoin( detail_url, other_recipe_node.a["href"]) other_recipe.id = re.search( r".*/(.*)\.html", other_recipe.detail_url).group(1) return other_recipe
def _get_recipe_overviews(self, overview_soup, entry_url): recipe_title_node = overview_soup.find("h2", text=re.compile(r"レシピ.*")) if recipe_title_node is None: logger.info("{} have no recipe.".format(entry_url)) return dict() recipe_root_node = recipe_title_node.parent recipes = dict() # key: Recipe.id, value: Recipe for ii, recipe_node in enumerate( [h3.parent for h3 in recipe_root_node.find_all("h3")]): recipe = Recipe() recipe.program_date = dateutil.parser.parse("20{}".format( pathlib.Path(entry_url).stem)) recipe.program_name = self.program_name recipe.detail_url = entry_url recipe.cooking_name = recipe_node.h3.text recipe.image_urls.append( urllib.parse.urljoin( entry_url, re.search("background-image:url\((.*?)\);", recipe_node.img["style"]).group(1))) is_material_area = False is_recipe_step_area = False for l in recipe_node.find_all("p")[1].text.splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("【材料】"): if is_recipe_step_area == False: is_material_area = True l = l.replace("【材料】", "").translate( self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if len(l): recipe.materials.append(RecipeText(l)) continue if -1 < l.find("【作り方】"): is_material_area = False is_recipe_step_area = True continue if is_material_area: material = l.replace(":", ": ") recipe.materials.append(RecipeText(material)) elif is_recipe_step_area: recpe_step_text = l m = re.match("^(\d+)(.*)", l) if m: num, recipe_t = m.groups() recpe_step_text = "({}){}".format( num, recipe_t.strip()) recipe.recipe_steps.append(RecipeText(recpe_step_text)) recipe.id = "{:%Y%m%d}_{}".format(recipe.program_date, ii) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe current_subtitle = None current_recipe_important_points = list() for item in overview_soup.find_all("section")[1:]: if item.h1: continue subtitle_node = item.find("h2", "option-sub-title") if subtitle_node and subtitle_node.find_next_sibling("p") is None: # current_subtitle = subtitle_node.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip() current_recipe_important_points.clear() continue if item.h2: title_node = item recipe = Recipe() recipe.detail_url = entry_url recipe.cooking_name = title_node.h2.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip() recipe.cooking_name_sub = current_subtitle recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse("{}/{}".format(*re.search("(\d+)\D+(\d+)\D+", recipe.cooking_name_sub).groups())) recipe.image_urls.append(urllib.parse.urljoin(entry_url, title_node.img["src"])) is_material_area = False is_recipe_step_area = False for l in title_node.find("div", "option-media-row").get_text("\n").splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("<材料>"): is_material_area = True recipe.materials.append(RecipeText(l.replace("<材料>", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO))) continue if -1 < l.find("<作り方>"): is_material_area = False is_recipe_step_area = True continue if is_material_area: recipe.materials.extend([RecipeText(m.replace(":", ": ")) for m in l.split()]) elif is_recipe_step_area: recipe.recipe_steps.append(RecipeText(l)) if not recipe.program_date < datetime.datetime.now(): logger.debug("{} is invalid date".format(recipe.program_date)) continue recipe.id = "{:%Y%m%d}_{}".format(recipe.program_date, hashlib.md5(("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name) if recipe.cooking_name_sub else recipe.cooking_name).encode("utf-8")).hexdigest()) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): def get_other_recipes(detail_url): ret = dict() # key: Recipe.id, value: Recipe res = requests.get(detail_url, verify=False) if res.ok: soup = BeautifulSoup(res.content, "html5lib", from_encoding=res.apparent_encoding) for other_recipe_node in soup.find_all("div", "detail-more-title"): other_recipe = Recipe() other_recipe.detail_url = urllib.parse.urljoin(detail_url, other_recipe_node.a["href"]) other_recipe.id = "_".join(re.search(r".*/(.*)/(.*)/", other_recipe.detail_url).groups()) ret[other_recipe.id] = other_recipe return ret recipes = dict() # key: Recipe.id, value: Recipe for item in [item for item in overview_soup.find_all("div", "result-title")]: recipe = Recipe() recipe.detail_url = urllib.parse.urljoin(entry_url, item.parent["href"]) recipe.id = re.search(r".*/(.*)/", recipe.detail_url).group(1) recipes[recipe.id] = recipe other_recipes = get_other_recipes(recipe.detail_url) recipes.update(other_recipes) return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe for item in overview_soup.find("ul", "recipeTable").find_all("li"): recipe = Recipe() recipe.detail_url = urllib.parse.urljoin(entry_url, item.a["href"]) recipe.id = re.search(r".*/(.*)\.html", recipe.detail_url).group(1) program_date_str, _, cooking_name_sub, _ = item.find_all("p") recipe.cooking_name_sub = "〜{}〜より".format(cooking_name_sub.text) recipe.program_name = self.program_name recipe.program_date = datetime.date(*[int(v) for v in re.match(r"(\d+)\D+(\d+)\D+(\d+)\D*", program_date_str.text).groups()]) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe for item in overview_soup.find_all("div", "item"): recipe = Recipe() recipe.detail_url = item.a["href"] id_s = re.search(r"/(\d+)/(.*)?\.html", recipe.detail_url) recipe.id = "{}_{}".format(id_s.group(1), id_s.group(2)) recipe.cooking_name = item.h4.text recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse( item.find("div", "date").text) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe for item in overview_soup.find_all("div", "titletext"): recipe = Recipe() tmp = item.find("p", "title") recipe.detail_url = tmp.a["href"] program_date_str = re.search(r"date=(\d+)\D?", recipe.detail_url).group(1) recipe.id = int(program_date_str) recipe.cooking_name = tmp.text recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse(program_date_str) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe for item in overview_soup.select("td .mon,.tue,.wed,.thu,.fri"): for link in item.find_all("a"): recipe = Recipe() recipe.detail_url = urllib.parse.urljoin( entry_url, link["href"]) program_date_str = re.search(r"/(\d+)\.html", recipe.detail_url).group(1) recipe.id = int(program_date_str) recipe.cooking_name = link.text.split()[-1] recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse(program_date_str) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe for item in overview_soup.find_all("div", "recipe-piece"): recipe = Recipe() recipe.detail_url = urllib.parse.urljoin(entry_url, item.a["href"]) program_date_str = re.search(r"/(\d+)\.html", recipe.detail_url).group(1) recipe.id = int(program_date_str) recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse(program_date_str) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe for item in overview_soup.find("ul", "recipe-list").find_all("li"): recipe = Recipe() name = item.find("p", "name") date = item.find("p", "date") recipe.detail_url = item.a["href"] recipe.id = int( re.search(r".*/(\d+)/$", recipe.detail_url).group(1)) recipe.cooking_name = name.text recipe.program_name = self.program_name recipe.program_date = datetime.date(*[ int(v) for v in re.match(r"(\d+)\D+(\d+)\D+(\d+)\D*", date.text).groups() ]) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, jdata, entry_url): recipes = dict() # key: Recipe.id, value: Recipe for item in jdata["result"]: recipe = Recipe() recipe.detail_url = item["url"] recipe.id = item["id"] recipe.cooking_name_sub = item["identifierGroup"]["episodeName"][ 1:-1] recipe.program_name = self.program_name for be in item["broadcastEvent"]: if be["misc"]["releaseLevel"] == "original": program_date_str = be["identifierGroup"]["date"] recipe.program_date = dateutil.parser.parse( program_date_str).date() recipe.id = "{}_{}".format(program_date_str, item["id"]) break if not recipe.program_date < datetime.date.today(): logger.debug("{} is invalid date".format(recipe.program_date)) continue recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe links = [a for a in overview_soup.find_all("a") if a.img] for link in links: recipe = Recipe() recipe.detail_url = urllib.parse.urljoin(entry_url, link["href"]) recipe.id = int(urllib.parse.splitvalue(recipe.detail_url)[1]) recipe.program_name = self.program_name recipes[recipe.id] = recipe m = re.match(r".*?(\d{6}).*", pathlib.Path(link.img["src"]).name) if m: yymmdd = m.group(1) logger.debug("program_date:{}".format(yymmdd)) # recipe.program_date = datetime.date(year=2000 + int(yymmdd[0:2]), month=int(yymmdd[2:4]), day=int(yymmdd[4:6])) recipe.program_date = dateutil.parser.parse("20{}".format(yymmdd)) return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe detail_urls = [ urllib.parse.urljoin(entry_url, waku.a["href"]) for waku in overview_soup.find_all("div", "waku") if waku.a ] for detail_url in detail_urls: recipe = Recipe() recipe.detail_url = detail_url recipe.id = int( re.search(r"/\D*(\d+)\D*$", recipe.detail_url).group(1)) recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse(str(recipe.id)) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe items = overview_soup.select("section,hr")[1:-1] subtitle_node = None title_node = None title_node_counter = 0 for item in items: if item.name == "hr": subtitle_node = None title_node = None title_node_counter = 0 continue if subtitle_node is None: subtitle_node = item continue else: title_node = item title_node_counter += 1 recipe = Recipe() recipe.detail_url = entry_url recipe.cooking_name = ( title_node.h2 if title_node.h2 else title_node.p).text.translate( self.__class__._TABLE_REMOVE_KAKKO).strip() # 2020.01.10 # recipe.cooking_name = title_node.h2.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip() recipe.cooking_name_sub = subtitle_node.h2.text.strip() recipe.program_name = self.program_name recipe.program_date = dateutil.parser.parse( "{}/{}".format(*re.search("(\d+)\D+(\d+)\D+", recipe.cooking_name_sub).groups())) if title_node.img: recipe.image_urls.append( urllib.parse.urljoin(entry_url, title_node.img["src"])) is_material_area = False is_recipe_step_area = False for l in title_node.find( "div", "option-media-row").get_text("\n").splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("【材料】"): if is_recipe_step_area == False: is_material_area = True l = l.replace("【材料】", "").translate( self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if len(l): recipe.materials.append(RecipeText(l)) continue if -1 < l.find("【作り方】"): is_material_area = False is_recipe_step_area = True continue if is_material_area: materials = [ m.replace("… ", "…").replace("…", ": ") for m in l.split("\n") if len(m.strip()) ] materials = [ m[1:] if m.startswith("・") else m for m in materials ] recipe.materials.extend([RecipeText(m) for m in materials]) elif is_recipe_step_area: recipe.recipe_steps.append(RecipeText(l)) if not recipe.program_date < datetime.datetime.now(): logger.debug("{} is invalid date".format(recipe.program_date)) continue # recipe.id = hashlib.md5("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name).encode("utf-8")).hexdigest() recipe.id = "{:%Y%m%d}".format(recipe.program_date) if 1 < title_node_counter: recipe.id += "_{}".format(title_node_counter) recipes[recipe.id] = recipe return recipes
def _get_recipe_overviews(self, overview_soup, entry_url): recipes = dict() # key: Recipe.id, value: Recipe current_subtitle = None current_recipe_important_points = list() for item in overview_soup.find_all("section")[1:]: if item.table: continue if item.h1: continue if item.h2: current_subtitle = item.h2.text.translate( self.__class__._TABLE_REMOVE_KAKKO).strip() current_recipe_important_points.clear() continue if item.p is None: continue recipe = Recipe() recipe.detail_url = entry_url recipe.program_name = self.program_name recipe.program_date = None if item.img is None: for l in item.p.get_text("\n").splitlines(): current_recipe_important_points.append(RecipeText(l)) continue if item.h3: # multiple recipe recipe.cooking_name = item.h3.text recipe.cooking_name_sub = current_subtitle else: # single recipe recipe.cooking_name = current_subtitle recipe.important_points.extend(current_recipe_important_points) recipe.image_urls.append( urllib.parse.urljoin(entry_url, item.img["src"])) is_material_area = False is_recipe_step_area = False # for l in item.find("div", "option-media-row").get_text("\n").splitlines(): for l in item.p.get_text("\n").splitlines(): if len(l.strip()) == 0: continue if -1 < l.find("◎材料"): is_material_area = True material_title = l.replace("◎材料", "").translate( self.__class__._TABLE_REPLACE_MARUKAKKO).strip() if len(material_title): recipe.materials.append(RecipeText(material_title)) continue if -1 < l.find("<作り方>"): is_material_area = False is_recipe_step_area = True continue if is_material_area: l = l.replace(" 本", "本").replace(" 個", "個") recipe.materials.extend( [RecipeText(m.replace(":", ": ")) for m in l.split()]) elif is_recipe_step_area: m = re.match(r"(\d+).\s*(.*)", l) if m: gs = m.groups() num = int(gs[0]) recipe_step = gs[1] recipe.recipe_steps.append( RecipeText("({}){}".format(num, recipe_step))) else: recipe.recipe_steps.append(RecipeText(l)) recipe.id = hashlib.md5( ("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name) if recipe.cooking_name_sub else recipe.cooking_name).encode("utf-8")).hexdigest() recipes[recipe.id] = recipe return recipes