Пример #1
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        def get_other_recipe(detail_url):
            res = requests.get(detail_url, verify=False)
            if res.ok:
                soup = BeautifulSoup(res.content,
                                     "html5lib",
                                     from_encoding=res.apparent_encoding)
                other_recipe_node = soup.select_one("#other-recipe")
                if other_recipe_node:
                    other_recipe = Recipe()
                    other_recipe.detail_url = urllib.parse.urljoin(
                        detail_url, other_recipe_node.a["href"])
                    other_recipe.id = re.search(
                        r".*/(.*)\.html", other_recipe.detail_url).group(1)
                    return other_recipe

        recipes = dict()  # key: Recipe.id, value: Recipe
        for item in [
                item for item in overview_soup.find_all("div", "waku")
                if item.a
        ]:
            recipe = Recipe()
            recipe.detail_url = urllib.parse.urljoin(entry_url, item.a["href"])
            recipe.id = re.search(r".*/(.*)\.html", recipe.detail_url).group(1)
            recipes[recipe.id] = recipe

            other_recipe = get_other_recipe(recipe.detail_url)
            if other_recipe:
                recipes[other_recipe.id] = other_recipe

        return recipes
Пример #2
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        links = [a for a in overview_soup.find_all("a") if a.text == "レシピ"]
        for link in links:
            recipe = Recipe()
            recipe.detail_url = link["href"]
            recipe.id = pathlib.Path(recipe.detail_url).stem
            recipe.program_name = self.program_name
            recipes[recipe.id] = recipe

        return recipes
Пример #3
0
 def get_other_recipes(detail_url):
     ret = dict() # key: Recipe.id, value: Recipe
     res = requests.get(detail_url, verify=False)
     if res.ok:
         soup = BeautifulSoup(res.content, "html5lib", from_encoding=res.apparent_encoding)
         for other_recipe_node in soup.find_all("div", "detail-more-title"):
             other_recipe = Recipe()
             other_recipe.detail_url = urllib.parse.urljoin(detail_url, other_recipe_node.a["href"])
             other_recipe.id = "_".join(re.search(r".*/(.*)/(.*)/", other_recipe.detail_url).groups())
             ret[other_recipe.id] = other_recipe
     return ret
Пример #4
0
 def get_other_recipe(detail_url):
     res = requests.get(detail_url, verify=False)
     if res.ok:
         soup = BeautifulSoup(res.content,
                              "html5lib",
                              from_encoding=res.apparent_encoding)
         other_recipe_node = soup.select_one("#other-recipe")
         if other_recipe_node:
             other_recipe = Recipe()
             other_recipe.detail_url = urllib.parse.urljoin(
                 detail_url, other_recipe_node.a["href"])
             other_recipe.id = re.search(
                 r".*/(.*)\.html", other_recipe.detail_url).group(1)
             return other_recipe
Пример #5
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipe_title_node = overview_soup.find("h2", text=re.compile(r"レシピ.*"))
        if recipe_title_node is None:
            logger.info("{} have no recipe.".format(entry_url))
            return dict()

        recipe_root_node = recipe_title_node.parent

        recipes = dict()  # key: Recipe.id, value: Recipe
        for ii, recipe_node in enumerate(
            [h3.parent for h3 in recipe_root_node.find_all("h3")]):
            recipe = Recipe()

            recipe.program_date = dateutil.parser.parse("20{}".format(
                pathlib.Path(entry_url).stem))
            recipe.program_name = self.program_name
            recipe.detail_url = entry_url
            recipe.cooking_name = recipe_node.h3.text
            recipe.image_urls.append(
                urllib.parse.urljoin(
                    entry_url,
                    re.search("background-image:url\((.*?)\);",
                              recipe_node.img["style"]).group(1)))

            is_material_area = False
            is_recipe_step_area = False
            for l in recipe_node.find_all("p")[1].text.splitlines():
                if len(l.strip()) == 0:
                    continue

                if -1 < l.find("【材料】"):
                    if is_recipe_step_area == False:
                        is_material_area = True
                        l = l.replace("【材料】", "").translate(
                            self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
                        if len(l):
                            recipe.materials.append(RecipeText(l))
                        continue
                if -1 < l.find("【作り方】"):
                    is_material_area = False
                    is_recipe_step_area = True
                    continue

                if is_material_area:
                    material = l.replace(":", ": ")
                    recipe.materials.append(RecipeText(material))
                elif is_recipe_step_area:
                    recpe_step_text = l
                    m = re.match("^(\d+)(.*)", l)
                    if m:
                        num, recipe_t = m.groups()
                        recpe_step_text = "({}){}".format(
                            num, recipe_t.strip())
                    recipe.recipe_steps.append(RecipeText(recpe_step_text))

            recipe.id = "{:%Y%m%d}_{}".format(recipe.program_date, ii)

            recipes[recipe.id] = recipe

        return recipes
Пример #6
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict() # key: Recipe.id, value: Recipe

        current_subtitle = None
        current_recipe_important_points = list()
        for item in overview_soup.find_all("section")[1:]:
            if item.h1:
                continue
            
            subtitle_node = item.find("h2", "option-sub-title")
            if subtitle_node and subtitle_node.find_next_sibling("p") is None: # 
                current_subtitle = subtitle_node.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip()
                current_recipe_important_points.clear()
                continue            
            
            if item.h2:
                title_node = item
                
                recipe = Recipe()
                recipe.detail_url = entry_url
                recipe.cooking_name = title_node.h2.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip()
                recipe.cooking_name_sub = current_subtitle
                recipe.program_name = self.program_name
                recipe.program_date = dateutil.parser.parse("{}/{}".format(*re.search("(\d+)\D+(\d+)\D+", recipe.cooking_name_sub).groups()))
                recipe.image_urls.append(urllib.parse.urljoin(entry_url, title_node.img["src"]))
            
                is_material_area = False
                is_recipe_step_area = False
                for l in title_node.find("div", "option-media-row").get_text("\n").splitlines():
                    if len(l.strip()) == 0:
                        continue
                    
                    if -1 < l.find("<材料>"):
                        is_material_area = True
                        recipe.materials.append(RecipeText(l.replace("<材料>", "").translate(self.__class__._TABLE_REPLACE_MARUKAKKO)))
                        continue
                    if -1 < l.find("<作り方>"):
                        is_material_area = False
                        is_recipe_step_area = True
                        continue
                    
                    if is_material_area:
                        recipe.materials.extend([RecipeText(m.replace(":", ": ")) for m in l.split()])
                    elif is_recipe_step_area:
                        recipe.recipe_steps.append(RecipeText(l))
                        
                if not recipe.program_date < datetime.datetime.now():
                    logger.debug("{} is invalid date".format(recipe.program_date))
                    continue

                recipe.id = "{:%Y%m%d}_{}".format(recipe.program_date, hashlib.md5(("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name) if recipe.cooking_name_sub else recipe.cooking_name).encode("utf-8")).hexdigest())
                recipes[recipe.id] = recipe

        return recipes
Пример #7
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        def get_other_recipes(detail_url):
            ret = dict() # key: Recipe.id, value: Recipe
            res = requests.get(detail_url, verify=False)
            if res.ok:
                soup = BeautifulSoup(res.content, "html5lib", from_encoding=res.apparent_encoding)
                for other_recipe_node in soup.find_all("div", "detail-more-title"):
                    other_recipe = Recipe()
                    other_recipe.detail_url = urllib.parse.urljoin(detail_url, other_recipe_node.a["href"])
                    other_recipe.id = "_".join(re.search(r".*/(.*)/(.*)/", other_recipe.detail_url).groups())
                    ret[other_recipe.id] = other_recipe
            return ret
        
        recipes = dict() # key: Recipe.id, value: Recipe
        for item in [item for item in overview_soup.find_all("div", "result-title")]:
            recipe = Recipe()
            recipe.detail_url = urllib.parse.urljoin(entry_url, item.parent["href"])
            recipe.id = re.search(r".*/(.*)/", recipe.detail_url).group(1)
            recipes[recipe.id] = recipe

            other_recipes = get_other_recipes(recipe.detail_url)
            recipes.update(other_recipes)

        return recipes
Пример #8
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict() # key: Recipe.id, value: Recipe
        for item in overview_soup.find("ul", "recipeTable").find_all("li"):
            recipe = Recipe()
            recipe.detail_url = urllib.parse.urljoin(entry_url, item.a["href"])
            recipe.id = re.search(r".*/(.*)\.html", recipe.detail_url).group(1)
            
            program_date_str, _, cooking_name_sub, _ = item.find_all("p")
            recipe.cooking_name_sub = "〜{}〜より".format(cooking_name_sub.text)
            recipe.program_name = self.program_name
            recipe.program_date = datetime.date(*[int(v) for v in re.match(r"(\d+)\D+(\d+)\D+(\d+)\D*", program_date_str.text).groups()])
            recipes[recipe.id] = recipe

        return recipes
Пример #9
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        for item in overview_soup.find_all("div", "item"):
            recipe = Recipe()
            recipe.detail_url = item.a["href"]
            id_s = re.search(r"/(\d+)/(.*)?\.html", recipe.detail_url)
            recipe.id = "{}_{}".format(id_s.group(1), id_s.group(2))
            recipe.cooking_name = item.h4.text
            recipe.program_name = self.program_name
            recipe.program_date = dateutil.parser.parse(
                item.find("div", "date").text)
            recipes[recipe.id] = recipe

        return recipes
Пример #10
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        for item in overview_soup.find_all("div", "titletext"):
            recipe = Recipe()
            tmp = item.find("p", "title")
            recipe.detail_url = tmp.a["href"]
            program_date_str = re.search(r"date=(\d+)\D?",
                                         recipe.detail_url).group(1)
            recipe.id = int(program_date_str)
            recipe.cooking_name = tmp.text
            recipe.program_name = self.program_name
            recipe.program_date = dateutil.parser.parse(program_date_str)
            recipes[recipe.id] = recipe

        return recipes
Пример #11
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        for item in overview_soup.select("td .mon,.tue,.wed,.thu,.fri"):
            for link in item.find_all("a"):
                recipe = Recipe()
                recipe.detail_url = urllib.parse.urljoin(
                    entry_url, link["href"])
                program_date_str = re.search(r"/(\d+)\.html",
                                             recipe.detail_url).group(1)
                recipe.id = int(program_date_str)
                recipe.cooking_name = link.text.split()[-1]
                recipe.program_name = self.program_name
                recipe.program_date = dateutil.parser.parse(program_date_str)
                recipes[recipe.id] = recipe

        return recipes
Пример #12
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        for item in overview_soup.find_all("div", "recipe-piece"):
            recipe = Recipe()
            recipe.detail_url = urllib.parse.urljoin(entry_url, item.a["href"])
            program_date_str = re.search(r"/(\d+)\.html",
                                         recipe.detail_url).group(1)
            recipe.id = int(program_date_str)
            recipe.program_name = self.program_name
            recipe.program_date = dateutil.parser.parse(program_date_str)
            recipes[recipe.id] = recipe

        return recipes
Пример #13
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        for item in overview_soup.find("ul", "recipe-list").find_all("li"):
            recipe = Recipe()
            name = item.find("p", "name")
            date = item.find("p", "date")
            recipe.detail_url = item.a["href"]
            recipe.id = int(
                re.search(r".*/(\d+)/$", recipe.detail_url).group(1))
            recipe.cooking_name = name.text
            recipe.program_name = self.program_name
            recipe.program_date = datetime.date(*[
                int(v) for v in re.match(r"(\d+)\D+(\d+)\D+(\d+)\D*",
                                         date.text).groups()
            ])
            recipes[recipe.id] = recipe

        return recipes
Пример #14
0
    def _get_recipe_overviews(self, jdata, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        for item in jdata["result"]:
            recipe = Recipe()
            recipe.detail_url = item["url"]
            recipe.id = item["id"]
            recipe.cooking_name_sub = item["identifierGroup"]["episodeName"][
                1:-1]
            recipe.program_name = self.program_name
            for be in item["broadcastEvent"]:
                if be["misc"]["releaseLevel"] == "original":
                    program_date_str = be["identifierGroup"]["date"]
                    recipe.program_date = dateutil.parser.parse(
                        program_date_str).date()
                    recipe.id = "{}_{}".format(program_date_str, item["id"])
                    break

            if not recipe.program_date < datetime.date.today():
                logger.debug("{} is invalid date".format(recipe.program_date))
                continue
            recipes[recipe.id] = recipe
        return recipes
Пример #15
0
 def _get_recipe_overviews(self, overview_soup, entry_url):
     recipes = dict() # key: Recipe.id, value: Recipe
     links = [a for a in overview_soup.find_all("a") if a.img]
     for link in links:
         recipe = Recipe()
         recipe.detail_url = urllib.parse.urljoin(entry_url, link["href"])
         recipe.id = int(urllib.parse.splitvalue(recipe.detail_url)[1])
         recipe.program_name = self.program_name
         recipes[recipe.id] = recipe
         
         m = re.match(r".*?(\d{6}).*", pathlib.Path(link.img["src"]).name)
         if m:
             yymmdd = m.group(1)
             logger.debug("program_date:{}".format(yymmdd))
             # recipe.program_date = datetime.date(year=2000 + int(yymmdd[0:2]), month=int(yymmdd[2:4]), day=int(yymmdd[4:6]))
             recipe.program_date = dateutil.parser.parse("20{}".format(yymmdd))
     return recipes
Пример #16
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        detail_urls = [
            urllib.parse.urljoin(entry_url, waku.a["href"])
            for waku in overview_soup.find_all("div", "waku") if waku.a
        ]
        for detail_url in detail_urls:
            recipe = Recipe()

            recipe.detail_url = detail_url
            recipe.id = int(
                re.search(r"/\D*(\d+)\D*$", recipe.detail_url).group(1))
            recipe.program_name = self.program_name
            recipe.program_date = dateutil.parser.parse(str(recipe.id))
            recipes[recipe.id] = recipe

        return recipes
Пример #17
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe
        items = overview_soup.select("section,hr")[1:-1]

        subtitle_node = None
        title_node = None
        title_node_counter = 0
        for item in items:
            if item.name == "hr":
                subtitle_node = None
                title_node = None
                title_node_counter = 0
                continue
            if subtitle_node is None:
                subtitle_node = item
                continue
            else:
                title_node = item
                title_node_counter += 1

            recipe = Recipe()
            recipe.detail_url = entry_url
            recipe.cooking_name = (
                title_node.h2
                if title_node.h2 else title_node.p).text.translate(
                    self.__class__._TABLE_REMOVE_KAKKO).strip()  # 2020.01.10
            # recipe.cooking_name = title_node.h2.text.translate(self.__class__._TABLE_REMOVE_KAKKO).strip()
            recipe.cooking_name_sub = subtitle_node.h2.text.strip()
            recipe.program_name = self.program_name
            recipe.program_date = dateutil.parser.parse(
                "{}/{}".format(*re.search("(\d+)\D+(\d+)\D+",
                                          recipe.cooking_name_sub).groups()))
            if title_node.img:
                recipe.image_urls.append(
                    urllib.parse.urljoin(entry_url, title_node.img["src"]))

            is_material_area = False
            is_recipe_step_area = False
            for l in title_node.find(
                    "div", "option-media-row").get_text("\n").splitlines():
                if len(l.strip()) == 0:
                    continue

                if -1 < l.find("【材料】"):
                    if is_recipe_step_area == False:
                        is_material_area = True
                        l = l.replace("【材料】", "").translate(
                            self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
                        if len(l):
                            recipe.materials.append(RecipeText(l))
                        continue
                if -1 < l.find("【作り方】"):
                    is_material_area = False
                    is_recipe_step_area = True
                    continue

                if is_material_area:
                    materials = [
                        m.replace("… ", "…").replace("…", ": ")
                        for m in l.split("\n") if len(m.strip())
                    ]
                    materials = [
                        m[1:] if m.startswith("・") else m for m in materials
                    ]
                    recipe.materials.extend([RecipeText(m) for m in materials])
                elif is_recipe_step_area:
                    recipe.recipe_steps.append(RecipeText(l))

            if not recipe.program_date < datetime.datetime.now():
                logger.debug("{} is invalid date".format(recipe.program_date))
                continue

            # recipe.id = hashlib.md5("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name).encode("utf-8")).hexdigest()
            recipe.id = "{:%Y%m%d}".format(recipe.program_date)
            if 1 < title_node_counter:
                recipe.id += "_{}".format(title_node_counter)

            recipes[recipe.id] = recipe

        return recipes
Пример #18
0
    def _get_recipe_overviews(self, overview_soup, entry_url):
        recipes = dict()  # key: Recipe.id, value: Recipe

        current_subtitle = None
        current_recipe_important_points = list()
        for item in overview_soup.find_all("section")[1:]:
            if item.table:
                continue

            if item.h1:
                continue

            if item.h2:
                current_subtitle = item.h2.text.translate(
                    self.__class__._TABLE_REMOVE_KAKKO).strip()
                current_recipe_important_points.clear()
                continue

            if item.p is None:
                continue

            recipe = Recipe()
            recipe.detail_url = entry_url
            recipe.program_name = self.program_name
            recipe.program_date = None

            if item.img is None:
                for l in item.p.get_text("\n").splitlines():
                    current_recipe_important_points.append(RecipeText(l))
                continue

            if item.h3:
                # multiple recipe
                recipe.cooking_name = item.h3.text
                recipe.cooking_name_sub = current_subtitle
            else:
                # single recipe
                recipe.cooking_name = current_subtitle

            recipe.important_points.extend(current_recipe_important_points)
            recipe.image_urls.append(
                urllib.parse.urljoin(entry_url, item.img["src"]))

            is_material_area = False
            is_recipe_step_area = False
            # for l in item.find("div", "option-media-row").get_text("\n").splitlines():
            for l in item.p.get_text("\n").splitlines():
                if len(l.strip()) == 0:
                    continue

                if -1 < l.find("◎材料"):
                    is_material_area = True
                    material_title = l.replace("◎材料", "").translate(
                        self.__class__._TABLE_REPLACE_MARUKAKKO).strip()
                    if len(material_title):
                        recipe.materials.append(RecipeText(material_title))
                    continue
                if -1 < l.find("<作り方>"):
                    is_material_area = False
                    is_recipe_step_area = True
                    continue

                if is_material_area:
                    l = l.replace(" 本", "本").replace(" 個", "個")
                    recipe.materials.extend(
                        [RecipeText(m.replace(":", ": ")) for m in l.split()])
                elif is_recipe_step_area:
                    m = re.match(r"(\d+).\s*(.*)", l)
                    if m:
                        gs = m.groups()
                        num = int(gs[0])
                        recipe_step = gs[1]
                        recipe.recipe_steps.append(
                            RecipeText("({}){}".format(num, recipe_step)))
                    else:
                        recipe.recipe_steps.append(RecipeText(l))

            recipe.id = hashlib.md5(
                ("{}/{}".format(recipe.cooking_name_sub, recipe.cooking_name)
                 if recipe.cooking_name_sub else
                 recipe.cooking_name).encode("utf-8")).hexdigest()
            recipes[recipe.id] = recipe

        return recipes