def build_node(k, v): if isinstance(v, dict): node = {'name': k, 'value': k, 'children': get_children_dict(v)} elif isinstance(v, list): node = {'name': k, 'value': k, 'children': get_children_list(v)} else: node = { 'name': k + ": " + normalize_string(str(v)), 'value': normalize_string(str(v)) } return node
def instructions(self): instructions = self.soup.findAll("li", {"class": "recipe__list-step"}) return "\n".join([ normalize_string(instruction.span.get_text()) for instruction in instructions ])
def get_children_list(children): kid_list = [] for kid in children: if type(kid) == list: node = { 'name': "unknown list", 'value': "unknown list", 'children': get_children_list(kid) } kid_list.append(node) elif type(kid) == dict: for k, v in kid.items(): kid_list.append(build_node(k, v)) else: kid_list.append({ 'name': normalize_string(str(kid)), 'value': normalize_string(str(kid)) }) return kid_list
def get_recipe_from_source(text, url, space): def build_node(k, v): if isinstance(v, dict): node = {'name': k, 'value': k, 'children': get_children_dict(v)} elif isinstance(v, list): node = {'name': k, 'value': k, 'children': get_children_list(v)} else: node = { 'name': k + ": " + normalize_string(str(v)), 'value': normalize_string(str(v)) } return node def get_children_dict(children): kid_list = [] for k, v in children.items(): kid_list.append(build_node(k, v)) return kid_list def get_children_list(children): kid_list = [] for kid in children: if type(kid) == list: node = { 'name': "unknown list", 'value': "unknown list", 'children': get_children_list(kid) } kid_list.append(node) elif type(kid) == dict: for k, v in kid.items(): kid_list.append(build_node(k, v)) else: kid_list.append({ 'name': normalize_string(str(kid)), 'value': normalize_string(str(kid)) }) return kid_list recipe_json = { 'name': '', 'url': '', 'description': '', 'image': '', 'keywords': [], 'recipeIngredient': [], 'recipeInstructions': '', 'servings': '', 'prepTime': '', 'cookTime': '' } recipe_tree = [] parse_list = [] html_data = [] images = [] text = unquote(text) try: parse_list.append(remove_graph(json.loads(text))) if not url and 'url' in parse_list[0]: url = parse_list[0]['url'] scrape = text_scraper("<script type='application/ld+json'>" + text + "</script>", url=url) except JSONDecodeError: soup = BeautifulSoup(text, "html.parser") html_data = get_from_html(soup) images += get_images_from_source(soup, url) for el in soup.find_all('script', type='application/ld+json'): el = remove_graph(el) if not url and 'url' in el: url = el['url'] if type(el) == list: for le in el: parse_list.append(le) elif type(el) == dict: parse_list.append(el) for el in soup.find_all(type='application/json'): el = remove_graph(el) if type(el) == list: for le in el: parse_list.append(le) elif type(el) == dict: parse_list.append(el) scrape = text_scraper(text, url=url) recipe_json = helper.get_from_scraper(scrape, space) for el in parse_list: temp_tree = [] if isinstance(el, Tag): try: el = json.loads(el.string) except TypeError: continue for k, v in el.items(): if isinstance(v, dict): node = { 'name': k, 'value': k, 'children': get_children_dict(v) } elif isinstance(v, list): node = { 'name': k, 'value': k, 'children': get_children_list(v) } else: node = { 'name': k + ": " + normalize_string(str(v)), 'value': normalize_string(str(v)) } temp_tree.append(node) if '@type' in el and el['@type'] == 'Recipe': recipe_tree += [{'name': 'ld+json', 'children': temp_tree}] else: recipe_tree += [{'name': 'json', 'children': temp_tree}] return recipe_json, recipe_tree, html_data, images