Python normalize_string示例，recipe_scrapers._utils.normalize_string Python示例

示例#1

0

显示文件

文件： recipe_html_import.py 项目： bmendric/recipes

 def build_node(k, v):
     if isinstance(v, dict):
         node = {'name': k, 'value': k, 'children': get_children_dict(v)}
     elif isinstance(v, list):
         node = {'name': k, 'value': k, 'children': get_children_list(v)}
     else:
         node = {
             'name': k + ": " + normalize_string(str(v)),
             'value': normalize_string(str(v))
         }
     return node

示例#2

0

显示文件

    def instructions(self):
        instructions = self.soup.findAll("li", {"class": "recipe__list-step"})

        return "\n".join([
            normalize_string(instruction.span.get_text())
            for instruction in instructions
        ])

示例#3

0

显示文件

文件： recipe_html_import.py 项目： bmendric/recipes

 def get_children_list(children):
     kid_list = []
     for kid in children:
         if type(kid) == list:
             node = {
                 'name': "unknown list",
                 'value': "unknown list",
                 'children': get_children_list(kid)
             }
             kid_list.append(node)
         elif type(kid) == dict:
             for k, v in kid.items():
                 kid_list.append(build_node(k, v))
         else:
             kid_list.append({
                 'name': normalize_string(str(kid)),
                 'value': normalize_string(str(kid))
             })
     return kid_list

示例#4

0

显示文件

文件： recipe_html_import.py 项目： bmendric/recipes

def get_recipe_from_source(text, url, space):
    def build_node(k, v):
        if isinstance(v, dict):
            node = {'name': k, 'value': k, 'children': get_children_dict(v)}
        elif isinstance(v, list):
            node = {'name': k, 'value': k, 'children': get_children_list(v)}
        else:
            node = {
                'name': k + ": " + normalize_string(str(v)),
                'value': normalize_string(str(v))
            }
        return node

    def get_children_dict(children):
        kid_list = []
        for k, v in children.items():
            kid_list.append(build_node(k, v))
        return kid_list

    def get_children_list(children):
        kid_list = []
        for kid in children:
            if type(kid) == list:
                node = {
                    'name': "unknown list",
                    'value': "unknown list",
                    'children': get_children_list(kid)
                }
                kid_list.append(node)
            elif type(kid) == dict:
                for k, v in kid.items():
                    kid_list.append(build_node(k, v))
            else:
                kid_list.append({
                    'name': normalize_string(str(kid)),
                    'value': normalize_string(str(kid))
                })
        return kid_list

    recipe_json = {
        'name': '',
        'url': '',
        'description': '',
        'image': '',
        'keywords': [],
        'recipeIngredient': [],
        'recipeInstructions': '',
        'servings': '',
        'prepTime': '',
        'cookTime': ''
    }
    recipe_tree = []
    parse_list = []
    html_data = []
    images = []
    text = unquote(text)

    try:
        parse_list.append(remove_graph(json.loads(text)))
        if not url and 'url' in parse_list[0]:
            url = parse_list[0]['url']
        scrape = text_scraper("<script type='application/ld+json'>" + text +
                              "</script>",
                              url=url)

    except JSONDecodeError:
        soup = BeautifulSoup(text, "html.parser")
        html_data = get_from_html(soup)
        images += get_images_from_source(soup, url)
        for el in soup.find_all('script', type='application/ld+json'):
            el = remove_graph(el)
            if not url and 'url' in el:
                url = el['url']
            if type(el) == list:
                for le in el:
                    parse_list.append(le)
            elif type(el) == dict:
                parse_list.append(el)
        for el in soup.find_all(type='application/json'):
            el = remove_graph(el)
            if type(el) == list:
                for le in el:
                    parse_list.append(le)
            elif type(el) == dict:
                parse_list.append(el)
        scrape = text_scraper(text, url=url)

    recipe_json = helper.get_from_scraper(scrape, space)

    for el in parse_list:
        temp_tree = []
        if isinstance(el, Tag):
            try:
                el = json.loads(el.string)
            except TypeError:
                continue

        for k, v in el.items():
            if isinstance(v, dict):
                node = {
                    'name': k,
                    'value': k,
                    'children': get_children_dict(v)
                }
            elif isinstance(v, list):
                node = {
                    'name': k,
                    'value': k,
                    'children': get_children_list(v)
                }
            else:
                node = {
                    'name': k + ": " + normalize_string(str(v)),
                    'value': normalize_string(str(v))
                }
            temp_tree.append(node)

        if '@type' in el and el['@type'] == 'Recipe':
            recipe_tree += [{'name': 'ld+json', 'children': temp_tree}]
        else:
            recipe_tree += [{'name': 'json', 'children': temp_tree}]

    return recipe_json, recipe_tree, html_data, images