Python html2dom示例

编程语言: Python

命名空间/包名称: dom

方法/功能: html2dom

hotexamples.com的示例: 4

Python html2dom - 已找到4个示例。这些是从开源项目中提取的最受好评的dom.html2dom现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： zhihu.py 项目： linzelong/zhihu-archive

def parse_answer_pure(content):
    with open('last.html', 'w') as f:
        f.write(content.decode())

    doc = dom.html2dom(content.decode())
    answerdom = doc.get_element_by_id('zh-question-answer-wrap')
    if len(answerdom) == 0:
        slog('warinng: no #zh-question-answer-wrap')
        # file_put_contents('last_error.html', content)
        raise Exception("no #zh-question-answer-wrap")
    for div in answerdom.iter('div'):
        classes = div.get('class')
        if classes is not None:
            classes = classes.split(' ')
            if 'zm-editable-content' in classes:
                answer = ''.join([dom.c14n(e) for e in div])
    span = get_list_by_attrib(answerdom.iter('span'), 'class', 'count')[0]
    vote = int(span.text)
    
    q = doc.get_element_by_id('zh-question-title')
    try:
        a = q[0][0]
    except Exception as e:
        print(dom.c14n(q))
        raise e
    question = a.text
    
    descript = doc.get_element_by_id('zh-question-detail')
    descript = ''.join([dom.c14n(e) for e in descript[0]])
    
    return (question, descript, answer, vote)

示例#2

显示文件

文件： zhihu.py 项目： linzelong/zhihu-archive

def get_avatar_src(content):
    doc = dom.html2dom(content.decode())
    wrap = doc.get_element_by_id('zh-pm-page-wrap')
    # print(dom.c14n(wrap))
    img_list = wrap.iter('img')
    # if img_list is None:
    #     raise Exception('no .zh-pm-page-wrap img')
    img_list = get_list_by_attrib(img_list, 'class', 'zm-profile-header-img zg-avatar-big zm-avatar-editor-preview')
    img = img_list[0]
    return img.get('src')

示例#3

显示文件

文件： zhihu.py 项目： linzelong/zhihu-archive

def get_username_list(content):
    with open('last_question.html', 'w') as f:
        f.write(content.decode())

    doc = dom.html2dom(content.decode())
    ret = {}
    regex = re.compile('/people/(.+)')
    for node in doc.root.iter('a'):
        href = node.get('href')
        if href is not None:
            matches = regex.search(href)
            if matches is not None:
                username = matches.group(1)
                ret[username] = node.text
    return ret

示例#4

显示文件

文件： zhihu.py 项目： linzelong/zhihu-archive

def get_answer_link_list(content):
    doc = dom.html2dom(content.decode())
    wrap = doc.get_element_by_id('zh-profile-answer-list')
    node_list = wrap.iter('a')
    question_link_list = get_list_by_attrib(node_list, 'class', 'question_link')
    return [e.get('href') for e in question_link_list]