示例#1
0
def parse_answer_pure(content):
    with open('last.html', 'w') as f:
        f.write(content.decode())

    doc = dom.html2dom(content.decode())
    answerdom = doc.get_element_by_id('zh-question-answer-wrap')
    if len(answerdom) == 0:
        slog('warinng: no #zh-question-answer-wrap')
        # file_put_contents('last_error.html', content)
        raise Exception("no #zh-question-answer-wrap")
    for div in answerdom.iter('div'):
        classes = div.get('class')
        if classes is not None:
            classes = classes.split(' ')
            if 'zm-editable-content' in classes:
                answer = ''.join([dom.c14n(e) for e in div])
    span = get_list_by_attrib(answerdom.iter('span'), 'class', 'count')[0]
    vote = int(span.text)
    
    q = doc.get_element_by_id('zh-question-title')
    try:
        a = q[0][0]
    except Exception as e:
        print(dom.c14n(q))
        raise e
    question = a.text
    
    descript = doc.get_element_by_id('zh-question-detail')
    descript = ''.join([dom.c14n(e) for e in descript[0]])
    
    return (question, descript, answer, vote)
示例#2
0
def get_avatar_src(content):
    doc = dom.html2dom(content.decode())
    wrap = doc.get_element_by_id('zh-pm-page-wrap')
    # print(dom.c14n(wrap))
    img_list = wrap.iter('img')
    # if img_list is None:
    #     raise Exception('no .zh-pm-page-wrap img')
    img_list = get_list_by_attrib(img_list, 'class', 'zm-profile-header-img zg-avatar-big zm-avatar-editor-preview')
    img = img_list[0]
    return img.get('src')
示例#3
0
def get_username_list(content):
    with open('last_question.html', 'w') as f:
        f.write(content.decode())

    doc = dom.html2dom(content.decode())
    ret = {}
    regex = re.compile('/people/(.+)')
    for node in doc.root.iter('a'):
        href = node.get('href')
        if href is not None:
            matches = regex.search(href)
            if matches is not None:
                username = matches.group(1)
                ret[username] = node.text
    return ret
示例#4
0
def get_answer_link_list(content):
    doc = dom.html2dom(content.decode())
    wrap = doc.get_element_by_id('zh-profile-answer-list')
    node_list = wrap.iter('a')
    question_link_list = get_list_by_attrib(node_list, 'class', 'question_link')
    return [e.get('href') for e in question_link_list]