def parse_answer_pure(content): with open('last.html', 'w') as f: f.write(content.decode()) doc = dom.html2dom(content.decode()) answerdom = doc.get_element_by_id('zh-question-answer-wrap') if len(answerdom) == 0: slog('warinng: no #zh-question-answer-wrap') # file_put_contents('last_error.html', content) raise Exception("no #zh-question-answer-wrap") for div in answerdom.iter('div'): classes = div.get('class') if classes is not None: classes = classes.split(' ') if 'zm-editable-content' in classes: answer = ''.join([dom.c14n(e) for e in div]) span = get_list_by_attrib(answerdom.iter('span'), 'class', 'count')[0] vote = int(span.text) q = doc.get_element_by_id('zh-question-title') try: a = q[0][0] except Exception as e: print(dom.c14n(q)) raise e question = a.text descript = doc.get_element_by_id('zh-question-detail') descript = ''.join([dom.c14n(e) for e in descript[0]]) return (question, descript, answer, vote)
def get_avatar_src(content): doc = dom.html2dom(content.decode()) wrap = doc.get_element_by_id('zh-pm-page-wrap') # print(dom.c14n(wrap)) img_list = wrap.iter('img') # if img_list is None: # raise Exception('no .zh-pm-page-wrap img') img_list = get_list_by_attrib(img_list, 'class', 'zm-profile-header-img zg-avatar-big zm-avatar-editor-preview') img = img_list[0] return img.get('src')
def get_username_list(content): with open('last_question.html', 'w') as f: f.write(content.decode()) doc = dom.html2dom(content.decode()) ret = {} regex = re.compile('/people/(.+)') for node in doc.root.iter('a'): href = node.get('href') if href is not None: matches = regex.search(href) if matches is not None: username = matches.group(1) ret[username] = node.text return ret
def get_answer_link_list(content): doc = dom.html2dom(content.decode()) wrap = doc.get_element_by_id('zh-profile-answer-list') node_list = wrap.iter('a') question_link_list = get_list_by_attrib(node_list, 'class', 'question_link') return [e.get('href') for e in question_link_list]