def parse_question(self, content): """ 解析question页面 """ html = etree.HTML(content) question_tag = [] question_title = question_text = follower_count = '' try: question_tag = [ i.strip() for i in html.xpath('//a[@class="zm-item-tag"]/text()') ] or [] question_title_xpath = html.xpath( '//h2[@class="zm-item-title"]/span/text()') question_text_xpath = html.xpath( '//div[@class="zm-editable-content"]/text()') follower_count_xpath = html.xpath( '//div[@class="zg-gray-normal"]/a/strong/text()') answer_count_xpath = html.xpath( '//h3[@id="zh-question-answer-num"]/text()') question_title = question_title_xpath[ 0] if question_title_xpath else '' question_text = question_text_xpath[ 0] if question_text_xpath else '' follower_count = follower_count_xpath[ 0] if follower_count_xpath else 0 _answer_count = answer_count_xpath[0] if answer_count_xpath else '' except Exception, e: log.error( 'question_parse_question: question_url={} except={}'.format( self.question_url, str(e)))
def crawl(url): """ 返回下载页面 """ session = cookie.get_cookie() try: resp = session.get(url, headers=headers.get_headers()) except Exception, e: log.error('util_crawler: crawl {} except={}'.format(url, str(e))) return
def insert_question_id(questions_id, topic_id): """ mysql question表存储question_id """ in_sql = 'INSERT INTO question (question_id, topic_id) VALUES (%s, %s);' values = (questions_id, topic_id) try: MYSQLHANDLER.insert(in_sql, values) except Exception, e: log.error('main_main insert_question_id {}-{} except={}'.format(topic_id, questions_id, str(e)))
def redis_check(table, table_id): """ 判断question或answer是否已存在 """ if table == 'question': return REDIS_CLI.sismember('zhihu_topic_question_id', str(table_id)) elif table == 'answer': return REDIS_CLI.sismember('zhihu_topic_answer_id', str(table_id)) else: log.error('main_redis_check illegal table') return False
def crawl_question(self): """ 根据question_url抓取question页面 """ self.question_url = self.get_question_url() content = crawler.crawl(self.question_url) #with open('./data/questions/38589246.html', 'rb') as _r: # content = _r.read() if not content: log.error('question_crawl_question: content is None') return self.webpage_save(content) return self.parse_question(content)
def update_question_item(questions_id, q_item, topic_id): """ mysql question表存储question其他项目 """ up_sql = 'UPDATE question SET question_url=%s, question_title=%s, question_text=%s, follower_count=%s,\ scan_count=%s, answer_count=%s, question_tag=%s WHERE question_id=%s AND topic_id=%s;' values = (q_item.get('question_url'), q_item.get('question_title'), q_item.get('question_text'), q_item.get('follower_count'), q_item.get('scan_count'), q_item.get('answer_count'), '|'.join(q_item.get('question_tag')), questions_id, topic_id) try: MYSQLHANDLER.insert(up_sql, values) except Exception, e: log.error('main_main update_question_item {}-{} except={}'.format(topic_id, questions_id, str(e)))
def get_topic_max_page(self): """ 根据入口链接找出此topic下页面总数 """ url = crawl_topic_url.format(self.topic_id) content = crawler.crawl(url) max_page = 0 try: page_num = re.findall('\?page=(\d+)', content) if page_num: max_page = max(int(i) for i in page_num) except Exception, e: log.error('topic_topic: get_topic_max_page except={}'.format(e))
def parse_answer(self, content_json): """ 解析answer数据 """ for d in content_json.get('data'): try: self.answer_id = d.get('id') or '' if not self.answer_id: raise Exception self.author_name = d.get('author').get('name') or '' self.author_domain = d.get('author').get('url_token') or '' self.author_type = d.get('author').get('type') or '' self.author_headline = d.get('author').get('headline') or '' self.author_id = d.get('author').get('id') or '' self.content = d.get('content') self.voteup_count = d.get('voteup_count') or 0 self.comment_count = d.get('comment_count') or 0 self.answer_url = answer_url.format(self.question_id, self.answer_id) _answer_updated_time = d.get('updated_time') or 0 _answer_create_time = d.get('created_time') or 0 if _answer_updated_time: self.answer_updated_time = datetime.datetime.fromtimestamp( int(_answer_updated_time)).strftime( '%Y-%m-%d %H:%M:%S') if _answer_create_time: self.answer_create_time = datetime.datetime.fromtimestamp( int(_answer_create_time)).strftime('%Y-%m-%d %H:%M:%S') _answer = { 'question_id': self.question_id, 'answer_id': self.answer_id, 'answer_url': self.answer_url, 'author_name': self.author_name, 'author_domain': self.author_domain, 'author_type': self.author_type, 'author_headline': self.author_headline, 'author_id': self.author_id, 'content': self.content, 'answer_updated_time': self.answer_updated_time, 'answer_create_time': self.answer_create_time, 'voteup_count': self.voteup_count, 'comment_count': self.comment_count } self.answers.append(_answer) except Exception, e: log.error( 'answer_parse_anwser: question_id={} except={}'.format( self.question_id, str(e)))
def crawl_question(): """ question抓取 """ ## 获取topic topics = get_topic_mysql('crawled') for topic_id in topics: saved_questions_id = get_exists_question_id(topic_id, q_type='empty') for q_id in saved_questions_id: question_item = Question.from_question(q_id, topic_id).crawl_question() if not question_item: log.error('main_crawl_question: question_item is None question_id={}'.format(q_id)) break continue update_question_item(q_id, question_item, topic_id) time.sleep(0.2)
def get_exists_question_id(topic_id, q_type='all'): """ 查看mysql中已经保存的question_id """ questions = list() if q_type == 'all': se_sql = 'SELECT question_id FROM question WHERE topic_id=%s;' elif q_type == 'normal': se_sql = 'SELECT question_id FROM question WHERE topic_id=%s AND question_title IS NOT NULL;' elif q_type == 'empty': se_sql = 'SELECT question_id FROM question WHERE topic_id=%s AND question_title IS NULL;' else: log.error('main_get_exists_question_id: qType = {}'.format(q_type)) values = (topic_id, ) infos = MYSQLHANDLER.select(se_sql, values) for info in infos: questions.append(info.get('question_id')) return list(set(questions))
def insert_answer_item(a_items): """ mysql answer表存储answer项目 """ in_sql = 'INSERT INTO answer (answer_id, answer_url, question_id, author_name, author_domain, author_type, author_headline,\ author_id, content, answer_updated_time, answer_create_time, voteup_count, comment_count)\ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=NOW();' for a_item in a_items: ## 判断answer是否已存在 if redis_check('answer', a_item.get('answer_id')): continue values = (a_item.get('answer_id'), a_item.get('answer_url'), a_item.get('question_id'), a_item.get('author_name'), a_item.get('author_domain'), a_item.get('author_type'), a_item.get('author_headline'), a_item.get('author_id'), a_item.get('content'), a_item.get('answer_updated_time'), a_item.get('answer_create_time'), a_item.get('voteup_count'), a_item.get('comment_count')) try: MYSQLHANDLER.insert(in_sql, values) except Exception, e: log.error('main_main insert_answer_item {}-{} except={}'.format(a_item.get('question_id'), a_item.get('answer_id'), str(e))) REDIS_CLI.sadd('zhihu_topic_answer_id', str(a_item.get('answer_id')))
def call_questionAPI(self): """ 根据questionAPI获取answer数据 """ while True: url_api = self.get_questionAPI_url() #print url_api content = crawler.crawl(url_api) time.sleep(0.2) if not content: log.error( 'answer_call_questionAPI: url_api={}'.format(url_api)) break try: content_json = json.loads(content) except Exception, e: log.error( 'answer_call_questionAPI: except={} url_api={}'.format( str(e), url_api)) if not content_json or not content_json.get('data') or len( content_json.get('data')) == 0: break self.parse_answer(content_json)
import os import sys import requests reload(sys) sys.setdefaultencoding('utf-8') sys.path.append(os.path.abspath('..')) from util._log import log from util import common from util import headers from util import cookie """ 本程序用以下载页面 """ def crawl(url): """ 返回下载页面 """ session = cookie.get_cookie() try: resp = session.get(url, headers=headers.get_headers()) except Exception, e: log.error('util_crawler: crawl {} except={}'.format(url, str(e))) return if resp.status_code != 200: log.error('util_crawler: resp.status_code={}'.format(resp.status_code)) return resp.encoding = 'utf-8' return resp.text