def get_ratings(initial_num): results = [[ 'id', 'userid', 'restid', 'rating', 'rating_env', 'rating_flavor', 'rating_service', 'timestamp', 'comment', 'url' ]] # initial_num = 706751182 success_count = 0 for i in range(1000): initial_num = initial_num - i - int(5 * random.random()) url = "http://www.dianping.com/review/{}".format(initial_num) r = get_html(url, headers) print('url', url) if r.status_code == 200: if '页面无法访问' in r.text: time.sleep(10 + 15 * random.random()) continue else: html = r.content.decode() b = re.findall("<div class='logo' id='logo'>验证中心</div>", html, re.S) if b: print('------需要验证呀,我先退出------') break else: html = None if html: # print(HTMLParser(html).css("div[class='review-words']")[0].text()) css_content = get_css_content(html, headers, r_sessoion) if css_content == 'error': time.sleep(10 + 15 * random.random()) continue font_dic, y_list = get_font_dic(css_content) html_new = get_html_full_review(html, css_content, font_dic, y_list) temp = None if 'span class="score"' in html_new: temp = HTMLParser(html_new).css( "span[class='score']")[1].html.strip() if '口味' in str(temp): for node in HTMLParser(temp).css("span[class='item']"): if '环境' in node.text(): rating_env = ''.join([ s for s in node.text() if s.isdigit() or s == '.' ]) elif '服务' in node.text(): rating_service = ''.join([ s for s in node.text() if s.isdigit() or s == '.' ]) elif '口味' in node.text(): rating_flavor = ''.join([ s for s in node.text() if s.isdigit() or s == '.' ]) elif '食材' in node.text(): rating_food = ''.join([ s for s in node.text() if s.isdigit() or s == '.' ]) rating = sum([ float(rating_env), float(rating_service), float(rating_flavor) ]) / 3 temp = HTMLParser(html_new).css( "div[class='review-detail-nav']")[0].html userid = HTMLParser(temp).css("span")[0].text().strip() restid = HTMLParser(temp).css("a")[-1].text().strip() comment = HTMLParser(html_new).css( "div[class='review-words']")[0].text().strip().replace( '\n', '') timestamp = HTMLParser(html_new).css( "span[class='time']")[0].text().strip() if '更新于' in timestamp: timestamp = timestamp.split('更新于')[1] timestamp = time.mktime( datetime.datetime.strptime( timestamp, "%Y-%m-%d %H:%M").timetuple()) results.append([ success_count, userid, restid, rating, float(rating_env), float(rating_flavor), float(rating_service), timestamp, comment, url ]) mystr = ' | '.join( str(x) for x in [ success_count + 212, userid, restid, rating, float(rating_env), float(rating_flavor), float(rating_service), timestamp, comment, url ]) with open('./dazhong.txt', 'a', encoding='utf8') as the_file: the_file.write(mystr + '\n') success_count += 1 print(results) if success_count == 300: break time.sleep(10 + 15 * random.random())
def Process(): initial_num = DianPing.get_initial_num() results = [['id', 'userid', 'restid', 'rating', 'rating_env', 'rating_flavor', 'rating_service', 'timestamp', 'comment', 'url']] # initial_num = 706754427 success_count = 0 for i in range(1000): initial_num += 1 url = "http://www.dianping.com/review/{}".format(initial_num) r = DianPing.get_html(url, DianPing.headers) print('url: ', url) print('r.status_code: ', r.status_code) if r.status_code == 200: if '页面无法访问' in r.text: print('页面无法访问') time.sleep(10 + 15 * random.random()) continue else: html = r.content.decode() b = re.findall( "<div class='logo' id='logo'>验证中心</div>", html, re.S) if b: print('------需要验证呀,我先退出------') break else: html = None print('html: ', html) if html: # print(HTMLParser(html).css("div[class='review-words']")[0].text()) css_content = DianPing.get_css_content( html, DianPing.headers, DianPing.r_sessoion) if css_content == 'error': print('css_content: error') time.sleep(10 + 15 * random.random()) continue font_dic, y_list = DianPing.get_font_dic(css_content) html_new = DianPing.get_html_full_review( html, css_content, font_dic, y_list) temp = None if 'span class="score"' in html_new: temp = HTMLParser(html_new).css( "span[class='score']")[1].html.strip() if '口味' in str(temp): for node in HTMLParser(temp).css("span[class='item']"): if '环境' in node.text(): rating_env = ''.join( [s for s in node.text() if s.isdigit() or s == '.']) elif '服务' in node.text(): rating_service = ''.join( [s for s in node.text() if s.isdigit() or s == '.']) elif '口味' in node.text(): rating_flavor = ''.join( [s for s in node.text() if s.isdigit() or s == '.']) elif '食材' in node.text(): rating_food = ''.join( [s for s in node.text() if s.isdigit() or s == '.']) rating = sum([float(rating_env), float( rating_service), float(rating_flavor)])/3 temp = HTMLParser(html_new).css( "div[class='review-detail-nav']")[0].html userid = HTMLParser(temp).css("span")[0].text().strip() restid = HTMLParser(temp).css("a")[-1].text().strip() comment = HTMLParser(html_new).css( "div[class='review-words']")[0].text().strip().replace('\n', '') timestamp = HTMLParser(html_new).css( "span[class='time']")[0].text().strip() if '更新于' in timestamp: timestamp = timestamp.split('更新于')[1] timestamp = time.mktime(datetime.datetime.strptime( timestamp, "%Y-%m-%d %H:%M").timetuple()) * 1000 results.append([success_count, userid, restid, rating, float(rating_env), float( rating_flavor), float(rating_service), timestamp, comment, url]) mystr = ' | '.join(str(x) for x in [success_count + 212, userid, restid, rating, float( rating_env), float(rating_flavor), float(rating_service), timestamp, comment, url]) with open('./dazhong.txt', 'a', encoding='utf8') as the_file: the_file.write(mystr + '\n') print('------开始在数据库写数据------') reviewId = initial_num # 评论ID user_id = '' # 用户Id name = userid # 用户名 comment = comment # 评论 star = rating # 评分 timestamp = int(timestamp) # 评论时间戳 rating = Rating(reviewId, user_id, name, '', restid, star, comment, url, timestamp, '大众点评') rating.insert() success_count += 1 print(results) DianPing.update_db(initial_num) if success_count == 3: print('------爬到了3个数据,休息一下,免的被封------') break time.sleep(10 + 15 * random.random())