def record_result(self, result, color='default', font_size=16, strong=False, type='word', br=True, default=False, new_line=False): logging.error("RealTimeAnalysis 1111") self.full_result = '' if type == 'word' and default == False: if strong: result = '<strong style="color: %s; font-size: %spx;">%s</strong>' % ( color, font_size, result) else: result = '<span style="color: %s; font-size: %spx;">%s</span>' % ( color, font_size, result) elif type == 'image': result = markdown2.markdown(result) self.full_result += result if br: self.full_result += '<br>' if new_line: self.full_result += '\n' logging.error("full_result:%s result:%s" % (self.full_result, result)) logging.error("guid:%s, user_id:%s, info:%s, type:%s" % (self.guid, self.user_id, self.full_result, type)) utils.push_redis(guid=self.guid, user_id=self.user_id, info=self.full_result, type=type)
def record_result(self, result, color='default', font_size=16, strong=False, type='word', br=True, default=False, new_line=False): self.full_result = '' if type == 'word' and default == False: if strong: result = '<strong style="color: %s; font-size: %spx;">%s</strong>' % ( color, font_size, result) else: result = '<span style="color: %s; font-size: %spx;">%s</span>' % ( color, font_size, result) elif type == 'image': result = markdown2.markdown(result) self.full_result += result if br: self.full_result += '<br>' if new_line: self.full_result += '\n' utils.push_redis(guid=self.guid, product_id=self.product_id, info=self.full_result, type=type)
def init(self): command = ( "CREATE TABLE IF NOT EXISTS {} (" "`id` BIGINT (15) NOT NULL AUTO_INCREMENT," # 评论的 id "`content` TEXT NOT NULL," # 评论的内容 "`creation_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP," # 评论创建的时间 "`reply_count` INT(4) DEFAULT NULL ," # 回复数量 "`score` INT(2) DEFAULT NULL," # 评星 "`useful_vote_count` INT(5) DEFAULT NULL," # 其他用户觉得有用的数量 "`useless_vote_count` INT(4) DEFAULT NULL," # 其他用户觉得无用的数量 "`user_level_id` INT(4) DEFAULT NULL," # 评论用户等级的 id '`user_province` CHAR(8) DEFAULT NULL,' # 用户的省份 '`nickname` CHAR(20) DEFAULT NULL,' # 评论用户的昵称 '`product_color` CHAR(50) DEFAULT NULL,' # 商品的颜色 "`product_size` CHAR(50) DEFAULT NULL," # 商品的大小 "`user_level_name` CHAR(20) DEFAULT NULL," # 评论用户的等级 "`user_client` INT(5) DEFAULT NULL," # 用户评价平台 "`user_client_show` CHAR(20) DEFAULT NULL," # 用户评价平台 "`is_mobile` INT(3) DEFAULT NULL," # 是否是在移动端完成的评价 "`days` INT(3) DEFAULT NULL," # 购买后评论的天数 "`reference_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP," # 购买的时间 "`after_days` INT(3) DEFAULT NULL," # 购买后再次评论的天数 "`images_count` INT(3) DEFAULT NULL," # 评论总图片的数量 "`ip` CHAR(20) DEFAULT NULL," # 再次评论时的 ip 地址 "`after_content` TEXT DEFAULT NULL," # 再次评论的内容 "`save_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP," # 抓取数据的时间 "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(self.item_table)) self.sql.create_table(command) utils.push_redis(self.guid, self.product_id, '开始抓取京东商城该商品的评价信息...')
def get_comment_count(self, response): self.save_page('%s.html' % self.product_id, response.body) name = response.xpath('//head/title/text()').extract_first() self.log('name:%s' % name) utils.push_redis( self.guid, self.product_id, '商品名称:%s 链接:<a href="%s" target="_blank">%s' % (name, self.url, self.url)) ids = response.xpath('//div[@class="dd"]/div/@data-sku').extract() item_ids = ','.join(ids) self.log('item_ids:%s' % item_ids) pattern = re.compile('commentVersion:\'(\d+)\'', re.S) comment_version = re.search(pattern, response.body).group(1) # sort type 5:推荐排序 6:时间排序 url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \ '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page=0&pageSize=10' \ '&isShadowSku=0'. \ format(product_id = self.product_id, comment_version = comment_version, sort_type = '6') yield Request( url=url, headers={ 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'club.jd.com', 'Referer': 'https://item.jd.com/%s.html' % self.product_id, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 ' 'Firefox/52.0', }, method='GET', meta={ 'name': name, 'comment_version': comment_version, 'item_ids': item_ids, }, dont_filter=True, callback=self.get_all_comment)
def handle(self, *args, **options): reload(sys) sys.setdefaultencoding('utf-8') os.chdir(sys.path[0]) spargs = utils.arglist_to_dict(options['spargs']) if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler=False) logging.basicConfig(filename='log/%s.log' % spargs.get('user_id'), format='%(levelname)s %(asctime)s: %(message)s', level=logging.ERROR) guid = spargs.get('guid', '0') user_id = spargs.get('user_id', '0') logging.warn('user_id') if guid == '0' or user_id == '0': utils.log('分析数据传入参数不对,接收到的参数为: spargs:%s' % spargs) utils.push_redis(guid=guid, user_id=user_id, info='分析数据传入参数不对,接收到的参数为:%s' % spargs) utils.push_redis(guid=guid, user_id=user_id, info='finish') return utils.log('开始分析:%s' % spargs) sql = SqlHelper() red = redis.StrictRedis(host=config.redis_host, port=config.redis_part, db=config.redis_db, password=config.redis_pass) spargs['sql'] = sql spargs['red'] = red # 运行爬虫 logging.warn(spargs) runspider(spargs) # 开启分析 logging.warn(spargs) analysis = RealTimeAnalysis(**spargs) analysis.run()
def close(spider, reason): if spider.product_msg != None: spider.sql.insert_json(spider.product_msg, config.jd_item_table) # 如果是分布式抓取 清理 redis if config.is_distributed: utils.red.delete('%s_page' % spider.product_id) utils.red.delete(spider.product_id) spider.log('clear redis product_id:%s' % spider.product_id) # 等其他抓取进程一下 time.sleep(5) command = "SELECT COUNT(*) FROM {}".format('item_%s' % spider.product_id) spider.sql.execute(command, commit=False) (count, ) = spider.sql.cursor.fetchone() command = "SELECT COUNT(*) FROM {} WHERE score=5".format( 'item_%s' % spider.product_id) spider.sql.execute(command, commit=False) (good_count, ) = spider.sql.cursor.fetchone() command = "SELECT COUNT(*) FROM {} WHERE score>=3 and score <=4".format( 'item_%s' % spider.product_id) spider.sql.execute(command, commit=False) (general_count, ) = spider.sql.cursor.fetchone() command = "SELECT COUNT(*) FROM {} WHERE score<=2".format( 'item_%s' % spider.product_id) spider.sql.execute(command, commit=False) (poor_count, ) = spider.sql.cursor.fetchone() utils.push_redis( spider.guid, spider.product_id, info= '抓取信息完成,实际抓取评价信息,<strong style="color: red; font-size: 24px;">总共抓取评价数:%s、好评数:%s、' '中评数:%s、差评数:%s</strong>' % (count, good_count, general_count, poor_count)) # 事务提交数据 spider.sql.commit()
def randitem(spargs): guid = spargs.get('guid', 0) utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False) url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \ '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682' headers = { 'Host': 'diviner.taobao.com', 'Referer': 'https://www.taobao.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0' } cookies = { '__jda': '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1', '__jdb': '122270672.1.1492415671516609876050|1.1492415672', '__jdc': '122270672', '__jdv': '122270672|direct|-|none|-|1492415671524', '__jdu': '1492415671516609876050', } r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20) pattern = re.compile('"sku":(\d+),', re.S) ids = re.findall(pattern, r.text) id = random.choice(ids) url = 'https://item.taobao.com/%s.html' % str(id) utils.push_redis(guid, 0, '生成商品链接:<a href="%s" target="_blank">%s' % (url, url), save_to_mysql=False) sql = SqlHelper() command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.tb_item_table, product_id = id) result = sql.query_one(command) # 如果数据库中没有,则重新抓取 if result == None: cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid, product_id = id) subprocess.Popen(cmd, shell=True) else: # 如果数据库中存在则,直接读取数据库中数据 command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, id) result = sql.query(command) for res in result: utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('\d+', re.S) product_id = re.search(pattern, url).group() sql = SqlHelper() utils.log('product_id:%s' % product_id) if 'item.jd.com' in url and product_id != None: data['status'] = 'success' data['guid'] = str(uuid.uuid4()) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.jd_item_table, product_id = product_id) result = sql.query_one(command) if result == None: name = 'jd' cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, product_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, product_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'product_id={product_id};'. \ format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e
def get_all_comment(self, response): self.save_page('%s_all_comment.html' % self.product_id, response.body) detect = chardet.detect(response.body) encoding = detect.get('encoding', '') body = response.body.decode(encoding, 'ignore') pattern = re.compile('\((.*?)\);', re.S) item = re.search(pattern, body) if item != None and item.group(1) != None: data = json.loads(item.group(1)) # productCommentSummary pcs = data.get('productCommentSummary') self.product_msg = { 'id': self.product_id, 'name': response.meta.get('name'), 'good_rate_show': pcs.get('goodRateShow'), 'poor_rate_show': pcs.get('poorRateShow'), 'average_score': pcs.get('averageScore'), 'good_count': pcs.get('goodCount'), 'general_rate': pcs.get('generalRate'), 'general_count': pcs.get('generalCount'), 'poor_rate': pcs.get('poorRate'), 'after_count': pcs.get('afterCount'), 'good_rate_style': pcs.get('goodRateStyle'), 'poor_count': pcs.get('poorCount'), 'poor_rate_style': pcs.get('poorRateStyle'), 'general_rate_style': pcs.get('generalRateStyle'), 'comment_count': pcs.get('commentCount'), 'product_id': pcs.get('productId'), 'good_rate': pcs.get('goodRate'), 'general_rate_show': pcs.get('generalRateShow'), 'url': self.url, 'item_ids': response.meta.get('item_ids'), 'save_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), } info = '京东商城显示的评价信息,<strong style="color: red; font-size: 24px;">总的评价数:{comment_count}、好评数:{good_count}、' \ '好评百分比:{good_rate}%、中评数:{general_count}、中评百分比:{general_rate}%、差评数:{poor_count}、差评百分比:{poor_rate}% ' \ '</strong>' \ .format(comment_count = pcs.get('commentCount'), good_count = pcs.get('goodCount'), general_count = pcs.get('generalCount'), poor_count = pcs.get('poorCount'), good_rate = pcs.get('goodRate', 0) * 100, general_rate = pcs.get('generalRate', 0) * 100, poor_rate = pcs.get('poorRate', 0) * 100) utils.push_redis(self.guid, self.product_id, info) # 显示正在加载图片 utils.push_redis( self.guid, self.product_id, '<li id="loader"><img src="/static/loader.gif" height="90" width="90"></li>', type='image', save_to_mysql=False) comment_version = response.meta.get('comment_version') comment_count = pcs.get('commentCount') page_count = int(comment_count) / 10 + 10 # 这里为什么加 10 ? inner_crawl_page = get_project_settings().get( 'INNER_CRAWL_PAGE', 20) if page_count > inner_crawl_page and config.is_distributed: for i in range(inner_crawl_page, page_count): # 将数据插入 redis ,实现分布式抓取 data = { 'prodyct_id': self.product_id, 'comment_version': comment_version, 'sort_type': '6', 'page': i } self.red.rpush(self.product_id, json.dumps(data)) count = self.red.llen('spiders') self.red.set('%s_page' % self.product_id, page_count - inner_crawl_page) for i in range(count): guid = self.red.lindex('spiders', i) self.red.rpush(guid, self.product_id) # 正常抓取 count = min(page_count, inner_crawl_page) for i in range(count): # sort type 5:推荐排序 6:时间排序 url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \ '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page={page}&' \ 'pageSize=10&isShadowSku=0'. \ format(product_id = self.product_id, comment_version = comment_version, sort_type = '6', page = i) yield Request( url=url, headers={ 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'club.jd.com', 'Referer': 'https://item.jd.com/%s.html' % self.product_id, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 ' 'Firefox/52.0', }, method='GET', meta={ 'page': i, 'name': response.meta.get('name'), }, dont_filter=True, callback=self.parse_comment)
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('user-rate-') urls = re.split(pattern, url) user_id = urls[1] pattern = re.compile('\w+', re.S) user_id = re.search(pattern, user_id).group() sql = SqlHelper() utils.log('user_id:%s' % user_id) if 'rate.taobao.com' in url and user_id != None: data['status'] = 'success' data['guid'] = str(random.randint(1000000000000, 9999999999999)) + '_' + str( random.randint(100, 999)) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={user_id}". \ format(table = config.tb_item_table, user_id = user_id) result = sql.query_one(command) if result == None: name = 'tb_comment' cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a user_id={user_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) logging.warn(cmd) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \ format(config.analysis_item_table, user_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, user_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'user_id={user_id};'. \ format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e