class JdCommentParse(object): def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.comment_page_switch_sleep_time = 1.2 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs() self._add_headers_cookies() def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) self.goods_id = goods_id self.headers.update({ 'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id), }) # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 3): _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json' params = self._set_params(goods_id=goods_id, current_page=current_page) body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params) # self.my_lg.info(str(body)) _data = self._json_2_dict(body).get('wareDetailComment', {}).get('commentInfoList', []) _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id:{0}'.format(goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data def _get_comment_list(self, _tmp_comment_list): ''' 转换成需求的结果集 :param _tmp_comment_list: :return: ''' _comment_list = [] for item in _tmp_comment_list: _comment_date = item.get('commentDate', '') assert _comment_date != '', '得到的_comment_date为空str!请检查!' # sku_info(有些商品评论是没有规格的所以默认为空即可,不加assert检查!) ware_attributes = item.get('wareAttributes', []) # self.my_lg.info(str(ware_attributes)) sku_info = ' '.join([ i.get('key', '') + ':' + i.get('value', '') for i in ware_attributes ]) # assert sku_info != '', '得到的sku_info为空str!请检查!' _comment_content = item.get('commentData', '') assert _comment_content != '', '得到的评论内容为空str!请检查!' _comment_content = self._wash_comment(comment=_comment_content) buyer_name = item.get('userNickName', '') assert buyer_name != '', '得到的用户昵称为空值!请检查!' # jd设置默认 购买量为1 quantify = 1 head_img = item.get('userImgURL', '') assert head_img != '', '得到的用户头像为空值!请检查!' head_img = 'https://' + head_img # 第一次评论图片 _comment_img_list = item.get('pictureInfoList', []) if _comment_img_list != []: _comment_img_list = [{ 'img_url': img.get('largePicURL', '') } for img in _comment_img_list] '''追评''' append_comment = {} # star_level star_level = int(item.get('commentScore', '5')) comment = [{ 'comment': _comment_content, 'comment_date': _comment_date, 'sku_info': sku_info, 'img_url_list': _comment_img_list, 'star_level': star_level, 'video': '', }] _comment_list.append({ 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 评论数量 'head_img': head_img, # 头像 'append_comment': append_comment, # 追评 }) return _comment_list def _add_headers_cookies(self): # 测试发现得带cookies, 详细到cookies中的sid字符必须有 # 先获取cookies _cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://item.m.jd.com/') # self.my_lg.info(str(_cookies)) self.headers.update({ 'cookie': _cookies, }) return None def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/京东/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_headers(self): self.headers = { 'origin': 'https://item.m.jd.com', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS) - 1)], 'content-type': 'application/x-www-form-urlencoded', 'accept': 'application/json', 'referer': 'https://item.m.jd.com/ware/view.action?wareId=5025518', 'x-requested-with': 'XMLHttpRequest', } def _wash_comment(self, comment): ''' 清洗评论 :param comment: :return: ''' comment = re.compile(r'jd|\n|Jd|JD').sub('', comment) comment = re.compile('京东').sub('优秀网', comment) return comment def _json_2_dict(self, json_str): ''' json2dict :param json_str: :return: ''' try: _ = json.loads(json_str) except: self.my_lg.error('json.loads转换json_str时出错! 出错goods_id: ' + self.goods_id) return {} return _ def _set_params(self, goods_id, current_page): ''' 设置params :param goods_id: :param current_page: :return: ''' _params = [ ('wareId', goods_id), ('offset', str(current_page)), ('num', '10'), ('checkParam', 'LUIPPTP'), ('category', '670_671_1105'), ('isUseMobile', 'true'), ('evokeType', ''), ('type', '3'), # '0' 全部评论 | '3' 好评 ('isCurrentSku', 'false'), ] return _params def __del__(self): try: del self.my_lg del self.my_phantomjs del self.headers except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server. select_jumeiyoupin_xianshimiaosha_all_goods_id()) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 jumeiyoupin_spike = JuMeiYouPinSpike() # 获取cookies my_phantomjs = MyPhantomjs() cookies = my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 jumeiyoupin_miaosha = JuMeiYouPinParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] this_page_all_goods_list = self.get_one_page_all_goods_list( item[2]) if this_page_all_goods_list == '网络错误!': print('网络错误!先跳过') continue elif this_page_all_goods_list == []: print( '#### 该page对应得到的this_page_all_goods_list为空[]!') print('** 该商品已被下架限时秒杀活动, 此处将其删除') tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id( item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(goods_id=item[0]) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url( item[3]) jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get( 'begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = jumeiyoupin_spike.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time'] ) # print(goods_data) jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(JUMEIYOUPIN_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
class JuMeiYouPinSpike(object): def __init__(self): self._set_headers() def _set_headers(self): self.headers = { 'Accept': 'application/json,text/javascript,text/plain,*/*;q=0.01', # 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'h5.jumei.com', 'Referer': 'https://h5.jumei.com/', 'Cache-Control': 'max-age=0', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': HEADERS[randint(0, 34)], # 随机一个请求头 } def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] self.my_phantomjs = MyPhantomjs() cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/') try: del self.my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) print('开始抓取在售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]: item['page'] = page all_goods_list.append(item) sleep(.5) print('开始抓取预售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format(str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]: item['page'] = page all_goods_list.append(item) sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] print(all_goods_list) print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__()) self.deal_with_data(all_goods_list) return True def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, page, goods_url from dbo.jumeiyoupin_xianshimiaosha where site_id=26' db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))] # print(db_goods_id_list) for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: jumei = JuMeiYouPinParse() goods_id = item.get('goods_id', '') type = item.get('type', '') tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format(goods_id, type) jumei.get_goods_data(goods_id=[goods_id, type]) goods_data = jumei.deal_with_data() if goods_data == {}: pass elif goods_data.get('is_delete', 0) == 1: print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) goods_data['page'] = item.get('page') # pprint(goods_data) # print(goods_data) jumei.insert_into_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(JUMEIYOUPIN_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 try: del jumei except: pass else: print('数据库连接失败,此处跳过!') pass gc.collect() def get_miaosha_begin_time_and_miaosha_end_time(self, miaosha_time): ''' 返回秒杀开始和结束时间 :param miaosha_time: :return: tuple miaosha_begin_time, miaosha_end_time ''' miaosha_begin_time = miaosha_time.get('miaosha_begin_time') miaosha_end_time = miaosha_time.get('miaosha_end_time') # 将字符串转换为datetime类型 miaosha_begin_time = datetime.datetime.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S') miaosha_end_time = datetime.datetime.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S') return miaosha_begin_time, miaosha_end_time def __del__(self): gc.collect()