def _deal_with_jd_goods(goods_link, my_lg): ''' 处理jd商品 :param goods_link: :return: ''' my_lg.info('进入京东商品处理接口...') goods_id = _get_jd_goods_id(goods_link) if goods_id == '': msg = 'goods_id匹配失败!请检查url是否正确!' return _error_data(msg=msg) jd_url = 'https://item.jd.com/{0}.html'.format(goods_id) data = get_one_jd_data(wait_to_deal_with_url=jd_url) if data.get('msg', '') == 'data为空!': msg = '该goods_id:{0}, 抓取数据失败!'.format(goods_id) return _error_data(msg=msg) else: pass site_id = _from_jd_type_get_site_id(type=data.get('jd_type')) data = _get_right_model_data(data=data, site_id=site_id, logger=my_lg) my_pipeline = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('------>>>| 正在存储的数据为: ' + data.get('goods_id', '')) params = _get_db_jd_insert_params(item=data) sql_str = 'insert into dbo.GoodsInfoAutoGet(GoodsID, GoodsUrl, UserName, CreateTime, ModfiyTime, ShopName, Account, GoodsName, SubTitle, LinkName, Price, TaoBaoPrice, PriceInfo, SKUName, SKUInfo, ImageUrl, PropertyInfo, DetailInfo, SellCount, SiteID, IsDelete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' is_insert_into = my_pipeline._insert_into_table(sql_str=sql_str, params=params) if is_insert_into: # 如果返回值为True pass else: # 不处理存储结果 # msg = '存储该goods_id:{0}失败!'.format(goods_id) # return _error_data(msg=msg) pass return compatible_api_goods_data(data=data, my_lg=my_lg)
def get_all_user_and_their_recommend_goods_list(self): for index in range(1, 100): t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # 达人推荐的地址(ajax请求) tmp_url = 'https://wq.jd.com/shopgroup_feed/GetDarenFeeds?pageno={}&pagesize=5&darenType=0&perDarenFeedNum=3&totalpage=1&_={}&callback=jsonpCBKC&g_ty=ls'.format( str(index), t ) self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.set_page_load_timeout(15) # 设置成15秒避免数据出错 try: self.driver.get(tmp_url) self.driver.implicitly_wait(15) except Exception as e: # 如果超时, 终止加载并继续后续操作 print('-->>time out after 15 seconds when loading page') self.driver.execute_script('window.stop()') # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作 # pass body = self.driver.page_source body = re.compile(r'\n').sub('', body) body = re.compile(r'\t').sub('', body) body = re.compile(r' ').sub('', body) # print(body) body = re.compile(r'square\((.*)\)').findall(body) if body != []: body = body[0] try: data = json.loads(body) # pprint(data) except: print('json.loads转换body得到data时出错!') return [] if data.get('user_list') is None: # 表示没有数据了,返回的为 square({"errmsg":"","iRet":0,"totalnum":347} ) print('body中获取的user_list为None!') pass else: user_list = data.get('user_list', []) # pprint(user_list) for item in user_list: # 达人昵称 nick_name = item.get('nickname', '') # 达人头像 head_url = item.get('headurl', '') head_url = re.compile(r'http:').sub('', head_url) if re.compile(r'^http').findall(head_url) != []: pass else: head_url = 'http:' + head_url # 个性签名 profile = item.get('profile', '') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=7 or SiteID=8 or SiteID=9 or SiteID=10' _ = my_pipeline._select_table(sql_str=sql_str) db_goods_id = [j[1] for j in list(_)] if _ is not None else [] # print(db_goods_id) sql_str = r'select share_id from dbo.jd_youxuan_daren_recommend' db_share_id = [j[0] for j in list(my_pipeline._select_table(sql_str=sql_str))] # print(db_share_id) jd = JdParse() # 达人推荐的商品info feed_list = item.get('feed_list', []) for feed_list_item in feed_list: if feed_list_item.get('shareid', '') in db_share_id: print('该share_id({})已存在于数据库中, 此处跳过!'.format(feed_list_item.get('shareid', ''))) pass else: # share_id share_id = feed_list_item.get('shareid', '') article_url = 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id print('------>>>| 正在抓取的jd优选达人推荐文章的地址为: ', 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id) # 图片的信息 tmp_share_img_url_list = [] for item2 in feed_list_item.get('sharepicurl', '').split(','): if re.compile(r'^//').findall(item2) == []: tmp_share_img_url = 'https://img14.360buyimg.com/evalpic/s800x800_' + item2 else: tmp_share_img_url = 'http:' + item2 tmp_share_img_url_list.append(tmp_share_img_url) share_img_url_list = [{'img_url': item5} for item5 in tmp_share_img_url_list] # 处理得到达人的自拍图片div tmp_img_div_desc = '' for item4 in tmp_share_img_url_list: tmp_img_div = r'<img src="{}" style="height:auto;width:100%;"/>'.format(item4) tmp_img_div_desc += tmp_img_div my_img_div = '<div>' + tmp_img_div_desc + '</div>' # print(my_img_div) # 获取到goods_id 和 fisrt_text share_url = 'https://wq.jd.com/shopgroup_feed/FeedDetail?shareid=' + feed_list_item.get('shareid', '') + '&g_tk=1975813451' try: self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.get(share_url) self.driver.implicitly_wait(15) except Exception as e: # 如果超时, 终止加载并继续后续操作 print('-->>time out after 15 seconds when loading page') self.driver.execute_script('window.stop()') # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作 # pass feed_detail_body = self.driver.page_source feed_detail_body = re.compile(r'\n').sub('', feed_detail_body) feed_detail_body = re.compile(r'\t').sub('', feed_detail_body) feed_detail_body = re.compile(r' ').sub('', feed_detail_body) feed_data = re.compile(r'square\((.*)\)').findall(feed_detail_body) # print(feed_data) if feed_data != []: feed_data = feed_data[0] try: feed_data = json.loads(feed_data) except: print('json.loads转换feed_data失败,此处跳过!') break # 跳出后执行后面的外层的else # 文章标题 title = feed_data.get('feeddata', {}).get('title', '') title = re.compile(r'12.12').sub('', title) # 达人评论内容 tmp_comment_content = feed_data.get('feeddata', {}).get('commentcontent', '') tmp_comment_content = re.compile(r'&').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'\n').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'12.12').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'11.11').sub('', tmp_comment_content) comment_content = tmp_comment_content if title == '': # 由于获取到title为空, 所有title = comment_content, 并把comment_content = '' title = comment_content comment_content = '' # print('该文章的标题为: ', title) # print('达人的评论内容为: ', comment_content) # first_text(文章的第一段评论内容) first_text = feed_data.get('feeddata', {}).get('firsttext', '') first_text = re.compile(r'12.12').sub('', first_text) first_text = re.compile(r'11.11').sub('', first_text) # print('first_text为: ', first_text) sku_id = feed_data.get('feeddata', {}).get('skuid') if sku_id == '0': # 如果sku_id = '0'表示没有sku_id sku_id = '' # print('sku_id为: ', sku_id) share_id = feed_list_item.get('shareid', '') tmp_div_body_dict = self.get_div_body(share_id=share_id) # pprint(tmp_div_body_dict) if tmp_div_body_dict['sku_info'] == [] and sku_id != '': # 表示如果tmp_div_body_dict['sku_info']为[],则第二部分没有goods_id,所有将第一个sku_id赋值给sku_info goods_id_list = [{'goods_id': sku_id}] else: # 这篇文章推荐的商品goods_id的list(第一个为没有div_body时的goods_id) goods_id_list = [{'goods_id': item6} for item6 in tmp_div_body_dict['sku_info']] tmp_div_body = '<div>' + '<h3>{}</h3>'.format(title) + '<p>{}</p>'.format(comment_content) + my_img_div + tmp_div_body_dict['div_body'] # print('该文章推荐的商品goods_id的list为: ', goods_id_list) # print(tmp_div_body) else: print('获取feed_data失败!') return [] # 后期处理 if comment_content == '': comment_content = first_text ''' 时区处理,时间处理到上海时间 ''' tz = pytz.timezone('Asia/Shanghai') # 创建时区对象 now_time = datetime.datetime.now(tz) # 处理为精确到秒位,删除时区信息 now_time = re.compile(r'\..*').sub('', str(now_time)) # 将字符串类型转换为datetime类型 now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S') create_time = now_time # 创建的时间 result = { 'nick_name': nick_name, # 达人昵称 'head_url': head_url, # 达人头像 'profile': profile, # 个性签名 'share_id': share_id, # 分享的share_id 'article_url': article_url, # 文章原地址 'title': title, # 文章标题 'comment_content': comment_content, # 达人的评论内容 'share_img_url_list': share_img_url_list, # 达人自拍照片list # 'first_text': first_text, # 文章的第一段评论文字 'goods_id_list':goods_id_list, # 文章中所有推荐的商品的goods_id的list 'div_body': tmp_div_body, # 文章主体div 'create_time': create_time, # 文章创建的时间 } # pprint(result) print(result) params = self._get_db_insert_params(item=result) sql_str = r'insert into dbo.jd_youxuan_daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_img_url_list, goods_id_list, div_body, create_time) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' my_pipeline._insert_into_table(sql_str=sql_str, params=params) print('准备开始抓取该文章中的所有推荐商品'.center(30, '-')) for i in goods_id_list: if i.get('goods_id', '') in db_goods_id: print('该goods_id({})已经存在于数据库中, 此处跳过!'.format(i.get('goods_id', ''))) pass else: tmp_goods_id_url = 'https://item.jd.com/' + i.get('goods_id', '') + '.html' goods_id = jd.get_goods_id_from_url(jd_url=tmp_goods_id_url) jd.get_goods_data(goods_id=goods_id) tmp_jd_data = jd.deal_with_data(goods_id=goods_id) tmp_jd_data['spider_url'] = tmp_goods_id_url tmp_jd_data['username'] = '******' tmp_jd_data['goods_id'] = goods_id[1] jd.insert_into_jd_table(data=tmp_jd_data, pipeline=my_pipeline) print('该文章内推荐的商品全部抓取完毕'.center(30, '-')) else: print('body为空list!')
async def _insert_into_table(self): pipeline = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'insert into dbo.Region(c_name, code, parent_code, parent_name) values(%s, %s, %s, %s)' # 存储第一级别 print('第一级别'.center(100, '-')) for item in self.target_data: c_name = item.get('c_name') code = item.get('code') parent_code = '' parent_name = '' params = ( c_name, code, parent_code, parent_name, ) print(params) pipeline._insert_into_table(sql_str=sql_str, params=params) # 存储第二级别 print('第二级别'.center(100, '-')) for item in self.target_data: item_name = item.get('c_name') item_code = item.get('code') o = item.get('o') for i in o: c_name = i.get('c_name') code = i.get('code') parent_code = item_code parent_name = item_name params = ( c_name, code, parent_code, parent_name, ) print(params) pipeline._insert_into_table(sql_str=sql_str, params=params) # 存储第三级别 print('第三级别'.center(100, '-')) for item in self.target_data: o = item.get('o') for i in o: i_name = i.get('c_name') i_code = i.get('code') w = i.get('w') if w is not None: for j in w: c_name = j.get('c_name') code = j.get('code') parent_code = i_code parent_name = i_name params = ( c_name, code, parent_code, parent_name, ) print(params) pipeline._insert_into_table(sql_str=sql_str, params=params) else: # eg: 北京市只到第二级, 第三级无 pass print('全部写入完毕!'.center(100, '*')) return True
def run(self): global coupon_queue, goods_id_and_coupon_url_list, unique_coupon_id_list while True: sql_cli = None try: if coupon_queue.qsize() >= 1: # todo 有些领券url 为预付定金商品, 此处不处理 coupon_item = coupon_queue.get() ori_coupon_list = json_2_dict( json_str=re.compile('\((.*)\)').findall(coupon_item) [0], default_res={}, ).get('data', {}).get('resultList', []) assert ori_coupon_list != [] # pprint(ori_coupon_list) # todo: 测试发现, 返回数据中, 若有多买几件的优惠券在字段'nCouponInfoMap'中 # 现只支持1件, 不支持多件的券 coupon_list = [] for item in ori_coupon_list: try: goods_id = str(item.get('itemId', '')) assert goods_id != '' # 一个账户优惠券只能使用一次 # 优惠券展示名称, eg: '优惠券' coupon_display_name = '优惠券' # 优惠券的值, 即优惠几元 ori_coupon_value = item.get('couponAmount', '') assert ori_coupon_value != '' coupon_value = str( float(ori_coupon_value).__round__(2)) # 使用门槛 ori_thresold = item.get('couponStartFee', '') assert ori_thresold != '' threshold = str(float(ori_thresold).__round__(2)) begin_time = str( timestamp_to_regulartime( int( item.get('couponEffectiveStartTime', '')[0:10]))) end_time = str( timestamp_to_regulartime( int( item.get('couponEffectiveEndTime', '')[0:10]))) # 使用方法 use_method = '满{}元, 减{}元'.format( threshold, coupon_value) if string_to_datetime( end_time) <= get_shanghai_time(): print('该券已过期[goods_id: {}]'.format(goods_id)) # 已过期的 continue if datetime_to_timestamp(string_to_datetime(end_time)) - datetime_to_timestamp(string_to_datetime(begin_time)) \ <= 60 * 60 * 36: print('该券小于1.5天[goods_id: {}], pass'.format( goods_id)) continue # todo 测试发现, 同一商品可能存在不同活动时间段的同一优惠券(但是活动时间不同), 导致一个商品有多个优惠券 # 所以取值时, 按结束时间最大那个来取值 # 上面还是会有问题, 导致价格重复减, 所以生成唯一id, 所以在一次转换价格后要把所有的该goods_id券都标记为1 # 生成唯一id # unique_id = str(get_uuid3( # target_str=goods_id \ # + coupon_value \ # + threshold \ # + str(datetime_to_timestamp(string_to_datetime(begin_time)))[0:10]\ # + str(datetime_to_timestamp(string_to_datetime(end_time)))[0:10])) # todo 根据上诉存在多张券导致价格被多次修改的情况,故表中一个goods_id,只允许存一张券, 就不会出现价格被多次修改的情况 # 解释就说: 只存储优惠力度最大的券 unique_id = str(get_uuid3(target_str=goods_id)) # 领券地址 # pprint(goods_id_and_coupon_url_list) coupon_url = '' for j in goods_id_and_coupon_url_list: tmp_goods_id = j['goods_id'] tmp_coupon_url = j['coupon_url'] if goods_id == tmp_goods_id: print('@@@ 成功匹配到goods_id: {} 的领券地址: {}!!'. format(goods_id, tmp_coupon_url)) coupon_url = tmp_coupon_url break else: continue assert coupon_url != '' coupon_list.append({ 'unique_id': unique_id, 'goods_id': goods_id, 'coupon_url': coupon_url, 'coupon_display_name': coupon_display_name, 'coupon_value': coupon_value, 'threshold': threshold, 'begin_time': begin_time, 'end_time': end_time, 'use_method': use_method, }) except Exception as e: print(e) continue # pprint(coupon_list) if coupon_list != []: # 存储 sql_cli = SqlServerMyPageInfoSaveItemPipeline() if not sql_cli.is_connect_success: raise SqlServerConnectionException for item in coupon_list: unique_id = item['unique_id'] goods_id = item['goods_id'] if unique_id not in unique_coupon_id_list: save_res = sql_cli._insert_into_table( sql_str= 'insert into dbo.coupon_info(unique_id, create_time, goods_id, coupon_url, coupon_display_name, coupon_value, threshold, begin_time, end_time, use_method) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', params=( unique_id, str(get_shanghai_time()), goods_id, item['coupon_url'], item['coupon_display_name'], Decimal( item['coupon_value']).__round__(2), Decimal( item['threshold']).__round__(2), item['begin_time'], item['end_time'], item['use_method'], ), repeat_insert_default_res=False, # 避免重复改价 ) if save_res: # todo 只更新一次价格, 避免重复更新导致价格错误 # 去重 unique_coupon_id_list.append(unique_id) # 更新常规表中的商品价格变动 sql_str = ''' select top 1 Price, TaoBaoPrice, SKUInfo from dbo.GoodsInfoAutoGet where GoodsID=%s ''' db_res = [] try: db_res = list( sql_cli._select_table( sql_str=sql_str, params=(goods_id, ), )) except Exception as e: print(e) if db_res != []: # 标记常规商品由于优惠券带来的价格变动 try: # 减去优惠券的价格 coupon_value = float( item['coupon_value']) threshold = float( item['threshold']) # 还原为原始价格 db_price = float( db_res[0][0]) * (1 - CP_PROFIT) db_taobao_price = float( db_res[0][1]) * (1 - CP_PROFIT) # 减去优惠券价, 并且加上CP_PROFIT, 得到最终待存储价格 new_price = ( (db_price - coupon_value if db_price >= threshold else db_price) * (1 + CP_PROFIT)).__round__(2) new_taobao_price = ( (db_taobao_price - coupon_value if db_taobao_price >= threshold else db_taobao_price) * (1 + CP_PROFIT)).__round__(2) new_sku_info = get_new_sku_info_from_old_sku_info_subtract_coupon_and_add_cp_profit( old_sku_info=json_2_dict( json_str=db_res[0][2], default_res=[], ), threshold=threshold, coupon_value=coupon_value, ) sql_str2 = ''' update dbo.GoodsInfoAutoGet set Price=%s, TaoBaoPrice=%s, SKUInfo=%s, ModfiyTime=%s, sku_info_trans_time=%s, IsPriceChange=1, PriceChangeInfo=SKUInfo where GoodsID=%s ''' now_time = get_shanghai_time() sql_cli._update_table( sql_str=sql_str2, params=( Decimal(new_price ).__round__(2), Decimal(new_taobao_price). __round__(2), dumps(new_sku_info, ensure_ascii=False), now_time, now_time, goods_id, ), ) except Exception as e: print(e) else: pass else: continue else: continue else: continue except IndexError: # 跳过相同接口得索引异常 continue except Exception as e: print(e) finally: try: del sql_cli except: pass