def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_jd_all_goods_id_url()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 jd = JdParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[1], index)) tmp_item = [] if item[0] == 7 or item[0] == 8: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 9: tmp_item.append(1) elif item[0] == 10: tmp_item.append(2) tmp_item.append(item[1]) jd.get_goods_data(goods_id=tmp_item) data = jd.deal_with_data(goods_id=tmp_item) if data != {}: data['goods_id'] = item[1] ''' 设置最后刷新的商品状态上下架时间 ''' # 1.is_delete由0->1 为下架时间down_time 2. is_delete由1->0 为上架时间shelf_time my_shelf_and_down_time = { 'shelf_time': '', 'down_time': '', } if data['is_delete'] != item[2]: if data['is_delete'] == 0 and item[2] == 1: # is_delete由0->1 表示商品状态上架变为下架 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # is_delete由1->0 表示商品状态下架变为上架 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: if item[3] is None or item[ 3] == '{"shelf_time": "", "down_time": ""}' or len( item[3]) == 35: # 35就是那串初始str if data['is_delete'] == 0: # 上架的状态 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: # 下架的状态 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # 否则保存原始值不变 tmp_shelf_and_down_time = item[3] my_shelf_and_down_time = json.loads( tmp_shelf_and_down_time) # 先转换为dict data['my_shelf_and_down_time'] = my_shelf_and_down_time # print(my_shlef_and_down_time) # print('------>>>| 爬取到的数据为: ', data) jd.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass gc.collect() # sleep(1) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
def get_all_user_and_their_recommend_goods_list(self): for index in range(1, 100): # 达人推荐的地址(ajax请求) tmp_url = 'https://wq.jd.com/shopgroup_feed/GetDarenFeeds?pageno={}&pagesize=5&perDarenFeedNum=3&g_tk=1975813451'.format(str(index)) self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.set_page_load_timeout(15) # 设置成15秒避免数据出错 try: self.driver.get(tmp_url) self.driver.implicitly_wait(15) except Exception as e: # 如果超时, 终止加载并继续后续操作 print('-->>time out after 15 seconds when loading page') self.driver.execute_script('window.stop()') # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作 # pass body = self.driver.page_source body = re.compile(r'\n').sub('', body) body = re.compile(r'\t').sub('', body) body = re.compile(r' ').sub('', body) # print(body) body = re.compile(r'square\((.*)\)').findall(body) if body != []: body = body[0] try: data = json.loads(body) # pprint(data) except: print('json.loads转换body得到data时出错!') return [] if data.get('user_list') is None: # 表示没有数据了,返回的为 square({"errmsg":"","iRet":0,"totalnum":347} ) print('body中获取的user_list为None!') pass else: user_list = data.get('user_list', []) # pprint(user_list) for item in user_list: # 达人昵称 nick_name = item.get('nickname', '') # 达人头像 head_url = item.get('headurl', '') head_url = re.compile(r'http:').sub('', head_url) if re.compile(r'^http').findall(head_url) != []: pass else: head_url = 'http:' + head_url # 个性签名 profile = item.get('profile', '') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() db_goods_id = [j[1] for j in list(my_pipeline.select_jd_all_goods_id_url())] # print(db_goods_id) db_share_id = [j[0] for j in list(my_pipeline.select_jd_youxuan_daren_recommend_all_share_id())] # print(db_share_id) jd = JdParse() # 达人推荐的商品info feed_list = item.get('feed_list', []) for feed_list_item in feed_list: if feed_list_item.get('shareid', '') in db_share_id: print('该share_id({})已存在于数据库中, 此处跳过!'.format(feed_list_item.get('shareid', ''))) pass else: # share_id share_id = feed_list_item.get('shareid', '') article_url = 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id print('------>>>| 正在抓取的jd优选达人推荐文章的地址为: ', 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id) # 图片的信息 tmp_share_img_url_list = [] for item2 in feed_list_item.get('sharepicurl', '').split(','): if re.compile(r'^//').findall(item2) == []: tmp_share_img_url = 'https://img14.360buyimg.com/evalpic/s800x800_' + item2 else: tmp_share_img_url = 'http:' + item2 tmp_share_img_url_list.append(tmp_share_img_url) share_img_url_list = [{'img_url': item5} for item5 in tmp_share_img_url_list] # 处理得到达人的自拍图片div tmp_img_div_desc = '' for item4 in tmp_share_img_url_list: tmp_img_div = r'<img src="{}" style="height:auto;width:100%;"/>'.format(item4) tmp_img_div_desc += tmp_img_div my_img_div = '<div>' + tmp_img_div_desc + '</div>' # print(my_img_div) # 获取到goods_id 和 fisrt_text share_url = 'https://wq.jd.com/shopgroup_feed/FeedDetail?shareid=' + feed_list_item.get('shareid', '') + '&g_tk=1975813451' try: self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.get(share_url) self.driver.implicitly_wait(15) except Exception as e: # 如果超时, 终止加载并继续后续操作 print('-->>time out after 15 seconds when loading page') self.driver.execute_script('window.stop()') # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作 # pass feed_detail_body = self.driver.page_source feed_detail_body = re.compile(r'\n').sub('', feed_detail_body) feed_detail_body = re.compile(r'\t').sub('', feed_detail_body) feed_detail_body = re.compile(r' ').sub('', feed_detail_body) feed_data = re.compile(r'square\((.*)\)').findall(feed_detail_body) # print(feed_data) if feed_data != []: feed_data = feed_data[0] try: feed_data = json.loads(feed_data) except: print('json.loads转换feed_data失败,此处跳过!') break # 跳出后执行后面的外层的else # 文章标题 title = feed_data.get('feeddata', {}).get('title', '') title = re.compile(r'12.12').sub('', title) # 达人评论内容 tmp_comment_content = feed_data.get('feeddata', {}).get('commentcontent', '') tmp_comment_content = re.compile(r'&').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'\n').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'12.12').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'11.11').sub('', tmp_comment_content) comment_content = tmp_comment_content if title == '': # 由于获取到title为空, 所有title = comment_content, 并把comment_content = '' title = comment_content comment_content = '' # print('该文章的标题为: ', title) # print('达人的评论内容为: ', comment_content) # first_text(文章的第一段评论内容) first_text = feed_data.get('feeddata', {}).get('firsttext', '') first_text = re.compile(r'12.12').sub('', first_text) first_text = re.compile(r'11.11').sub('', first_text) # print('first_text为: ', first_text) sku_id = feed_data.get('feeddata', {}).get('skuid') if sku_id == '0': # 如果sku_id = '0'表示没有sku_id sku_id = '' # print('sku_id为: ', sku_id) share_id = feed_list_item.get('shareid', '') tmp_div_body_dict = self.get_div_body(share_id=share_id) # pprint(tmp_div_body_dict) if tmp_div_body_dict['sku_info'] == [] and sku_id != '': # 表示如果tmp_div_body_dict['sku_info']为[],则第二部分没有goods_id,所有将第一个sku_id赋值给sku_info goods_id_list = [{'goods_id': sku_id}] else: # 这篇文章推荐的商品goods_id的list(第一个为没有div_body时的goods_id) goods_id_list = [{'goods_id': item6} for item6 in tmp_div_body_dict['sku_info']] tmp_div_body = '<div>' + '<h3>{}</h3>'.format(title) + '<p>{}</p>'.format(comment_content) + my_img_div + tmp_div_body_dict['div_body'] # print('该文章推荐的商品goods_id的list为: ', goods_id_list) # print(tmp_div_body) else: print('获取feed_data失败!') return [] # 后期处理 if comment_content == '': comment_content = first_text ''' 时区处理,时间处理到上海时间 ''' tz = pytz.timezone('Asia/Shanghai') # 创建时区对象 now_time = datetime.datetime.now(tz) # 处理为精确到秒位,删除时区信息 now_time = re.compile(r'\..*').sub('', str(now_time)) # 将字符串类型转换为datetime类型 now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S') create_time = now_time # 创建的时间 result = { 'nick_name': nick_name, # 达人昵称 'head_url': head_url, # 达人头像 'profile': profile, # 个性签名 'share_id': share_id, # 分享的share_id 'article_url': article_url, # 文章原地址 'title': title, # 文章标题 'comment_content': comment_content, # 达人的评论内容 'share_img_url_list': share_img_url_list, # 达人自拍照片list # 'first_text': first_text, # 文章的第一段评论文字 'goods_id_list':goods_id_list, # 文章中所有推荐的商品的goods_id的list 'div_body': tmp_div_body, # 文章主体div 'create_time': create_time, # 文章创建的时间 } # pprint(result) print(result) my_pipeline.insert_into_jd_youxuan_daren_recommend_table(item=result) print('准备开始抓取该文章中的所有推荐商品'.center(30, '-')) for i in goods_id_list: if i.get('goods_id', '') in db_goods_id: print('该goods_id({})已经存在于数据库中, 此处跳过!'.format(i.get('goods_id', ''))) pass else: tmp_goods_id_url = 'https://item.jd.com/' + i.get('goods_id', '') + '.html' goods_id = jd.get_goods_id_from_url(jd_url=tmp_goods_id_url) jd.get_goods_data(goods_id=goods_id) tmp_jd_data = jd.deal_with_data(goods_id=goods_id) tmp_jd_data['spider_url'] = tmp_goods_id_url tmp_jd_data['username'] = '******' tmp_jd_data['goods_id'] = goods_id[1] jd.insert_into_jd_table(data=tmp_jd_data, pipeline=my_pipeline) print('该文章内推荐的商品全部抓取完毕'.center(30, '-')) else: print('body为空list!')