def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=z8_delete_str_1) result = list( tmp_sql_server._select_table(sql_str=z8_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if index % 300 == 0: # 每更新300个,休眠3分钟 sleep_time = 3 * 60 sleep(sleep_time) print('休眠{}s中...'.format(sleep_time)) if tmp_sql_server.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=item[0]) # 不用这个了因为会影响到正常情况的商品 try: # 单独处理商品页面不存在的情况 if isinstance(tmp_tmp, str) and re.compile( r'^ze').findall(tmp_tmp) != []: print('@@ 该商品的页面已经不存在!此处将其删除!') tmp_sql_server._delete_table( sql_str=z8_delete_str_2, params=(item[0], )) sleep(ZHE_800_PINTUAN_SLEEP_TIME) continue else: pass except: pass data = zhe_800_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] if item[1] == 1: tmp_sql_server._delete_table( sql_str=z8_delete_str_2, params=(item[0], )) print('该goods_id[{0}]已过期,删除成功!'.format(item[0])) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass gc.collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
class Z8Updater(AsyncCrawler): """折800常规商品实时更新""" def __init__(self, *params, **kwargs): AsyncCrawler.__init__(self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/折800/实时更新/') self.sql_cli = None self.goods_index = 1 # 并发量 self.concurrency = 10 async def _get_db_old_data(self): self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: result = list(self.sql_cli._select_table(sql_str=z8_select_str_3)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_new_ali_obj(self, index) -> None: if index % 10 == 0: try: del self.zhe_800 except: try: del self.zhe_800 except: pass collect() self.zhe_800 = Zhe800Parse() async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新一个goods的信息 :param db_goods_info_obj: :param index: :return: ['goods_id', bool:'成功与否'] ''' res = False await self._get_new_ali_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(db_goods_info_obj.goods_id, index)) self.zhe_800.get_goods_data(goods_id=db_goods_info_obj.goods_id) data = self.zhe_800.deal_with_data() if data != {}: data = get_goods_info_change_data( target_short_name='z8', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = self.zhe_800.to_right_and_update_data( data=data, pipeline=self.sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(2.) return [db_goods_info_obj.goods_id, res] async def _update_db(self): while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.zhe_800 = Zhe800Parse() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: db_goods_info_obj = Z8DbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format( db_goods_info_obj.goods_id)) tasks.append( self.loop.create_task( self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) try: del tasks except: pass self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(5.5) try: del self.zhe_800 except: pass collect() def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=vip_select_str_1)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') continue print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 vip = VipParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) vip.get_goods_data(goods_id=[0, item[0]]) data = vip.deal_with_data() if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[6]), site_id=25) except AttributeError: # 处理已被格式化过的 old_sku_info = item[6] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=25), is_price_change=item[7] if item[7] is not None else 0) vip.to_right_and_update_data(data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del vip # except: # pass gc.collect() sleep(VIP_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(30) # del vip gc.collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server._select_table(sql_str=kl_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 kaola = KaoLaParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del kaola except: pass kaola = KaoLaParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) data = kaola._get_goods_data(goods_id=item[1]) if data.get('is_delete') == 1: # 单独处理下架商品 data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data)) kaola.to_right_and_update_data(data, pipeline=tmp_sql_server) sleep(TMALL_REAL_TIMES_SLEEP_TIME) index += 1 gc.collect() continue data = kaola._deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) if data.get('is_delete') == 1: my_lg.info('@@@ 该商品已下架...') tmp_sql_server._update_table_2(sql_str=kl_update_str_2, params=(item[1],), logger=my_lg) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price'] ) try: old_sku_info = format_price_info_list(price_info_list=json_2_dict(item[7]), site_id=29) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list(data['price_info_list'], site_id=29), is_price_change=item[8] if item[8] is not None else 0 ) kaola.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) gc.collect()
def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] chuchujie = ChuChuJie_9_9_Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, gender, page, goods_url from dbo.chuchujie_xianshimiaosha where site_id=24' db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))] # print(db_goods_id_list) # my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) # index = 1 for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str(goods_id) chuchujie.get_goods_data(goods_id=goods_id) goods_data = chuchujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass elif goods_data.get('is_delete', 0) == 1: # is_delete=1(即库存为0)则跳过 print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) # 获取剩余时间 tmp_body = my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='p#activityTime span' ) # print(tmp_body) try: del my_phantomjs except: pass gc.collect() if tmp_body == '': # 获取手机版的页面完整html失败 sleep(.4) pass else: # p#activityTime span _t = Selector(text=tmp_body).css('p#activityTime span::text').extract_first() _t = re.compile(r'剩余').sub('', _t) # print(_t) if _t == '' or _t is None: print('获取到的_t为空值, 严重错误! 请检查!') miaosha_end_time = self.get_miaosha_end_time(_t) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(time.time())), 'miaosha_end_time': timestamp_to_regulartime(int(miaosha_end_time)), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) goods_data['gender'] = str(item.get('gender', '0')) goods_data['page'] = item.get('page') # pprint(goods_data) # print(goods_data) chuchujie.insert_into_chuchujie_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) # sleep(CHUCHUJIE_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 # index += 1 else: print('数据库连接失败,此处跳过!') pass try: del chuchujie except: pass gc.collect()
def deal_with_data(self, *params): ''' 处理并存储相关拼团商品的数据 :param params: 待传参数 :return: ''' goods_list = params[0] mogujie = MoGuJieParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=mg_select_str_1)) ] print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://shop.mogujie.com/detail/' + str( goods_id) mogujie.get_goods_data(goods_id=str(goods_id)) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 # 规范化 goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['pintuan_time'] = item.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get('pintuan_time', {})) goods_data['all_sell_count'] = item.get( 'all_sell_count', '') goods_data['fcid'] = str(item.get('fcid')) goods_data['page'] = str(item.get('page')) goods_data['sort'] = str(item.get('sort', '')) # pprint(goods_data) # print(goods_data) _r = mogujie.insert_into_mogujie_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 db_goods_id_list.append(goods_id) db_goods_id_list = list(set(db_goods_id_list)) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, schedule, is_delete from dbo.juanpi_pintuan where site_id=18' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_pintuan = JuanPiParse() for item in result: # 实时更新数据 data = {} if index % 6 == 0: try: del juanpi_pintuan except: pass gc.collect() juanpi_pintuan = JuanPiParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: pintuan_end_time = json.loads(item[1])[0].get('end_time') pintuan_end_time = int(str(time.mktime(time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(pintuan_end_time) if item[2] == 1 or pintuan_end_time < int(time.time()): sql_str = r'delete from dbo.juanpi_pintuan where goods_id=%s' tmp_sql_server._delete_table(sql_str=sql_str, params=(item[0])) print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0])) else: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) juanpi_pintuan.get_goods_data(goods_id=item[0]) data = juanpi_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] juanpi_pintuan.to_right_and_update_pintuan_data(data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del juanpi_pintuan # except: # pass gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
class ZWMSpider(AsyncCrawler): def __init__(self): AsyncCrawler.__init__( self, ip_pool_type=IP_POOL_TYPE, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/zwm/_/', ) self.init_zwm_pwd() self.concurrency = 10 self.num_retries = 6 self.max_transaction_details_page_num = 20 # 交易截止抓取页 self.max_business_settlement_records_page_num = 20 # 商户结算记录截止抓取页 self.max_business_manage_page_num = 80 # 商户及门店管理截止抓取页(单数据也超过此数量就得进行修改) self.login_cookies_dict = {} self.sleep_time = 5 def init_zwm_pwd(self): ori_data = '' with open(ZWM_PWD_PATH, 'r') as f: for line in f: ori_data += line.replace('\n', '').replace(' ', '') data = json_2_dict( json_str=ori_data, logger=self.lg, default_res={}, ) self.zwm_username, self.zwm_pwd = data['username'], data['pwd'] assert self.zwm_username != '' and self.zwm_pwd != '' async def _fck_run(self) -> None: while True: try: login_res = await self._login() assert login_res is True, '登录失败, 退出后续同步操作!' # 获取所有交易明细(自己有接口, 不需要了) # all_transaction_details = await self._get_all_transaction_details() # pprint(all_transaction_details) # self.lg.info('len_all_transaction_details: {}'.format(len(all_transaction_details))) # await self._wash_and_save_all_transaction_details(target_list=all_transaction_details) # 获取所有商户结算记录 self.lg.info('获取所有商户结算记录...') all_business_settlement_records = await self._get_all_business_settlement_records_by_something( ) # pprint(all_business_settlement_records) self.lg.info('len_now_business_settlement_records: {}'.format( len(all_business_settlement_records))) await self._wash_save_all_business_settlement_records( target_list=all_business_settlement_records) self.lg.info('\n') # 获取所有商户及门店管理记录 self.lg.info('获取所有商户及门店管理记录 ...') all_business_manage_records = await self._get_all_business_manage_records_by_something( ) # pprint(all_business_manage_records) self.lg.info('len_all_business_manage_records: {}'.format( len(all_business_manage_records))) await self._wash_save_all_business_manage_records( target_list=all_business_manage_records) self.lg.info('\n') except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('## 同步完成 ##') self.lg.info('休眠 {} minutes ...'.format(self.sleep_time)) # 定时 await async_sleep(60 * self.sleep_time) async def _login(self) -> bool: """ 登录 :return: """ headers = await self._get_random_pc_headers() headers.update({ 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/loginNew.jsp', }) file_load = { 'loginName': self.zwm_username, 'userPassword': self.zwm_pwd, } m = MultipartEncoder(fields=file_load) # self.lg.info(m) headers.update({'Content-Type': m.content_type}) login_url = 'https://agent.yrmpay.com/JHAdminConsole/foreigncard/permissionsLogin.do' with session() as _session: try: response = _session.post( url=login_url, headers=headers, data=m, proxies=self._get_proxies(), ) login_res = json_2_dict( json_str=response.text, default_res={}, logger=self.lg, ).get('message', '') assert login_res == '登录成功', '登录失败!' self.lg.info(login_res) self.login_cookies_dict = response.cookies.get_dict() assert self.login_cookies_dict != {}, 'self.login_cookies_dict != 空dict!' # pprint(self.login_cookies_dict) except Exception: self.lg.error('遇到错误:', exc_info=True) return False return True async def _wash_save_all_business_manage_records(self, target_list: list): """ 清洗并存储所有未存储的 or 更新所有已存储的business manage records :param target_list: :return: """ all_res = [] for item in target_list: try: now_time = get_shanghai_time() create_time, modify_time, approval_status_change_time = now_time, now_time, now_time agent_name = item['agentName'] top_agent_name = item['topAgentName'] shop_type = item['merType'] is_high_quality_shop = item['isHighQualityMer'] if is_high_quality_shop == '否': is_high_quality_shop = 0 elif is_high_quality_shop == '是': is_high_quality_shop = 1 else: raise ValueError( 'is_high_quality_shop value: {} 异常!'.format( is_high_quality_shop)) shop_id = item.get('jhmid', '') assert shop_id != '' shop_chat_name = item.get('merchantName', '') assert shop_chat_name != '' phone_num = item.get('phone', '') assert phone_num != '' shop_chant_num = int(item['merchantNum']) sale = item['sale'] is_real_time = 0 if item['isRealTime'] == '未开通' else 1 approve_date = date_parse(item['approveDate']) rate = Decimal(item['rate']).__round__(4) account_type = item['accType'] apply_time = date_parse(item['applyTime']) # 可为空值 process_context = item.get('processContext', '') is_non_contact = 0 if item['isNonContact'] == '未开通' else 1 approval_status = item['approvalStatus'] if approval_status == '待审核': approval_status = 1 elif approval_status == '审核通过': approval_status = 0 elif approval_status == '退回': approval_status = 2 else: raise ValueError( 'approval_status value: {} 异常'.format(approval_status)) # 用其原值为定值不变, 且唯一 unique_id = item['id'] except Exception: self.lg.error('遇到错误:', exc_info=True) continue zwm_item = ZWMBusinessManageRecordItem() zwm_item['unique_id'] = unique_id zwm_item['create_time'] = create_time zwm_item['modify_time'] = modify_time zwm_item['agent_name'] = agent_name zwm_item['top_agent_name'] = top_agent_name zwm_item['shop_type'] = shop_type zwm_item['is_high_quality_shop'] = is_high_quality_shop zwm_item['shop_id'] = shop_id zwm_item['shop_chat_name'] = shop_chat_name zwm_item['phone_num'] = phone_num zwm_item['shop_chant_num'] = shop_chant_num zwm_item['sale'] = sale zwm_item['is_real_time'] = is_real_time zwm_item['approve_date'] = approve_date zwm_item['rate'] = rate zwm_item['account_type'] = account_type zwm_item['apply_time'] = apply_time zwm_item['process_context'] = process_context zwm_item['is_non_contact'] = is_non_contact zwm_item['approval_status'] = approval_status zwm_item[ 'approval_status_change_time'] = approval_status_change_time all_res.append(dict(zwm_item)) # 查看 # if shop_id == 'YRMPAY100038574': # if phone_num == '18192242001': # if shop_chat_name == '哇哇叫': # pprint(dict(zwm_item)) # pprint(all_res) await self._insert_or_update_shop_manage_records_table(all_res=all_res) try: del all_res except: pass return None async def _insert_or_update_shop_manage_records_table(self, all_res: list): """ 插入or update原数据 :param all_res: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_2, params=None, logger=self.lg, ) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format( len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None new_add_count = 0 for item in all_res: unique_id = item['unique_id'] if unique_id not in db_unique_id_list: # 插入 self.lg.info('inserting unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params2(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_2, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue else: db_old_approval_status, db_old_approval_status_change_time = await self._get_dd_old_approval_status_and_approval_status_change_time( db_data=db_data, unique_id=unique_id, ) item[ 'approval_status_change_time'] = await self._get_new_approval_status_change_time( db_old_approval_status=db_old_approval_status, db_old_approval_status_change_time= db_old_approval_status_change_time, new_approval_status=item['approval_status'], new_approval_status_change_time=item[ 'approval_status_change_time']) # 更新 self.lg.info('updating unique_id: {} ...'.format(unique_id)) params = await self._get_update_item_params(item=item) try: res = self.sql_cli._update_table_2( sql_str=zwm_update_str_1, params=params, logger=self.lg) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass try: del db_data del db_unique_id_list except: pass self.lg.info( 'table.zwm_buss_manage_records新增个数: {}'.format(new_add_count)) async def _get_new_approval_status_change_time( self, db_old_approval_status, db_old_approval_status_change_time, new_approval_status, new_approval_status_change_time): """ 获取新的approval_status_change_time :return: """ if db_old_approval_status_change_time is not None: new_approval_status_change_time = db_old_approval_status_change_time \ if db_old_approval_status == new_approval_status \ else get_shanghai_time() else: pass return new_approval_status_change_time async def _get_dd_old_approval_status_and_approval_status_change_time( self, db_data: list, unique_id: str) -> tuple: """ 获取db 原先的approval_status :param db_data: :param unique_id: :return: """ for item in db_data: if unique_id == item[0]: return item[1], item[2] else: continue async def _get_all_business_manage_records_by_something(self, ): """ 获取所有商户及门店管理记录 :return: """ async def get_tasks_params_list(max_business_manage_page_num) -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, max_business_manage_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where page_num: {}]...'.format(k['page_num']) def get_now_args(k) -> list: return [ k['page_num'], ] res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list( max_business_manage_page_num=self.max_business_manage_page_num ), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self._get_one_page_business_manage_records_by_something, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res, one_default_res=[], step=self.concurrency, logger=self.lg, get_all_res=True, ) return res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_business_manage_records_by_something( self, page_num: int, start_date: str = None, end_date: str = None, ): """ 获取单页商户及门店管理记录 :param page_num: :param start_date: 默认设置前一个月27号, eg: '2019-01-27 00:00' :param end_date: eg: '2019-07-20 09:39' :return: """ # todo 获取最开始->至今的, 即采集所有, 避免老店铺的审核状态变动, 而后台无法同步状态, 审核时间 # start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] + ' 00:00' start_date = '2018-01-01 00:00' end_date = (str(get_shanghai_time()) if end_date is None else end_date)[0:16] self.lg.info('start_date: {}, end_date: {}'.format( start_date, end_date)) headers = self.get_random_pc_headers() headers.update({ 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/page.do', 'X-Requested-With': 'XMLHttpRequest', }) params = (('_dc', get_now_13_bit_timestamp()), ) data = { 'merchantCode': '', 'accType': '', 'phone': '', 'approveDate': '', 'merchantName': '', 'processStatus': '', 'startTime': start_date, 'endTime': end_date, 'agentName': '', 'page': str(page_num), 'start': str((page_num - 1) * 100), # 开始位置0, 100, 200 'limit': '100', } url = 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/materialList.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries, ) assert body != '', 'body不为空值!' res = json_2_dict(json_str=body, logger=self.lg, default_res={}).get('materialList', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res async def _wash_save_all_business_settlement_records(self, target_list): """ 清洗并存储 未被存储的所有商户结算记录 :param target_list: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_1, params=None, logger=self.lg, ) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format( len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None all_res = [] for item in target_list: try: create_time = get_shanghai_time() shop_name = item.get('merName', '') assert shop_name != '' shop_id = item.get('mid', '') assert shop_id != '' agent_name = item['agentName'] top_agent_name = item['topAgentName'] date_settle_type = item['settleType'] trans_amount = item.get('transAmt', '') assert trans_amount != '' trans_amount = Decimal(trans_amount).__round__(2) service_charge = Decimal(item['mda']).__round__(2) accounting_amount = Decimal(item['mnamt']).__round__(2) trans_date = date_parse(item['txnDay']) trans_status = item['status'] if trans_status == '已结算': trans_status = 0 else: raise ValueError( 'trans_status: {}, 未知交易状态!'.format(trans_status)) settle_type = item['type'] settle_date = date_parse(item['minDay']) # 生成唯一标识码 unique_id = get_uuid3( target_str=shop_id + str(date_settle_type) + str(trans_amount) + \ str(service_charge) + str(trans_date) + \ str(settle_type) + str(settle_date),) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if unique_id in db_unique_id_list: # self.lg.info('该record[unique_id: {}]已存在!'.format(unique_id)) continue settle_record_item = ZWMBusinessSettlementRecordItem() settle_record_item['unique_id'] = unique_id settle_record_item['create_time'] = create_time settle_record_item['shop_name'] = shop_name settle_record_item['shop_id'] = shop_id settle_record_item['agent_name'] = agent_name settle_record_item['top_agent_name'] = top_agent_name settle_record_item['date_settle_type'] = date_settle_type settle_record_item['trans_amount'] = trans_amount settle_record_item['service_charge'] = service_charge settle_record_item['accounting_amount'] = accounting_amount settle_record_item['trans_date'] = trans_date settle_record_item['trans_status'] = trans_status settle_record_item['settle_type'] = settle_type settle_record_item['settle_date'] = settle_date all_res.append(dict(settle_record_item)) # pprint(all_res) self.lg.info('未存储个数: {}'.format(len(all_res))) await self._save_all_business_settlement_records(all_res=all_res) try: del all_res except: pass return None async def _save_all_business_settlement_records(self, all_res) -> None: """ 存储新增的商家提现记录 :param all_res: :return: """ new_add_count = 0 for item in all_res: # 处理未存储的新数据 unique_id = item['unique_id'] self.lg.info('saving unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_1, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass self.lg.info('新增个数: {}'.format(new_add_count)) return None async def _get_insert_item_params(self, item) -> tuple: """ 待插入对象 :param item: :return: """ return tuple([ item['unique_id'], item['create_time'], item['shop_name'], item['shop_id'], item['agent_name'], item['top_agent_name'], item['date_settle_type'], item['trans_amount'], item['service_charge'], item['accounting_amount'], item['trans_date'], item['trans_status'], item['settle_type'], item['settle_date'], ]) async def _get_insert_item_params2(self, item) -> tuple: """ 待插入对象, zwm_buss_manage_records table :param item: :return: """ return tuple([ item['unique_id'], item['create_time'], item['modify_time'], item['agent_name'], item['top_agent_name'], item['shop_type'], item['is_high_quality_shop'], item['shop_id'], item['shop_chat_name'], item['phone_num'], item['shop_chant_num'], item['sale'], item['is_real_time'], item['approve_date'], item['rate'], item['account_type'], item['apply_time'], item['process_context'], item['is_non_contact'], item['approval_status'], item['approval_status_change_time'], ]) async def _get_update_item_params(self, item: dict) -> tuple: """ 更新对象, zwm_buss_manage_records table :param item: :return: """ return tuple([ item['modify_time'], item['agent_name'], item['top_agent_name'], item['shop_type'], item['is_high_quality_shop'], item['shop_id'], item['shop_chat_name'], item['phone_num'], item['shop_chant_num'], item['sale'], item['is_real_time'], item['approve_date'], item['rate'], item['account_type'], item['apply_time'], item['process_context'], item['is_non_contact'], item['approval_status'], item['approval_status_change_time'], item['unique_id'], ]) async def _wash_and_save_all_transaction_details(self, target_list: list): """ 清洗并存储所有交易明细 :param target_list: :return: """ pass async def _get_all_business_settlement_records_by_something(self): """ 获取所有商户结算记录 :return: """ async def get_tasks_params_list( max_business_settlement_records_page_num) -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, max_business_settlement_records_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where page_num: {}]...'.format(k['page_num']) def get_now_args(k) -> list: return [ k['page_num'], ] res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list(max_business_settlement_records_page_num=self .max_business_settlement_records_page_num), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self. _get_one_page_business_settlement_records_by_something, func_name_where_get_now_args=get_now_args, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res, one_default_res=[], step=self.concurrency, logger=self.lg, get_all_res=True, ) return res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_business_settlement_records_by_something( self, page_num: int, start_date: str = None, end_date: str = None, mid: str = '', agent_name: str = '') -> list: """ 得到单页商户结算记录 :param page_num: :param start_date: 默认设置前一个月27号, eg: '2019-07-01' :param end_date: eg: '2019-07-16' :param mid: 商户编号 :param agent_name: 顶级机构名称 :return: """ start_date = str(self.get_1_on_the_month( ) if start_date is None else start_date).split(' ')[0] # start_date = '2018-01-01' end_date = (str(get_shanghai_time()) if end_date is None else end_date).split(' ')[0] self.lg.info('start_date: {}, end_date: {}'.format( start_date, end_date)) headers = self.get_random_pc_headers() headers.update({ 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merSettle/querySettleJsp.do', 'X-Requested-With': 'XMLHttpRequest', }) params = (('_dc', get_now_13_bit_timestamp()), ) data = { 'startDate': start_date, 'endDate': end_date, 'mid': mid, 'agentName': agent_name, 'loginAgentId': self.zwm_username[0:8], # 前8位 'page': str(page_num), 'start': str((page_num - 1) * 100), # 开始位置, 0, 100, 200 'limit': '100', } url = 'https://agent.yrmpay.com/JHAdminConsole/merSettle/queryMerSettleList.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries, ) # self.lg.info(body) assert body != '', 'body不为空值!' res = json_2_dict(json_str=body, logger=self.lg, default_res={}).get('data', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res async def _get_all_transaction_details(self) -> list: """ 获取所有交易流水 :return: """ async def _get_tasks_params_list() -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, self.max_transaction_details_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list tasks_params_list = await _get_tasks_params_list() tasks_params_list_obj = TasksParamsListObj( tasks_params_list=tasks_params_list, step=self.concurrency, ) all_res = [] while True: try: slice_params_list = tasks_params_list_obj.__next__() except AssertionError: break tasks = [] for k in slice_params_list: page_num = k['page_num'] self.lg.info( 'create task[where page_num: {}]...'.format(page_num)) func_args = [ page_num, ] tasks.append( self.loop.create_task( unblock_func( func_name=self. _get_one_page_transaction_details_by_something, func_args=func_args, logger=self.lg, ))) one_res = await async_wait_tasks_finished(tasks=tasks) try: del tasks except: pass for i in one_res: for j in i: all_res.append(j) return all_res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_transaction_details_by_something( self, page_num: int, start_date: str = None, end_date: str = None, transaction_status: str = '', mer_name: str = '', order_no: str = '', mid: str = '', agent_name: str = '', pay_channel: str = '', sale_name: str = '', ) -> list: """ 获取单页交易流水 :param page_num: 开始页面, eg: 1, 2, 3 :param start_date: eg: '2019-07-16 00:00' :param end_data: eg: '2019-07-16 10:02' :param transaction_status: 交易状态 | 选择全部: '' or 交易成功: '1' or 退款成功: '3' :param mer_name: 待查询的商户名称 :param order_no: 订单号 :param mid: 商户编号 :param agent_name: 顶级机构名称 :param pay_channel: 支付渠道 | 请选择: '' or 微信: '50' or 支付宝: '51' or 微信条码: '55' or 支付宝条码: '56' or 微信小程序: '67' :param sale_name: 销售名称 :return: """ res = [] start_date = self.get_0_00_on_the_day( ) if start_date is None else start_date end_date = str(get_shanghai_time()) if end_date is None else end_date headers = self.get_random_pc_headers() headers.update({ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': '*/*', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/transflow.do', 'X-Requested-With': 'XMLHttpRequest', }) params = (('_dc', get_now_13_bit_timestamp()), ) data = { 'startDate': start_date, 'endDate': end_date, 'type': '2', 'status': transaction_status, 'payChannel': pay_channel, 'orderNo': order_no, 'merName': mer_name, 'mid': mid, 'agentName': agent_name, 'saleName': sale_name, 'page': str(page_num), 'start': str((page_num - 1) * 20), # 开始位置, 0, 20, 40 'limit': '20', } url = 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/querylimafuTransFlow.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries, ) assert body != '', 'body不为空值!' res = json_2_dict(json_str=body, logger=self.lg, default_res={}).get('data', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res def get_0_00_on_the_day(self) -> str: """ 获取当天的0点 :return: """ now_time = get_shanghai_time() return str( datetime(year=now_time.year, month=now_time.month, day=now_time.day)) def get_1_on_the_month(self) -> str: """ 获取当月的第一天 :return: """ now_time = get_shanghai_time() # 避免月底流水无法获取 day = 5 now_month = now_time.month if now_month > 1: now_month -= 1 else: # now_month为1月份 now_month = 12 return str(datetime( year=now_time.year, month=now_month, day=day, )) def _get_proxies(self) -> dict: """ 获取代理 :return: """ proxies = Requests._get_proxies(ip_pool_type=self.ip_pool_type, ) assert proxies != {}, 'proxies不为空dict!' return proxies async def _get_random_pc_headers(self) -> dict: """ :return: """ return self.get_random_pc_headers() @staticmethod def get_random_pc_headers() -> dict: headers = get_random_headers( upgrade_insecure_requests=False, cache_control='', ) headers.update({ 'Origin': 'https://agent.yrmpay.com', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundarytSJCAoaErjNY4IbM', 'accept': 'text/plain, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', }) return headers def __del__(self): try: del self.lg del self.login_cookies_dict except: pass try: del self.loop except: pass collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # and GETDATE()-ModfiyTime>0.3 # and MainGoodsID is not null sql_str = ''' select SiteID, GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time from dbo.GoodsInfoAutoGet where SiteID=30 and GETDATE()-ModfiyTime>0.3 and MainGoodsID is not null order by ID asc''' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 yanxuan = YanXuanParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del yanxuan except: pass yanxuan = YanXuanParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yanxuan._get_goods_data(goods_id=item[1]) data = yanxuan._deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') else: data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) # my_lg.info(str(data['_is_price_change']) + ' ' +str(data['_price_change_info'])) yanxuan.to_right_and_update_data( data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # and GETDATE()-ModfiyTime>1 sql_str = ''' select GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time from dbo.GoodsInfoAutoGet where SiteID=2 and MainGoodsID is not null and GETDATE()-ModfiyTime>1 order by ID desc ''' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse() for item in result: # 实时更新数据 if index % 5 == 0: ali_1688 = ALi1688LoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data = ali_1688.get_ali_1688_data(item[0]) if isinstance(data, int) is True: # 单独处理返回tt为4041 continue else: pass if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) print('上架时间:', data['shelf_time'], '下架时间:', data['delete_time']) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(1.5) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) print('上架时间:', data['shelf_time'], '下架时间:', data['delete_time']) '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中''' # 业务逻辑 # 公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格 data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(.3) # 避免服务器更新太频繁 else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del ali_1688 # except: # pass gc.collect() sleep(2.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
class XiaoHongShuParse(Crawler): def __init__(self, logger=None, by_wx=False): ''' :param logger: :param by_wx: 抓取wx小程序(弊端: 没有tags值 优点: 可长期采集, 不容易被封) √ vs 抓取app(弊端: 测试发现就算用高匿proxy每跑20个, 就被封3-5分钟, 效率低) ''' super(XiaoHongShuParse, self).__init__( ip_pool_type=IP_POOL_TYPE, log_print=True, logger=logger, log_save_path=MY_SPIDER_LOGS_PATH + '/小红书/_/', ) self._set_headers() self.by_wx = by_wx self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.index = 0 self.success_insert_db_num = 0 self.CRAWL_ARTICLE_SLEEP_TIME = 1 # 抓每天文章的sleep_time(wx=1/app=2) self.LONG_SLEEP_TIME = 0 # 每抓10条休眠时间 self.db_share_id = [] # db原先存在的 self.ip_pool_type = IP_POOL_TYPE def _set_headers(self): self.headers = { 'authority': 'www.xiaohongshu.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', } def _get_xiaohongshu_home_aritles_info(self): ''' 小红书主页json模拟获取(模拟app端主页请求) :return: ''' headers = { 'Accept-Encoding': 'br, gzip, deflate', 'Connection': 'keep-alive', # 'device_id': '2AEEF650-2CAE-480F-B30C-CA5CABC26193', 'Accept': 'application/json', 'Host': 'www.xiaohongshu.com', 'User-Agent': 'discover/5.19.1 (iPhone; iOS 11.0; Scale/3.00) Resolution/1242*2208 Version/5.19.1 Build/5191001 Device/(Apple Inc.;iPhone7,1)', # 'Authorization': 'session.1210427606534613282', 'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9', 'X-Tingyun-Id': 'LbxHzUNcfig;c=2;r=551911068', } # 下面参数每个都是必须的, 且不变 params = ( ('deviceId', '2AEEF650-2CAE-480F-B30C-CA5CABC26193'), ('device_fingerprint', '201805101352429dd715d37f422fe3e64dd3923c0b0bc8017d90c099539039'), ('device_fingerprint1', '201805101352429dd715d37f422fe3e64dd3923c0b0bc8017d90c099539039'), ('lang', 'zh'), ('num', '10'), ('oid', 'homefeed_recommend'), ('platform', 'iOS'), ('sid', 'session.1210427606534613282'), ('sign', 'c9a9eadc6c46823ae3075d7b28fe97fa'), ('t', '1531010946'), # 用原来的避免sign错误 # ('t', int(time.time())), ) url = 'https://www.xiaohongshu.com/api/sns/v6/homefeed' body = Requests.get_url_body(url=url, headers=headers, params=params, cookies=None, high_conceal=True, ip_pool_type=self.ip_pool_type) # self.lg.info(body) if body == '': self.lg.error('获取到的body为空值!请检查!') return [] if re.compile(r'<title>403 Forbidden</title>').findall(body) != []: self.lg.info('此次抓取被403禁止!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) return [] _ = json_2_dict(body, logger=self.lg).get('data', []) # pprint(_) if _ == []: self.lg.error('获取到的data为空值!请检查!') return [] _ = [{ 'share_link': item.get('share_link', ''), 'likes': item.get('likes', 0), } for item in _] return _ def _deal_with_home_article(self): home_articles_link_list = self._get_xiaohongshu_home_aritles_info() # pprint(home_articles_link_list) self.lg.info(home_articles_link_list) # self.lg.info(str(home_articles_link_list) + '\n') data = self._deal_with_articles(articles_list=home_articles_link_list) # pprint(data) self._save_articles(data=data) self.lg.info('一次采集完毕, 进入{0}s休眠...'.format(self.LONG_SLEEP_TIME)) sleep(self.LONG_SLEEP_TIME) # 设置休眠, 实现周期抓取, 避免频繁抓取被封禁(测试发现抓20个就会封一会) return True def _deal_with_articles(self, articles_list): ''' 处理给与小红书地址(articles_list) :param articles_list: 待抓取的文章地址list eg: [{'share_link':'小红书地址', 'likes': 111}, ...] # likes可以为空 :return: data a list ''' data = [] _db = self.my_pipeline._select_table( sql_str='select share_id from dbo.daren_recommend') if _db is not None and _db != [] and _db != [()]: self.db_share_id = [item[0] for item in _db] # self.lg.info(self.db_share_id) for item in articles_list: self.index += 1 article_link = item.get('share_link', '') article_likes = item.get('likes', 0) article_id = re.compile(r'/item/(\w+)').findall(article_link)[0] if article_id in self.db_share_id: self.lg.info('该{0}已存在于db中...跳过!'.format(article_id)) self.lg.info('[+] {0}'.format(article_link)) if article_link != '': if not self.by_wx: # 通过pc端 params = (('_at', '499a292d16aed3d80a068fc60e0c1e3ee3410'), ) body = Requests.get_url_body( url=article_link, headers=self.headers, params=params, high_conceal=True, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) try: article_info = re.compile( 'window.__INITIAL_SSR_STATE__=(.*?)</script>' ).findall(body)[0] # self.lg.info(str(article_info)) except IndexError: self.lg.error('获取article_info时IndexError!请检查!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) continue article_info = self._wash_article_info( json_2_dict(json_str=article_info, logger=self.lg)) # pprint(article_info) article_info = self._parse_page( article_link=article_link, article_info=article_info, article_likes=article_likes) # pprint(article_info) else: # 通过wx小程序 # url = "https://www.xiaohongshu.com/wx_mp_api/sns/v1/note/" + article_id # wx接口改版, 需要一个参数Auth认证, 暂时没处理 url = 'https://www.xiaohongshu.com/sapi/wx_mp_api/sns/v1/note/' + article_id params = { "sid": "session.1210427606534613282", # 对方服务器用来判断登录是否过期(过期则替换这个即可再次采集) } body = Requests.get_url_body( url=url, headers=self.headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': self.lg.error('获取到的article的body为空值!跳过!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) continue article_info = self._wash_article_info_from_wx( json_2_dict(json_str=body, logger=self.lg)) article_info = self._parse_page_from_wx( article_link=article_link, article_info=article_info, article_likes=article_likes) # pprint(article_info) data.append(article_info) sleep(self.CRAWL_ARTICLE_SLEEP_TIME) else: pass self.lg.info('@@@ 抓取完毕!') # pprint(data) return data def _parse_page(self, **kwargs): ''' 解析单个article的info :return: a dict ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('NoteView', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('noteInfo', {}).get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('noteInfo', {}).get('user', {}).get('image', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('noteInfo', {}).get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = article_info.get('noteInfo', {}).get('title', '') # title默认留空 comment_content = self.wash_sensitive_info( article_info.get('noteInfo', {}).get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('noteInfo', {}).get('images', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('noteInfo', {}).get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] tags = self._get_tags(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('noteInfo', {}).get('video', '') tmp_video_url = 'https:' + tmp_video_url if tmp_video_url != '' else '' video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('noteInfo', {}).get('collects', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.lg.error('遇到错误: ', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _ def _parse_page_from_wx(self, **kwargs): ''' 解析wx单个article的info :param kwargs: :return: a WellRecommendArticle object ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('data', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('user', {}).get('images', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = self.wash_sensitive_info(article_info.get('title', '')) # title默认留空 comment_content = self.wash_sensitive_info( article_info.get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('images_list', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] # wx端tags没有返回值 tags = self._get_tags_from_wx(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('video', '') tmp_video_url = re.compile('\?.*').sub('', tmp_video_url) video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('fav_count', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.lg.error('遇到错误:', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _ def _save_articles(self, data): ''' 存储数据 :param data: :return: ''' self.lg.info('即将开始存储该文章...') sql_str = 'insert into dbo.daren_recommend(share_id, nick_name, head_url, profile, gather_url, title, comment_content, share_img_url_list, div_body, create_time, site_id, tags, video_url, likes, collects) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' for item in data: if self.index % 20 == 0: self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if self.my_pipeline.is_connect_success: share_id = item.get('share_id', '') if share_id == '': continue self.lg.info( '------>>>| 正在存储share_id: {0}...'.format(share_id)) try: params = self._get_db_insert_into_params(item=item) except Exception: continue result = self.my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.lg) if result: self.success_insert_db_num += 1 else: self.lg.error('db连接失败!存储失败! 出错article地址:{0}'.format( item.get('gather_url', ''))) self.lg.info('@' * 9 + ' 目前成功存储{0}个!'.format(self.success_insert_db_num)) return True def _get_db_insert_into_params(self, item): ''' 得到待存储的数据 :param item: :return: ''' params = [ item['share_id'], item['nick_name'], item['head_url'], item['profile'], item['gather_url'], item['title'], item['comment_content'], dumps(item['share_img_url_list'], ensure_ascii=False), # dumps(item['goods_id_list'], ensure_ascii=False), # dumps(item['share_goods_base_info'], ensure_ascii=False), item['div_body'], item['create_time'], item['site_id'], dumps(item['tags'], ensure_ascii=False), item['video_url'], item['likes'], item['collects'], ] return tuple(params) def _get_tags(self, article_info): ''' 获取tags :return: ''' tmp_tags = list_duplicate_remove([ str(item.get('name', '')) for item in article_info.get( 'noteInfo', {}).get('relatedTags', []) ]) # self.lg.info(str(tmp_tags)) # list先转str, 去掉敏感字眼, 再转list, 并去除''元素, 得到最后list tmp_tags = delete_list_null_str( self.wash_sensitive_info('|'.join(tmp_tags)).split('|')) tags = [ { # tags可以为空list! 'keyword': item, } for item in tmp_tags ] return tags def _get_tags_from_wx(self, article_info): ''' 从wx获取tags :param article_info: :return: ''' return [] def _wash_article_info(self, _dict): ''' 清洗无用字段 :param _dict: :return: ''' try: _dict['NoteView']['commentInfo'] = {} # 评论信息 _dict['NoteView']['panelData'] = [] # 相关笔记 except: pass return _dict def _wash_article_info_from_wx(self, _dict): ''' 清洗wx无用字段 :param _dict: :return: ''' try: _dict['data']['mini_program_info'] = {} # 推荐首页的缩略信息 _dict['data']['share_info'] = {} # 分享的信息 except: pass return _dict def wash_sensitive_info(self, data): ''' 清洗敏感信息 :param data: :return: ''' replace_str_list = [ ('小红书', '优秀网'), ('xiaohongshu', '优秀网'), ('XIAOHONGSHU', '优秀网'), ('某宝', '优秀网'), ('薯队长', '秀队长'), ('薯宝宝', '秀客'), ('红薯们', '秀客们'), ('小红薯', '小秀客'), ] add_sensitive_str_list = [ '#.*#', '@.*?薯', ] return wash_sensitive_info( data=data, replace_str_list=replace_str_list, add_sensitive_str_list=add_sensitive_str_list, is_default_filter=True, is_lower=False, ) def __del__(self): try: del self.lg del self.my_pipeline except: pass gc.collect()
class RecommendGoodOps(AsyncCrawler): """荐好ops""" def __init__(self): AsyncCrawler.__init__( self, log_print=True, is_new_loop=False, log_save_path=MY_SPIDER_LOGS_PATH + '/荐好/ops/', ip_pool_type=IP_POOL_TYPE, ) self.request_num_retries = 6 self.article_type = 'zq' self.yx_username = input('请输入yx_username:'******'请输入yx_password:'******'yx_username: {}, yx_password: {}'.format( self.yx_username, self.yx_password)) # 不支持https了, 原先支持 self.publish_url = 'http://configadmin.yiuxiu.com/Business/Index' self.select_sql0 = 'SELECT unique_id FROM dbo.recommend_good_ops_article_id_duplicate_removal' self.insert_sql0 = 'INSERT INTO dbo.recommend_good_ops_article_id_duplicate_removal(unique_id, create_time) values(%s, %s)' self.min_article_id = 0 self.max_article_id = 0 self.driver_headless = True # 必须使用代理, yx限制ip频繁 self.driver_use_proxy = True # 荐好管理label self.recommend_good_label_css_selector = 'span.nav-label' # 设置开眼的min_article_id, max_article_id self.ky_min_article_id, self.ky_max_article_id = 4000, 60000 # article_id 截取数 self.zq_intercept_num = 2 self.hk_intercept_num = 1 self.lfd_intercept_num = 1 self.gxg_intercept_num = 1 self.pp_intercept_num = 2 self.kr_intercept_num = 1 self.dfsp_intercept_num = 1 self.jrxsp_intercept_num = 1 self.ky_intercept_num = 1 # 增加全屏视频数 self.lsp_intercept_num = 2 self.mp_intercept_num = 1 self.klm_intercept_num = 2 self.article_parser = None # 暂存好看视频list的dict self.hk_cache_dict = {} self.lfd_cache_dict = {} self.gxg_cache_dict = {} self.pp_cache_dict = {} self.kr_cache_dict = {} self.dfsp_cache_dict = {} self.lsp_cache_dict = {} self.mp_cache_dict = {} self.klm_cache_dict = {} self.jrxsp_cache_dict = {} async def _fck_run(self): # 休眠7.5分钟, 避免频繁发!(5分钟还是太快, 删不过来)(增加较多视频, 失败率较高故还是5分钟) # sleep_time = 0. sleep_time = 60 * 5. self.db_article_id_list = await self.get_db_unique_id_list() assert self.db_article_id_list != [] self.lg.info('db_article_id_list_len: {}'.format( len(self.db_article_id_list))) _timeout = await self.get_auto_publish_articles_timeout() while True: if get_shanghai_time().hour == 0: # 夜晚休眠 await async_sleep(60 * 60 * 4.) else: pass try: try: await async_wait_for( self.auto_publish_articles(), timeout=_timeout, ) except AsyncTimeoutError: raise PublishOneArticleFailException except ( ArticleTitleOverLongException, LoginFailException, ArticleTitleContainSensitiveWordsException, PublishOneArticleFailException, EnterTargetPageFailException, ): self.lg.error('遇到错误:', exc_info=True) continue except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('休眠{}s...'.format(sleep_time)) await async_sleep(sleep_time) async def get_auto_publish_articles_timeout(self): """ 获取自动发布文章的超时时长 :return: """ all_intercept_num = self.zq_intercept_num \ + self.hk_intercept_num \ + self.lfd_intercept_num \ + self.gxg_intercept_num \ + self.pp_intercept_num \ + self.kr_intercept_num \ + self.dfsp_intercept_num \ + self.lsp_intercept_num \ + self.mp_intercept_num \ + self.klm_intercept_num \ + self.jrxsp_intercept_num \ + self.ky_intercept_num _timeout = all_intercept_num * 2.5 * 60 return _timeout async def get_db_unique_id_list(self) -> list: """ 获取db的unique_id_list :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() if not self.sql_cli.is_connect_success: raise SqlServerConnectionException res = [] try: res = self.sql_cli._select_table( sql_str=self.select_sql0, logger=self.lg, ) except Exception: self.lg.error('遇到错误:', exc_info=True) res = [] if res is None else res return [item[0] for item in res] async def auto_publish_articles(self): """ 自动发布文章 :return: """ self.sql_cli = get_new_sql_cli(sql_cli=self.sql_cli) if not self.sql_cli.is_connect_success: raise SqlServerConnectionException else: pass if self.min_article_id == 0\ or self.max_article_id == 0: self.article_parser = ArticleParser(logger=self.lg) article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type=self.article_type, )) assert article_list != [] self.min_article_id, self.max_article_id = self.get_latest_max_and_min_artcile_id_from_article_list( article_list=article_list, ) self.lg.info('最新的min_article_id: {}, max_article_id: {}'.format( self.min_article_id, self.max_article_id, )) else: pass # todo shadow模式下老是登录失败, 建议关闭shadow, 不使用其 or lantern or 使用shadow其全局代理模式即可 # 创建目标集合 # zq_article_list = [] # hk_article_list = [] # lfd_article_list = [] # gxg_article_list = [] # pp_article_list = [] # kr_article_list = [] # dfsp_article_list = [] # lsp_article_list = [] # mp_article_list = [] # klm_article_list = [] # jrxsp_article_list = [] zq_article_list = self.get_zq_own_create_article_id_list( min_article_id=self.min_article_id, max_article_id=self.max_article_id, ) hk_article_list = self.get_hk_article_id_list() lfd_article_list = self.get_lfd_article_id_list() gxg_article_list = self.get_gxg_article_id_list() pp_article_list = self.get_pp_article_id_list() kr_article_list = self.get_kr_article_id_list() dfsp_article_list = self.get_dfsp_article_id_list() lsp_article_list = self.get_lsp_article_id_list() mp_article_list = self.get_mp_article_id_list() klm_article_list = self.get_klm_article_id_list() jrxsp_article_list = self.get_jrxsp_article_id_list() ky_article_list = self.get_ky_own_create_article_id_list() # 测试用 # article_id = '17300123' # article_list = [{ # 'uid': get_uuid3(target_str='{}::{}'.format('zq', article_id)), # 'article_type': 'zq', # 'article_id': article_id, # 'title': '未知', # 'article_url': 'https://focus.youth.cn/mobile/detail/id/{}#'.format(article_id), # }] # 文章在前的发布顺序, 视频在后(避免视频发过多) article_list = zq_article_list \ + pp_article_list \ + kr_article_list \ + dfsp_article_list \ + hk_article_list \ + klm_article_list \ + jrxsp_article_list \ + mp_article_list \ + lsp_article_list \ + ky_article_list \ + lfd_article_list \ + gxg_article_list assert article_list != [] # pprint(article_list) target_article_list = self.get_target_article_list( article_list=article_list) if target_article_list == []: self.lg.info('待发布的target_article_list为空list, pass!') return driver = None try: try: # rasp上代理模式启动chromedriver具有一定的失败率, 故还是mac driver = BaseDriver( type=CHROME, executable_path=CHROME_DRIVER_PATH, # 本地老是出错 # type=FIREFOX, # executable_path=FIREFOX_DRIVER_PATH, load_images=True, logger=self.lg, headless=self.driver_headless, driver_use_proxy=self.driver_use_proxy, ip_pool_type=self.ip_pool_type, ) self.login_bg(driver=driver) self.get_into_recommend_good_manage(driver=driver) except (FZTimeoutError, WebDriverException): raise LoginFailException for item in target_article_list: uid = item.get('uid', '') title = item.get('title', '') article_url = item.get('article_url', '') self.lg.info('正在发布文章 title: {}, article_url: {} ...'.format( title, article_url)) try: self.publish_one_article( driver=driver, article_url=article_url, ) except FZTimeoutError: raise PublishOneArticleFailException # 新增, 以及插入db self.db_article_id_list.append(uid) self.sql_cli._insert_into_table_2( sql_str=self.insert_sql0, params=( uid, get_shanghai_time(), ), logger=self.lg, ) except ( ArticleTitleOverLongException, LoginFailException, ArticleTitleContainSensitiveWordsException, PublishOneArticleFailException, EnterTargetPageFailException, ) as e: # 抛出异常 raise e except Exception: self.lg.error('遇到错误:', exc_info=True) finally: try: # ** 注意: 不可直接del driver, 测试发现浏览器未被正确关闭, 还存在!! # del driver # 关闭浏览器 driver.driver.quit() # 只关闭当前窗口, 不关闭浏览器 # driver.driver.close() self.lg.info('driver 释放成功!') except: try: driver.driver.quit() except: pass collect() return def get_ky_own_create_article_id_list(self): """ 获取ky article_list :return: """ article_id_list = [ str(article_id) for article_id in range(self.ky_min_article_id, self.ky_max_article_id) ] # 截取 article_id_list = random_sample(article_id_list, self.ky_intercept_num) res = [{ 'uid': get_uuid3(target_str='{}::{}'.format('ky', article_id)), 'article_type': 'ky', 'title': '未知', 'article_id': article_id, 'article_url': 'https://www.kaiyanapp.com/detail.html?vid={}'.format(article_id), } for article_id in article_id_list] return res def get_jrxsp_article_id_list(self): """ 获取目标article_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.jrxsp_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='jrxsp', )) self.jrxsp_cache_dict['data'] = article_list self.jrxsp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.jrxsp_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 40 * 60: # klm 每日更新数量有限, 每过40分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='jrxsp', )) self.jrxsp_cache_dict['data'] = article_list self.jrxsp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.jrxsp_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.jrxsp_intercept_num) return article_list def get_klm_article_id_list(self): """ 获取目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.klm_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='klm', )) self.klm_cache_dict['data'] = article_list self.klm_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.klm_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # klm 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='klm', )) self.klm_cache_dict['data'] = article_list self.klm_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.klm_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.klm_intercept_num) return article_list def get_mp_article_id_list(self): """ 获取mp 目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.mp_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='mp', )) self.mp_cache_dict['data'] = article_list self.mp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.mp_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # mp 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='mp', )) self.mp_cache_dict['data'] = article_list self.mp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.mp_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.mp_intercept_num) return article_list def get_lsp_article_id_list(self): """ 获取lsp 目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.lsp_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='lsp', )) self.lsp_cache_dict['data'] = article_list self.lsp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.lsp_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # dfsp 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='lsp', )) self.lsp_cache_dict['data'] = article_list self.lsp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.lsp_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.lsp_intercept_num) return article_list def get_dfsp_article_id_list(self): """ 获取dfsp 目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.dfsp_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='dfsp', )) self.dfsp_cache_dict['data'] = article_list self.dfsp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.dfsp_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # dfsp 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='dfsp', )) self.dfsp_cache_dict['data'] = article_list self.dfsp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.dfsp_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.dfsp_intercept_num) return article_list def get_kr_article_id_list(self): """ 获取kr 目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.kr_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='kr', )) self.kr_cache_dict['data'] = article_list self.kr_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.kr_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # pp 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='kr', )) self.kr_cache_dict['data'] = article_list self.kr_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.kr_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.kr_intercept_num) return article_list def get_pp_article_id_list(self): """ 获取pp目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.pp_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='pp', )) self.pp_cache_dict['data'] = article_list self.pp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.pp_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # pp 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='pp', )) self.pp_cache_dict['data'] = article_list self.pp_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.pp_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.pp_intercept_num) return article_list def get_gxg_article_id_list(self): """ 获取gxg目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.gxg_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='gxg', )) self.gxg_cache_dict['data'] = article_list self.gxg_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.gxg_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # gxg 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='gxg', )) self.gxg_cache_dict['data'] = article_list self.gxg_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.gxg_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.gxg_intercept_num) return article_list def get_lfd_article_id_list(self): """ 获取lfd目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.lfd_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='lfd', )) self.lfd_cache_dict['data'] = article_list self.lfd_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.lfd_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # lfd 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='lfd', )) self.lfd_cache_dict['data'] = article_list self.lfd_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.lfd_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.lfd_intercept_num) return article_list def get_hk_article_id_list(self): """ 获取hk 目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.hk_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='hk', )) self.hk_cache_dict['data'] = article_list self.hk_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.hk_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 12 * 60: # 每过12分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='hk', )) self.hk_cache_dict['data'] = article_list self.hk_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.hk_cache_dict['data'] if article_list != []: # 截取1个(与图文穿插) article_list = random_sample(article_list, self.hk_intercept_num) return article_list def get_latest_max_and_min_artcile_id_from_article_list( self, article_list) -> tuple: """ 获取最新范围的article_id最大, 最小的article_id(目的动态的自己创建值) :return: (int, int) """ latest_article_id_list = [] for item in article_list: # eg: zq是'17296475' article_id = item.get('article_id', '') if len(article_id) >= 8: latest_article_id_list.append(int(article_id)) else: continue assert latest_article_id_list != [] latest_article_id_list = sorted(latest_article_id_list) # pprint(latest_article_id_list) return (latest_article_id_list[0], latest_article_id_list[-1]) def get_zq_own_create_article_id_list(self, min_article_id: int, max_article_id: int): """ 自己create的article_id_list :return: """ # 取中间值, 避免老是在发老新闻 middle_article_id = int((min_article_id + max_article_id) / 2) self.lg.info('middle_article_id: {}'.format(middle_article_id)) article_id_list = [ str(article_id) for article_id in range(middle_article_id, max_article_id) ] # 截取3 article_id_list = random_sample(article_id_list, self.zq_intercept_num) res = [{ 'uid': get_uuid3(target_str='{}::{}'.format('zq', article_id)), 'article_type': 'zq', 'title': '未知', 'article_id': article_id, 'article_url': 'https://focus.youth.cn/mobile/detail/id/{}#'.format(article_id), } for article_id in article_id_list] new_res = res # 本地不检测了 # article_parser = ArticleParser(logger=self.lg) # # article_list = self.loop.run_until_complete(article_parser.get_article_list_by_article_type( # # article_type=self.article_type,)) # new_res = [] # for item in res: # article_url = item.get('article_url', '') # try: # self.lg.info('本地检测url: {}'.format(article_url)) # _ = self.loop.run_until_complete(article_parser._parse_article( # article_url=article_url,)) # title = _.get('title', '') # assert title != '' # # 标题必须小于等于30 # assert len(title) <= 30 # except Exception: # continue # # item.update({ # 'title': title, # }) # new_res.append(item) return new_res def get_target_article_list(self, article_list: list) -> list: """ 获取未被发布的item :return: """ target_article_list = [] for item in article_list: try: title = item.get('title', '') assert title != '' uid = item.get('uid', '') assert uid != '' article_url = item.get('article_url', '') assert article_url != '' if uid not in self.db_article_id_list: target_article_list.append(item) else: # 已发布的跳过 self.lg.info('该文章之前已被发布![where title: {}, url: {}]'.format( title, article_url)) continue except Exception: self.lg.error('遇到错误:', exc_info=True) continue return target_article_list @fz_set_timeout(seconds=1.5 * 60) def login_bg(self, driver: BaseDriver): """ login :return: """ self.lg.info('login ...') body = driver.get_url_body( url=self.publish_url, timeout=30, ) try: assert body != '' driver.find_element(value='input#loginName').send_keys( self.yx_username) driver.find_element(value='input#loginPwd').send_keys( self.yx_password) driver.find_element(value='button#subbut').click() except ( NoSuchElementException, SeleniumTimeoutException, AssertionError, WebDriverException, AttributeError, ): # 抛出登录异常 raise LoginFailException try: self.wait_for_recommend_good_label_appear(driver=driver) except FZTimeoutError: # 进入目标页失败, 则抛出异常! raise EnterTargetPageFailException @fz_set_timeout(seconds=10.) def wait_for_recommend_good_label_appear(self, driver: BaseDriver): """ 直到出现荐好管理label :param driver: :return: """ while True: recommend_good_label_text = driver.find_element( value=self.recommend_good_label_css_selector).text # self.lg.info('recommend_good_label_text: {}'.format(recommend_good_label_text)) if recommend_good_label_text == '荐好管理': break else: continue self.lg.info('login success!') @fz_set_timeout(seconds=60.) def get_into_recommend_good_manage(self, driver: BaseDriver): """ 进入荐好管理 :param driver: :return: """ try: driver.find_element( value=self.recommend_good_label_css_selector).click() # 等待下方标签出现 sleep(.5) driver.find_element(value='a.J_menuItem').click() except SeleniumTimeoutException: # 进入目标页失败, 则抛出异常! raise EnterTargetPageFailException @fz_set_timeout(seconds=2.5 * 60) def publish_one_article(self, driver: BaseDriver, article_url: str): """ 发布一篇图文 :param driver: :param article_url: :return: """ try: # 切换到目标iframe(用index有时候不准, pass) # driver.switch_to_frame(frame_reference=1) iframe_ele_list = driver.find_elements(by=By.TAG_NAME, value='iframe') # pprint(iframe_ele_list) assert iframe_ele_list != [] target_iframe_ele = iframe_ele_list[1] if len( iframe_ele_list) > 1 else iframe_ele_list[0] driver.switch_to_frame(frame_reference=target_iframe_ele) except (NoSuchFrameException, ) as e: # 没匹配到frame(可能是原先就在目标iframe, eg: title过长的, 再切回iframe, 但是iframe_ele_list为0) raise e try: # 清空输入框 input_box_ele = driver.find_element(value='input#SnatchUrl') input_box_ele.clear() # 输入待采集地址 input_box_ele.send_keys(article_url) # 点击采集按钮 driver.find_elements( value='span.input-group-btn button')[0].click() self.wait_for_delete_img_appear(driver=driver) except (FZTimeoutError, NoSuchElementException, WebDriverException): # 发布某文章超时失败or无元素存在, 则抛出发布异常 raise PublishOneArticleFailException # 获取输入框的值 title = driver.find_element( value='input#RecommendName').get_attribute('value') self.lg.info('title: {}'.format(title)) if target_str_contain_some_char_check( target_str=title, check_char_obj=ARTICLE_TITLE_SENSITIVE_STR_TUPLE): raise ArticleTitleContainSensitiveWordsException else: pass if isinstance(title, str) and len(title) > 30: # 标题过长则return, 不发布 self.lg.info('@@@ title 标题过长, 无法发布!! 跳过!') # 由于标题过长后, 无法处理后续文章, 故不return, 直接抛出异常 # return raise ArticleTitleOverLongException else: pass try: # 点击发布按钮 driver.find_elements( value='span.input-group-btn button')[1].click() except WebDriverException: # 处理发布单篇异常! # 处理报错: Message: unknown error: Element <iframe class="J_iframe" name="iframe0" raise PublishOneArticleFailException # 切换至主页面 driver.switch_to_default_content() # 填写被发布人 random_phone = self.get_random_phone() driver.find_element( value='input.layui-layer-input').send_keys(random_phone) # 点击确定 driver.find_element(value='a.layui-layer-btn0').click() self.lg.info('url: {} 发布成功!'.format(article_url)) # 发布成功, 等待8.5秒, 等待页面元素置空 sleep(8.5) return @fz_set_timeout(seconds=70.) def wait_for_delete_img_appear(self, driver: BaseDriver): """ 直至出现图片, 超时退出(并且避免发布无图文章) :return: """ while True: # 改用 不宜用下面方式 长期跑电脑卡死 try: delete_btn_text = driver.find_element( value='div.deletebut').text except NoSuchElementException: # 处理这个异常, 并继续等待 sleep(.3) continue # 原先 但是老是发布失败!! # delete_btn_text = driver.find_element(value='div.deletebut').text # self.lg.info('delete_btn_text: {}'.format(delete_btn_text)) if delete_btn_text == '删除': break else: continue self.lg.info('该url采集完毕!') def get_random_phone(self) -> int: """ 随机个手机号 :return: """ phone_list = [] with open('../tools/phone.txt', 'r') as f: for line in f: try: phone_list.append(int(line.replace('\n', ''))) except Exception: continue # pprint(phone_list) random_phone = phone_list[randint(0, len(phone_list) - 1)] self.lg.info('random_phone: {}'.format(random_phone)) return random_phone def __del__(self): try: del self.lg del self.loop del self.db_article_id_list del self.publish_url del self.article_parser del self.hk_cache_dict except: pass collect()
def run_forever(): #### 实时更新数据 while True: # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select GoodsID, IsDelete, MyShelfAndDownTime from dbo.GoodsInfoAutoGet where SiteID=1' sql_str_2 = 'select GoodsOutUrl, goods_id from db_k85u.dbo.goodsinfo where OutGoodsType<=13 and onoffshelf=1 and not exists (select maingoodsid from gather.dbo.GoodsInfoAutoGet c where c.maingoodsid=goodsinfo.goods_id)' try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server._select_table(sql_str=sql_str, params=None) result_2 = list( tmp_sql_server_2._select_table(sql_str=sql_str_2, params=None)) # print(result_2) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 new_table_ali_1688_all_goods_id_list = [item[0] for item in result] for item in result_2: # 实时更新数据 data = {} taobao = TaoBaoLoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = taobao.get_goods_id_from_url(item[0]) if goods_id == '': print('@@@ 原商品的地址为: ', item[0]) continue else: if goods_id in new_table_ali_1688_all_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过!') continue else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) tt = taobao.get_goods_data(goods_id) if tt.get('is_delete') == 1: # 处理已下架的但是还是要插入的 tt['goods_id'] = goods_id tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) tt['username'] = '******' tt['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data=tt, pipeline=tmp_sql_server_2) index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) continue else: pass data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data, pipeline=tmp_sql_server_2) else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
# coding:utf-8 ''' @author = super_fazai @File : test_sql_str.py @Time : 2018/6/14 07:41 @connect : [email protected] ''' import sys sys.path.append('..') from pprint import pprint from json import dumps from my_pipeline import SqlServerMyPageInfoSaveItemPipeline _ = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select gather_url, MainID from dbo.daren_recommend where site_id=2 and MainID is not null' params = None result = _._select_table(sql_str=sql_str, params=params) pprint(result) # 更新 # sql_str_2 = 'UPDATE dbo.daren_recommend set share_img_url_list=NULL, goods_id_list=NULL, share_goods_base_info=%s where MainID=579;' # result = _._update_table(sql_str=sql_str_2, params=params) # print(result)
class Z8Updater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/折800/秒杀实时更新/', ip_pool_type=IP_POOL_TYPE, ) self.tmp_sql_server = None self.goods_index = 1 self.concurrency = 8 # 并发量 self.delete_sql_str = z8_delete_str_3 async def _get_db_old_data(self): self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = None try: self.tmp_sql_server._delete_table(sql_str=z8_delete_str_4, params=None) await async_sleep(5) result = list( self.tmp_sql_server._select_table(sql_str=z8_select_str_4)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_miaosha_begin_time(self, miaosha_time) -> int: miaosha_begin_time = json_2_dict(miaosha_time).get( 'miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) return miaosha_begin_time async def _get_new_z8_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.zhe_800_spike except: pass collect() self.zhe_800_spike = Zhe800Spike() async def _update_is_delete(self, goods_id) -> bool: ''' 下架商品逻辑删除 :param goods_id: :return: ''' delete_str = 'update dbo.zhe_800_xianshimiaosha set is_delete=1 where goods_id=%s' res = self.tmp_sql_server._update_table(sql_str=delete_str, params=(goods_id, )) await async_sleep(.3) return res async def _update_one_goods_info(self, item, index) -> tuple: ''' 更新单个 :param item: :param index: :return: ''' res = False goods_id = item[0] miaosha_time = item[1] session_id = item[2] miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time) # self.lg.info(str(miaosha_begin_time)) await self._get_new_z8_obj(index=index) self.tmp_sql_server = await _get_new_db_conn( db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30) if self.tmp_sql_server.is_connect_success: is_recent_time = await self._is_recent_time(miaosha_begin_time) if is_recent_time == 0: res = await self._update_is_delete(goods_id=goods_id) self.lg.info( '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format( goods_id, json.loads(item[1]).get('miaosha_begin_time'))) index += 1 self.goods_index = index res = True await async_sleep(.3) return goods_id, res elif is_recent_time == 2: # 可能包括过期的 self.lg.info('未来时间暂时不更新! {}'.format( timestamp_to_regulartime(miaosha_begin_time))) index += 1 self.goods_index = index return goods_id, res else: # 返回1,表示在待更新区间内 self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(goods_id, index)) try: tmp_data = self.zhe_800_spike._get_one_session_id_data( base_session_id=str(session_id)) except Exception: self.lg.error(msg='遇到错误:', exc_info=True) index += 1 self.goods_index = index return goods_id, res try: tmp_data = tmp_data.get('data', {}).get('blocks', []) assert tmp_data != [], '该session_id不存在,此处跳过' except AssertionError: # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品 self.lg.error(msg='遇到错误:', exc_info=True) res = await self._update_is_delete(goods_id) self.lg.info( msg= '该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!' .format(goods_id, miaosha_begin_time)) index += 1 self.goods_index = index await async_sleep(1.2) return goods_id, res tmp_data = [item_s.get('deal', {}) for item_s in tmp_data] # pprint(tmp_data) try: miaosha_goods_list = await self._get_miaoshao_goods_info_list( data=tmp_data) # pprint(miaosha_goods_list) except ValueError: await async_sleep(2) index += 1 self.goods_index = index return goods_id, res # 该session_id中现有的所有zid的list miaosha_goods_all_goods_id = [ i.get('zid') for i in miaosha_goods_list ] if goods_id not in miaosha_goods_all_goods_id: # 内部已经下架的 res = await self._update_is_delete(goods_id) self.lg.info( '该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format( goods_id)) index += 1 self.goods_index = index return goods_id, res else: # 未下架的 res = await self._one_update( miaosha_goods_list=miaosha_goods_list, goods_id=goods_id) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(1.5) return goods_id, res async def _one_update(self, **kwargs) -> bool: ''' 未下架的更新 :return: ''' miaosha_goods_list = kwargs.get('miaosha_goods_list') goods_id = kwargs.get('goods_id') zhe_800_miaosha = Zhe800Parse() res = False for item_1 in miaosha_goods_list: if item_1.get('zid', '') == goods_id: zhe_800_miaosha.get_goods_data(goods_id=goods_id) goods_data = zhe_800_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 break else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get('stock_info') goods_data['goods_id'] = str(item_1.get('zid')) if item_1.get('stock_info').get('activity_stock') > 0: # self.lg.info(item_1.get('price')) # self.lg.info(item_1.get('taobao_price')) goods_data['price'] = item_1.get('price') goods_data['taobao_price'] = item_1.get('taobao_price') else: self.lg.info('该商品参与活动的对应库存为0') await self._update_is_delete(goods_id=goods_id) break goods_data['sub_title'] = item_1.get('sub_title') goods_data['miaosha_time'] = item_1.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get('miaosha_time')) if goods_data.get('is_delete', 0) == 1: self.lg.info('该商品[{0}]已售罄...'.format(goods_id)) # self.lg.info(str(goods_data['stock_info'])) # self.lg.info(str(goods_data['miaosha_time'])) res = zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=self.tmp_sql_server) break else: pass collect() return res async def _is_recent_time(self, timestamp) -> int: ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = datetime_to_timestamp(get_shanghai_time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -259200: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time < -172800: # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来2小时的商品信息 return 0 # 已过期恢复原价的 elif diff_time > -172800 and diff_time < 7200: return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 async def _update_db(self): ''' 秒杀数据实时更新 :return: ''' while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.zhe_800_spike = Zhe800Spike() index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: self.lg.info('创建 task goods_id: {}'.format(item[0])) tasks.append( self.loop.create_task( self._update_one_goods_info(item=item, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(2.5 * 60) try: del self.zhe_800_spike except: pass collect() async def _get_miaoshao_goods_info_list(self, data) -> list: ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] # pprint(data) for item in data: if item == {}: continue # pprint(item) tmp = {} tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int( str(item.get('begin_time'))[0:10])), 'miaosha_end_time': timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])), } # 折800商品地址 tmp['zid'] = item.get('zid') # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } # 原始价格 tmp['price'] = float(item.get('list_price')) # 秒杀的价格, float类型 tmp['taobao_price'] = float(item.get('price')) tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list def __del__(self): try: del self.lg except: pass try: del self.loop except: pass try: del self.zhe_800_spike except: pass collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, fcid, page from dbo.mogujie_pintuan where site_id=23' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs() for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # print(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'detail_price': '', 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = price_info_list # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [ { 'spec_value': item_4.get( 'spec_value'), 'pintuan_price': item_4.get( 'detail_price'), 'detail_price': '', 'normal_price': item_4.get( 'normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get( 'rest_number'), } for item_4 in tmp_price_info_list ] goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = price_info_list goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_miaosha_goods_list = self.get_all_miaosha_goods_list() try: self.driver.quit() except: pass gc.collect() pinduoduo = PinduoduoParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: if my_pipeline._select_table(sql_str=pd_select_str_3) is None: db_goods_id_list = [] else: db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=pd_select_str_3))] for item in all_miaosha_goods_list: ''' 注意: 明日8点半抓取到的是页面加载中返回的是空值 ''' if item.get('goods_id') != 'None': # 跳过goods_id为'None' if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + item.get('goods_id') pinduoduo.get_goods_data(goods_id=item.get('goods_id')) goods_data = pinduoduo.deal_with_data() # print(goods_data) if goods_data == {}: # 返回的data为空则跳过 print('得到的goods_data为空值,此处先跳过,下次遍历再进行处理') # sleep(3) pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = item.get('goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get('taobao_price') # 秒杀价 goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) if item.get('stock_info').get('activity_stock') <= 2: # 实时秒杀库存小于等于2时就标记为 已售罄 print('该秒杀商品已售罄...') goods_data['is_delete'] = 1 pinduoduo.insert_into_pinduoduo_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(PINDUODUO_SLEEP_TIME) else: print('该goods_id为"None", 此处跳过') pass sleep(5) else: pass try: del pinduoduo except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=mg_delete_str_2) result = list(sql_cli._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) for item in result: # 实时更新数据 goods_id = item[0] pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, remainder=50) if sql_cli.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) print( '过期的goods_id为(%s)' % goods_id, ', 拼团开始时间为(%s), 逻辑删除成功!' % json.loads(item[1]).get('begin_time')) sleep(.3) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.get_url_body(url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mg_update_str_5, sql_cli=sql_cli, ) sleep(.3) else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if goods_id not in pintuan_goods_all_goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=sql_cli) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == goods_id: mogujie_pintuan.get_goods_data( goods_id=goods_id) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data[ 'goods_id'] = goods_id goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=sql_cli) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = ''' select GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time from dbo.GoodsInfoAutoGet where SiteID=13 and MainGoodsID is not null''' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 pinduoduo = PinduoduoParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) pinduoduo.get_goods_data(goods_id=item[0]) data = pinduoduo.deal_with_data() if data != {}: data['goods_id'] = item[0] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price'] ) # print('------>>>| 爬取到的数据为: ', data) pinduoduo.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del pinduoudo # except: # pass gc.collect() # sleep(1) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) # del pinduoduo gc.collect()
def deal_with_data(self, *param): ''' 处理并存储相关秒杀商品的数据 :param param: 相关参数 :return: ''' print(60 * '*') event_time = param[0] item_list = param[1] print('秒杀开始时间:', timestamp_to_regulartime(event_time), '\t', '对应时间戳为: ', event_time) print(60 * '*') mogujie = MoGuJieMiaoShaParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=mg_select_str_4)) db_goods_id_list = [item[0] for item in _] for item in item_list: goods_id = str(item.get('iid', '')) if goods_id in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = item.get('link', '') # print(tmp_url) try: object_id = re.compile('objectId=(\w+)').findall( tmp_url)[0] except IndexError: # 表示匹配到的地址不是秒杀商品的地址 print('+++++++ 这个url不是秒杀的url: ', tmp_url) continue tmp_url = 'https://shop.mogujie.com/rushdetail/{0}?objectId={1}&type=rush'.format( goods_id, object_id) tmp_ = mogujie.get_goods_id_from_url(tmp_url) mogujie.get_goods_data(goods_id=tmp_) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) # price设置为原价 try: tmp_price_list = sorted([ round(float(item_4.get('normal_price', '')), 2) for item_4 in goods_data['price_info_list'] ]) price = Decimal(tmp_price_list[-1]).__round__( 2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int(item.get('startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int(item.get('endTime', 0))), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['event_time'] = str(event_time) # pprint(goods_data) # print(goods_data) res = mogujie.insert_into_mogujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) data = self._get_one_session_id_data(base_session_id=base_session_id) sleep(.3) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 pass else: # 否则session_id存在 try: _ = str(data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10] if _ != '': pass elif data.get('data', {}).get('blocks', [])[0].get('showcase', {}) != {}: # 未来时间 print('*** 未来时间 ***') # pprint(data.get('data', {})) _ = str(data.get('data', {}).get('blocks', [])[1].get('deal', {}).get('begin_time', ''))[:10] else: raise Exception begin_times_timestamp = int(_) # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整 except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp)) if self.is_recent_time(timestamp=begin_times_timestamp): # 说明秒杀日期合法 try: data = [item_s.get('deal', {}) for item_s in data.get('data', {}).get('blocks', [])] except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list(data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=z8_select_str_5))] for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str(item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url(tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = str(item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get('taobao_price') goods_data['sub_title'] = item.get('sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) goods_data['session_id'] = str(base_session_id) # print(goods_data['miaosha_time']) # print(goods_data) zhe_800.insert_into_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 sleep(1) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') # return {} pass else: pass base_session_id += 2
def get_ali_1688_data(self, goods_id): if goods_id == '': self.result_data = {} return {} # 阿里1688手机版地址: https://m.1688.com/offer/559836312862.html wait_to_deal_with_url = 'https://m.1688.com/offer/' + str( goods_id) + '.html' print('------>>>| 待处理的阿里1688地址为: ', wait_to_deal_with_url) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=wait_to_deal_with_url, css_selector='div.d-content') # print(body) if body == '': print('获取到的body为空str!请检查!') self.result_data = {} return {} # ''' # 改用requests # ''' # body = MyRequests.get_url_body(url=wait_to_deal_with_url, headers=self.headers) # # print(body) # # if body == '': # return {} # print(body) tmp_body = body try: pull_off_shelves = Selector( text=body).css('div.d-content p.info::text').extract_first() except: pull_off_shelves = '' if pull_off_shelves == '该商品无法查看或已下架': # 表示商品已下架, 同样执行插入数据操作 # print('test') try: tmp_my_pipeline = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=2 and GoodsID=%s' is_in_db = tmp_my_pipeline._select_table( sql_str=sql_str, params=(str(goods_id), )) # print(is_in_db) except Exception as e: print('遇到错误:', e) print('数据库连接失败!') self.result_data = {} return {} if is_in_db != []: # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可 sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s' tmp_my_pipeline._update_table(sql_str=sql_str, params=(goods_id)) print('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1') tmp_data_s = self.init_pull_off_shelves_goods() # 初始化下架商品的属性 tmp_data_s['before'] = True # 用来判断原先该goods是否在db中 self.result_data = {} return tmp_data_s else: # 表示该goods_id没存在于db中 print('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...') tmp_data_s = self.init_pull_off_shelves_goods() # 初始化下架商品的属性 tmp_data_s['before'] = False self.result_data = {} return tmp_data_s body = re.compile(r'{"beginAmount"(.*?)</script></div></div>').findall( body) if body != []: body = body[0] body = r'{"beginAmount"' + body # print(body) body = json.loads(body) # pprint(body) if body.get('discountPriceRanges') is not None: self.result_data = self._wash_discountPriceRanges(body=body) return self.result_data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('解析ing..., 该商品正在参与火拼, 此处为火拼价, 为短期活动价格!') body = re.compile( r'{"activityId"(.*?)</script></div></div>').findall(tmp_body) if body != []: body = body[0] body = r'{"activityId"' + body # print(body) body = json.loads(body) # pprint(body) if body.get('discountPriceRanges') is not None: self.result_data = self._wash_discountPriceRanges( body=body) self.is_activity_goods = True return self.result_data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('这个商品对应活动属性未知, 此处不解析, 设置为跳过!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=3 or SiteID=4 or SiteID=6 order by ID desc' sql_str_2 = 'select GoodsOutUrl, goods_id from db_k85u.dbo.goodsinfo where OutGoodsType<=13 and onoffshelf=1 and not exists (select maingoodsid from gather.dbo.GoodsInfoAutoGet c where c.maingoodsid=goodsinfo.goods_id)' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) result_2 = list( tmp_sql_server._select_table(sql_str=sql_str_2, params=None)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None result_2 = [] if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 tmall = TmallParse() for item in result_2: # 实时更新数据 data = {} if index % 5 == 0: tmall = TmallParse() gc.collect() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = tmall.get_goods_id_from_url(item[0]) if goods_id == []: print('@@@ 原地址为: ', item[0]) continue else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id[1], index)) data = tmall.get_goods_data(goods_id=goods_id) if isinstance(data, int): continue if data.get('is_delete') == 1: data['goods_id'] = goods_id[1] # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[ 1] # 构造成标准干净的天猫商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[ 1] elif goods_id[ 0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = str( goods_id[2]) + '?id=' + goods_id[1] else: continue data['goods_url'] = wait_to_deal_with_url data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) result = tmall.old_tmall_goods_insert_into_new_table( data, pipeline=tmp_sql_server) if result is False: print('出错商品的地址为: ', item[0]) else: pass index += 1 gc.collect() sleep(1.2) continue else: pass data = tmall.deal_with_data() if data != {}: data['goods_id'] = goods_id[1] # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[ 1] # 构造成标准干净的天猫商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[ 1] elif goods_id[ 0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = str( goods_id[2]) + goods_id[1] else: continue data['goods_url'] = wait_to_deal_with_url data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) tmall.old_tmall_goods_insert_into_new_table( data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass gc.collect() sleep(2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
async def deal_with_all_goods_id(self): ''' 获取每个详细分类的商品信息 :return: None ''' sort_data = await self.get_all_goods_list() sql_cli = SqlServerMyPageInfoSaveItemPipeline() index = 1 if sql_cli.is_connect_success: # 普通sql_server连接(超过3000无返回结果集) self.lg.info('正在获取天天特价db原有goods_id, 请耐心等待...') db_ = list(sql_cli._select_table(sql_str=tb_select_str_6)) db_goods_id_list = [[item[0], item[2]] for item in db_] self.lg.info('获取完毕!!!') # print(db_goods_id_list) db_all_goods_id = [i[0] for i in db_goods_id_list] for item in sort_data: tejia_goods_list = await self.get_tiantiantejia_goods_list( data=item.get('data', [])) self.lg.info(str(tejia_goods_list)) for tmp_item in tejia_goods_list: if tmp_item.get( 'goods_id', '' ) in db_all_goods_id: # 处理如果该goods_id已经存在于数据库中的情况 tmp_end_time = '' try: tmp_end_time = [ i[1] for i in db_goods_id_list if tmp_item.get('goods_id', '') == i[0] ][0] # print(tmp_end_time) except: pass if tmp_end_time != '' \ and tmp_end_time < get_shanghai_time(): ''' * 处理由常规商品又转换为天天特价商品 * ''' self.lg.info('##### 该商品由常规商品又转换为天天特价商品! #####') # 先删除,再重新插入(原先已过期) _ = await sql_cli.delete_taobao_tiantiantejia_expired_goods_id( goods_id=tmp_item.get('goods_id', ''), logger=self.lg) if _ is False: continue index = await self.insert_into_table( tmp_item=tmp_item, category=item['category'], current_page=item['current_page'], sql_cli=sql_cli, index=index, ) else: self.lg.info('该goods_id已经存在于数据库中, 此处跳过') pass else: sql_cli = await _get_new_db_conn( db_obj=sql_cli, index=index, logger=self.lg, ) if sql_cli.is_connect_success: index = await self.insert_into_table( tmp_item=tmp_item, category=item['category'], current_page=item['current_page'], sql_cli=sql_cli, index=index, ) else: self.lg.error('数据库连接失败!') pass else: self.lg.error('数据库连接失败!') pass collect() # 休眠30分钟 self.lg.info('休眠30分钟, 避免特价数据量过大...') await async_sleep(60 * 30) return True
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=tm_select_str_3)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info( '--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 tmall = TmallParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del tmall except: pass tmall = TmallParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) tmp_item = [] if item[0] == 3: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 4: tmp_item.append(1) elif item[0] == 6: tmp_item.append(2) tmp_item.append(item[1]) oo = tmall.get_goods_data(goods_id=tmp_item) oo_is_delete = oo.get('is_detele', 0) # 避免下面解析data错误休眠 if isinstance(oo, int): # 单独处理return 4041 index += 1 sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue data = tmall.deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) site_id = tmall._from_tmall_type_get_site_id( type=data['type']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[7]), site_id=site_id) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=site_id), is_price_change=item[8] if item[8] is not None else 0) tmall.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 if oo_is_delete == 1: pass else: my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def deal_with_data(self, goods_list): ''' 处理并存储相关拼团商品的数据 :param goods_list: :return: ''' mia = MiaPintuanParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=mia_select_str_1)) db_goods_id_list = [item[0] for item in _] # print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://www.mia.com/item-' + str( goods_id) + '.html' mia.get_goods_data(goods_id=str(goods_id)) goods_data = mia.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_url = goods_data['goods_url'] if re.compile(r'://m.miyabaobei.hk/').findall( goods_url) != '': goods_url = 'https://www.miyabaobei.hk/item-' + str( goods_id) + '.html' else: goods_url = 'https://www.mia.com/item-' + str( goods_id) + '.html' goods_data['goods_url'] = goods_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['pintuan_time']) goods_data['pid'] = item.get('pid') # pprint(goods_data) _r = mia.insert_into_mia_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mia except: pass collect()
class GoodsKeywordsSpider(Crawler): def __init__(self): super(GoodsKeywordsSpider, self).__init__( ip_pool_type=IP_POOL_TYPE, log_print=True, logger=None, log_save_path=MY_SPIDER_LOGS_PATH + '/goods_keywords/_/', ) self.msg = '' self._init_debugging_api() self.debugging_api = self._init_debugging_api() self._set_func_name_dict() self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # 插入数据到goods_id_and_keyword_middle_table表 self.add_keyword_id_for_goods_id_sql_str = kw_insert_str_1 def _init_debugging_api(self): ''' 用于设置crawl的关键字热销商品的site_id :return: dict ''' return { 1: True, # 淘宝 2: True, # 阿里1688 3: True, # 天猫 4: True, # 京东 } def _set_func_name_dict(self): self.func_name_dict = { 'taobao': 'self._taobao_keywords_spider(goods_id_list={0}, keyword_id={1})', 'ali': 'self._ali_keywords_spider(goods_id_list={0}, keyword_id={1})', 'tmall': 'self._tmall_keywords_spider(goods_id_list={0}, keyword_id={1})', 'jd': 'self._jd_keywords_spider(goods_id_list={0}, keyword_id={1})' } def _just_run(self): while True: # 获取原先goods_db的所有已存在的goods_id try: result = list( self.my_pipeline._select_table(sql_str=kw_select_str_1)) self.lg.info('正在获取db中已存在的goods_id...') result_2 = list( self.my_pipeline._select_table(sql_str=kw_select_str_2)) self.lg.info('db中已存在的goods_id获取成功!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None result_2 = None if result is not None and result_2 is not None: self.lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.lg.info(str(result)) self.lg.info( '--------------------------------------------------------') self.lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self.add_goods_index = 0 # 用于定位增加商品的个数 self.db_existed_goods_id_list = [item[0] for item in result_2] # 即时释放资源 try: del result_2 except: pass gc.collect() for item in result: # 每个关键字在True的接口都抓完, 再进行下一次 self.lg.info('正在处理id为{0}, 关键字为 {1} ...'.format( item[0], item[1])) for type, type_value in self.debugging_api.items( ): # 遍历待抓取的电商分类 if type_value is False: self.lg.info('api为False, 跳过!') continue if self.add_goods_index % 20 == 0: self.lg.info('my_pipeline客户端重连中...') try: del self.my_pipeline except: pass self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline( ) self.lg.info('my_pipeline客户端重连完毕!') goods_id_list = self._get_keywords_goods_id_list( type=type, keyword=item) self.lg.info( '关键字为{0}, 获取到的goods_id_list 如下: {1}'.format( item[1], str(goods_id_list))) '''处理goods_id_list''' self._deal_with_goods_id_list( type=type, goods_id_list=goods_id_list, keyword_id=item[0]) sleep(3) def _get_keywords_goods_id_list(self, type, keyword): ''' 获取goods_id_list :param type: 电商种类 :param keyword: :return: ''' if type == 1: self.lg.info('下面是淘宝的关键字采集...') goods_id_list = self._get_taobao_goods_keywords_goods_id_list( keyword=keyword) elif type == 2: self.lg.info('下面是阿里1688的关键字采集...') goods_id_list = self._get_1688_goods_keywords_goods_id_list( keyword=keyword) elif type == 3: self.lg.info('下面是天猫的关键字采集...') goods_id_list = self._get_tmall_goods_keywords_goods_id_list( keyword=keyword) elif type == 4: self.lg.info('下面是京东的关键字采集...') goods_id_list = self._get_jd_goods_keywords_goods_id_list( keyword=keyword) else: goods_id_list = [] return goods_id_list def _deal_with_goods_id_list(self, **kwargs): ''' 分类执行代码 :param kwargs: :return: ''' type = kwargs.get('type', '') goods_id_list = kwargs.get('goods_id_list', []) keyword_id = kwargs.get('keyword_id', '') if type == 1: self._taobao_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 2: self._1688_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 3: self._tmall_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 4: self._jd_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) else: pass return None def _get_taobao_goods_keywords_goods_id_list(self, keyword): ''' 获取该keywords的商品的goods_id_list :param keyword: (id, keyword) :return: a list ''' headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306', 'authority': 's.taobao.com', # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw', } # 获取到的为淘宝关键字搜索按销量排名 params = ( ('data-key', 'sort'), ('data-value', 'sale-desc'), ('ajax', 'true'), # ('_ksTS', '1528171408340_395'), ('callback', 'jsonp396'), ('q', keyword[1]), ('imgfile', ''), ('commend', 'all'), ('ssid', 's5-e'), ('search_type', 'item'), ('sourceId', 'tb.index'), # ('spm', 'a21bo.2017.201856-taobao-item.1'), ('ie', 'utf8'), # ('initiative_id', 'tbindexz_20170306'), ) s_url = 'https://s.taobao.com/search' body = Requests.get_url_body(url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) if body == '': return [] else: try: data = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1])) return [] data = json_2_dict(json_str=data, logger=self.lg) if data == {}: self.lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: goods_id_list = data.get('mainInfo', {}).get( 'traceInfo', {}).get('traceData', {}).get('allNids', []) if goods_id_list is None or goods_id_list == []: self.lg.error('获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] else: return goods_id_list def _get_1688_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取1688销量靠前的商品信息 :param keyword: :return: a list eg: ['11111', ...] ''' '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品''' headers = { 'authority': 'm.1688.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE', } params = ( ('sortType', 'booked'), ('filtId', ''), ('keywords', keyword[1]), ('descendOrder', 'true'), ) url = 'https://m.1688.com/offer_search/-6161.html' body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: try: goods_id_list = Selector(text=body).css( 'div.list_group-item::attr("data-offer-id")').extract() # pprint(goods_id_list) except Exception as e: self.lg.exception(e) self.lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) goods_id_list = [] return goods_id_list def _get_tmall_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' # 搜索: 偶尔不稳定但是还是能用 headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte', } params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } s_url = 'https://list.tmall.com/m/search_items.htm' body = Requests.get_url_body(url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: data = json_2_dict(json_str=body, logger=self.lg) if data == {}: self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: _ = data.get('item', []) if _ is None or _ == []: self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] try: goods_id_list = [str(item.get('url', '')) for item in _] except Exception as e: self.lg.exception(e) self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] return goods_id_list def _get_jd_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取京东销量靠前的商品 :param keyword: :return: [] or ['xxxx', ....] ''' # 方案1: jd m站的搜索(基于搜索接口) headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://so.m.jd.com/ware/search.action?keyword=b&area_ids=1,72,2819&sort_type=sort_totalsales15_desc&qp_disable=no&fdesc=%E5%8C%97%E4%BA%AC&t1=1529934870416', 'authority': 'so.m.jd.com', # 'cookie': '3AB9D23F7A4B3C9B=SL4YPRE3Y4C627UCHFP4ROHI54TTYYJKLFSVROZQ57T7K3OUUKSYIVFUJKQHBAUPRANZOTPLCVC2TICTSJG6WEMUII; mba_muid=1523868445027-16c30fbc5f8c54c429; abtest=20180416164812814_35; visitkey=41587293677961039; shshshfpa=9e159581-c64f-e9f4-ad0c-8b6ced0d9f28-1525907842; shshshfpb=1a725fe3148b84c839f009c93fc261f2218f59c61e7f4e6c05af381826; retina=1; webp=1; TrackerID=GGwYSka4RvH3lm0ZwLoO2_qdMpBwRG39BvyBvQaJfzyN5cmdGt4lEMSqqJS-sbDqj4nAUX2HU4sVDGA8vl169D37w4EqceYcH6ysXv46kMVfvVdAPmSMV9LceeO3Cc6Z; whwswswws=; __jdc=122270672; subAbTest=20180604104024339_59; mobilev=html5; m_uuid_new=05C2D24B7D8FFDA8D4243A929A5C6234; intlIpLbsCountrySite=jd; mhome=1; cid=9; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; M_Identification_abtest=20180604104040270_32361722; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; so_eggsCount=1; warehistory="4764260,10658784927,"; wq_logid=1528080290.1936376147; __jdu=15238681432201722645210; __jda=122270672.15238681432201722645210.1523868143.1528255502.1529934182.18; __jdv=122270672|direct|-|none|-|1529934182053; cn=0; user-key=ecfc3673-cc54-43e2-96bd-fb7a7e700c32; ipLoc-djd=1-72-2799-0; shshshfp=a3b9323dfc6a675230170e6a43efcb81; USER_FLAG_CHECK=d9f73823a80c0305366f70a3b99b9ecb; sid=57ea016fe0ab4b04271e00f01d94d3b9; intlIpLbsCountryIp=60.177.32.78; autoOpenApp_downCloseDate_auto=1529934572240_21600000; wxa_level=1; PPRD_P=UUID.15238681432201722645210; sc_width=1280; wq_area=15_1213_0%7C3; __jdb=122270672.10.15238681432201722645210|18.1529934182; mba_sid=15299345705167145512031951538.7; __wga=1529934993217.1529934585585.1528080039013.1526716673573.6.3; shshshsID=7f3d94fa215b4e53b467f0d5e0563e9c_9_1529934993592', } params = ( ('keyword', keyword[1]), ('datatype', '1'), ('callback', 'jdSearchResultBkCbA'), ('page', '1'), ('pagesize', '10'), ('ext_attr', 'no'), ('brand_col', 'no'), ('price_col', 'no'), ('color_col', 'no'), ('size_col', 'no'), ('ext_attr_sort', 'no'), ('merge_sku', 'yes'), ('multi_suppliers', 'yes'), ('area_ids', '1,72,2819'), ('sort_type', 'sort_totalsales15_desc'), ('qp_disable', 'no'), ('fdesc', '\u5317\u4EAC'), # ('t1', '1529934992189'), ) s_url = 'https://so.m.jd.com/ware/search._m2wq_list' body = Requests.get_url_body(url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: try: data = re.compile('jdSearchResultBkCbA\((.*)\)').findall( body)[0] except IndexError: self.lg.error('获取jd的关键字数据时, IndexError! 出错关键字为{0}'.format( (keyword[1]))) return [] '''问题在于编码中是\xa0之类的,当遇到有些 不用转义的\http之类的,则会出现以上错误。''' data = deal_with_JSONDecodeError_about_value_invalid_escape( json_str=data) data = json_2_dict(json_str=data, logger=self.lg) if data == {}: self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: # 注意拿到的数据如果是京东拼购则跳过 # pprint(data) data = data.get('data', {}).get('searchm', {}).get('Paragraph', []) # pingou中字段'bp'不为空即为拼购商品,抓取时不抓取拼购商品, 即'pingou_price': item.get('pinGou', {}).get('bp', '') == '' if data is not None and data != []: goods_id_list = [ item.get('wareid', '') for item in data if item.get('pinGou', {}).get('bp', '') == '' ] return goods_id_list else: self.lg.error('获取到的data为空list, 请检查!') return [] def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://item.taobao.com/item.htm?id=' + item for item in goods_id_list ] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) result = taobao.old_taobao_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _1688_keywords_spider(self, **kwargs): ''' 1688对应关键字的商品信息抓取存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://detail.1688.com/offer/{0}.html'.format(item) for item in goods_id_list ] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: result = False # 每次重置 try: goods_id = re.compile('offer/(.*?).html').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: ali_1688 = ALi1688LoginAndParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = ali_1688.get_goods_id_from_url(item) if goods_id == '': self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get( 'before') is False: # 处理已下架的但是还是要插入的 # 下架的商品就pass continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = None result = ali_1688.old_ali_1688_goods_insert_into_new_table( data=data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 try: del ali_1688 except: pass gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _tmall_keywords_spider(self, **kwargs): ''' tmall对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list ] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue result = tmall.old_tmall_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _jd_keywords_spider(self, **kwargs): ''' jd对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') '''初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址, 存也可以存这个地址''' # 所以这边jd就不分类存,一律存为常规商品site_id = 7 goods_url_list = [ 'https://item.jd.com/{0}.html'.format(str(item)) for item in goods_id_list ] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入db的参数 try: goods_id = re.compile('\/(\d+)\.html').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: jd = JdParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = jd.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = jd.get_goods_data(goods_id) data = jd.deal_with_data(goods_id) goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = item result = jd.old_jd_goods_insert_into_new_table( data, self.my_pipeline) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 sleep(1) try: del jd except: pass gc.collect() if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _insert_into_goods_id_and_keyword_middle_table(self, **kwargs): ''' 数据插入goods_id_and_keyword_middle_table :param kwargs: :return: ''' goods_id = str(kwargs['goods_id']) keyword_id = int(kwargs['keyword_id']) # self.lg.info(goods_id) # self.lg.info(keyword_id) result = False '''先判断中间表goods_id_and_keyword_middle_table是否已新增该关键字的id''' # 注意非完整sql语句不用r'', 而直接'' try: _ = self.my_pipeline._select_table(sql_str=kw_select_str_3, params=(goods_id, )) _ = [i[0] for i in _] # pprint(_) except Exception: self.lg.error( '执行中间表goods_id_and_keyword_middle_table是否已新增该关键字的id的sql语句时出错, 跳过给商品加keyword_id' ) return result if keyword_id not in _: params = ( goods_id, keyword_id, ) self.lg.info('------>>>| 正在插入keyword_id为{0}, goods_id为{1}'.format( params[1], params[0])) result = self.my_pipeline._insert_into_table_2( sql_str=self.add_keyword_id_for_goods_id_sql_str, params=params, logger=self.lg) return result def _add_keyword_2_db_from_excel_file(self): ''' 从excel插入新关键字到db :return: ''' excel_file_path = '/Users/afa/Desktop/2018-07-18-淘宝phone-top20万.xlsx' self.lg.info('正在读取{0}, 请耐心等待...'.format(excel_file_path)) try: excel_result = read_info_from_excel_file( excel_file_path=excel_file_path) except Exception: self.lg.error('遇到错误:', exc_info=True) return False self.lg.info('读取完毕!!') self.lg.info('正在读取db中原先的keyword...') db_keywords = self.my_pipeline._select_table(sql_str=kw_select_str_4) db_keywords = [i[0] for i in db_keywords] self.lg.info('db keywords 读取完毕!') for item in excel_result: keyword = item.get('关键词', None) if not keyword: continue if keyword in db_keywords: self.lg.info('该关键字{0}已经存在于db中...'.format(keyword)) continue self.lg.info('------>>>| 正在存储关键字 {0}'.format(keyword)) self.my_pipeline._insert_into_table_2(sql_str=kw_insert_str_2, params=(str(keyword), 0), logger=self.lg) self.lg.info('全部写入完毕!') return True def __del__(self): try: del self.lg del self.msg del self.my_pipeline except: pass try: del self.db_existed_goods_id_list except: pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33] # notice for tab_id in tab_id_list: for index in range(0, 50): tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(index)) print('待抓取的限时秒杀地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format( tab_id, index)) break else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) print(miaosha_goods_list) juanpi = JuanPiParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, tab_id, page from dbo.juanpi_xianshimiaosha where site_id=15' if my_pipeline._select_table( sql_str=sql_str) is None: db_goods_id_list = [] else: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table( sql_str=sql_str)) ] for item in miaosha_goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://shop.juanpi.com/deal/' + item.get( 'goods_id') juanpi.get_goods_data( goods_id=item.get('goods_id')) goods_data = juanpi.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = item.get( 'goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get( 'price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get( 'taobao_price') # 秒杀价 goods_data['sub_title'] = item.get( 'sub_title', '') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get( 'miaosha_time')) goods_data['tab_id'] = tab_id goods_data['page'] = index # print(goods_data) juanpi.insert_into_juanpi_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(.4) # 短暂sleep下避免出错跳出 sleep(.65) else: pass try: del juanpi except: pass gc.collect()
class TaoBaoWeiTaoShareParse(AsyncCrawler): def __init__( self, logger=None, *params, **kwargs, ): AsyncCrawler.__init__( self, *params, **kwargs, logger=logger, ip_pool_type=IP_POOL_TYPE, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/微淘/', ) self._set_headers() self.msg = '' self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() def _set_headers(self): self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', 'referer': 'https://market.m.taobao.com/apps/market/content/index.html?ut_sk=1.VmYadv9DXkkDAFZm0VV4JBNq_21380790_1527298517854.Copy.33¶ms=%7B%22csid%22%3A%2254a52aea54b7c29d289a0e36b2bf2f51%22%7D&wh_weex=true&contentId=200668154273&source=weitao_2017_nocover&data_prefetch=true&suid=3D763077-A7BF-43BC-9092-C17B35E896F9&wx_navbar_transparent=false&wx_navbar_hidden=false&sourceType=other&un=bc80c9f324602d31384c4a342af87869&share_crt_v=1&sp_tk=o6R2Q0ZDMHZvaDBlS6Ok&cpp=1&shareurl=true&spm=a313p.22.68.948703884987&short_name=h.WAjz5RP&app=chrome', 'authority': 'h5api.m.taobao.com', # cookie得注释掉, 否则为非法请求 # 'cookie': '' } async def _get_target_url_and_content_id_and_csid(self, taobao_short_url): ''' 根据给与的淘宝分享短链接, 得到target_url, content_id, csid :param taobao_short_url: :return: ''' if re.compile(r'contentId').findall(taobao_short_url) != []: # 先检查是否已为目标地址 target_url = taobao_short_url else: body = Requests.get_url_body( url=taobao_short_url, headers=self.headers, ip_pool_type=self.ip_pool_type, ) # self.lg.info(str(body)) if body == '': self.lg.error('获取到的body为空值, 出错短链接地址: {0}'.format( str(taobao_short_url))) return '', '', '' try: # 获取短连接的目标地址 target_url = re.compile('var url = \'(.*?)\';').findall( body)[0] self.lg.info('获取到原始连接: {}'.format(target_url)) except IndexError: self.lg.error('获取target_url的时候IndexError! 出错短链接地址: {0}'.format( str(taobao_short_url))) target_url = '' try: # 得到contentId content_id = re.compile('contentId=(\d+)').findall(target_url)[0] self.lg.info(content_id) except IndexError: self.lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format( str(taobao_short_url))) content_id = '' try: # 得到csid csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall( target_url)[0] # self.lg.info(csid) except IndexError: self.lg.info('此链接为无csid情况的链接...') # self.lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) csid = '' try: tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0] except IndexError: tag_name = '' try: tag = re.compile('tag=(.*?)&').findall(target_url)[0] except IndexError: tag = '' return target_url, content_id, csid, tag_name, tag async def _get_api_body(self, taobao_short_url): ''' 获取该页面api返回的文件 :param taobao_short_url: :return: body 类型 str ''' base_url = 'https://h5api.m.taobao.com/h5/mtop.taobao.beehive.detail.contentservicenewv2/1.0/' try: target_url, content_id, csid, tag_name, tag = await self._get_target_url_and_content_id_and_csid( taobao_short_url) except ValueError: self.lg.error('遇到ValueError!', exc_info=True) return '' if content_id == '' and csid == '': # 异常退出 return '' data = dumps({ 'businessSpm': '', 'business_spm': '', 'contentId': content_id, 'params': dumps({ "csid": csid, }) if csid != '' else '', # 没有csid时,就不传这个参数 'source': 'weitao_2017_nocover', 'tagName': tag_name, # 这个是我自己额外加的用于获取tags的api接口 'track_params': '', 'type': 'h5', }) params = { 'AntiCreep': 'true', 'AntiFlood': 'true', 'api': 'mtop.taobao.beehive.detail.contentservicenewv2', 'appKey': '12574478', 'callback': 'mtopjsonp1', # 'data': '{"contentId":"200668154273","source":"weitao_2017_nocover","type":"h5","params":"{\\"csid\\":\\"54a52aea54b7c29d289a0e36b2bf2f51\\"}","businessSpm":"","business_spm":"","track_params":""}', 'data': data, 'dataType': 'jsonp', 'data_2': '', 'jsv': '2.4.11', # 'sign': 'e8cb623e58bab0ceb10e9edffdacd5b2', # 't': '1527300457911', 'type': 'jsonp', 'v': '1.0' } # TODO 新版 # 必传参数(无cookies, sign正确也无结果!) # 而且登录后的cookies, 但是继续采集, tb会报: 亲,访问被拒绝了哦!请检查是否使用了代理软件或VPN哦~ result_1 = await get_taobao_sign_and_body( base_url=base_url, headers=self.headers, params=params, data=data, logger=self.lg, ip_pool_type=self.ip_pool_type) _m_h5_tk = result_1[0] if _m_h5_tk == '': self.lg.error( '获取到的_m_h5_tk为空str! 出错短链接地址: {0}'.format(taobao_short_url)) # 带上_m_h5_tk, 和之前请求返回的session再次请求得到需求的api数据 result_2 = await get_taobao_sign_and_body( base_url=base_url, headers=self.headers, params=params, data=data, _m_h5_tk=_m_h5_tk, session=result_1[1], logger=self.lg, ip_pool_type=self.ip_pool_type) body = result_2[2] return body async def _deal_with_api_info(self, taobao_short_url): ''' 处理api返回的信息, 并结构化存储 :param taobao_short_url: :return: ''' data = await self._get_api_body(taobao_short_url) if data == '': self.lg.error('获取到的api数据为空值!') return {} try: data = re.compile('mtopjsonp1\((.*)\)').findall(data)[0] except IndexError: self.lg.error( 're获取主信息失败, IndexError, 出错短链接地址:{0}'.format(taobao_short_url)) data = {} try: data = await self._wash_api_info(loads(data)) # pprint(data) except Exception as e: self.lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.lg.exception(e) return {} article = await self._get_article(data=data, taobao_short_url=taobao_short_url) pprint(article) if article != {} and article.get('share_id', '') != '': '''采集该文章推荐的商品''' await self._crawl_and_save_these_goods( goods_url_list=article.get('goods_url_list', [])) '''存储该文章info''' await self._save_this_article(article=article) return True else: self.lg.info('获取到的文章失败! article为空dict!') return False async def _crawl_and_save_these_goods(self, goods_url_list): ''' 采集该文章推荐的商品 :param goods_url_list: :return: ''' sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6' try: result = self.my_pipeline._select_table(sql_str=sql_str) except TypeError: result = [] self.lg.info('即将开始抓取该文章的goods, 请耐心等待...') index = 1 db_all_goods_id_list = [item[0] for item in result] for item in goods_url_list: try: goods_id = re.compile(r'id=(\d+)').findall( item.get('goods_url', ''))[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in db_all_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue else: taobao = TaoBaoLoginAndParse(logger=self.lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url( item.get('goods_url', '')) if goods_id == '': self.lg.info('@@@ 原商品的地址为: {0}'.format( item.get('goods_url', ''))) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.lg.info('该文章的商品已经抓取完毕!') return True async def _save_this_article(self, article): ''' 存储该文章info :param article: :return: ''' sql_str = 'select share_id from dbo.daren_recommend' db_share_id = [ j[0] for j in list(self.my_pipeline._select_table(sql_str=sql_str)) ] if article.get('share_id') in db_share_id: self.lg.info('该share_id({})已存在于数据库中, 此处跳过!'.format( article.get('share_id', ''))) return True else: self.lg.info('即将开始存储该文章...') if self.my_pipeline.is_connect_success: params = await self._get_db_insert_params(item=article) # pprint(params) sql_str = r'insert into dbo.daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_goods_base_info, div_body, create_time, site_id) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' self.my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.lg) return True else: self.lg.error('db连接失败!存储失败! 出错article地址:{0}'.format( article.get('gather_url', ''))) return False async def _get_db_insert_params(self, item): params = ( item['nick_name'], item['head_url'], item['profile'], item['share_id'], item['gather_url'], item['title'], item['comment_content'], # dumps(item['share_img_url_list'], ensure_ascii=False), # dumps(item['goods_id_list'], ensure_ascii=False), dumps(item['share_goods_base_info'], ensure_ascii=False), item['div_body'], item['create_time'], item['site_id'], ) return params async def _get_article(self, data, taobao_short_url): ''' 得到该文章的需求信息 :param data: :return: ''' try: nick_name = data.get('data', {}).get('models', {}).get('account', {}).get('name', '') assert nick_name != '', '获取到的nick_name为空值!' head_url = await self._get_head_url(data=data) # 推荐人的简介或者个性签名 tmp_profile = data.get('data', {}).get('models', {}).get('account', {}).get('accountDesc', '') profile = tmp_profile if tmp_profile is not None else '' title = self._wash_sensitive_info( data.get('data', {}).get('models', {}).get('content', {}).get('title', '')) # self.lg.info(title) assert title != '', '获取到的title为空值!请检查!' # 达人的评论,可用于荐好首页的文字信息 comment_content = self._wash_sensitive_info( data.get('data', {}).get('models', {}).get('content', {}).get('summary', '')) '''微淘抓包的接口: 图片,商品依次对应''' tmp_goods_list = data.get('data', {}).get('models', {}).get( 'content', {}).get('drawerList', []) assert tmp_goods_list != [], '获取到的goods_id_list为空list! 请检查! 可能该文章推荐商品为空[]!' share_img_url_list = [{ 'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', '') } for item in tmp_goods_list] goods_id_list = [{ 'goods_id': item.get('itemId', '') } for item in tmp_goods_list] # 由于微淘的图片跟商品信息一一对应,so直接存一个字段, 清除重复的推荐商品(list去重,并保持原来的顺序) share_goods_base_info = list_duplicate_remove([{ 'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', ''), 'goods_id': item.get('itemId', ''), } for item in tmp_goods_list]) # div_body div_body = self._wash_sensitive_info( await self._get_div_body(rich_text=data.get('data', {}).get( 'models', {}).get('content', {}).get('richText', []))) # print(div_body) # 待抓取的商品地址, 统一格式为淘宝的,如果是tmall地址, 浏览器会重定向到天猫 goods_url_list = [{ 'goods_url': 'https://item.taobao.com/item.htm?id=' + item.get('goods_id', '') } for item in goods_id_list] _ = ( await self._get_target_url_and_content_id_and_csid(taobao_short_url)) gather_url = _[0] share_id = _[1] # 即content_id create_time = get_shanghai_time() site_id = 2 # 淘宝微淘 # tags 额外的文章地址 tags = await self._get_tags(data=data) # pprint(tags) except Exception as e: self.lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.lg.exception(e) return {} article = WellRecommendArticle() article['nick_name'] = nick_name article['head_url'] = head_url article['profile'] = profile article['share_id'] = share_id article['title'] = title article['comment_content'] = comment_content article['share_img_url_list'] = share_img_url_list article['goods_id_list'] = goods_id_list article['div_body'] = div_body article['gather_url'] = gather_url article['create_time'] = create_time article['site_id'] = site_id article['goods_url_list'] = goods_url_list article['tags'] = tags article['share_goods_base_info'] = share_goods_base_info return article async def _get_head_url(self, data): ''' 获取头像地址 :param data: :return: ''' tmp_head_url = data.get('data', {}).get('models', {}).get('account', {}).get('accountPic', {}).get('picUrl', '') if tmp_head_url != '': if re.compile('http').findall(tmp_head_url) == []: head_url = 'https:' + tmp_head_url else: head_url = tmp_head_url else: head_url = '' return head_url def _wash_sensitive_info(self, data): ''' 清洗敏感信息 :param data: :return: ''' data = re.compile('淘宝|天猫|taobao|tmall|TAOBAO|TMALL').sub('', data) return data async def _get_tags(self, data): ''' 获得额外文章的信息 :param data: :return: ''' tags = data.get('data', {}).get('models', {}).get('tags', []) tags = [{ 'url': unquote(item.get('url', '')), 'name': item.get('name', ''), } for item in tags] return tags async def _get_div_body(self, rich_text): ''' 处理得到目标文章 :param rich_text: 待处理的原文章 :return: ''' div_body = '' for item in rich_text: if item.get('resource') is None: continue for resource_item in item.get('resource', []): # 可能是多个 # resource = item.get('resource', [])[0] text = resource_item.get('text', '') # 介绍的文字 picture = resource_item.get('picture', {}) # 介绍的图片 _goods = resource_item.get('item', {}) # 一个商品 if text != '': text = '<p style="height:auto;width:100%">' + text + '</p>' + '<br>' div_body += text continue if picture != {}: # 得到该图片的宽高,并得到图片的<img>标签 _ = r'<img src="{0}" style="height:{1}px;width:{2}px;"/>'.format( 'https:' + picture.get('picUrl', ''), picture.get('picHeight', ''), picture.get('picWidth', '')) _ = _ + '<br>' div_body += _ continue if _goods != {}: _hiden_goods_id = r'<p style="display:none;">此处有个商品[goods_id]: {0}</p>'.format( _goods.get('itemId', '')) + '<br>' div_body += _hiden_goods_id continue return '<div>' + div_body + '</div>' if div_body != '' else '' async def _wash_api_info(self, data): ''' 清洗接口 :param data: :return: ''' try: data['data']['assets'] = [] data['data']['models']['config'] = {} data['data']['modules'] = [] except Exception: pass return data def __del__(self): try: del self.lg del self.msg del self.my_pipeline except: pass gc.collect()
class CommentRealTimeUpdateSpider(object): def __init__(self): self._set_logger() self.msg = '' self.debugging_api = self._init_debugging_api() self._set_func_name_dict() self.sql_str = cm_update_str_1 if self._init_debugging_api().get(2): self.my_lg.info('初始化 1688 phantomjs中...') self.ali_1688 = ALi1688CommentParse(logger=self.my_lg) if self._init_debugging_api().get(3) is True \ or self._init_debugging_api().get(4) is True\ or self._init_debugging_api().get(6) is True: self.my_lg.info('初始化 天猫 phantomjs中...') self.tmall = TmallCommentParse(logger=self.my_lg) if self._init_debugging_api().get(7) is True \ or self._init_debugging_api().get(8) is True\ or self._init_debugging_api().get(9) is True\ or self._init_debugging_api().get(10) is True: self.my_lg.info('初始化 京东 phantomjs中...') self.jd = JdCommentParse(logger=self.my_lg) def _set_logger(self): self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/all_comment/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) def _init_debugging_api(self): ''' 用于设置待抓取的商品的site_id :return: dict ''' return { 1: True, 2: True, 3: True, 4: True, 6: True, 7: True, 8: True, 9: True, 10: True, 11: False, 12: False, 13: False, 25: False, } def _set_func_name_dict(self): self.func_name_dict = { 'taobao': 'self._update_taobao_comment({0}, {1}, {2})', 'ali': 'self._update_ali_1688_comment({0}, {1}, {2})', 'tmall': 'self._update_tmall_comment({0}, {1}, {2})', 'jd': 'self._update_jd_comment({0}, {1}, {2})', 'zhe_800': 'self._update_zhe_800_comment({0}, {1}, {2})', 'juanpi': 'self._update_juanpi_comment({0}, {1}, {2})', 'pinduoduo': 'self._update_pinduoduo_comment({0}, {1}, {2})', 'vip': 'self._update_vip_comment({0}, {1}, {2})', } def _just_run(self): while True: #### 更新数据 self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline() # and GETDATE()-a.modify_time>1 try: result = list( self._comment_pipeline._select_table( sql_str=cm_select_str_1, logger=self.my_lg)) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') continue self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(str(result)) self.my_lg.info( '--------------------------------------------------------') self.my_lg.info('待更新个数: {0}'.format(len(result))) self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) # 1.淘宝 2.阿里 3.天猫 4.天猫超市 5.聚划算 6.天猫国际 7.京东 8.京东超市 9.京东全球购 10.京东大药房 11.折800 12.卷皮 13.拼多多 14.折800秒杀 15.卷皮秒杀 16.拼多多秒杀 25.唯品会 for index, item in enumerate( result): # item: ('xxxx':goods_id, 'y':site_id) if not self.debugging_api.get(item[1]): self.my_lg.info('api为False, 跳过! 索引值[%s]' % str(index)) continue if index % 20 == 0: try: del self._comment_pipeline except: pass self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline( ) switch = { 1: self.func_name_dict.get('taobao'), # 淘宝 2: self.func_name_dict.get('ali'), # 阿里1688 3: self.func_name_dict.get('tmall'), # 天猫 4: self.func_name_dict.get('tmall'), # 天猫超市 6: self.func_name_dict.get('tmall'), # 天猫国际 7: self.func_name_dict.get('jd'), # 京东 8: self.func_name_dict.get('jd'), # 京东超市 9: self.func_name_dict.get('jd'), # 京东全球购 10: self.func_name_dict.get('jd'), # 京东大药房 11: self.func_name_dict.get('zhe_800'), # 折800 12: self.func_name_dict.get('juanpi'), # 卷皮 13: self.func_name_dict.get('pinduoduo'), # 拼多多 25: self.func_name_dict.get('vip'), # 唯品会 } # 动态执行 exec_code = compile( switch[item[1]].format(index, item[0], item[1]), '', 'exec') exec(exec_code) sleep(1.1) def _update_taobao_comment(self, index, goods_id, site_id): ''' 处理淘宝的商品comment :param index: 索引 :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): self.my_lg.info('------>>>| 淘宝\t\t索引值(%s)' % str(index)) taobao = TaoBaoCommentParse(logger=self.my_lg) _r = taobao._get_comment_data(goods_id=str(goods_id)) if _r.get('_comment_list', []) != []: if self._comment_pipeline.is_connect_success: self._comment_pipeline._update_table_2( sql_str=self.sql_str, params=self._get_db_update_params(item=_r), logger=self.my_lg) else: self.my_lg.info('该商品_comment_list为空list! 此处跳过!') try: del taobao except: self.my_lg.info('del taobao失败!') gc.collect() else: pass def _update_ali_1688_comment(self, index, goods_id, site_id): ''' 处理阿里1688的商品comment :param index: 索引 :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): self.my_lg.info('------>>>| 阿里1688\t\t索引值(%s)' % str(index)) if index % 5 == 0: try: del self.ali_1688 except: self.my_lg.error('del ali_1688失败!') gc.collect() self.ali_1688 = ALi1688CommentParse(logger=self.my_lg) _r = self.ali_1688._get_comment_data(goods_id=goods_id) if _r.get('_comment_list', []) != []: if self._comment_pipeline.is_connect_success: self._comment_pipeline._update_table_2( sql_str=self.sql_str, params=self._get_db_update_params(item=_r), logger=self.my_lg) else: self.my_lg.info('该商品_comment_list为空list! 此处跳过!') else: pass def _update_tmall_comment(self, index, goods_id, site_id): ''' 处理tmall商品的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): self.my_lg.info('------>>>| 天猫\t\t索引值(%s)' % str(index)) if site_id == 3: _type = 0 elif site_id == 4: _type = 1 elif site_id == 6: _type = 2 else: return None if index % 5 == 0: try: del self.tmall except: self.my_lg.info('del tmall失败!') gc.collect() self.tmall = TmallCommentParse(logger=self.my_lg) _r = self.tmall._get_comment_data(type=_type, goods_id=str(goods_id)) if _r.get('_comment_list', []) != []: if self._comment_pipeline.is_connect_success: self._comment_pipeline._update_table_2( sql_str=self.sql_str, params=self._get_db_update_params(item=_r), logger=self.my_lg) else: self.my_lg.info('该商品_comment_list为空list! 此处跳过!') gc.collect() else: pass def _update_jd_comment(self, index, goods_id, site_id): ''' 处理京东商品的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): self.my_lg.info('------>>>| 京东\t\t索引值(%s)' % str(index)) if index % 5 == 0: try: del self.jd except: self.my_lg.info('del jd失败!') gc.collect() self.jd = JdCommentParse(logger=self.my_lg) _r = self.jd._get_comment_data(goods_id=str(goods_id)) if _r.get('_comment_list', []) != []: if self._comment_pipeline.is_connect_success: self._comment_pipeline._update_table_2( sql_str=self.sql_str, params=self._get_db_update_params(item=_r), logger=self.my_lg) else: self.my_lg.info('该商品_comment_list为空list! 此处跳过!') else: pass def _update_zhe_800_comment(self, index, goods_id, site_id): ''' 处理折800商品的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): pass else: pass def _update_juanpi_comment(self, index, goods_id, site_id): ''' 处理卷皮商品的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): pass else: pass def _update_pinduoduo_comment(self, index, goods_id, site_id): ''' 处理拼多多的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): pass else: pass def _update_vip_comment(self, index, goods_id, site_id): ''' 处理唯品会的comment :param index: :param goods_id: :param site_id: :return: ''' if self.debugging_api.get(site_id): pass else: pass def _get_db_update_params(self, item): return ( item['modify_time'], dumps(item['_comment_list'], ensure_ascii=False), item['goods_id'], ) def __del__(self): try: del self.my_lg del self.msg del self.debugging_api except: pass try: del self._comment_pipeline except: pass try: del self.tmall except: pass gc.collect()