def _deal_with_data(self): ''' 处理并存储抓取到的拼团商品的数据 :return: ''' zid_list = self._get_pintuan_goods_info() zhe_800_pintuan = Zhe800PintuanParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, is_delete from dbo.zhe_800_pintuan where site_id=17' db_goods_id_list = [ item[0] for item in list(my_pipeline._select_table(sql_str=sql_str)) ] for item in zid_list: if item[0] in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str( item[0]) goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url) zhe_800_pintuan.get_goods_data(goods_id=goods_id) goods_data = zhe_800_pintuan.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['goods_id'] = str(item[0]) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['page'] = str(item[1]) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( schedule=goods_data.get('schedule', [])[0]) # print(goods_data) _r = zhe_800_pintuan.insert_into_zhe_800_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 插入就更新 db_goods_id_list.append(item[0]) db_goods_id_list = list(set(db_goods_id_list)) sleep(ZHE_800_PINTUAN_SLEEP_TIME) gc.collect() else: pass try: del zhe_800_pintuan except: pass gc.collect() return None
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=z8_delete_str_1) result = list( tmp_sql_server._select_table(sql_str=z8_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=item[0]) # 不用这个了因为会影响到正常情况的商品 try: # 单独处理商品页面不存在的情况 if isinstance(tmp_tmp, str) and re.compile( r'^ze').findall(tmp_tmp) != []: print('@@ 该商品的页面已经不存在!此处将其删除!') tmp_sql_server._delete_table( sql_str=z8_delete_str_2, params=(item[0], )) sleep(ZHE_800_PINTUAN_SLEEP_TIME) continue else: pass except: pass data = zhe_800_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] if item[1] == 1: tmp_sql_server._delete_table( sql_str=z8_delete_str_2, params=(item[0], )) print('该goods_id[{0}]已过期,删除成功!'.format(item[0])) else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass gc.collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时秒杀商品信息 :return: ''' zid_list = [] for page in range(0, 100): tmp_zid_list = [] tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) tmp_data = self.get_url_body(tmp_url=tmp_url) # print(tmp_data) if tmp_data == []: print('该tmp_url得到的object为空list, 此处跳过!') break tmp_zid_list = [(item.get('product', {}).get('zid', ''), page) for item in tmp_data] # print(tmp_zid_list) for item in tmp_zid_list: if item != '': zid_list.append(item) zid_list = list(set(zid_list)) print('该zid_list的总个数为: ', len(zid_list)) print(zid_list) zhe_800_pintuan = Zhe800PintuanParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline.select_zhe_800_pintuan_all_goods_id()) ] for item in zid_list: if item[0] in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str( item[0]) goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url) zhe_800_pintuan.get_goods_data(goods_id=goods_id) goods_data = zhe_800_pintuan.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['goods_id'] = str(item[0]) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['page'] = str(item[1]) # print(goods_data) zhe_800_pintuan.insert_into_zhe_800_pintuan_table( data=goods_data, pipeline=my_pipeline) sleep(.7) else: pass try: del zhe_800_pintuan except: pass gc.collect()
def run_forever(): while True: #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: sql_cli._delete_table(sql_str=z8_delete_str_1) result = list(sql_cli._select_table(sql_str=z8_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result) index = 1 for item in result: # 实时更新数据 goods_id = item[0] db_is_delete = item[1] # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, remainder=50, ) if index % 300 == 0: # 每更新300个,休眠3分钟 sleep_time = 3 * 60 sleep(sleep_time) print('休眠{}s中...'.format(sleep_time)) if sql_cli.is_connect_success: tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id) # 不用这个了因为会影响到正常情况的商品 try: # 单独处理商品页面不存在的情况 if isinstance(tmp_tmp, str) and re.compile( r'^ze').findall(tmp_tmp) != []: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) sleep(ZHE_800_PINTUAN_SLEEP_TIME) continue else: pass except: pass data = zhe_800_pintuan.deal_with_data() if data != {}: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) data['goods_id'] = goods_id if db_is_delete == 1: print('该goods_id[{0}]已过期!'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=z8_update_str_4, sql_cli=sql_cli, ) else: zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 try: del zhe_800_pintuan except: pass collect() sleep(ZHE_800_PINTUAN_SLEEP_TIME) print('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_zhe_800_pintuan_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_pintuan = Zhe800PintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) zhe_800_pintuan.get_goods_data(goods_id=item[0]) data = zhe_800_pintuan.deal_with_data() if data != {}: data['goods_id'] = item[0] if item[1] == 1: tmp_sql_server.delete_zhe_800_pintuan_expired_goods_id( goods_id=item[0]) print('该goods_id[{0}]已过期,删除成功!'.format(item[0])) zhe_800_pintuan.to_right_and_update_data( data=data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del zhe_800_pintuan # except: # pass gc.collect() sleep(.7) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()