def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=2 order by ID desc' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) result_2 = list(tmp_sql_server.select_old_table_all_goods_id()) # print(result_2) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse() # 新表 GoodsInfoAutoGet new_table_ali_1688_all_goods_id_list = list( set([item[0] for item in result])) # 新表里面的goods_id print(new_table_ali_1688_all_goods_id_list) sleep(2) # 老表 old_table_ali_1688_all_goods_list = [] for item in result_2: tmp_goods_id = ali_1688.get_goods_id_from_url(item[0]) if tmp_goods_id != '' and tmp_goods_id not in new_table_ali_1688_all_goods_id_list: old_table_ali_1688_all_goods_list.append([ 'https://detail.1688.com/offer/' + tmp_goods_id + '.html', item[1], tmp_goods_id, ]) else: print('@@@ 原地址为: ', item[0]) # print(old_table_ali_1688_all_goods_list) print('老表待转数据个数为: ', len(old_table_ali_1688_all_goods_list)) sleep(2) for item in old_table_ali_1688_all_goods_list: # 实时更新数据 if index % 10 == 0: ali_1688 = ALi1688LoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = str(item[2]) # print(goods_id) if goods_id in new_table_ali_1688_all_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过!') index += 1 gc.collect() continue # 跳过sleep else: sql_str = r'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=2 and GoodsID=%s' try: # 老是有重复的,索性单独检查 is_in_db = list( tmp_sql_server._select_table( sql_str=sql_str, params=(goods_id))) except: is_in_db = [] pass if is_in_db != []: print('该goods_id已经存在于数据库中, 此处跳过!') index += 1 gc.collect() continue print( '------>>>| 正在插入的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get( 'before') is False: # 处理已下架的但是还是要插入的 tt['goods_id'] = goods_id tt['goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' tt['username'] = '******' tt['main_goods_id'] = item[1] ali_1688.old_ali_1688_goods_insert_into_new_table( data=tt, pipeline=tmp_sql_server) index += 1 gc.collect() sleep(1.2) continue else: pass data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = item[1] ali_1688.old_ali_1688_goods_insert_into_new_table( data=data, pipeline=tmp_sql_server) else: # 表示返回的data为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del ali_1688 # except: # pass gc.collect() sleep(2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
def run_forever(): #### 实时更新数据 while True: # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() result_2 = list(tmp_sql_server_2.select_old_table_all_goods_id()) # print(result_2) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 new_table_ali_1688_all_goods_id_list = [item[0] for item in result] for item in result_2: # 实时更新数据 data = {} taobao = TaoBaoLoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = taobao.get_goods_id_from_url(item[0]) if goods_id == '': print('@@@ 原商品的地址为: ', item[0]) continue else: if goods_id in new_table_ali_1688_all_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过!') continue else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) tt = taobao.get_goods_data(goods_id) if tt.get('is_delete') == 1: # 处理已下架的但是还是要插入的 tt['goods_id'] = goods_id tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) tt['username'] = '******' tt['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data=tt, pipeline=tmp_sql_server_2) index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) continue else: pass data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data, pipeline=tmp_sql_server_2) else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=3 or SiteID=4 or SiteID=6 order by ID desc' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) result_2 = list(tmp_sql_server.select_old_table_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None result_2 = [] if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 tmall = TmallParse() for item in result_2: # 实时更新数据 data = {} if index % 5 == 0: tmall = TmallParse() gc.collect() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = tmall.get_goods_id_from_url(item[0]) if goods_id == []: print('@@@ 原地址为: ', item[0]) continue else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id[1], index)) data = tmall.get_goods_data(goods_id=goods_id) if isinstance(data, int): continue if data.get('is_delete') == 1: data['goods_id'] = goods_id[1] # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[ 1] # 构造成标准干净的天猫商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[ 1] elif goods_id[ 0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = str( goods_id[2]) + '?id=' + goods_id[1] else: continue data['goods_url'] = wait_to_deal_with_url data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) result = tmall.old_tmall_goods_insert_into_new_table( data, pipeline=tmp_sql_server) if result is False: print('出错商品的地址为: ', item[0]) else: pass index += 1 gc.collect() sleep(1.2) continue else: pass data = tmall.deal_with_data() if data != {}: data['goods_id'] = goods_id[1] # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[ 1] # 构造成标准干净的天猫商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[ 1] elif goods_id[ 0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = str( goods_id[2]) + goods_id[1] else: continue data['goods_url'] = wait_to_deal_with_url data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) tmall.old_tmall_goods_insert_into_new_table( data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass gc.collect() sleep(2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()