def deal_with_data(self, goods_list): ''' 处理并存储相关拼团商品的数据 :param goods_list: :return: ''' mia = MiaPintuanParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [item[0] for item in list(my_pipeline.select_mia_pintuan_all_goods_id())] # print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://www.mia.com/item-' + str(goods_id) + '.html' mia.get_goods_data(goods_id=str(goods_id)) goods_data = mia.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_url = goods_data['goods_url'] if re.compile(r'://m.miyabaobei.hk/').findall(goods_url) != '': goods_url = 'https://www.miyabaobei.hk/item-' + str(goods_id) + '.html' else: goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html' goods_data['goods_url'] = goods_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(pintuan_time=goods_data['pintuan_time']) goods_data['pid'] = item.get('pid') # pprint(goods_data) # print(goods_data) mia.insert_into_mia_pintuan_table(data=goods_data, pipeline=my_pipeline) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mia except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = 'select goods_id, miaosha_time, pid from dbo.mia_pintuan where site_id=21' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mia_pintuan = MiaPintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str( item[2]) + '/0/' # print(tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) if body == '': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') if tmp_data.get('data_list', []) == []: print('得到的data_list为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: data_list = [{ 'goods_id': item_2.get('sku', ''), 'sub_title': item_2.get('intro', ''), } for item_2 in tmp_data.get('data_list', [])] # pprint(data_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in data_list ] # print(pintuan_goods_all_goods_id) ''' 蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍) ''' if item[0] not in pintuan_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # 一律更新 mia_pintuan.get_goods_data( goods_id=item[0]) goods_data = mia_pintuan.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) if goods_data[ 'pintuan_time'] == {}: # 当没有拼团时间时,就表示已下架拼团(未让其正常更新进数据库, 我把拼团开始结束时间都设置为当前时间) now_time = get_shanghai_time() goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = ( now_time, now_time) else: goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) # pprint(goods_data) # print(goods_data) mia_pintuan.update_mia_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in data_list: if item_2.get('goods_id', '') == item[0]: mia_pintuan.get_goods_data( goods_id=item[0]) goods_data = mia_pintuan.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) goods_data[ 'sub_title'] = item_2.get( 'sub_title', '') if goods_data[ 'pintuan_time'] == {}: # 当没有拼团时间时,就表示已下架拼团 now_time = get_shanghai_time( ) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = ( now_time, now_time) else: goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time= goods_data[ 'pintuan_time'] ) # pprint(goods_data) # print(goods_data) mia_pintuan.update_mia_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME ) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' result = self._get_db_old_data() if result is None: sleep_time = 20 print('获取db数据失败, 休眠{}s ...'.format(sleep_time)) sleep(sleep_time) return None index = 1 for item in result: # 实时更新数据 goods_id = item[0] pid = item[2] # 2020-04-12 00:00:00 pintuan_end_time = json_2_dict(item[1]).get('end_time') pintuan_end_time = datetime_to_timestamp( string_to_datetime(pintuan_end_time)) # print(pintuan_end_time) data = {} self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli, index=index, remainder=50) if self.sql_cli.is_connect_success: is_recent_time = self.is_recent_time(pintuan_end_time) if is_recent_time == 0: # 已恢复原价的 _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, update_sql_str=mia_update_str_7, sql_cli=self.sql_cli) print('该goods拼团开始时间为({})'.format( json.loads(item[1]).get('begin_time'))) sleep(.4) elif is_recent_time == 2: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) pass else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})' .format(goods_id, index)) data['goods_id'] = goods_id try: data_list = get_mia_pintuan_one_page_api_goods_info( page_num=pid) except ResponseBodyIsNullStrException: index += 1 sleep(.4) continue # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新 # try: # assert data_list != [], 'data_list不为空list!' # except AssertionError as e: # print(e) # _handle_goods_shelves_in_auto_goods_table( # goods_id=goods_id, # update_sql_str=mia_update_str_7, # sql_cli=self.sql_cli) # sleep(.4) # index += 1 # continue pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in data_list ] # print(pintuan_goods_all_goods_id) ''' 蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍) ''' mia_pt = MiaPintuanParse(is_real_times_update_call=True) if goods_id not in pintuan_goods_all_goods_id: # 内部已经下架的 # 一律更新 try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, ) except AssertionError: # 返回的data为空则跳过 index += 1 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table(data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in data_list: if item_2.get('goods_id', '') == goods_id: sub_title = item_2.get('sub_title', '') try: goods_data = self._get_mia_pt_one_goods_info( mia_pt_obj=mia_pt, goods_id=goods_id, sub_title=sub_title, ) except AssertionError: # 返回的data为空则跳过 continue # pprint(goods_data) mia_pt.update_mia_pintuan_table( data=goods_data, pipeline=self.sql_cli) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass try: del mia_pt except: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(10 * 60) collect()
def deal_with_data(self, goods_list): ''' 处理并存储相关拼团商品的数据 :param goods_list: :return: ''' mia = MiaPintuanParse(is_real_times_update_call=True) my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=mia_select_str_1)) db_goods_id_list = [item[0] for item in _] # print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) mia.get_goods_data(goods_id=str(goods_id)) goods_data = mia.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_url = goods_data['goods_url'] if re.compile(r'://m.miyabaobei.hk/').findall( goods_url) != '': # 地址变了 # 原先 # goods_url = 'https://www.miyabaobei.hk/item-{}.html'.format(goods_id) # now goods_url = 'https://m.miyabaobei.hk/item-{}.html'.format( goods_id) else: goods_url = 'https://m.mia.com/item-' + str( goods_id) + '.html' goods_data['goods_url'] = goods_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['pintuan_time']) goods_data['pid'] = item.get('pid') # pprint(goods_data) _r = mia.insert_into_mia_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mia except: pass collect()