def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger
def _set_logger(self, logger): if logger is None: self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/聚美优品/拼团/' + self.get_log_file_name_from_time() + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger
def _set_logger(self, logger): if logger is None: self.lg = set_logger(logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/聚美优品/拼团/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.lg = logger
def _set_logger(self, logger): ''' 设置logger :param logger: :return: ''' if logger is None: self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) else: self.my_lg = logger
async def run_forever(): #### 实时更新数据 # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # 由于不处理下架的商品,所以is_delete=0 try: result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7)) except TypeError: my_lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)') return None my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('--------------------------------------------------------') my_lg.info('待更新的goods_id个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg) for item in result: # 实时更新数据 if index % 50 == 0: my_lg.info('正在重置,并与数据库建立新连接中...') # try: del tmp_sql_server # except: pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: tejia_end_time = item[2] # my_lg.info(str(tejia_end_time)) if item[1] == 1: # 原先下架的商品,扫描到不处理 # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0]) # my_lg.info('该商品goods_id[{0}]已售完, 删除成功!'.format(item[0])) my_lg.info( '&&&&&& 该商品({0})原先状态为is_delete=1, 不进行实际删除操作! 索引为({1})'. format(item[0], str(index))) index += 1 pass elif tejia_end_time < datetime.datetime.now(): # 过期的不删除, 降为更新为常规爆款促销商品 index = await update_expired_goods_to_normal_goods( goods_id=item[0], index=index, tmp_sql_server=tmp_sql_server, logger=my_lg) pass else: # 下面为天天特价商品信息更新 ''' ** 由于天天特价不会提前下架商品,就不对应更新特价时间段 ''' # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间 # if index % 6 == 0: # try: del tmp_taobao_tiantiantejia # except: pass # gc.collect() # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg) # # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3]) # if tmp_body == '': # msg = '获取到的tmp_body为空str! 出错category为: ' + item[3] # my_lg.error(msg) # continue # # try: # tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0] # except IndexError: # msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3] # my_lg.error(msg) # continue # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body) # if tmp_sort_data == 'no items': # my_lg.info('该api接口获取到的item_list为no items!请检查') # break # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data) # # my_lg.info(str(tejia_goods_list)) # await asyncio.sleep(.45) # # my_lg.info('111') ''' 研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过 ''' # if is_in_child_sort(tejia_goods_list, goods_id=item[0]) is False: # 表示被官方提前下架 # # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0]) # # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(item[0])) # print('222') # pass # else: # 表示商品未被提前下架 my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) taobao = TaoBaoLoginAndParse(logger=my_lg) taobao.get_goods_data(item[0]) goods_data = taobao.deal_with_data(goods_id=item[0]) if goods_data != {}: # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=item[0]) # if tmp_time != []: # begin_time, end_time = tmp_time # # goods_data['goods_id'] = item[0] # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) # await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server) # else: # my_lg.info('该goods_id不在该api接口的商品中!!') # pass goods_data['goods_id'] = item[0] '''不专门更新上下架时间段''' # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) if goods_data.get('is_delete', 0) == 1: my_lg.info('@该商品已下架...') await taobao.update_taobao_tiantiantejia_table( data=goods_data, pipeline=tmp_sql_server) else: await asyncio.sleep(4) # 否则休息4秒 pass await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) index += 1 gc.collect() else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') pass gc.collect() my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 # sleep(60 * 60 * .5) pass else: sleep(5) gc.collect() return True
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # and GETDATE()-ModfiyTime>0.2 sql_str = ''' select SiteID, GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time from dbo.GoodsInfoAutoGet where (SiteID=3 or SiteID=4 or SiteID=6) and MainGoodsID is not null order by ID desc''' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 tmall = TmallParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try:del tmall except: pass tmall = TmallParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) tmp_item = [] if item[0] == 3: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 4: tmp_item.append(1) elif item[0] == 6: tmp_item.append(2) tmp_item.append(item[1]) data = tmall.get_goods_data(goods_id=tmp_item) if isinstance(data, int): # 单独处理return 4041 index += 1 sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue if data.get('is_delete') == 1: # 单独处理下架商品 data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data)) tmall.to_right_and_update_data(data, pipeline=tmp_sql_server) sleep(TMALL_REAL_TIMES_SLEEP_TIME) index += 1 gc.collect() continue data = tmall.deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price'] ) # my_lg.info(str(data['_is_price_change']) + ' ' +str(data['_price_change_info'])) # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data)) tmall.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) gc.collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=mia_select_str_5)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 mia = MiaParse() for item in result: goods_id = item[1] if index % 5 == 0: try: del mia except: pass mia = MiaParse() collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) mia.get_goods_data(goods_id=goods_id) data = mia.deal_with_data() db_goods_info_obj = MIADbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='mia', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) mia._to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) try: del my_lg except: pass collect()
from api import IpPoolsObj from fzutils.log_utils import set_logger from fzutils.time_utils import get_shanghai_time from fzutils.data.pickle_utils import ( deserializate_pickle_object, serialize_obj_item_2_dict, ) from fzutils.safe_utils import get_uuid3 from fzutils.sql_utils import BaseRedisCli from fzutils.data.list_utils import list_remove_repeat_dict from fzutils.linux_utils import _get_simulate_logger from fzutils.ip_utils import get_local_external_network_ip lg = set_logger(log_file_name=SPIDER_LOG_PATH + str(get_shanghai_time())[0:10] + '.log', console_log_level=INFO, file_log_level=ERROR) redis_cli = BaseRedisCli() _key = get_uuid3(proxy_list_key_name) # 存储proxy_list的key _h_key = get_uuid3(high_proxy_list_key_name) # 本地外网ip值 local_ip = '' # ip pool obj ip_pools_obj = IpPoolsObj(_k=high_proxy_list_key_name) def get_proxy_process_data(): ''' 抓取代理并更新redis中的值 :return: '''
async def run_forever(): #### 实时更新数据 # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志 lg = set_logger(logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # 由于不处理下架的商品,所以is_delete=0 try: # todo 先不处理过期的因为后台没有同步下架会导致其无法查到数据 # tmp_sql_server._delete_table(sql_str=tb_delete_str_2, params=None) # await async_sleep(10) result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7)) except TypeError: lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)') return None await _print_db_old_data( result=result, logger=lg, ) index = 1 for item in result: goods_id = item[0] tejia_end_time = item[2] tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=lg, db_conn_type=1, ) if tmp_sql_server.is_connect_success: # lg.info(str(tejia_end_time)) if tejia_end_time < get_shanghai_time(): # 过期的不删除, 降为更新为常规爆款促销商品 # index = await update_expired_goods_to_normal_goods( # goods_id=goods_id, # index=index, # tmp_sql_server=tmp_sql_server, # logger=lg # ) # 过期直接下架 lg.info('@@ 过期下架[goods_id: {}]'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=lg, update_sql_str=tb_update_str_5, ) index += 1 else: # 下面为天天特价商品信息更新 ''' ** 由于天天特价不会提前下架商品,就不对应更新特价时间段 ''' # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间 # if index % 6 == 0: # try: del tmp_taobao_tiantiantejia # except: pass # collect() # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=lg) # # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3]) # if tmp_body == '': # msg = '获取到的tmp_body为空str! 出错category为: ' + item[3] # lg.error(msg) # continue # # try: # tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0] # except IndexError: # msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3] # lg.error(msg) # continue # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body) # if tmp_sort_data == 'no items': # lg.info('该api接口获取到的item_list为no items!请检查') # break # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data) # # lg.info(str(tejia_goods_list)) # await async_sleep(.45) # # lg.info('111') ''' 研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过 ''' # if is_in_child_sort(tejia_goods_list, goods_id=goods_id) is False: # 表示被官方提前下架 # # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=goods_id) # # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(goods_id)) # print('222') # pass # else: # 表示商品未被提前下架 lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) taobao = TaoBaoLoginAndParse( logger=lg, is_real_times_update_call=is_real_times_update_call) taobao.get_goods_data(goods_id) goods_data = taobao.deal_with_data(goods_id=goods_id) if goods_data != {}: # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=goods_id) # if tmp_time != []: # begin_time, end_time = tmp_time # # goods_data['goods_id'] = goods_id # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) # await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server) # else: # lg.info('该goods_id不在该api接口的商品中!!') # pass goods_data['goods_id'] = goods_id if goods_data.get('is_delete', 0) == 1: lg.info('@该商品已下架...') await taobao.update_taobao_tiantiantejia_table( data=goods_data, pipeline=tmp_sql_server) else: await async_sleep(4) await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) index += 1 collect() else: lg.error('数据库连接失败,数据库可能关闭或者维护中') pass collect() lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 # sleep(60 * 60 * .5) await async_sleep(5 * 60) else: await async_sleep(60 * 1) collect() return True
def _set_logger(self): self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/折800/秒杀实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=INFO )
import requests from pprint import pprint from json import loads, dumps import re import asyncio from logging import INFO, ERROR from fzutils.log_utils import set_logger from fzutils.time_utils import get_shanghai_time from fzutils.cp_utils import get_taobao_sign_and_body MY_SPIDER_LOGS_PATH = '/Users/afa/myFiles/my_spider_logs/电商项目' my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/微淘/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'accept': '*/*', 'referer': 'https://market.m.taobao.com/apps/market/content/index.html?ut_sk=1.VmYadv9DXkkDAFZm0VV4JBNq_21380790_1527298517854.Copy.33¶ms=%7B%22csid%22%3A%2254a52aea54b7c29d289a0e36b2bf2f51%22%7D&wh_weex=true&contentId=200668154273&source=weitao_2017_nocover&data_prefetch=true&suid=3D763077-A7BF-43BC-9092-C17B35E896F9&wx_navbar_transparent=false&wx_navbar_hidden=false&sourceType=other&un=bc80c9f324602d31384c4a342af87869&share_crt_v=1&sp_tk=o6R2Q0ZDMHZvaDBlS6Ok&cpp=1&shareurl=true&spm=a313p.22.68.948703884987&short_name=h.WAjz5RP&app=chrome', 'authority': 'h5api.m.taobao.com', # cookie得注释掉, 否则为非法请求 # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; cookie2=16c0da3976ab60d7c87ef7cea1e83cb2; v=0; _tb_token_=dd9fe0edb4b3; tk_trace=oTRxOWSBNwn9dPy4KVJVbutfzK5InlkjwbWpxHegXyGxPdWTLVRjn23RuZzZtB1ZgD6Khe0jl%2BAoo68rryovRBE2Yp933GccTPwH%2FTbWVnqEfudSt0ozZPG%2BkA1iKeVv2L5C1tkul3c1pEAfoOzBoBsNsJyRfZ0FH5AEyz0CWtQgYlWnUAkbLeBYDpeNMwsdmBZ5GYwOAPdU1B2IUBU8G0MXGQCqFCjZt1pjb2TJN2uXIiZePpK9SWkwA%2FlD1sTTfYGTmnCo0YJ7IAG%2BnJtbITMYZ3mzYjFZtYlGojOqye861%2FNFDJbTR41FruF%2BHJRnt%2BHJNgFj3F7IDGXJCs8K; linezing_session=4ic7MPhjlPi65fN5BzW36xB7_1527299424026Fe7K_1; isg=BDo6U2SENb2uULiLxiJ4XA6ri2ZWbZPa3G9M1kQz602YN9pxLHsO1QBGg8PrpzZd; _m_h5_tk=53d85a4f43d72bc623586c142f0c5293_1527305714711; _m_h5_tk_enc=cc75764d122f72920ae715c9102701a8' }
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=tm_select_str_3)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info( '--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 tmall = TmallParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del tmall except: pass tmall = TmallParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) tmp_item = [] if item[0] == 3: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 4: tmp_item.append(1) elif item[0] == 6: tmp_item.append(2) tmp_item.append(item[1]) oo = tmall.get_goods_data(goods_id=tmp_item) oo_is_delete = oo.get('is_detele', 0) # 避免下面解析data错误休眠 if isinstance(oo, int): # 单独处理return 4041 index += 1 sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue data = tmall.deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) site_id = tmall._from_tmall_type_get_site_id( type=data['type']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[7]), site_id=site_id) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=site_id), is_price_change=item[8] if item[8] is not None else 0) tmall.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 if oo_is_delete == 1: pass else: my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=kl_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) for item in result: # 实时更新数据 goods_id = item[1] if index % 5 == 0: try: del kaola except: pass kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) collect() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, logger=my_lg, remainder=10, ) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) db_goods_info_obj = KLDbGoodsInfoObj(item=item, logger=my_lg) data = kaola._get_goods_data(goods_id=goods_id) if data.get('is_delete', 0) == 1: # 单独处理下架商品 data['goods_id'] = goods_id data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=db_goods_info_obj.is_delete, shelf_time=db_goods_info_obj.shelf_time, delete_time=db_goods_info_obj.delete_time, ) try: kaola.to_right_and_update_data(data, pipeline=sql_cli) except Exception: my_lg.error(exc_info=True) sleep(TMALL_REAL_TIMES_SLEEP_TIME) index += 1 collect() continue data = kaola._deal_with_data() if data != {}: if data.get('is_delete', 0) == 1: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='kl', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) kaola.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠3s中...') sleep(3.) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=yx_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 yanxuan = YanXuanParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del yanxuan except: pass yanxuan = YanXuanParse(logger=my_lg) collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yanxuan._get_goods_data(goods_id=item[1]) data = yanxuan._deal_with_data() db_goods_info_obj = YXDbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') sql_cli._update_table_2( sql_str=yx_update_str_2, params=(db_goods_info_obj.goods_id, ), logger=my_lg, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='yx', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) yanxuan.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server._select_table(sql_str=yx_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 yanxuan = YanXuanParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del yanxuan except: pass yanxuan = YanXuanParse(logger=my_lg) gc.collect() if index % 10 == 0: # 每10次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yanxuan._get_goods_data(goods_id=item[1]) data = yanxuan._deal_with_data() if data != {}: data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') tmp_sql_server._update_table_2(sql_str=yx_update_str_2, params=(item[1],), logger=my_lg) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price'] ) try: old_sku_info = format_price_info_list(price_info_list=json_2_dict(item[7]), site_id=30) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list(data['price_info_list'], site_id=30), is_price_change=item[8] if item[8] is not None else 0 ) yanxuan.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 gc.collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) gc.collect()
def run_forever(): while True: my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/1688/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=al_select_str_6)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: ali_1688 = ALi1688LoginAndParse(logger=my_lg) if index % 50 == 0: my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})' .format(item[0], index)) data = ali_1688.get_ali_1688_data(item[0]) if isinstance(data, int) is True: # 单独处理返回tt为4041 continue else: pass if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) # my_lg.info('上架时间:{0}, 下架时间:{1}'.format(data['shelf_time'], data['delete_time'])) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(1.5) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) # my_lg.info('上架时间:{0}, 下架时间:{1}'.format(data['shelf_time'], data['delete_time'])) '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中''' # 业务逻辑 # 公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格 data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[6]), site_id=2) except AttributeError: # 处理已被格式化过的 old_sku_info = item[6] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['sku_map'], site_id=2), is_price_change=item[7] if item[7] is not None else 0) ali_1688.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(.3) # 避免服务器更新太频繁 else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(2.2) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
app = Flask(__name__, root_path=getcwd()) # 存储tb user_id tb_shop_id_list = [] tb_shop_info_list = [] company_id_bloom_filter = BloomFilter(capacity=500000, error_rate=.00001) # 获取db company_id company_id_bloom_filter = _get_db_company_unique_id_list_by_site_id( site_id=13, bloom_filter=company_id_bloom_filter, )[1] lg = set_logger( logger_name=get_uuid1(), log_file_name='/Users/afa/myFiles/my_spider_logs/fz_server/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=INFO, ) @app.route('/', methods=['GET', 'POST']) def home(): return '欢迎来到 fzhook_server 主页!' @app.route('/dy', methods=['GET', 'POST']) def dy(): """ 获取dy 无水印视频 :return:
flowfilter, io, ) import random import typing from logging import INFO, ERROR from fzutils.log_utils import set_logger from fzutils.spider.app_utils import get_mitm_flow_request_headers_user_agent from fzutils.spider.async_always import * LOG_SAVE_PATH = '/Users/afa/myFiles/my_spider_logs/mitmproxy/tb/' logger = ctx.log lg = set_logger( logger_name=get_uuid1(), log_file_name=LOG_SAVE_PATH + str(get_shanghai_time())[0:10] + '.txt', file_log_level=INFO, ) class Writer: def __init__(self, path: str) -> None: self.f: typing.IO[bytes] = open(path, "wb") self.w = io.FlowWriter(self.f) def response(self, flow: http.HTTPFlow) -> None: response = flow.response # logger.info(str(response.text)) # logger.info(response.headers) if random.choice([True, False]): self.w.add(flow)
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) sql_str = ''' select GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time from dbo.GoodsInfoAutoGet where SiteID=1 and MainGoodsID is not null order by ID desc''' # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server._select_table(sql_str=sql_str, ) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) data = taobao.get_goods_data(item[0]) if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: my_lg.info('------>>>| 休眠5s中...') sleep(5) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()
def run_forever(): while True: my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/jd/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # and GETDATE()-ModfiyTime>1 and IsDelete=0 try: result = list(tmp_sql_server._select_table(sql_str=jd_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') continue my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('--------------------------------------------------------') my_lg.info('总计待更新个数:{}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 jd = JdParse(logger=my_lg) for item in result: # 实时更新数据 # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # jd = JdParse() if index % 10 == 0: try: del jd except: pass gc.collect() jd = JdParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info('------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'.format(item[1], index)) tmp_item = [] if item[0] == 7 or item[0] == 8: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 9: tmp_item.append(1) elif item[0] == 10: tmp_item.append(2) tmp_item.append(item[1]) jd.get_goods_data(goods_id=tmp_item) data = jd.deal_with_data(goods_id=tmp_item) if data != {}: data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) my_lg.info('上架时间: {0}, 下架时间: {1}'.format(data['shelf_time'], data['delete_time'])) data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) site_id = jd._from_jd_type_get_site_id_value(jd_type=data['jd_type']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[7]), site_id=site_id) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list(data['price_info_list'], site_id=site_id), is_price_change=item[8] if item[8] is not None else 0 ) jd.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(1.2) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) try: del jd except: pass if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) gc.collect()
def _set_logger(self): self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/all_comment/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR)
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server._select_table(sql_str=tb_select_str_3, ) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) oo = taobao.get_goods_data(item[0]) oo_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[6]), site_id=1) except AttributeError: # 处理已被格式化过的 old_sku_info = item[6] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=1), is_price_change=item[7] if item[7] is not None else 0) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: if oo_is_delete == 1: pass else: my_lg.info('------>>>| 休眠5s中...') sleep(4) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()