def __init__(self, logger=None): super().__init__() self.result_data = {} self.msg = '' if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/阿里1688/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger self.my_phantomjs = MyPhantomjs() # 可动态执行的代码 self._exec_code = ''' self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() sleep(1.5) # 向下滚动10000像素 js = 'document.body.scrollTop=10000' self.driver.execute_script(js) sleep(3) ''' self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS) - 1)], 'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'referer': 'https://detail.1688.com/offer/45579899125.html', } self.page_size = '30'
def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.comment_page_switch_sleep_time = 1.2 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs() self._add_headers_cookies()
def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.page_size = '10' self.comment_page_switch_sleep_time = 1.5 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs() self.g_data = {} # 临时数据 self.random_sku_info_list = [] # 临时数据(存该商品所有的规格)
def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'zhe800.com', 'User-Agent': HEADERS[randint(0, 34)] # 随机一个请求头 } self.my_phantomjs = MyPhantomjs()
def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'mobile.yangkeduo.com', 'User-Agent': HEADERS[randint(0, 34)], # 随机一个请求头 # 'Cookie': 'api_uid=rBQh+FoXerAjQWaAEOcpAg==;', # 分析发现需要这个cookie值 } self.result_data = {} # self.set_cookies_key_api_uid() # 设置cookie中的api_uid的值 self.my_phantomjs = MyPhantomjs()
def __init__(self): super().__init__() self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '1688.com', 'User-Agent': HEADERS[randint(0, 34)] # 随机一个请求头 } self.result_data = {} self.is_activity_goods = False self.my_phantomjs = MyPhantomjs()
async def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: ''' s_time = time.time() goods_list = [] my_phantomjs = MyPhantomjs() for key in self.tab_dict: self.msg = '正在抓取的分类为: ' + key self.my_lg.info(self.msg) for index in range(1, 20): item_list = await self.get_one_page_goods_list(my_phantomjs=my_phantomjs, key=key, tab=self.tab_dict[key], index=index) all_goods_id = list(set([s.get('goods_id', '') for s in goods_list])) for item in item_list: if item.get('goods_id', '') not in all_goods_id: goods_list.append(item) # await asyncio.sleep(.5) try: del my_phantomjs except: pass self.my_lg.info(str(goods_list)) self.my_lg.info('本次抓到所有拼团商品个数为: ' + str(len(goods_list))) e_time = time.time() self.my_lg.info('总用时:' + str(e_time-s_time)) await asyncio.sleep(3) return goods_list
def __init__(self, logger=None): super().__init__() self.result_data = {} self.msg = '' self._set_headers() self._set_logger(logger) self.my_phantomjs = MyPhantomjs() # 可动态执行的代码 self._exec_code = ''' self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() _text = str(self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').text) print(_text) # if _text == '四五星(0)': assert _text != '四五星(0)', 'my assert error!' # 通过断言来跳过执行下面的代码 sleep(2.5) # 向下滚动10000像素 js = 'document.body.scrollTop=10000' self.driver.execute_script(js) sleep(4) ''' self._page_sleep_time = 1.2
class PinduoduoParse(object): def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'mobile.yangkeduo.com', 'User-Agent': HEADERS[randint(0, 34)], # 随机一个请求头 # 'Cookie': 'api_uid=rBQh+FoXerAjQWaAEOcpAg==;', # 分析发现需要这个cookie值 } self.result_data = {} # self.set_cookies_key_api_uid() # 设置cookie中的api_uid的值 self.my_phantomjs = MyPhantomjs() def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + str(goods_id) print('------>>>| 得到的商品手机版地址为: ', tmp_url) ''' 1.采用requests,由于经常返回错误的body(即requests.get返回的为空的html), So pass ''' # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) ''' 2.采用phantomjs来获取 ''' body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) if body == '': print('body中re匹配到的data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} data = re.compile(r'window.rawData= (.*?);</script>').findall(body) # 贪婪匹配匹配所有 if data != []: data = data[0] try: data = json.loads(data) except Exception: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) try: data['goods'].pop('localGroups') data['goods'].pop('mallService') data.pop('reviews') # 评价信息跟相关统计 except: pass # pprint(data) ''' 处理detailGallery转换成能被html显示页面信息 ''' detail_data = data.get('goods', {}).get('detailGallery', []) tmp_div_desc = '' if detail_data != []: for index in range(0, len(detail_data)): if index == 0: # 跳过拼多多的提示 pass else: tmp = '' tmp_img_url = detail_data[index].get('url') tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format(tmp_img_url) tmp_div_desc += tmp detail_data = '<div>' + tmp_div_desc + '</div>' else: detail_data = '' # print(detail_data) try: data['goods'].pop('detailGallery') # 删除图文介绍的无第二次用途的信息 except: pass data['div_desc'] = detail_data # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} def deal_with_data(self): ''' 处理result_data, 返回需要的信息 :return: 字典类型 ''' data = self.result_data if data != {}: # 店铺名称 if data.get('mall') is not None: shop_name = data.get('mall', {}).get('mallName', '') else: shop_name = '' # 掌柜 account = '' # 商品名称 title = data.get('goods', {}).get('goodsName', '') # 子标题 sub_title = '' # 商品库存 # 商品标签属性对应的值 # 商品标签属性名称 if data.get('goods', {}).get('skus', []) == []: detail_name_list = [] else: if data.get('goods', {}).get('skus', [])[0].get('specs') == []: detail_name_list = [] else: detail_name_list = [{'spec_name': item.get('spec_key')} for item in data.get('goods', {}).get('skus', [])[0].get('specs')] # print(detail_name_list) # 要存储的每个标签对应规格的价格及其库存 skus = data.get('goods', {}).get('skus', []) # pprint(skus) price_info_list = [] if skus != []: # ** 注意: 拼多多商品只有一个规格时skus也不会为空的 ** for index in range(0, len(skus)): tmp = {} price = skus[index].get('groupPrice', '') # 拼团价 normal_price = skus[index].get('normalPrice', '') # 单独购买价格 spec_value = [item.get('spec_value') for item in data.get('goods', {}).get('skus', [])[index].get('specs')] spec_value = '|'.join(spec_value) img_url = skus[index].get('thumbUrl', '') rest_number = skus[index].get('quantity', 0) # 剩余库存 is_on_sale = skus[index].get('isOnSale', 0) # 用于判断是否在特价销售,1:特价 0:原价(normal_price) tmp['spec_value'] = spec_value tmp['detail_price'] = price tmp['normal_price'] = normal_price tmp['img_url'] = img_url if rest_number <= 0: tmp['rest_number'] = 0 else: tmp['rest_number'] = rest_number tmp['is_on_sale'] = is_on_sale price_info_list.append(tmp) if price_info_list == []: print('price_info_list为空值') return {} # 商品价格和淘宝价 tmp_price_list = sorted([round(float(item.get('detail_price', '')), 2) for item in price_info_list]) price = tmp_price_list[-1] # 商品价格 taobao_price = tmp_price_list[0] # 淘宝价 if detail_name_list == []: print('## detail_name_list为空值 ##') price_info_list = [] # print('最高价为: ', price) # print('最低价为: ', taobao_price) # print(len(price_info_list)) # pprint(price_info_list) # 所有示例图片地址 all_img_url = [{'img_url': item} for item in data.get('goods', {}).get('topGallery', [])] # print(all_img_url) # 详细信息标签名对应属性 tmp_p_value = re.compile(r'\n').sub('', data.get('goods', {}).get('goodsDesc', '')) tmp_p_value = re.compile(r'\t').sub('', tmp_p_value) tmp_p_value = re.compile(r' ').sub('', tmp_p_value) p_info = [{'p_name': '商品描述', 'p_value': tmp_p_value}] # print(p_info) # 总销量 all_sell_count = data.get('goods', {}).get('sales', 0) # div_desc div_desc = data.get('div_desc', '') # 商品销售时间区间 schedule = [{ 'begin_time': self.timestamp_to_regulartime(data.get('goods', {}).get('groupTypes', [])[0].get('startTime')), 'end_time': self.timestamp_to_regulartime(data.get('goods', {}).get('groupTypes', [])[0].get('endTime')), }] # pprint(schedule) # 用于判断商品是否已经下架 is_delete = 0 result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 # 'shop_name_url': shop_name_url, # 店铺主页地址 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list,# 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'schedule': schedule, # 商品开卖时间和结束开卖时间 'all_sell_count': all_sell_count, # 商品总销售量 'is_delete': is_delete # 用于判断商品是否已经下架 } # pprint(result) # print(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) return result else: print('待处理的data为空的dict, 该商品可能已经转移或者下架') return {} def to_right_and_update_data(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id ''' 时区处理,时间处理到上海时间 ''' tz = pytz.timezone('Asia/Shanghai') # 创建时区对象 now_time = datetime.datetime.now(tz) # 处理为精确到秒位,删除时区信息 now_time = re.compile(r'\..*').sub('', str(now_time)) # 将字符串类型转换为datetime类型 now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S') tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 tmp['link_name'] = '' # 卖家姓名 tmp['account'] = data_list['account'] # 掌柜名称 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['price_info'] = [] # 价格信息 tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get('price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') # 采集的来源地 # tmp['site_id'] = 13 # 采集来源地(拼多多常规商品) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 tmp['my_shelf_and_down_time'] = data_list.get('my_shelf_and_down_time') tmp['delete_time'] = data_list.get('delete_time') tmp['all_sell_count'] = str(data_list.get('all_sell_count')) pipeline.update_pinduoduo_table(item=tmp) def insert_into_pinduoduo_xianshimiaosha_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['spider_url'] = data_list['spider_url'] # 商品地址 tmp['username'] = data_list['username'] # 操作人员username ''' 时区处理,时间处理到上海时间 ''' tz = pytz.timezone('Asia/Shanghai') # 创建时区对象 now_time = datetime.datetime.now(tz) # 处理为精确到秒位,删除时区信息 now_time = re.compile(r'\..*').sub('', str(now_time)) # 将字符串类型转换为datetime类型 now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S') tmp['deal_with_time'] = now_time # 操作时间 tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get('price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['stock_info'] = data_list.get('stock_info') tmp['miaosha_time'] = data_list.get('miaosha_time') tmp['miaosha_begin_time'] = data_list.get('miaosha_begin_time') tmp['miaosha_end_time'] = data_list.get('miaosha_end_time') # 采集的来源地 tmp['site_id'] = 16 # 采集来源地(卷皮秒杀商品) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>>| 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: ', tmp.get('goods_id')) pipeline.insert_into_pinduoduo_xianshimiaosha_table(item=tmp) def to_update_pinduoduo_xianshimiaosha_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id ''' 时区处理,时间处理到上海时间 ''' tz = pytz.timezone('Asia/Shanghai') # 创建时区对象 now_time = datetime.datetime.now(tz) # 处理为精确到秒位,删除时区信息 now_time = re.compile(r'\..*').sub('', str(now_time)) # 将字符串类型转换为datetime类型 now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S') tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get('price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['stock_info'] = data_list.get('stock_info') tmp['miaosha_time'] = data_list.get('miaosha_time') tmp['miaosha_begin_time'] = data_list.get('miaosha_begin_time') tmp['miaosha_end_time'] = data_list.get('miaosha_end_time') tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) pipeline.update_pinduoduo_xianshimiaosha_table(tmp) def set_cookies_key_api_uid(self): ''' 给headers增加一个cookie, 里面有个key名字为api_uid :return: ''' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # 得到cookie中的key名为api_uid的值 host_url = 'http://mobile.yangkeduo.com' try: response = requests.get(host_url, headers=self.headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 api_uid = response.cookies.get('api_uid') # print(response.cookies.items()) # if api_uid is None: # api_uid = 'rBQh+FoXerAjQWaAEOcpAg==' self.headers['Cookie'] = 'api_uid=' + str(api_uid) + ';' # print(api_uid) except Exception: print('requests.get()请求超时....') pass def timestamp_to_regulartime(self, timestamp): ''' 将时间戳转换成时间 ''' # 利用localtime()函数将时间戳转化成localtime的格式 # 利用strftime()函数重新格式化时间 # 转换成localtime time_local = time.localtime(timestamp) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) return dt def get_goods_id_from_url(self, pinduoduo_url): ''' 得到goods_id :param pinduoduo_url: :return: goods_id (类型str) ''' is_pinduoduo_url = re.compile(r'http://mobile.yangkeduo.com/goods.html.*?').findall(pinduoduo_url) if is_pinduoduo_url != []: if re.compile(r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?').findall(pinduoduo_url) != []: tmp_pinduoduo_url = re.compile(r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?').findall(pinduoduo_url)[0] if tmp_pinduoduo_url != '': goods_id = tmp_pinduoduo_url else: # 只是为了在pycharm里面测试,可以不加 pinduoduo_url = re.compile(r';').sub('', pinduoduo_url) goods_id = re.compile(r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?').findall(pinduoduo_url)[0] print('------>>>| 得到的拼多多商品id为:', goods_id) return goods_id else: pass else: print('拼多多商品url错误, 非正规的url, 请参照格式(http://mobile.yangkeduo.com/goods.html)开头的...') return '' def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class ALi1688CommentParse(object): def __init__(self, logger=None): super().__init__() self.result_data = {} self.msg = '' if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/阿里1688/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger self.my_phantomjs = MyPhantomjs() # 可动态执行的代码 self._exec_code = ''' self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() sleep(1.5) # 向下滚动10000像素 js = 'document.body.scrollTop=10000' self.driver.execute_script(js) sleep(3) ''' self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS) - 1)], 'accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'referer': 'https://detail.1688.com/offer/45579899125.html', } self.page_size = '30' def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} ''' 原先采用phantomjs, 改用pc端抓包到的接口 ''' tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str( goods_id) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, exec_code=self._exec_code) # self.my_lg.info(str(body)) if body == '': self.result_data = {} self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url) return {} _html_comment_list = list( Selector(text=body).css('div.remark-item').extract()) if _html_comment_list != []: _comment_list = [] for index, item in enumerate(_html_comment_list): if index > 25: # 就取前25条评论信息 break buyer_name = str( Selector( text=item).css('span.member::text').extract_first()) quantify = str( Selector( text=item).css('span.amount::text').extract_first()) try: quantify = int(re.compile(r'\d+').findall(quantify)[0]) except IndexError: self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url) self.result_data = {} return {} comment_date = str( Selector( text=item).css('div.date span::text').extract_first()) comment_date = self._get_comment_date( comment_date) # str '2017-01-25 17:06:00' tmp_sku_info = str( Selector(text=item).css('div.date::text').extract_first()) comment = [{ 'comment': str( Selector( text=item).css('div.bd::text').extract_first()), 'comment_date': comment_date, # 评论创建日期 'sku_info': re.compile(r'<span.*?</span>').sub( '', tmp_sku_info), # 购买的商品规格 'img_url_list': [], 'star_level': randint(3, 5), # 几星好评 'video': '', }] _ = { 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 购买数量 'head_img': '', # 用户头像 } _comment_list.append(_) self.result_data = { 'goods_id': str(goods_id), 'modify_time': datetime.datetime.now(), '_comment_list': _comment_list, } pprint(self.result_data) return self.result_data else: self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url) self.result_data = {} return {} # 下面是模拟pc端接口的 # tmp_url = 'https://rate.1688.com/remark/offerDetail/rates.json' # _params = self._set_params(goods_id=goods_id) # # # 常规requests获取不到数据改用phantomjs # # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=_params) # # _url = self._set_url(url=tmp_url, params=_params) # print(_url) # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=_url) # self.my_lg.info(str(body)) # if body == '': # self.result_data = {} # self.my_lg.error('该地址的body为空值, 出错goods_id: ' + goods_id) # return {} # try: # body = re.compile('<pre.*?>(.*)</pre>').findall(body)[0] # except IndexError: # self.result_data = {} # self.my_lg.error('re筛选body为空[], 出错goods_id: ' + goods_id) # return {} # # data = self.json_str_2_dict(json_str=body).get('data', {}).get('rates', []) # # pprint(data) # _comment_list = [] # try: # for item in data: # buyer_name = item.get('member', '') # comment = [{ # 'comment': i.get('remarkContent', ''), # 'comment_date': string_to_datetime(i.get('remarkTime', '')), # 评论日期 # 'star_level': i.get('starLevel', 5), # 'sku_info': '', # 购买的商品规格(pc端1688商品没有规格) # 'img_url_list': [], # } for i in item.get('rateItem', [])] # quantify = item.get('quantity', 1) # 购买数量 # # _ = { # 'buyer_name': buyer_name, # 买家昵称 # 'comment': comment, # 评论内容 # 'quantify': quantify # 购买数量 # } # _comment_list.append(_) # # except Exception as e: # self.result_data = {} # self.my_lg.error('出错商品goods_id: ' + goods_id) # self.my_lg.exception(e) # return {} # # self.result_data = { # 'goods_id': str(goods_id), # 'modify_time': datetime.datetime.now(), # '_comment_list': _comment_list, # } # pprint(self.result_data) # return self.result_data def _set_url(self, url, params): ''' 得到待抓取的api接口地址 :param url: :param params: :return: str ''' _ = [item[0] + '=' + str(item[1]) for item in params] return url + '?' + '&'.join(_) def _set_params(self, goods_id): ''' 设置params :param goods_id: :return: ''' params = ( ('_input_charset', 'GBK'), ('offerId', goods_id), ('page', '1'), ('pageSize', self.page_size), # 一个页面返回的comment数量 ('starLevel', '7'), ('orderBy', 'date'), # ('semanticId', ''), ('showStat', '0'), ('content', '1'), # ('t', '1523264528741'), ('memberId', 'zhangchenghao2009'), # ('callback', 'jQuery1720041881430222992844_1523264353082'), ) return params def json_str_2_dict(self, json_str): ''' json字符串转dict :param json_str: :return: ''' try: data = json.loads(json_str) except: self.my_lg.error('json.loads转换json_str时出错!请检查!') data = {} return data def _get_comment_date(self, comment_date): ''' 得到datetime类型的时间 :param comment_date: eg: 2017-12-04 :return: datetime ''' _ = str(randint(0, 23)) if len(_) == 1: _hour = '0' + _ else: _hour = _ _ = str(randint(0, 59)) if len(_) == 1: _min = '0' + _ else: _min = _ _ = str(randint(0, 59)) if len(_) == 1: _s = '0' + _ else: _s = _ comment_date = comment_date + ' ' + _hour + ':' + _min + ':' + _s return comment_date def __del__(self): try: del self.my_phantomjs del self.my_lg del self.msg except: pass gc.collect()
class Zhe800Spike(object): def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'zhe800.com', 'User-Agent': HEADERS[randint(0, 34)] # 随机一个请求头 } self.my_phantomjs = MyPhantomjs() def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page=1&per_page=1000'.format( str(base_session_id) ) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) # print(body) body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(body) if body_1 != []: data = body_1[0] data = json.loads(data) # pprint(data) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 print('该session_id不存在,此处跳过') pass else: # 否则session_id存在 # begin_times_timestamp = int(time.mktime(time.strptime(begin_times, '%Y-%m-%d %H:%M:%S'))) # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整 try: begin_times_timestamp = int(str(data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10]) except Exception as e: print('遇到严重错误: ', e) continue print('秒杀时间为: ', self.timestamp_to_regulartime(begin_times_timestamp)) if self.is_recent_time(timestamp=begin_times_timestamp): # 说明秒杀日期合法 try: data = [item_s.get('deal', {}) for item_s in data.get('data', {}).get('blocks', [])] except Exception as e: print('遇到严重错误: ', e) continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list(data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [item[0] for item in list(my_pipeline.select_zhe_800_xianshimiaosha_all_goods_id())] for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str(item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url(tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = str(item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get('taobao_price') goods_data['sub_title'] = item.get('sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) goods_data['session_id'] = str(base_session_id) # print(goods_data) zhe_800.insert_into_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 # sleep(2) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') # return {} pass else: pass else: print('获取到的data为空!') # return {} pass base_session_id += 2 def timestamp_to_regulartime(self, timestamp): ''' 将时间戳转换成时间 ''' # 利用localtime()函数将时间戳转化成localtime的格式 # 利用strftime()函数重新格式化时间 # 转换成localtime time_local = time.localtime(int(timestamp)) # print(time_local) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) return dt def get_miaosha_begin_time_and_miaosha_end_time(self, miaosha_time): ''' 返回秒杀开始和结束时间 :param miaosha_time: :return: tuple miaosha_begin_time, miaosha_end_time ''' miaosha_begin_time = miaosha_time.get('miaosha_begin_time') miaosha_end_time = miaosha_time.get('miaosha_end_time') # 将字符串转换为datetime类型 miaosha_begin_time = datetime.datetime.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S') miaosha_end_time = datetime.datetime.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S') return miaosha_begin_time, miaosha_end_time def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: # pprint(item) tmp = {} # 秒杀开始时间和结束时间 tmp['miaosha_time'] = { 'miaosha_begin_time': self.timestamp_to_regulartime(int(str(item.get('begin_time'))[0:10])), 'miaosha_end_time': self.timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])), } # 折800商品地址 tmp['zid'] = item.get('zid') # 是否包邮 # tmp['is_baoyou'] = item.get('is_baoyou', 0) # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } # 原始价格 tmp['price'] = float(item.get('list_price')) # 秒杀的价格, float类型 tmp['taobao_price'] = float(item.get('price')) # 子标题 tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: True or False ''' time_1 = int(timestamp) time_2 = time.time() # 当前的时间戳 time_1 = time.localtime(time_1) time_2 = time.localtime(time_2) if time_1.tm_year > time_2.tm_year: print('** 该年份为未来时间年份 **') if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR, SPIDER_END_HOUR)) return False if time_1.tm_year == time_2.tm_year: if time_1.tm_mon > time_2.tm_mon: # 先处理得到的time_1的月份大于当前月份的信息(即未来月份的) print('** 该月份为未来时间月份 **') if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR, SPIDER_END_HOUR)) return False if time_1.tm_mon >= time_2.tm_mon: # 如果目标时间的月份时间 >= 当前月份(月份合法, 表示是当前月份或者是今年其他月份) if time_1.tm_mday >= time_2.tm_mday-2: # 这样能抓到今天的前两天的信息 if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR, SPIDER_END_HOUR)) return False else: print('该日时间已过期, 此处跳过') return False else: # 月份过期 print('该月份时间已过期,此处跳过') return False else: print('非本年度的限时秒杀时间,此处跳过') return False def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class JuanPiParse(object): def __init__(self): super(JuanPiParse, self).__init__() self._set_headers() self.result_data = {} self.my_phantomjs = MyPhantomjs() def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'web.juanpi.com', 'User-Agent': HEADERS[randint(0, 34)], # 随机一个请求头 } def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id) print('------>>>| 得到的商品手机版的地址为: ', tmp_url) ''' 1.原先使用requests来模拟(起初安全的运行了一个月),但是后来发现光requests会not Found,记住使用前别翻墙 ''' # try: # response = requests.get(tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=12) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # main_body = response.content.decode('utf-8') # # print(main_body) # # main_body = re.compile(r'\n').sub('', main_body) # main_body = re.compile(r'\t').sub('', main_body) # main_body = re.compile(r' ').sub('', main_body) # print(main_body) # data = re.compile(r'__PRELOADED_STATE__=(.*),window\.__SERVER_TIME__=').findall(main_body) # 贪婪匹配匹配所有 # print(data) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 2.采用phantomjs来处理,记住使用前别翻墙 ''' body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='div.sc-kgoBCf.bTQvTk') # 该css为手机端标题块 if body == '': print('获取到的body为空str!请检查!') self.result_data = {} return {} data = re.compile( r'__PRELOADED_STATE__ = (.*);</script> <style ').findall( body) # 贪婪匹配匹配所有 # 得到skudata # 卷皮原先的skudata请求地址1(官方放弃) # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id) # 现在卷皮skudata请求地址2 skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str( goods_id) self.skudata_headers = self.headers self.skudata_headers.update({'Host': 'webservice.juanpi.com'}) skudata_body = MyRequests.get_url_body( url=skudata_url, headers=self.skudata_headers) if skudata_body == '': print('获取到的skudata_body为空str!请检查!') self.result_data = {} return {} skudata = re.compile(r'(.*)').findall(skudata_body) # 贪婪匹配匹配所有 if skudata != []: skudata = skudata[0] try: skudata = json.loads(skudata) except: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} skudata = skudata.get('skudata', {}) # pprint(skudata) try: if skudata.get('info') is not None: pass # 说明得到正确的skudata else: # 否则跳出 print('skudata中info的key为None, 返回空dict') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} except AttributeError as e: print('遇到错误如下(先跳过!): ', e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('skudata为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != []: main_data = data[0] # print(main_data) try: main_data = json.loads(main_data) # pprint(main_data) except: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if main_data.get('detail') is not None: main_data = self._wash_main_data( main_data.get('detail', {})) main_data['skudata'] = skudata # pprint(main_data) # print(main_data) main_data['goods_id'] = goods_id self.result_data = main_data return main_data else: print('data中detail的key为None, 返回空dict') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} def deal_with_data(self): ''' 解析data数据,得到需要的东西 :return: dict ''' data = self.result_data if data != {}: # 店铺名称 shop_name = self._get_shop_name(data=data) # 掌柜 account = '' # 商品名称 title = data.get('baseInfo', {}).get('title', '') # 子标题 sub_title = '' # 商品库存 # 商品标签属性名称 detail_name_list = self._get_detail_name_list(data=data) if isinstance(detail_name_list, str): # 单独处理下架的情况 if detail_name_list == 'is_delete=1': print('该商品已下架...') sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s' params = (self.result_data.get('goods_id', ''), ) _ = SqlServerMyPageInfoSaveItemPipeline() result = _._update_table(sql_str=sql_str, params=params) if result: print('### 该商品已经is_delete=1 ###') else: print('is_delete=1标记失败!') if detail_name_list == {}: self.result_data = {} return {} # print(detail_name_list) # 商品标签属性对应的值(pass不采集) # 要存储的每个标签对应的规格的价格及其库存 price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price( data=data) # print('最高价为: ', price) # print('最低价为: ', taobao_price) # pprint(price_info_list) # 所有示例图片的地址 # pprint(data.get('goodImages')) all_img_url = [{ 'img_url': item } for item in data.get('goodImages')] # print(all_img_url) # 详细信息标签名对应的属性 p_info = self._get_p_info(data=data) # pprint(p_info) # div_desc div_desc = self._get_div_desc(data=data) # print(div_desc) # 商品销售时间段 schedule = self._get_goods_schedule(data=data) # pprint(schedule) is_delete = self._get_is_delete(data=data, schedule=schedule) if price == 0 or taobao_price == 0: # 没有获取到价格说明商品已经下架了 is_delete = 1 # print('is_delete = ', is_delete) result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'is_delete': is_delete, # 是否下架判断 'schedule': schedule, # 商品销售时间段 } # pprint(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) gc.collect() return result else: print('待处理的data为空的dict') return {} def to_right_and_update_data(self, data, pipeline): data_list = data tmp = GoodsItem() tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modify_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 tmp['link_name'] = '' # 卖家姓名 tmp['account'] = data_list['account'] # 掌柜名称 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['price_info'] = [] # 价格信息 tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 tmp['my_shelf_and_down_time'] = data_list.get('my_shelf_and_down_time') tmp['delete_time'] = data_list.get('delete_time') tmp['is_price_change'] = data_list.get('_is_price_change') tmp['price_change_info'] = data_list.get('_price_change_info') params = self._get_db_update_params(item=tmp) # 改价格的sql语句 # sql_str = r'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, Price=%s, TaoBaoPrice=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, MyShelfAndDownTime=%s, delete_time=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s where GoodsID = %s' # 不改价格的sql语句 sql_str = r'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, MyShelfAndDownTime=%s, delete_time=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s where GoodsID = %s' pipeline._update_table(sql_str=sql_str, params=params) def insert_into_juanpi_xianshimiaosha_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['spider_url'] = data_list['spider_url'] # 商品地址 tmp['username'] = data_list['username'] # 操作人员username now_time = get_shanghai_time() tmp['deal_with_time'] = now_time # 操作时间 tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['stock_info'] = data_list.get('stock_info') tmp['miaosha_time'] = data_list.get('miaosha_time') tmp['miaosha_begin_time'] = data_list.get('miaosha_begin_time') tmp['miaosha_end_time'] = data_list.get('miaosha_end_time') tmp['tab_id'] = data_list.get('tab_id') tmp['page'] = data_list.get('page') # 采集的来源地 tmp['site_id'] = 15 # 采集来源地(卷皮秒杀商品) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_insert_miaosha_params(item=tmp) sql_str = r'insert into dbo.juanpi_xianshimiaosha(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, property_info, detail_info, schedule, stock_info, miaosha_time, miaosha_begin_time, miaosha_end_time, tab_id, page, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' pipeline._insert_into_table(sql_str=sql_str, params=params) def to_update_juanpi_xianshimiaosha_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['stock_info'] = data_list.get('stock_info') tmp['miaosha_time'] = data_list.get('miaosha_time') tmp['miaosha_begin_time'] = data_list.get('miaosha_begin_time') tmp['miaosha_end_time'] = data_list.get('miaosha_end_time') tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_update_miaosha_params(item=tmp) sql_str = r'update dbo.juanpi_xianshimiaosha set modfiy_time = %s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_info=%s, all_image_url=%s, property_info=%s, detail_info=%s, is_delete=%s, schedule=%s, stock_info=%s, miaosha_time=%s, miaosha_begin_time=%s, miaosha_end_time=%s where goods_id = %s' pipeline._update_table(sql_str=sql_str, params=params) def insert_into_juuanpi_pintuan_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['spider_url'] = data_list['spider_url'] # 商品地址 tmp['username'] = data_list['username'] # 操作人员username now_time = get_shanghai_time() tmp['deal_with_time'] = now_time # 操作时间 tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price try: tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal( data_list['taobao_price']).__round__(2) except: # 此处抓到的可能是卷皮拼团券所以跳过 print('此处抓到的可能是卷皮拼团券所以跳过') return None tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['all_sell_count'] = data_list.get('all_sell_count') # 总销量 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['pintuan_begin_time'] = data_list.get('pintuan_begin_time') tmp['pintuan_end_time'] = data_list.get('pintuan_end_time') tmp['page'] = data_list.get('page') # 采集的来源地 tmp['site_id'] = 18 # 采集来源地(卷皮拼团商品) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_insert_pintuan_params(item=tmp) sql_str = r'insert into dbo.juanpi_pintuan(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, all_sell_count, property_info, detail_info, schedule, miaosha_begin_time, miaosha_end_time, page, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' _r = pipeline._insert_into_table(sql_str=sql_str, params=params) return _r def to_right_and_update_pintuan_data(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price try: tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal( data_list['taobao_price']).__round__(2) except: # 此处抓到的可能是卷皮拼团券所以跳过 print('此处抓到的可能是卷皮拼团券所以跳过') return None tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 # tmp['all_sell_count'] = data_list.get('all_sell_count') # 总销量 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') # 采集的来源地 # tmp['site_id'] = 18 # 采集来源地(卷皮拼团商品) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>>| 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_update_pintuan_params(item=tmp) sql_str = r'update dbo.juanpi_pintuan set modfiy_time=%s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_Info=%s, all_image_url=%s, property_info=%s, detail_info=%s, schedule=%s, is_delete=%s where goods_id = %s' pipeline._update_table(sql_str=sql_str, params=params) def _get_shop_name(self, data): ''' 获取shop_name :param data: :return: ''' if data.get('brand_info') is not None: shop_name = data.get('brand_info', {}).get('title', '') else: shop_name = data.get('schedule_info', {}).get('brand_title', '') return shop_name def _get_detail_name_list(self, data): ''' 获取detail_name_list :param data: :return: {} 表示出错 | [] 非空正常 ''' sku = data.get('skudata', {}).get('sku', []) # pprint(sku) detail_name_list = [] if sku != []: try: if sku[0].get('av_fvalue', '') == '': fav_name = '' pass else: tmp = {} fav_name = data.get('skudata', {}).get('info', {}).get('fav_name', '') tmp['spec_name'] = fav_name detail_name_list.append(tmp) except IndexError: print('IndexError错误,此处跳过!') # print(sku) if isinstance(sku, str): # 单独处理下架的 if sku == '': return 'is_delete=1' return {} if sku[0].get('av_zvalue', '') == '': zav_name = '' else: tmp = {} zav_name = data.get('skudata', {}).get('info', {}).get('zav_name', '') tmp['spec_name'] = zav_name detail_name_list.append(tmp) return detail_name_list def _get_price_info_list_and_price_and_taobao_price(self, data): ''' 获取price_info_list, price, taobao_price :param data: :return: a tuple ''' sku = data.get('skudata', {}).get('sku', []) # 分析得到sku肯定不为[] # pprint(sku) price_info_list = [] if len(sku) == 1 and sku[0].get( 'av_fvalue', '') == '' and sku[0].get('av_zvalue') == '': # 没有规格的默认只有一个{} # price最高价, taobao_price最低价 price = round(float(sku[0].get('cprice')), 2) taobao_price = price else: # 有规格的 # 通过'stock'='1'来判断是否有库存, ='0'表示无库存 # '由于卷皮不给返回库存值, 所以 'stock_tips'='库存紧张', 我就设置剩余库存为10, 如果'stock_tips'='', 就默认设置库存量为50 # print('777') for item in sku: tmp = {} tmp_1 = [] if item.get('av_fvalue', '') == '': pass else: tmp_1.append(item.get('av_fvalue')) if item.get('av_zvalue', '') == '': pass else: tmp_1.append(item.get('av_zvalue')) tmp_1 = '|'.join(tmp_1) if item.get('av_origin_zpic', '') != '': tmp['img_url'] = item.get('av_origin_zpic', '') else: tmp['img_url'] = '' if item.get('cprice', '') != '': tmp['pintuan_price'] = item.get('cprice') tmp['detail_price'] = item.get('sprice', '') tmp['normal_price'] = item.get('price') else: tmp['pintuan_price'] = item.get('price') if item.get('sprice', '') != '': tmp['detail_price'] = item.get('sprice', '') else: tmp['detail_price'] = item.get('price') tmp['normal_price'] = item.get('price') if item.get('stock') == '0': # 跳过 rest_number = '0' else: # 即'stock'='1' rest_number = '50' if item.get('stock_tips', '') != '' and item.get( 'stock_tips', '') == '库存紧张': # 库存紧张的时候设置下 rest_number = '10' tmp['spec_value'] = tmp_1 tmp['rest_number'] = rest_number price_info_list.append(tmp) # 得到有规格时的最高价和最低价 tmp_price_list = sorted([ round(float(item.get('pintuan_price', '')), 2) for item in price_info_list ]) # print(tmp_price_list) if tmp_price_list == []: price = 0 taobao_price = 0 else: price = tmp_price_list[-1] # 商品价格 taobao_price = tmp_price_list[0] # 淘宝价 return price_info_list, price, taobao_price def _get_p_info(self, data): ''' 获取p_info :param data: :return: ''' p_info = [] attr = data.get('goodsDetail', {}).get('attr', []) # print(attr) if attr != []: # item是str时跳过 p_info = [{ 'p_name': item.get('st_key'), 'p_value': item.get('st_value') } for item in attr if isinstance(item, dict)] for item in p_info: if item.get('p_name') == '运费': # 过滤掉颜色的html代码 item['p_value'] = '全国包邮(偏远地区除外)' # 过滤清洗 tmp_p_value = item.get('p_value', '') tmp_p_value = re.compile(r'\xa0').sub(' ', tmp_p_value) # 替换为一个空格 item['p_value'] = tmp_p_value return p_info def _get_div_desc(self, data): ''' 获取div_desc :param data: :return: ''' div_images_list = data.get('goodsDetail', {}).get('images', []) tmp_div_desc = '' for item in div_images_list: tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( item) tmp_div_desc += tmp return '<div>' + tmp_div_desc + '</div>' def _get_goods_schedule(self, data): ''' 获取商品销售时间段 :param data: :return: ''' # print(data.get('skudata', {}).get('info', {})) # print(data.get('skudata', {})) begin_time = data.get('skudata', {}).get('info', {}).get( 'start_time') # 取这个时间段才是正确的销售时间, 之前baseInfo是虚假的 end_time = data.get('skudata', {}).get('info', {}).get('end_time') if begin_time is None or end_time is None: schedule = [] else: schedule = [{ 'begin_time': timestamp_to_regulartime(begin_time), 'end_time': timestamp_to_regulartime(end_time), }] return schedule def _get_is_delete(self, data, schedule): ''' 得到商品的上下架状态 :param data: :param schedule: :return: ''' end_time = data.get('skudata', {}).get('info', {}).get('end_time') is_delete = 0 # 是否下架判断 # 结束时间戳小于当前时间戳则表示已经删除无法购买, 另外每个规格卖光也不显示is_delete=1(在上面已经判断, 这个就跟销售时间段没关系了) if schedule != []: if data.get('baseInfo', {}).get('end_time') is not None: ''' 先判断如果baseInfo中的end_time=='0'表示已经下架 ''' # base_info_end_time = data.get('baseInfo', {}).get('end_time') # self.my_lg.info(base_info_end_time) # if base_info_end_time == '0': # is_delete = 1 pass if float(end_time) < time.time(): ''' 再判断日期过期的 ''' is_delete = 1 ''' 卷皮-新增下架判断: time: 2018-5-12 ''' if data.get('skudata', {}).get('info', {}).get('gstatus', '1') == '2': # 'gstatus'在售状态为'1' is_delete = 1 return is_delete def _wash_main_data(self, main_data): ''' 清洗main_data :param main_data: :return: ''' # 处理commitments try: main_data['commitments'] = '' main_data.get('discount', {})['coupon'] = '' main_data.get('discount', {})['coupon_index'] = '' main_data.get('discount', {})['vip_info'] = '' main_data['topbanner'] = '' except: pass try: main_data.get('brand_info')['sub_goods'] = '' except: pass return main_data def _get_db_update_params(self, item): ''' 得到待更新的db数据 :param item: :return: ''' params = ( item['modify_time'], item['shop_name'], item['account'], item['title'], item['sub_title'], item['link_name'], # item['price'], # item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], dumps(item['my_shelf_and_down_time'], ensure_ascii=False), item['delete_time'], item['is_delete'], dumps(item['schedule'], ensure_ascii=False), item['is_price_change'], dumps(item['price_change_info'], ensure_ascii=False), item['goods_id'], ) return params def _get_db_insert_miaosha_params(self, item): params = ( item['goods_id'], item['spider_url'], item['username'], item['deal_with_time'], item['modfiy_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), dumps(item['stock_info'], ensure_ascii=False), dumps(item['miaosha_time'], ensure_ascii=False), item['miaosha_begin_time'], item['miaosha_end_time'], item['tab_id'], item['page'], item['site_id'], item['is_delete'], ) return params def _get_db_update_miaosha_params(self, item): params = ( item['modfiy_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], item['is_delete'], dumps(item['schedule'], ensure_ascii=False), dumps(item['stock_info'], ensure_ascii=False), dumps(item['miaosha_time'], ensure_ascii=False), item['miaosha_begin_time'], item['miaosha_end_time'], item['goods_id'], ) return params def _get_db_insert_pintuan_params(self, item): params = ( item['goods_id'], item['spider_url'], item['username'], item['deal_with_time'], item['modfiy_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), item['all_sell_count'], dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), item['pintuan_begin_time'], item['pintuan_end_time'], item['page'], item['site_id'], item['is_delete'], ) return params def _get_db_update_pintuan_params(self, item): params = ( item['modfiy_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), # item['all_sell_count'], dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), item['is_delete'], item['goods_id']) return params def get_goods_id_from_url(self, juanpi_url): ''' 得到goods_id :param juanpi_url: :return: goods_id (类型str) ''' is_juanpi_url = re.compile(r'http://shop.juanpi.com/deal/.*?').findall( juanpi_url) if is_juanpi_url != []: if re.compile(r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url) != []: tmp_juanpi_url = re.compile( r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url)[0] if tmp_juanpi_url != '': goods_id = tmp_juanpi_url else: # 只是为了在pycharm运行时不跳到chrome,其实else完全可以不要的 juanpi_url = re.compile(r';').sub('', juanpi_url) goods_id = re.compile( r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url)[0] print('------>>>| 得到的卷皮商品的地址为:', goods_id) return goods_id else: print( '卷皮商品url错误, 非正规的url, 请参照格式(http://shop.juanpi.com/deal/)开头的...' ) return '' def __del__(self): try: del self.my_phantomjs del self.result_data except: pass gc.collect()
class TmallCommentParse(object): def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.page_size = '10' self.comment_page_switch_sleep_time = 1.5 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs() self.g_data = {} # 临时数据 self.random_sku_info_list = [] # 临时数据(存该商品所有的规格) def _get_comment_data(self, type: int, goods_id): if goods_id == '' or type == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) '''先获取到sellerId''' try: seller_id = self._get_seller_id(type=type, goods_id=goods_id) except AssertionError or IndexError as e: self.my_lg.error('出错goods_id: %s' % goods_id) self.my_lg.error(e.args[0]) self.result_data = {} self.random_sku_info_list = [] return {} """再获取price_info_list""" try: self.random_sku_info_list = self._get_random_sku_info_list() # self.my_lg.info(self.random_sku_info_list) except Exception as e: self.my_lg.error('出错goods_id: %s' % str(goods_id)) self.my_lg.exception(e) self.result_data = {} self.random_sku_info_list = [] return {} _tmp_comment_list = [] for current_page in range(1, 4): self.my_lg.info('------>>>| 正在抓取第 {0} 页的评论...'.format( str(current_page))) _url = 'https://rate.tmall.com/list_detail_rate.htm' params = self._set_params(goods_id=goods_id, seller_id=seller_id, current_page=current_page) self.headers.update({ 'referer': 'https://detail.m.tmall.com/item.htm?id=' + goods_id }) # 原先用代理请求不到数据的原因是没有cookies # body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params, encoding='gbk') # 所以直接用phantomjs来获取相关api数据 _url = _get_url_contain_params(url=_url, params=params) # 根据params组合得到url # self.my_lg.info(_url) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=_url) # self.my_lg.info(str(body)) if body == '': self.my_lg.error( '获取到的body为空str! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) self.result_data = {} return {} try: _ = re.compile('\((.*)\)').findall(body)[0] except IndexError: _ = {} self.my_lg.error('索引异常! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) try: data = json.loads(_).get('rateDetail', {}).get('rateList', []) # pprint(data) except: data = [] self.my_lg.error( 'json.loads转换_出错! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) _tmp_comment_list += data sleep(self.comment_page_switch_sleep_time) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data def _get_comment_list(self, _tmp_comment_list): ''' 转换成需求的结果集 :param _tmp_comment_list: :return: ''' _comment_list = [] for item in _tmp_comment_list: _comment_date = item.get('rateDate', '') assert _comment_date != '', '得到的_comment_date为空str!请检查!' # 天猫接口拿到的sku_info默认为空 # sku_info = '' # 从所有规格里面随机一个 if self.random_sku_info_list == []: self.random_sku_info_list = [''] sku_info = str(choice(self.random_sku_info_list)) _comment_content = item.get('rateContent', '') assert _comment_content != '', '得到的评论内容为空str!请检查!' _comment_content = self._wash_comment(comment=_comment_content) buyer_name = item.get('displayUserNick', '') assert buyer_name != '', '得到的用户昵称为空值!请检查!' # 天猫设置默认 购买量为1 quantify = 1 # 天猫没有head_img回传,就设置一个默认地址 head_img = '' # 第一次评论图片 _comment_img_list = item.get( 'pics', []) if item.get('pics', '') != '' else [] if _comment_img_list != []: _comment_img_list = [{ 'img_url': 'https:' + img } for img in _comment_img_list] '''追评''' _tmp_append_comment = item.get( 'appendComment', {}) if item.get('appendComment', '') != '' else {} # 追评的图片 _append_comment_img_list = _tmp_append_comment.get( 'pics', []) if _tmp_append_comment.get('pics', '') != '' else [] if _append_comment_img_list != []: _append_comment_img_list = [{ 'img_url': 'https:' + img } for img in _append_comment_img_list] if _tmp_append_comment != {}: append_comment = { 'comment_date': _tmp_append_comment.get('commentTime', ''), 'comment': self._wash_comment(_tmp_append_comment.get('content', '')), 'img_url_list': _append_comment_img_list, } else: append_comment = {} comment = [{ 'comment': _comment_content, 'comment_date': _comment_date, 'sku_info': sku_info, 'img_url_list': _comment_img_list, 'star_level': randint(4, 5), 'video': '', }] _ = { 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 评论数量 'head_img': head_img, # 头像 'append_comment': append_comment, # 追评 } _comment_list.append(_) return _comment_list def _get_seller_id(self, type, goods_id): ''' 得到seller_id :param type: :param goods_id: :return: ''' _ = TmallParse(logger=self.my_lg) _g = [type, goods_id] self.g_data = _.get_goods_data(goods_id=_g) seller_id = str(self.g_data.get('seller', {}).get('userId', 0)) # self.my_lg.info('获取到的seller_id: ' + seller_id) try: del _ except: pass assert seller_id != 0, '获取到的seller_id为0!' return seller_id def _get_random_sku_info_list(self): ''' 得到所有的sku_info_list信息,用于随机一个属性 :return: ''' assert self.g_data != {}, 'g_data为空dict' _t = TaoBaoLoginAndParse(logger=self.my_lg) # 得到每个标签对应值的价格及其库存 price_info_list = _t._get_price_info_list( data=self.g_data, detail_value_list=_t._get_detail_name_and_value_list( data=self.g_data)[1]) try: del _t except: pass return list(set([_i.get('spec_value', '') for _i in price_info_list])) def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_headers(self): self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS) - 1)], 'accept': '*/*', 'referer': 'https://detail.m.tmall.com/item.htm?id=524718632348', } def _wash_comment(self, comment): ''' 清洗评论 :param comment: :return: ''' comment = re.compile('天猫超市|天猫国际|天猫全球购|天猫大药房|某淘|某宝').sub('', comment) comment = comment.replace('天猫', '').replace('淘宝', '') comment = re.compile('tmall|Tmall|TMALL|TAOBAO|taobao').sub( '', comment) return comment def _set_params(self, **kwargs): ''' 设置params :param kwargs: :return: ''' goods_id = kwargs.get('goods_id') seller_id = kwargs.get('seller_id') current_page = kwargs.get('current_page') callback = '_DLP_2519_der_3_currentPage_{0}_pageSize_{1}_'.format( str(current_page), self.page_size) _params = ( ('itemId', goods_id), ('sellerId', seller_id), ('order', '3'), ('currentPage', str(current_page)), ('pageSize', self.page_size), ('callback', callback), ) return _params def __del__(self): try: del self.my_lg del self.my_phantomjs del self.g_data except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_mogujie_pintuan_all_goods_id()) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs() for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server.delete_mogujie_pintuan_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server.delete_mogujie_pintuan_expired_goods_id( goods_id=item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': self.timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': self.timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # print(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server.delete_mogujie_pintuan_expired_goods_id(goods_id=item[0]) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = price_info_list # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [ { 'spec_value': item_4.get( 'spec_value'), 'pintuan_price': item_4.get( 'detail_price'), 'normal_price': item_4.get( 'normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get( 'rest_number'), } for item_4 in tmp_price_info_list ] goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = price_info_list goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} return {} print('------>>>| 对应的手机端地址为: ', 'https://m.chuchujie.com/details/detail.html?id=' + goods_id) ''' 1.原先直接去手机端页面api post请求数据但是死活就返回请求参数错误,反复研究无果, 就改为解析pc端的 ''' # tmp_url = 'https://api-product.chuchujie.com/api.php?method=product_detail' # self.headers['Referer'] = 'https://m.chuchujie.com/details/detail.html?id=' + str(goods_id) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) # # params_2 = { # "channel": "QD_appstore", # "package_name": "com.culiukeji.huanletao", # "client_version": "3.9.101", # "ageGroup": "AG_0to24", # "client_type": "h5", # "api_version": "v5", # "imei": "", # "method": "product_detail", # "gender": "1", # 性别 0-女、1-男 # "token": "", # "userId": "", # "product_id": int(goods_id), # } # # params = { # 'data': json.dumps(params_2), # } # # try: # # response = requests.post( # # url=tmp_url, # # headers=self.headers, # # data=json.dumps(params), # # proxies=tmp_proxies, # # timeout=13 # # ) # response = requests.get( # url=tmp_url, # headers=self.headers, # params=params, # proxies=tmp_proxies, # timeout=13, # ) # last_url = re.compile(r'\+').sub('', response.url) # 转换后得到正确的url请求地址 # print(last_url) # print(tmp_url + '&data=%7B%22channel%22%3A%22QD_appstore%22%2C%22package_name%22%3A%22com.culiukeji.huanletao%22%2C%22client_version%22%3A%223.9.101%22%2C%22ageGroup%22%3A%22AG_0to24%22%2C%22client_type%22%3A%22h5%22%2C%22api_version%22%3A%22v5%22%2C%22imei%22%3A%22%22%2C%22method%22%3A%22product_detail%22%2C%22gender%22%3A%221%22%2C%22token%22%3A%22%22%2C%22userId%22%3A%22%22%2C%22product_id%22%3A10016793335%7D') # response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # # data = response.content.decode('utf-8') # print(data) # # except Exception: # print('requests.post()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 2. 改为解析pc端的商品页面数据 ''' tmp_url = 'http://wx.chuchujie.com/index.php?s=/WebProduct/product_detail/product_id/' + str( goods_id) # 开始常规requests有数据, 后面无数据, 改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) my_phantomjs = MyPhantomjs() body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) try: del my_phantomjs except: pass # print(body) if body == '': print('获取到的body为空str!') self.result_data = {} return {} data = {} try: data['title'] = Selector( text=body).css('div.zy_info_rt h3::text').extract_first() if data['title'] == '': print('title为空!') raise Exception data['sub_title'] = '' data['shop_name'] = Selector(text=body).css( 'div.other.ft14.clearfix label b::text').extract_first() # print(data['shop_name']) # 获取所有示例图片 all_img_url = [{ 'img_url': item } for item in list( Selector( text=body).css('p.s_img label img::attr("src")').extract()) ] # pprint(all_img_url) data['all_img_url'] = all_img_url ''' 获取p_info ''' # 由于获取的是pc端的对应没有p_info data['p_info'] = [] ''' 获取商品的div_desc ''' div_desc = Selector(text=body).css('div.s_two').extract_first() # print(div_desc) if div_desc == '': print('div_desc为空!请检查!') raise Exception data['div_desc'] = div_desc ''' 获取detail_name_list ''' detail_name_list = Selector(text=body).css( 'div.info-wd.bd-red dl.detail dt::text').extract() if len(detail_name_list) <= 1: detail_name_list = [] else: detail_name_list = [{ 'spec_name': item } for item in detail_name_list[:-1]] # print(detail_name_list) data['detail_name_list'] = detail_name_list # 商品价格(原价)跟淘宝价格 taobao_price = Selector( text=body).css('dl.detail p.price b::text').extract_first() price = Selector(text=body).css( 'dl.detail dd em.yjprice::text').extract_first() # print(taobao_price) # print(price) try: # 后面有'*' 是为了避免有价格为整数不是浮点类型的 taobao_price = re.compile(r'(\d+\.{0,1}\d*)').findall( taobao_price)[0] price = re.compile(r'(\d+\.{0,1}\d*)').findall(price)[0] except IndexError: print('获取price失败,请检查!') raise IndexError if taobao_price == '' or price == '': print('获取到的taobao_price或者price为空值出错, 请检查!') raise Exception taobao_price = Decimal(taobao_price).__round__(2) price = Decimal(price).__round__(2) # print('商品促销价为: ', taobao_price, ' 商品原价为: ', price) data['price'] = price data['taobao_price'] = taobao_price ''' 获取每个规格对应价格跟规格以及其库存 ''' price_info_list = self.get_price_info_list(detail_name_list, body, price, taobao_price) # pprint(price_info_list) if price_info_list == '': raise Exception else: data['price_info_list'] = price_info_list ''' 是否卖光 ''' all_stock = int( Selector(text=body).css( 'dl.detail dd label em::text').extract_first()) if all_stock == 0: is_delete = 1 else: is_delete = 0 data['is_delete'] = is_delete except Exception as e: print('遇到错误: ', e) self.result_data = {} return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
class Zhe800Spike(object): def __init__(self): self._set_headers() self.my_phantomjs = MyPhantomjs() def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'zhe800.com', 'User-Agent': HEADERS[randint(0, 34)] # 随机一个请求头 } def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) data = self._get_one_session_id_data( base_session_id=base_session_id) sleep(.2) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 pass else: # 否则session_id存在 try: _ = str( data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10] if _ != '': pass elif data.get('data', {}).get('blocks', [])[0].get( 'showcase', {}) != {}: # 未来时间 print('*** 未来时间 ***') # pprint(data.get('data', {})) _ = str( data.get('data', {}).get('blocks', [])[1].get( 'deal', {}).get('begin_time', ''))[:10] else: raise Exception begin_times_timestamp = int( _) # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整 except Exception as e: print('遇到严重错误: ', e) continue print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp)) if self.is_recent_time( timestamp=begin_times_timestamp): # 说明秒杀日期合法 try: data = [ item_s.get('deal', {}) for item_s in data.get( 'data', {}).get('blocks', []) ] except Exception as e: print('遇到严重错误: ', e) continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14' db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=sql_str)) ] for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str( item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url( tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = str( item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get( 'taobao_price') goods_data['sub_title'] = item.get( 'sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get( 'miaosha_time')) goods_data['session_id'] = str( base_session_id) # print(goods_data['miaosha_time']) # print(goods_data) zhe_800.insert_into_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 # sleep(2) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') # return {} pass else: pass base_session_id += 2 def _get_one_session_id_data(self, base_session_id): ''' 得到一个session_id的data :param base_session_id: :return: ''' _data = [] for _page in range(1, 20): '''per_page为20固定,其他不返回数据''' tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page={1}&per_page=20'.format( str(base_session_id), _page) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) # print(body) body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(body) if body_1 != []: data = body_1[0] data = json.loads(data) # pprint(data) # print(type(data.get('data', {}).get('has_next'))) if data.get('msg', '') == '无效场次': print('该session_id不存在,此处跳过') break if not data.get('data', {}).get('has_next', True): print('该session_id没有下页了!!') break else: print('正在抓取该session_id的第 {0} 页...'.format(_page)) for _i in data.get('data', {}).get('blocks', []): _data.append(_i) sleep(.2) return { 'data': { 'blocks': _data, } } def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: # pprint(item) tmp = {} # 秒杀开始时间和结束时间 tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int( str(item.get('begin_time'))[0:10])), 'miaosha_end_time': timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])), } # 折800商品地址 tmp['zid'] = item.get('zid') # 是否包邮 # tmp['is_baoyou'] = item.get('is_baoyou', 0) # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } # 原始价格 tmp['price'] = float(item.get('list_price')) # 秒杀的价格, float类型 tmp['taobao_price'] = float(item.get('price')) # 子标题 tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: True or False ''' time_1 = int(timestamp) time_2 = time.time() # 当前的时间戳 time_1 = time.localtime(time_1) time_2 = time.localtime(time_2) if time_1.tm_year > time_2.tm_year: print('** 该年份为未来时间年份 **') if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR, SPIDER_END_HOUR)) return False if time_1.tm_year == time_2.tm_year: if time_1.tm_mon > time_2.tm_mon: # 先处理得到的time_1的月份大于当前月份的信息(即未来月份的) print('** 该月份为未来时间月份 **') if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format( SPIDER_START_HOUR, SPIDER_END_HOUR)) return False if time_1.tm_mon >= time_2.tm_mon: # 如果目标时间的月份时间 >= 当前月份(月份合法, 表示是当前月份或者是今年其他月份) if time_1.tm_mday >= time_2.tm_mday - 2: # 这样能抓到今天的前两天的信息 if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format( SPIDER_START_HOUR, SPIDER_END_HOUR)) return False else: print('该日时间已过期, 此处跳过') return False else: # 月份过期 print('该月份时间已过期,此处跳过') return False else: print('非本年度的限时秒杀时间,此处跳过') return False def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
async def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = await tmp_sql_server.select_jumeiyoupin_pintuan_all_goods_id( logger=self.my_lg) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(result) self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await tmp_sql_server.delete_jumeiyoupin_pintuan_expired_goods_id( goods_id=item[0], logger=self.my_lg) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.my_lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.my_lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.my_lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: my_phantomjs = MyPhantomjs() item_list = await jumeiyoupin_2.get_one_page_goods_list( my_phantomjs=my_phantomjs, tab=item[2], index=item[3]) try: del my_phantomjs except: pass if item_list == []: self.my_lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.my_lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=tmp_sql_server) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=tmp_sql_server) else: self.my_lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() return None
def __init__(self): self._set_headers() self.result_data = {} # self.set_cookies_key_api_uid() # 设置cookie中的api_uid的值 self.my_phantomjs = MyPhantomjs()
class ALi1688CommentParse(object): ''' 阿里1688评论抓取解析类 ''' def __init__(self, logger=None): super().__init__() self.result_data = {} self.msg = '' self._set_headers() self._set_logger(logger) self.my_phantomjs = MyPhantomjs() # 可动态执行的代码 self._exec_code = ''' self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() _text = str(self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').text) print(_text) # if _text == '四五星(0)': assert _text != '四五星(0)', 'my assert error!' # 通过断言来跳过执行下面的代码 sleep(2.5) # 向下滚动10000像素 js = 'document.body.scrollTop=10000' self.driver.execute_script(js) sleep(4) ''' self._page_sleep_time = 1.2 def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) # 原先采用phantomjs, 改用pc端抓包到的接口(speed slow, give up) tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str( goods_id) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, exec_code=self._exec_code) # self.my_lg.info(str(body)) if body == '': self.result_data = {} self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url) return {} _html_comment_list = list( Selector(text=body).css('div.remark-item').extract()) if _html_comment_list != []: _comment_list = [] for index, item in enumerate(_html_comment_list): if index > 25: # 就取前25条评论信息 break buyer_name = str( Selector( text=item).css('span.member::text').extract_first()) quantify = str( Selector( text=item).css('span.amount::text').extract_first()) try: quantify = int(re.compile(r'\d+').findall(quantify)[0]) except IndexError: self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url) self.result_data = {} return {} comment_date = str( Selector( text=item).css('div.date span::text').extract_first()) comment_date = self._get_comment_date( comment_date) # str '2017-01-25 17:06:00' tmp_sku_info = str( Selector(text=item).css('div.date::text').extract_first()) comment = [{ 'comment': self._wash_comment( str( Selector(text=item).css( 'div.bd::text').extract_first())), 'comment_date': comment_date, # 评论创建日期 'sku_info': re.compile(r'<span.*?</span>').sub( '', tmp_sku_info), # 购买的商品规格 'img_url_list': [], 'star_level': randint(3, 5), # 几星好评 'video': '', }] _ = { 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 购买数量 'head_img': '', # 用户头像 'append_comment': {}, # 追评 } _comment_list.append(_) _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data else: self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url) self.result_data = {} return {} # 下面是模拟手机端好评接口 # _comment_list = [] # for page_num in range(1, 4): # tmp_url = 'https://m.1688.com/page/offerRemark.htm' # _params = self._set_params(goods_id=goods_id, page_num=page_num) # # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=_params) # # _url = self._set_url(url=tmp_url, params=_params) # # print(_url) # # self.my_lg.info(str(body)) # if body == '': # self.result_data = {} # self.my_lg.error('该地址的body为空值, 出错goods_id: ' + goods_id) # return {} # # data = self.json_str_2_dict(json_str=body).get('data', {}).get('model', []) # pprint(data) # try: # for item in data: # buyer_name = item.get('member', '') # comment = [{ # 'comment': i.get('remarkContent', ''), # 'comment_date': string_to_datetime(i.get('remarkTime', '')), # 评论日期 # 'star_level': i.get('starLevel', 5), # 'sku_info': '', # 购买的商品规格(pc端1688商品没有规格) # 'img_url_list': [], # } for i in item.get('rateItem', [])] # quantify = item.get('quantity', 1) # 购买数量 # # _ = { # 'buyer_name': buyer_name, # 买家昵称 # 'comment': comment, # 评论内容 # 'quantify': quantify # 购买数量 # } # _comment_list.append(_) # # except Exception as e: # self.result_data = {} # self.my_lg.error('出错商品goods_id: ' + goods_id) # self.my_lg.exception(e) # return {} # sleep(self._page_sleep_time) # # pprint(_comment_list) # self.result_data = { # 'goods_id': str(goods_id), # 'modify_time': datetime.datetime.now(), # '_comment_list': _comment_list, # } # pprint(self.result_data) # return self.result_data def _wash_comment(self, comment: str): ''' 清洗comment :param comment: :return: ''' comment = re.compile('阿里巴巴').sub('', comment) comment = re.compile('1688|合作|阿里').sub('', comment) return comment def _set_headers(self): self.headers = { # 下面的ali-ss为必要字段 'cookie': 'ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiI5WmZucV96VDl6NDhTOTg4WkNsaFpxSEwiLCJfZXhwaXJlIjoxNTI0MTE5MzI3NDQ5LCJfbWF4QWdlIjo4NjQwMDAwMH0=; ', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS) - 1)], 'accept': 'application/json, text/javascript, */*; q=0.01', 'referer': 'https://m.1688.com/page/offerRemark.htm?offerId=42735065607', 'x-requested-with': 'XMLHttpRequest', } def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/阿里1688/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_url(self, url, params): ''' 得到待抓取的api接口地址 :param url: :param params: :return: str ''' _ = [item[0] + '=' + str(item[1]) for item in params] return url + '?' + '&'.join(_) def _set_params(self, goods_id, page_num: int): ''' 设置params :param goods_id: :param page_num: :return: ''' data = json.dumps({ 'data': { 'offerId': goods_id, # 'receiveUserId': 2318703732, 'starLevel': 7, 'itemId': int(goods_id), 'bizType': 'trade', 'page': page_num, 'pageSize': 5, } }) params = ( ('_csrf', 'xMrEnTz7-VByOlidz0AzkXFg_ifMZBv6bCA0'), ('__wing_navigate_type', 'view'), ('__wing_navigate_url', 'detail:modules/offerRemarkList/view'), # ('__wing_navigate_options', '{"data":{"offerId":"42735065607","receiveUserId":2318703732,"starLevel":7,"itemId":42735065607,"bizType":"trade","page":1,"pageSize":5}}'), ('__wing_navigate_options', data), ('_', str(time.time().__round__()) + str(randint(100, 999))), ) return params def json_str_2_dict(self, json_str): ''' json字符串转dict :param json_str: :return: ''' try: data = json.loads(json_str) except: self.my_lg.error('json.loads转换json_str时出错!请检查!') data = {} return data def _get_comment_date(self, comment_date): ''' 得到datetime类型的时间 :param comment_date: eg: 2017-12-04 :return: datetime ''' _ = str(randint(0, 23)) if len(_) == 1: _hour = '0' + _ else: _hour = _ _ = str(randint(0, 59)) if len(_) == 1: _min = '0' + _ else: _min = _ _ = str(randint(0, 59)) if len(_) == 1: _s = '0' + _ else: _s = _ comment_date = comment_date + ' ' + _hour + ':' + _min + ':' + _s return comment_date def __del__(self): try: del self.my_phantomjs del self.my_lg del self.msg except: pass gc.collect()
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = MyPhantomjs() for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': self.timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': self.timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5)
class MoGuJiePinTuan(object): def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'api.mogujie.com', 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92', 'User-Agent': HEADERS[randint(0, 34)], # 随机一个请求头 } self.fcid_dict = { '女装': 10053171, # '精选': 10053172, '男友': 10053173, '内衣': 10053174, '女鞋': 10053175, '包包': 10053176, '美妆': 10053177, '生活': 10053178, '配饰': 10053179, '母婴': 10053180, '食品': 10053181, } def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = MyPhantomjs() for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': self.timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': self.timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5) def deal_with_data(self, *params): ''' 处理并存储相关拼团商品的数据 :param params: 待传参数 :return: ''' goods_list = params[0] mogujie = MoGuJieParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline.select_mogujie_pintuan_all_goods_id()) ] print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://shop.mogujie.com/detail/' + str( goods_id) mogujie.get_goods_data(goods_id=str(goods_id)) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 # 规范化 tmp_price_info_list = goods_data['price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['price_info_list'] = price_info_list goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['pintuan_time'] = item.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=item.get('pintuan_time', {})) goods_data['all_sell_count'] = item.get( 'all_sell_count', '') goods_data['fcid'] = str(item.get('fcid')) goods_data['page'] = str(item.get('page')) goods_data['sort'] = str(item.get('sort', '')) # pprint(goods_data) # print(goods_data) mogujie.insert_into_mogujie_pintuan_table( data=goods_data, pipeline=my_pipeline) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect() def get_pintuan_end_time(self, begin_time, left_time): ''' 处理并得到拼团结束时间 :param begin_time: 秒杀开始时间戳 :param left_time: 剩余时间字符串 :return: end_time 时间戳(int) ''' # 'leftTimeOrg': '6天13小时' # 'leftTimeOrg': '13小时57分' had_day = re.compile(r'天').findall(left_time) had_hour = re.compile(r'小时').findall(left_time) had_min = re.compile(r'分').findall(left_time) tmp = re.compile(r'\d+').findall(left_time) if had_day != [] and had_hour != []: # left_time 格式为 '6天13小时' day, hour, min = int(tmp[0]), int(tmp[1]), 0 elif had_day == [] and had_hour != []: # left_time 格式为 '13小时57分' day, hour, min = 0, int(tmp[0]), int(tmp[1]) elif had_day == [] and had_hour == []: # left_time 格式为 '36分' print('left_time = ', left_time) day, hour, min = 0, 0, int(tmp[0]) else: # 无天, 小时, 分 print('day, hour, min = 0, 0, 0', 'left_time = ', left_time) day, hour, min = 0, 0, 0 left_end_time_timestamp = \ day * 24 * 60 * 60 + \ hour * 60 * 60 + \ min * 60 return begin_time + left_end_time_timestamp def timestamp_to_regulartime(self, timestamp): ''' 将时间戳转换成时间 ''' # 利用localtime()函数将时间戳转化成localtime的格式 # 利用strftime()函数重新格式化时间 # 转换成localtime time_local = time.localtime(int(timestamp)) # print(time_local) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) return dt def get_pintuan_begin_time_and_pintuan_end_time(self, pintuan_time): ''' 返回拼团开始和结束时间 :param pintuan_time: :return: tuple pintuan_begin_time, pintuan_end_time ''' pintuan_begin_time = pintuan_time.get('begin_time') pintuan_end_time = pintuan_time.get('end_time') # 将字符串转换为datetime类型 pintuan_begin_time = datetime.datetime.strptime( pintuan_begin_time, '%Y-%m-%d %H:%M:%S') pintuan_end_time = datetime.datetime.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S') return pintuan_begin_time, pintuan_end_time def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
def __init__(self): super(JuanPiParse, self).__init__() self._set_headers() self.result_data = {} self.my_phantomjs = MyPhantomjs()
class Zhe_800_Miaosha_Real_Time_Update(object): def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'zhe800.com', 'User-Agent': HEADERS[randint(0, 34)] # 随机一个请求头 } self.my_phantomjs = MyPhantomjs() def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server.select_zhe_800_xianshimiaosha_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json.loads( item[1]).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_begin_time) data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_miaosha = Zhe800Parse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server.delete_zhe_800_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_begin_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page=1&per_page=1000'.format( str(item[2])) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall( body) if body_1 != []: tmp_data = body_1[0] tmp_data = json.loads(tmp_data) # pprint(tmp_data) if tmp_data.get('data', {}).get('blocks', []) == []: # session_id不存在 print('该session_id不存在,此处跳过') pass else: tmp_data = [ item_s.get('deal', {}) for item_s in tmp_data.get('data', {}).get( 'blocks', []) ] if tmp_data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list( data=tmp_data) # pprint(miaosha_goods_list) # 该session_id中现有的所有zid的list miaosha_goods_all_goods_id = [ i.get('zid') for i in miaosha_goods_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server.delete_zhe_800_expired_goods_id( goods_id=item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('zid', '') == item[0]: zhe_800_miaosha.get_goods_data( goods_id=item[0]) goods_data = zhe_800_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data[ 'stock_info'] = item_1.get( 'stock_info') goods_data[ 'goods_id'] = str( item_1.get('zid')) # goods_data['username'] = '******' if item_1.get( 'stock_info' ).get('activity_stock' ) > 0: goods_data[ 'price'] = item_1.get( 'price') goods_data[ 'taobao_price'] = item_1.get( 'taobao_price') else: pass goods_data[ 'sub_title'] = item_1.get( 'sub_title') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= item_1.get( 'miaosha_time' )) # print(goods_data['stock_info']) # print(goods_data['miaosha_time']) zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server ) else: pass else: # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品 print('该sessionid没有相关key为jsons的数据') # return {} tmp_sql_server.delete_zhe_800_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads( item[1]).get('miaosha_begin_time')) pass else: print('获取到的data为空!') # return {} pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass # sleep(.8) gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect() def get_miaosha_begin_time_and_miaosha_end_time(self, miaosha_time): ''' 返回秒杀开始和结束时间 :param miaosha_time: :return: tuple miaosha_begin_time, miaosha_end_time ''' miaosha_begin_time = miaosha_time.get('miaosha_begin_time') miaosha_end_time = miaosha_time.get('miaosha_end_time') # 将字符串转换为datetime类型 miaosha_begin_time = datetime.datetime.strptime( miaosha_begin_time, '%Y-%m-%d %H:%M:%S') miaosha_end_time = datetime.datetime.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S') return miaosha_begin_time, miaosha_end_time def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(time.time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -259200: # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息 # if diff_time < -172800: # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来2小时的商品信息 return 0 # 已过期恢复原价的 elif diff_time > -172800 and diff_time < 7200: return 1 # 表示是昨天跟今天的也就是待更新的 else: return 2 # 未来时间的暂时不用更新 def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: # pprint(item) tmp = {} # 秒杀开始时间和结束时间 tmp['miaosha_time'] = { 'miaosha_begin_time': self.timestamp_to_regulartime( int(str(item.get('begin_time'))[0:10])), 'miaosha_end_time': self.timestamp_to_regulartime( int(str(item.get('end_time'))[0:10])), } # 折800商品地址 tmp['zid'] = item.get('zid') # 是否包邮 # tmp['is_baoyou'] = item.get('is_baoyou', 0) # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } # 原始价格 tmp['price'] = float(item.get('list_price')) # 秒杀的价格, float类型 tmp['taobao_price'] = float(item.get('price')) # 子标题 tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list def timestamp_to_regulartime(self, timestamp): ''' 将时间戳转换成时间 ''' # 利用localtime()函数将时间戳转化成localtime的格式 # 利用strftime()函数重新格式化时间 # 转换成localtime time_local = time.localtime(timestamp) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) return dt def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class JuMeiYouPinSpike(object): def __init__(self): self._set_headers() def _set_headers(self): self.headers = { 'Accept': 'application/json,text/javascript,text/plain,*/*;q=0.01', # 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'h5.jumei.com', 'Referer': 'https://h5.jumei.com/', 'Cache-Control': 'max-age=0', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': HEADERS[randint(0, 34)], # 随机一个请求头 } def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] self.my_phantomjs = MyPhantomjs() cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/') try: del self.my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) print('开始抓取在售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]: item['page'] = page all_goods_list.append(item) sleep(.5) print('开始抓取预售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format(str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]: item['page'] = page all_goods_list.append(item) sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] print(all_goods_list) print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__()) self.deal_with_data(all_goods_list) return True def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, page, goods_url from dbo.jumeiyoupin_xianshimiaosha where site_id=26' db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))] # print(db_goods_id_list) for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: jumei = JuMeiYouPinParse() goods_id = item.get('goods_id', '') type = item.get('type', '') tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format(goods_id, type) jumei.get_goods_data(goods_id=[goods_id, type]) goods_data = jumei.deal_with_data() if goods_data == {}: pass elif goods_data.get('is_delete', 0) == 1: print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) goods_data['page'] = item.get('page') # pprint(goods_data) # print(goods_data) jumei.insert_into_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(JUMEIYOUPIN_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 try: del jumei except: pass else: print('数据库连接失败,此处跳过!') pass gc.collect() def get_miaosha_begin_time_and_miaosha_end_time(self, miaosha_time): ''' 返回秒杀开始和结束时间 :param miaosha_time: :return: tuple miaosha_begin_time, miaosha_end_time ''' miaosha_begin_time = miaosha_time.get('miaosha_begin_time') miaosha_end_time = miaosha_time.get('miaosha_end_time') # 将字符串转换为datetime类型 miaosha_begin_time = datetime.datetime.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S') miaosha_end_time = datetime.datetime.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S') return miaosha_begin_time, miaosha_end_time def __del__(self): gc.collect()
class MoGuJiePinTuanRealTimesUpdate(object): def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Accept-Encoding:': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'list.mogujie.com', # 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92', 'User-Agent': HEADERS[randint(0, 34)], # 随机一个请求头 } def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_mogujie_pintuan_all_goods_id()) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs() for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server.delete_mogujie_pintuan_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server.delete_mogujie_pintuan_expired_goods_id( goods_id=item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': self.timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': self.timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # print(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server.delete_mogujie_pintuan_expired_goods_id(goods_id=item[0]) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = price_info_list # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [ { 'spec_value': item_4.get( 'spec_value'), 'pintuan_price': item_4.get( 'detail_price'), 'normal_price': item_4.get( 'normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get( 'rest_number'), } for item_4 in tmp_price_info_list ] goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = price_info_list goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() def get_pintuan_begin_time_and_pintuan_end_time(self, pintuan_time): ''' 返回拼团开始和结束时间 :param pintuan_time: :return: tuple pintuan_begin_time, pintuan_end_time ''' pintuan_begin_time = pintuan_time.get('begin_time', '') pintuan_end_time = pintuan_time.get('end_time', '') # 将字符串转换为datetime类型 pintuan_begin_time = datetime.datetime.strptime( pintuan_begin_time, '%Y-%m-%d %H:%M:%S') pintuan_end_time = datetime.datetime.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S') return pintuan_begin_time, pintuan_end_time def timestamp_to_regulartime(self, timestamp): ''' 将时间戳转换成时间 ''' # 利用localtime()函数将时间戳转化成localtime的格式 # 利用strftime()函数重新格式化时间 # 转换成localtime time_local = time.localtime(int(timestamp)) # print(time_local) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) return dt def get_pintuan_end_time(self, begin_time, left_time): ''' 处理并得到拼团结束时间 :param begin_time: 秒杀开始时间戳 :param left_time: 剩余时间字符串 :return: end_time 时间戳(int) ''' # 'leftTimeOrg': '6天13小时' # 'leftTimeOrg': '13小时57分' had_day = re.compile(r'天').findall(left_time) had_hour = re.compile(r'小时').findall(left_time) had_min = re.compile(r'分').findall(left_time) tmp = re.compile(r'\d+').findall(left_time) if had_day != [] and had_hour != []: # left_time 格式为 '6天13小时' day, hour, min = int(tmp[0]), int(tmp[1]), 0 elif had_day == [] and had_hour != []: # left_time 格式为 '13小时57分' day, hour, min = 0, int(tmp[0]), int(tmp[1]) elif had_day == [] and had_hour == []: # left_time 格式为 '36分' print('left_time = ', left_time) day, hour, min = 0, 0, int(tmp[0]) else: # 无天, 小时, 分 print('day, hour, min = 0, 0, 0', 'left_time = ', left_time) day, hour, min = 0, 0, 0 left_end_time_timestamp = \ day * 24 * 60 * 60 + \ hour * 60 * 60 + \ min * 60 return begin_time + left_end_time_timestamp def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(time.time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] self.my_phantomjs = MyPhantomjs() cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/') try: del self.my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) print('开始抓取在售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]: item['page'] = page all_goods_list.append(item) sleep(.5) print('开始抓取预售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format(str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]: item['page'] = page all_goods_list.append(item) sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] print(all_goods_list) print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__()) self.deal_with_data(all_goods_list) return True
def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] chuchujie = ChuChuJie_9_9_Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline.select_chuchujie_xianshimiaosha_all_goods_id()) ] # print(db_goods_id_list) # my_phantomjs = MyPhantomjs() # my_phantomjs.init_phantomjs() # index = 1 for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str( goods_id) chuchujie.get_goods_data(goods_id=goods_id) goods_data = chuchujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass elif goods_data.get('is_delete', 0) == 1: # is_delete=1(即库存为0)则跳过 print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 my_phantomjs = MyPhantomjs() my_phantomjs.init_phantomjs() # 获取剩余时间 tmp_body = my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='p#activityTime span') # print(tmp_body) try: del my_phantomjs except: pass gc.collect() if tmp_body == '': # 获取手机版的页面完整html失败 sleep(.4) pass else: # p#activityTime span _t = Selector(text=tmp_body).css( 'p#activityTime span::text').extract_first() _t = re.compile(r'剩余').sub('', _t) # print(_t) if _t == '' or _t is None: print('获取到的_t为空值, 严重错误! 请检查!') miaosha_end_time = self.get_miaosha_end_time(_t) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': self.timestamp_to_regulartime(int( time.time())), 'miaosha_end_time': self.timestamp_to_regulartime( int(miaosha_end_time)), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['gender'] = str(item.get('gender', '0')) goods_data['page'] = item.get('page') # pprint(goods_data) # print(goods_data) chuchujie.insert_into_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) # sleep(CHUCHUJIE_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 # index += 1 else: print('数据库连接失败,此处跳过!') pass try: del chuchujie except: pass gc.collect()
class JdCommentParse(object): def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.comment_page_switch_sleep_time = 1.2 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs() self._add_headers_cookies() def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) self.goods_id = goods_id self.headers.update({ 'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id), }) # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 3): _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json' params = self._set_params(goods_id=goods_id, current_page=current_page) body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params) # self.my_lg.info(str(body)) _data = self._json_2_dict(body).get('wareDetailComment', {}).get('commentInfoList', []) _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id:{0}'.format(goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data def _get_comment_list(self, _tmp_comment_list): ''' 转换成需求的结果集 :param _tmp_comment_list: :return: ''' _comment_list = [] for item in _tmp_comment_list: _comment_date = item.get('commentDate', '') assert _comment_date != '', '得到的_comment_date为空str!请检查!' # sku_info(有些商品评论是没有规格的所以默认为空即可,不加assert检查!) ware_attributes = item.get('wareAttributes', []) # self.my_lg.info(str(ware_attributes)) sku_info = ' '.join([ i.get('key', '') + ':' + i.get('value', '') for i in ware_attributes ]) # assert sku_info != '', '得到的sku_info为空str!请检查!' _comment_content = item.get('commentData', '') assert _comment_content != '', '得到的评论内容为空str!请检查!' _comment_content = self._wash_comment(comment=_comment_content) buyer_name = item.get('userNickName', '') assert buyer_name != '', '得到的用户昵称为空值!请检查!' # jd设置默认 购买量为1 quantify = 1 head_img = item.get('userImgURL', '') assert head_img != '', '得到的用户头像为空值!请检查!' head_img = 'https://' + head_img # 第一次评论图片 _comment_img_list = item.get('pictureInfoList', []) if _comment_img_list != []: _comment_img_list = [{ 'img_url': img.get('largePicURL', '') } for img in _comment_img_list] '''追评''' append_comment = {} # star_level star_level = int(item.get('commentScore', '5')) comment = [{ 'comment': _comment_content, 'comment_date': _comment_date, 'sku_info': sku_info, 'img_url_list': _comment_img_list, 'star_level': star_level, 'video': '', }] _comment_list.append({ 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 评论数量 'head_img': head_img, # 头像 'append_comment': append_comment, # 追评 }) return _comment_list def _add_headers_cookies(self): # 测试发现得带cookies, 详细到cookies中的sid字符必须有 # 先获取cookies _cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://item.m.jd.com/') # self.my_lg.info(str(_cookies)) self.headers.update({ 'cookie': _cookies, }) return None def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/京东/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_headers(self): self.headers = { 'origin': 'https://item.m.jd.com', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS) - 1)], 'content-type': 'application/x-www-form-urlencoded', 'accept': 'application/json', 'referer': 'https://item.m.jd.com/ware/view.action?wareId=5025518', 'x-requested-with': 'XMLHttpRequest', } def _wash_comment(self, comment): ''' 清洗评论 :param comment: :return: ''' comment = re.compile(r'jd|\n|Jd|JD').sub('', comment) comment = re.compile('京东').sub('优秀网', comment) return comment def _json_2_dict(self, json_str): ''' json2dict :param json_str: :return: ''' try: _ = json.loads(json_str) except: self.my_lg.error('json.loads转换json_str时出错! 出错goods_id: ' + self.goods_id) return {} return _ def _set_params(self, goods_id, current_page): ''' 设置params :param goods_id: :param current_page: :return: ''' _params = [ ('wareId', goods_id), ('offset', str(current_page)), ('num', '10'), ('checkParam', 'LUIPPTP'), ('category', '670_671_1105'), ('isUseMobile', 'true'), ('evokeType', ''), ('type', '3'), # '0' 全部评论 | '3' 好评 ('isCurrentSku', 'false'), ] return _params def __del__(self): try: del self.my_lg del self.my_phantomjs del self.headers except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server. select_jumeiyoupin_xianshimiaosha_all_goods_id()) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 jumeiyoupin_spike = JuMeiYouPinSpike() # 获取cookies my_phantomjs = MyPhantomjs() cookies = my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 jumeiyoupin_miaosha = JuMeiYouPinParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] this_page_all_goods_list = self.get_one_page_all_goods_list( item[2]) if this_page_all_goods_list == '网络错误!': print('网络错误!先跳过') continue elif this_page_all_goods_list == []: print( '#### 该page对应得到的this_page_all_goods_list为空[]!') print('** 该商品已被下架限时秒杀活动, 此处将其删除') tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id( item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(goods_id=item[0]) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url( item[3]) jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get( 'begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = jumeiyoupin_spike.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time'] ) # print(goods_data) jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(JUMEIYOUPIN_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def __init__(self): self._set_headers() self.my_phantomjs = MyPhantomjs()