예제 #1
0
def goods_name_check_and_do_something():
    """
    违禁物品检测下架
    :return:
    """
    sql_cli = SqlServerMyPageInfoSaveItemPipeline()
    sql_str0 = '''
    -- select count(*)
    select GoodsID, GoodsName, ConvertTime, IsDelete, delete_time, MainGoodsID
    from dbo.GoodsInfoAutoGet
    where '''
    sql_str1 = 'update dbo.GoodsInfoAutoGet set IsDelete=1, ModfiyTime=%s, delete_time=%s where GoodsID=%s'
    goods_name_like_str = ' or '.join(['GoodsName like %s' for index in range(0, len(CONTRABAND_GOODS_KEY_TUPLE))])
    sql_str0 += goods_name_like_str
    print(sql_str0)
    params = ['%{}%'.format(item) for item in CONTRABAND_GOODS_KEY_TUPLE]
    pprint(params)
    res = sql_cli._select_table(
        sql_str=sql_str0,
        params=params, )
    pprint(res)

    # 下架
    assert res is not None
    now_time = get_shanghai_time()
    for item in res:
        goods_id = item[0]
        print('goods_id: {}'.format(goods_id))
        sql_cli._update_table(
            sql_str=sql_str1,
            params=(now_time, now_time, goods_id)
        )

    try:
        del sql_cli
    except:
        pass
예제 #2
0
    def deal_with_data(self):
        '''
        解析data数据,得到需要的东西
        :return: dict
        '''
        data = self.result_data
        if data != {}:
            shop_name = self._get_shop_name(data=data)
            # 掌柜
            account = ''
            title = self._get_title(data=data)
            sub_title = ''
            detail_name_list = self._get_detail_name_list(data=data)
            # print(detail_name_list)
            '''单独处理下架的情况'''
            if isinstance(detail_name_list, str):
                if detail_name_list == 'is_delete=1':
                    print('该商品已下架...')
                    sql_str = jp_update_str_1
                    params = (self.result_data.get('goods_id', ''), )
                    _ = SqlServerMyPageInfoSaveItemPipeline()
                    result = _._update_table(sql_str=sql_str, params=params)
                    if result:
                        print('### 该商品已经is_delete=1 ###')
                    else:
                        print('is_delete=1标记失败!')

            if detail_name_list == {}:
                return self._data_error_init()

            price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price(
                data=data)
            all_img_url = self._get_all_img_url(data=data)
            p_info = self._get_p_info(data=data)
            div_desc = self._get_div_desc(data=data)
            # 商品销售时间段
            schedule = self._get_goods_schedule(data=data)
            # pprint(schedule)

            is_delete = self._get_is_delete(data=data, schedule=schedule)
            if price == 0 or taobao_price == 0:  # 没有获取到价格说明商品已经下架了
                is_delete = 1
            parent_dir = data.get('parent_dir', '')

            result = {
                'shop_name': shop_name,  # 店铺名称
                'account': account,  # 掌柜
                'title': title,  # 商品名称
                'sub_title': sub_title,  # 子标题
                'price': price,  # 商品价格
                'taobao_price': taobao_price,  # 淘宝价
                # 'goods_stock': goods_stock,           # 商品库存
                'detail_name_list': detail_name_list,  # 商品标签属性名称
                # 'detail_value_list': detail_value_list, # 商品标签属性对应的值
                'price_info_list': price_info_list,  # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,  # 所有示例图片地址
                'p_info': p_info,  # 详细信息标签名对应属性
                'div_desc': div_desc,  # div_desc
                'is_delete': is_delete,  # 是否下架判断
                'schedule': schedule,  # 商品销售时间段
                'parent_dir': parent_dir,
            }
            # pprint(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            gc.collect()
            return result

        else:
            print('待处理的data为空的dict')
            return {}
예제 #3
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            return self._data_error_init()
        else:
            tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id)
            print('------>>>| 得到的商品手机版的地址为: ', tmp_url)
            '''
            1.原先使用requests来模拟(起初安全的运行了一个月),但是后来发现光requests会not Found,记住使用前别翻墙
            '''
            # try:
            #     response = requests.get(tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=12)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            #     main_body = response.content.decode('utf-8')
            #     # print(main_body)
            #     # main_body = re.compile(r'\n').sub('', main_body)
            #     main_body = re.compile(r'\t').sub('', main_body)
            #     main_body = re.compile(r'  ').sub('', main_body)
            #     print(main_body)
            #     data = re.compile(r'__PRELOADED_STATE__=(.*),window\.__SERVER_TIME__=').findall(main_body)  # 贪婪匹配匹配所有
            #     print(data)
            # except Exception:
            #     print('requests.get()请求超时....')
            #     print('data为空!')
            #     return self._data_error_init()
            '''
            2.采用phantomjs来处理,记住使用前别翻墙
            '''
            # body = self.driver.use_phantomjs_to_get_url_body(url=tmp_url, css_selector='div.sc-kgoBCf.bTQvTk')    # 该css为手机端标题块
            body = self.driver.use_phantomjs_to_get_url_body(
                url=tmp_url)  # 该css为手机端标题块
            # print(body)
            if re.compile(r'<span id="t-index">页面丢失ing</span>').findall(
                    body) != []:  # 页面为空处理
                _ = SqlServerMyPageInfoSaveItemPipeline()
                if _.is_connect_success:
                    _._update_table(sql_str=jp_update_str_1,
                                    params=(goods_id, ))
                    try:
                        del _
                    except:
                        pass
                    print('@@@ 逻辑删除该商品[{0}] is_delete = 1'.format(goods_id))
                    return self._data_error_init()

            if body == '':
                print('获取到的body为空str!请检查!')
                return self._data_error_init()

            data = re.compile(
                r'__PRELOADED_STATE__ = (.*);</script> <style ').findall(
                    body)  # 贪婪匹配匹配所有

            # 得到skudata
            # 卷皮原先的skudata请求地址1(官方放弃)
            # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id)
            # 现在卷皮skudata请求地址2
            skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str(
                goods_id)

            self.skudata_headers = self.headers
            self.skudata_headers.update({'Host': 'webservice.juanpi.com'})
            skudata_body = Requests.get_url_body(
                url=skudata_url,
                headers=self.skudata_headers,
                high_conceal=True,
                ip_pool_type=self.ip_pool_type)
            if skudata_body == '':
                print('获取到的skudata_body为空str!请检查!')
                return self._data_error_init()
            skudata = re.compile(r'(.*)').findall(skudata_body)  # 贪婪匹配匹配所有

            if skudata != []:
                skudata = json_2_dict(json_str=skudata[0]).get('skudata', {})
                if skudata == {}:
                    return self._data_error_init()
                # pprint(skudata)

                try:
                    if skudata.get('info') is not None:
                        pass  # 说明得到正确的skudata
                    else:  # 否则跳出
                        print('skudata中info的key为None, 返回空dict')
                        return self._data_error_init()

                except AttributeError as e:
                    print('遇到错误如下(先跳过!): ', e)
                    return self._data_error_init()

            else:
                print('skudata为空!')
                return self._data_error_init()

            if data != []:
                main_data = json_2_dict(json_str=data[0])
                if main_data == {}:
                    return self._data_error_init()

                if main_data.get('detail') is not None:
                    main_data = self._wash_main_data(
                        main_data.get('detail', {}))

                    main_data['skudata'] = skudata
                    main_data['goods_id'] = goods_id
                    main_data['parent_dir'] = _jp_get_parent_dir(
                        phantomjs=self.driver, goods_id=goods_id)
                    self.result_data = main_data
                    # pprint(main_data)

                    return main_data

                else:
                    print('data中detail的key为None, 返回空dict')
                    return self._data_error_init()
            else:
                print('data为空!')
                return self._data_error_init()
예제 #4
0
    def get_goods_data(self, goods_id: str) -> '重载获取数据的方法':
        '''
        模拟构造得到data
        :param goods_id:
        :return: data dict类型
        '''
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            data = {}
            # 常规商品手机地址
            goods_url = 'https://m.mia.com/item-' + str(goods_id) + '.html'
            # 常规商品pc地址
            # goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'
            print('------>>>| 待抓取的地址为: ', goods_url)

            body = MyRequests.get_url_body(url=goods_url,
                                           headers=self.headers,
                                           had_referer=True)
            # print(body)

            if body == '':
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            is_mia_mian_page = Selector(
                text=body).css('div.item-center::text').extract_first()
            # print(is_mia_mian_page)
            if isinstance(
                    is_mia_mian_page, str
            ) and is_mia_mian_page == '进口母婴正品特卖':  # 单独处理拼团下架被定向到手机版主页的拼团商品
                print('++++++ 该拼团商品已下架,被定向到蜜芽主页, 此处将其逻辑删除!')
                self.result_data = {}
                tmp_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                sql_str = r'update dbo.mia_pintuan set is_delete=1 where goods_id = %s'
                tmp_pipeline._update_table(sql_str=sql_str, params=(goods_id))
                print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |')
                gc.collect()
                return {}

            # 判断是否跳转,并得到跳转url, 跳转url的body, 以及is_hk(用于判断是否是全球购的商品)
            body, sign_direct_url, is_hk = self.get_jump_to_url_and_is_hk(
                body=body)

            try:
                # title, sub_title
                data['title'], data[
                    'sub_title'] = self.get_title_and_sub_title(body=body)

                # 获取所有示例图片
                all_img_url = self.get_all_img_url(goods_id=goods_id,
                                                   is_hk=is_hk)
                if all_img_url == '':
                    self.result_data = {}
                    return {}
                '''
                获取p_info
                '''
                tmp_p_info = Selector(
                    text=body).css('div.showblock div p').extract_first()

                if tmp_p_info == '':
                    print('获取到的tmp_p_info为空值, 请检查!')
                    self.result_data = {}
                    return {}
                else:
                    tmp_p_info = re.compile('<p>|</p>').sub('', tmp_p_info)
                    tmp_p_info = re.compile(r'<!--思源品牌,隐藏品牌-->').sub(
                        '', tmp_p_info)
                    p_info = [{
                        'p_name': item.split(':')[0],
                        'p_value': item.split(':')[1]
                    } for item in tmp_p_info.split('<br>') if item != '']

                # pprint(p_info)
                data['p_info'] = p_info

                # 获取每个商品的div_desc
                div_desc = self.get_goods_div_desc(body=body)

                if div_desc == '':
                    print('获取到的div_desc为空值! 请检查')
                    self.result_data = {}
                    return {}
                data['div_desc'] = div_desc
                '''
                获取每个规格的goods_id,跟规格名,以及img_url, 用于后面的处理
                '''
                sku_info = self.get_tmp_sku_info(body, goods_id,
                                                 sign_direct_url, is_hk)
                if sku_info == {}:
                    return {}
                '''
                由于这个拿到的都是小图,分辨率相当低,所以采用获取每个goods_id的phone端地址来获取每个规格的高清规格图
                '''
                # # print(Selector(text=body).css('dd.color_list li').extract())
                # for item in Selector(text=body).css('dd.color_list li').extract():
                #     # print(item)
                #     try:
                #         # 该颜色的商品的goods_id
                #         color_goods_id = Selector(text=item).css('a::attr("href")').extract_first()
                #         # 该颜色的名字
                #         color_name = Selector(text=item).css('a::attr("title")').extract_first()
                #         # 该颜色的img_url
                #         color_goods_img_url = Selector(text=item).css('img::attr("src")').extract_first()
                #
                #         color_goods_id = re.compile('(\d+)').findall(color_goods_id)[0]
                #     except IndexError:      # 表示该li为这个tmp_url的地址 (单独处理goods_id)
                #         color_goods_id = goods_id
                #         color_name = Selector(text=item).css('a::attr("title")').extract_first()
                #         color_goods_img_url = Selector(text=item).css('img::attr("src")').extract_first()
                #     print(color_goods_id, ' ', color_name, ' ', color_goods_img_url)
                '''
                获取每个规格对应价格跟规格以及其库存
                '''
                if self.get_true_sku_info(sku_info=sku_info) == {}:  # 表示出错退出
                    return {}
                else:  # 成功获取
                    true_sku_info, i_s, pintuan_time, all_sell_count = self.get_true_sku_info(
                        sku_info=sku_info)
                    data['price_info_list'] = true_sku_info
                    data['pintuan_time'] = pintuan_time
                    data['all_sell_count'] = all_sell_count
                # pprint(true_sku_info)

                # 设置detail_name_list
                data['detail_name_list'] = self.get_detail_name_list(i_s=i_s)
                # print(detail_name_list)
                '''单独处理all_img_url为[]的情况'''
                if all_img_url == []:
                    all_img_url = [{
                        'img_url': true_sku_info[0].get('img_url')
                    }]

                data['all_img_url'] = all_img_url
                # pprint(all_img_url)
                '''
                单独处理得到goods_url
                '''
                if sign_direct_url != '':
                    goods_url = sign_direct_url

                data['goods_url'] = goods_url

            except Exception as e:
                print('遇到错误如下: ', e)
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            if data != {}:
                # pprint(data)
                self.result_data = data
                return data

            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
예제 #5
0
    def get_goods_data(self, goods_id:str) -> '重载获取数据的方法':
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data dict类型
        '''
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            if re.compile(r'/rushdetail/').findall(goods_id) != []:
                tmp_url = goods_id
                print('------>>>| 原pc地址为: ', tmp_url)

                goods_id = re.compile('https://shop.mogujie.com/rushdetail/(.*?)\?.*?').findall(goods_id)[0]
                print('------>>>| 得到的蘑菇街商品id为:', goods_id)

            else:
                print('获取到的蘑菇街买哦啥地址错误!请检查')
                self.result_data = {}
                return {}

            data = {}

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
            # print(body)

            if body == '':
                print('获取到的body为空str!')
                self.result_data = {}
                return {}

            try:
                goods_info = re.compile(r'var detailInfo = (.*?);</script>').findall(body)[0]
                # print(goods_info)

                item_info = re.compile(r'itemInfo:(.*?) ,priceRuleImg').findall(goods_info)[0]
                # print(item_info)

                sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall(goods_info)[0]
                # print(sku_info)

                shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall(goods_info)[0]
                # print(shop_info)

                item_info = json_2_dict(json_str=item_info)
                sku_info = json_2_dict(json_str=sku_info)
                shop_info = json_2_dict(json_str=shop_info)
                # pprint(item_info)
                # pprint(sku_info)
                # pprint(shop_info)

                data['title'] = item_info.get('title', '')
                if data['title'] == '':
                    print('title为空!')
                    raise Exception

                data['sub_title'] = ''

                data['shop_name'] = shop_info.get('name', '')
                # print(data['shop_name'])

                # 获取所有示例图片
                all_img_url = [{'img_url': item} for item in item_info.get('topImages', [])]
                # pprint(all_img_url)
                data['all_img_url'] = all_img_url

                '''
                获取p_info
                '''
                p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str(goods_id)
                tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url, headers=self.headers, had_referer=True)
                # print(tmp_p_info_body)
                if tmp_p_info_body == '':
                    print('获取到的tmp_p_info_body为空值, 请检查!')
                    raise Exception

                p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body)
                # pprint(p_info)
                # if p_info == []:
                #     print('获取到的p_info为空list')
                #     self.result_data = {}
                #     return {}
                # else:
                # 不做上面判断了因为存在没有p_info的商品
                data['p_info'] = p_info

                # 获取每个商品的div_desc
                div_desc = self.get_goods_div_desc(tmp_p_info_body=tmp_p_info_body)
                # print(div_desc)
                if div_desc == '':
                    print('获取到的div_desc为空str, 请检查!')
                    self.result_data = {}
                    return {}
                else:
                    data['div_desc'] = div_desc

                '''
                获取去detail_name_list
                '''
                detail_name_list = self.get_goods_detail_name_list(sku_info=sku_info)
                # print(detail_name_list)
                if detail_name_list == '':
                    print('获取detail_name_list出错, 请检查!')
                    self.result_data = {}
                    return {}
                else:
                    data['detail_name_list'] = detail_name_list

                '''
                获取每个规格对应价格跟规格以及其库存
                '''
                price_info_list = self.get_price_info_list(sku_info=sku_info)
                # pprint(price_info_list)
                if price_info_list == '':
                    raise Exception
                else:
                    # pprint(price_info_list)
                    data['price_info_list'] = price_info_list


                if price_info_list == []:
                    print('该商品已售完,此处将商品状态改为1')
                    my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    try:
                        sql_str = r'update dbo.mogujie_xianshimiaosha set is_delete=1 where goods_id = %s'
                        my_pipeline._update_table(sql_str=sql_str, params=(goods_id))
                    except:
                        print('将该商品逻辑删除时出错!')
                        pass
                    print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |')
                    self.result_data = {}
                    return {}

                # 商品价格和淘宝价
                try:
                    tmp_price_list = sorted([round(float(item.get('detail_price', '')), 2) for item in data['price_info_list']])
                    price = Decimal(tmp_price_list[-1]).__round__(2)  # 商品价格
                    taobao_price = Decimal(tmp_price_list[0]).__round__(2)  # 淘宝价
                    # print('商品的最高价: ', price, ' 最低价: ', taobao_price)
                except IndexError:
                    print('获取price和taobao_price时出错! 请检查')
                    raise Exception

                data['price'] = price
                data['taobao_price'] = taobao_price

            except Exception as e:
                print('遇到错误: ', e)
                self.result_data = {}
                return {}

            if data != {}:
                # pprint(data)
                self.result_data = data
                return data

            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
예제 #6
0
class DbTimingScript(AsyncCrawler):
    """数据库定时脚本"""
    def __init__(self):
        AsyncCrawler.__init__(self, )
        self.sleep_time = 2. * 60
        self.init_sql_str()

    def init_sql_str(self):
        # 删除下架又上架但是状态还是下架的异常数据(即下架状态但是delete_time<shelf_time)(原因后台无法更新)
        self.sql_str0 = '''
        select top 100 GoodsID, SiteID
        from dbo.GoodsInfoAutoGet
        where MainGoodsID is not NULL
        and IsDelete=1
        and delete_time < shelf_time
        '''
        self.sql_str1 = '''
        update dbo.GoodsInfoAutoGet
        set ModfiyTime=%s, delete_time=%s
        where GoodsID=%s
        '''
        # 更改原先下架但是delete_time为空的商品(原因后台无法由上架变下架)
        self.sql_str2 = '''
        select top 100 GoodsID, SiteID
        from dbo.GoodsInfoAutoGet
        where MainGoodsID is not null
        and IsDelete=1
        and delete_time is null
        '''
        self.sql_str3 = '''
        update dbo.GoodsInfoAutoGet 
        set delete_time=%s
        where GoodsID=%s
        '''
        # 更改上架状态但是delete_time>shelf_time的商品(原因后台无法更新下架变上架)
        self.sql_str4 = '''
        select top 100 GoodsID, SiteID
        from dbo.GoodsInfoAutoGet
        where MainGoodsID is not NUll
        and IsDelete=0
        and shelf_time < delete_time
        '''
        self.sql_str5 = '''
        update dbo.GoodsInfoAutoGet 
        set ModfiyTime=%s, shelf_time=%s
        where GoodsID=%s
        '''
        # tb天天特价过期下架
        self.sql_str6 = '''
        select top 500 goods_id, site_id
        from dbo.taobao_tiantiantejia
        where MainGoodsID is not null
        and is_delete=0
        and miaosha_end_time < GETDATE()
        '''
        # zhe800秒杀标记下架
        self.sql_str7 = '''
        select top 500 goods_id, site_id
        from dbo.zhe_800_xianshimiaosha
        where MainGoodsID is not null
        and is_delete=0
        and miaosha_end_time <= GETDATE()
        '''
        # zhe800拼团过期下架
        self.sql_str8 = '''
        select top 500 goods_id, site_id
        from dbo.zhe_800_pintuan
        where MainGoodsID is not null
        and is_delete=0
        and miaosha_end_time <= GETDATE()
        '''
        # mia拼团
        self.sql_str9 = '''
        select top 500 goods_id, site_id
        from dbo.mia_pintuan
        where MainGoodsID is not null
        and is_delete=0
        and miaosha_end_time <= GETDATE()
        '''
        # 聚美优品拼团
        self.sql_str10 = '''
        select top 500 goods_id, site_id
        from dbo.jumeiyoupin_pintuan
        where MainGoodsID is not null
        and is_delete=0
        and miaosha_end_time <= GETDATE()
        '''
        # 周期性的把最近更新的商品进行规格跟价格变动标记
        self.sql_str11 = '''
        select top 200 GoodsID 
        from dbo.GoodsInfoAutoGet
        where MainGoodsID is not null
        and IsDelete=0
        ORDER BY ModfiyTime desc
        '''
        self.sql_str12 = '''
        update dbo.GoodsInfoAutoGet 
        set is_spec_change=1, spec_trans_time=%s, 
        ModfiyTime=%s, 
        IsPriceChange=1, sku_info_trans_time=%s, PriceChangeInfo=SKUInfo
        where GoodsID=%s
        '''

    async def _fck_run(self):
        while True:
            try:
                print('now_time: {}'.format(get_shanghai_time()))
                self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
                if not self.sql_cli.is_connect_success:
                    raise SqlServerConnectionException
                else:
                    pass

                await self.db_script0(
                    select_sql_str=self.sql_str0,
                    update_sql_str=self.sql_str1,
                    func_get_params=self.get_params0,
                )
                await self.db_script0(
                    select_sql_str=self.sql_str2,
                    update_sql_str=self.sql_str3,
                    func_get_params=self.get_params1,
                )
                await self.db_script0(
                    select_sql_str=self.sql_str4,
                    update_sql_str=self.sql_str5,
                    func_get_params=self.get_params0,
                )
                # tb天天特价
                await self.db_script0(
                    select_sql_str=self.sql_str6,
                    update_sql_str=tb_update_str_5,
                    func_get_params=self.get_params2,
                )
                # zhe800秒杀
                await self.db_script0(
                    select_sql_str=self.sql_str7,
                    update_sql_str=z8_update_str_6,
                    func_get_params=self.get_params2,
                )
                # zhe800拼团
                await self.db_script0(
                    select_sql_str=self.sql_str8,
                    update_sql_str=z8_update_str_4,
                    func_get_params=self.get_params2,
                )
                # mia拼团
                await self.db_script0(
                    select_sql_str=self.sql_str9,
                    update_sql_str=mia_update_str_7,
                    func_get_params=self.get_params2,
                )
                # 聚美优品拼团
                await self.db_script0(
                    select_sql_str=self.sql_str10,
                    update_sql_str=jm_update_str_5,
                    func_get_params=self.get_params2,
                )
                # 周期性的把最近更新的商品进行规格跟价格变动标记
                await self.db_script0(
                    select_sql_str=self.sql_str11,
                    update_sql_str=self.sql_str12,
                    func_get_params=self.get_params3,
                )
            except Exception as e:
                print(e)
            finally:
                print('休眠{}s ...'.format(self.sleep_time))
                await async_sleep(self.sleep_time)

    async def db_script0(
        self,
        select_sql_str: str,
        update_sql_str: str,
        func_get_params,
    ):
        get_current_func_info_by_traceback(self=self)
        db_res = self.sql_cli._select_table(sql_str=select_sql_str, )
        db_res = [] if db_res is None else db_res
        if db_res == []:
            print('目标db_res为空list! 跳过此次!')
            return None

        for item in db_res:
            params = func_get_params(k=item)
            self.sql_cli._update_table(
                sql_str=update_sql_str,
                params=params,
            )

        try:
            del db_res
        except:
            pass

        return None

    def get_params0(self, k) -> tuple:
        now_time = str(get_shanghai_time())
        goods_id = k[0]
        site_id = k[1]
        print('goods_id: {}, site_id: {}'.format(goods_id, site_id))

        return tuple([
            now_time,
            now_time,
            goods_id,
        ])

    def get_params1(self, k) -> tuple:
        now_time = str(get_shanghai_time())
        goods_id = k[0]
        site_id = k[1]
        print('goods_id: {}, site_id: {}'.format(goods_id, site_id))

        return tuple([
            now_time,
            goods_id,
        ])

    def get_params2(self, k) -> tuple:
        now_time = str(get_shanghai_time())
        goods_id = k[0]
        site_id = k[1]
        print('goods_id: {}, site_id: {}'.format(goods_id, site_id))

        return tuple([
            now_time,
            goods_id,
        ])

    def get_params3(self, k) -> tuple:
        now_time = str(get_shanghai_time())
        goods_id = k[0]
        print('goods_id: {}'.format(goods_id))

        return tuple([
            now_time,
            now_time,
            now_time,
            goods_id,
        ])

    def __del__(self):
        try:
            pass
        except:
            pass
        collect()
"""

import sys, json, re
sys.path.append('..')
from pprint import pprint

from my_pipeline import SqlServerMyPageInfoSaveItemPipeline

_ = SqlServerMyPageInfoSaveItemPipeline()
sql_str = r'select GoodsID, SiteID, GoodsUrl from dbo.GoodsInfoAutoGet where SiteID=6 order by ID desc'
_s = _._select_table(sql_str=sql_str)
# print(_s)

import re
tmp = _s
tmp = [list(item) for item in tmp]
for item in tmp:
    if re.compile('\?id=').findall(item[2]) == []:
        a = re.compile('(.*htm)').findall(item[2])[0]
        b = re.compile('.*htm(.*)').findall(item[2])[0]
        c = a + '?id=' + b
        item[2] = c

# print(tmp)
tmp = [{'goods_id': item[0], 'goods_url': item[2]} for item in tmp]
# print(tmp)

sql_str = r'update dbo.GoodsInfoAutoGet set GoodsUrl=%s where GoodsID = %s'
for item in tmp:
    _._update_table(sql_str=sql_str, params=(item['goods_url'], item['goods_id']))
예제 #8
0
class Z8Updater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/折800/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.tmp_sql_server = None
        self.goods_index = 1
        self.concurrency = 8  # 并发量
        self.delete_sql_str = z8_delete_str_3

    async def _get_db_old_data(self):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=z8_delete_str_4,
                                              params=None)
            await async_sleep(5)
            result = list(
                self.tmp_sql_server._select_table(sql_str=z8_select_str_4))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_begin_time(self, miaosha_time) -> int:
        miaosha_begin_time = json_2_dict(miaosha_time).get(
            'miaosha_begin_time')
        miaosha_begin_time = int(
            str(
                time.mktime(
                    time.strptime(miaosha_begin_time,
                                  '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_begin_time

    async def _get_new_z8_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.zhe_800_spike
            except:
                pass
            collect()
            self.zhe_800_spike = Zhe800Spike()

    async def _update_is_delete(self, goods_id) -> bool:
        '''
        下架商品逻辑删除
        :param goods_id:
        :return:
        '''
        delete_str = 'update dbo.zhe_800_xianshimiaosha set is_delete=1 where goods_id=%s'
        res = self.tmp_sql_server._update_table(sql_str=delete_str,
                                                params=(goods_id, ))
        await async_sleep(.3)

        return res

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        session_id = item[2]
        miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time)
        # self.lg.info(str(miaosha_begin_time))
        await self._get_new_z8_obj(index=index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
            remainder=30)

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_begin_time)
            if is_recent_time == 0:
                res = await self._update_is_delete(goods_id=goods_id)
                self.lg.info(
                    '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format(
                        goods_id,
                        json.loads(item[1]).get('miaosha_begin_time')))
                index += 1
                self.goods_index = index
                res = True
                await async_sleep(.3)

                return goods_id, res

            elif is_recent_time == 2:
                # 可能包括过期的
                self.lg.info('未来时间暂时不更新! {}'.format(
                    timestamp_to_regulartime(miaosha_begin_time)))
                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                try:
                    tmp_data = self.zhe_800_spike._get_one_session_id_data(
                        base_session_id=str(session_id))
                except Exception:
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                try:
                    tmp_data = tmp_data.get('data', {}).get('blocks', [])
                    assert tmp_data != [], '该session_id不存在,此处跳过'
                except AssertionError:  # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    res = await self._update_is_delete(goods_id)
                    self.lg.info(
                        msg=
                        '该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'
                        .format(goods_id, miaosha_begin_time))
                    index += 1
                    self.goods_index = index
                    await async_sleep(1.2)

                    return goods_id, res

                tmp_data = [item_s.get('deal', {}) for item_s in tmp_data]
                # pprint(tmp_data)
                try:
                    miaosha_goods_list = await self._get_miaoshao_goods_info_list(
                        data=tmp_data)
                    # pprint(miaosha_goods_list)
                except ValueError:
                    await async_sleep(2)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                # 该session_id中现有的所有zid的list
                miaosha_goods_all_goods_id = [
                    i.get('zid') for i in miaosha_goods_list
                ]
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    res = await self._update_is_delete(goods_id)
                    self.lg.info(
                        '该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format(
                            goods_id))
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(
                        miaosha_goods_list=miaosha_goods_list,
                        goods_id=goods_id)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(1.5)

        return goods_id, res

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :return:
        '''
        miaosha_goods_list = kwargs.get('miaosha_goods_list')
        goods_id = kwargs.get('goods_id')

        zhe_800_miaosha = Zhe800Parse()
        res = False
        for item_1 in miaosha_goods_list:
            if item_1.get('zid', '') == goods_id:
                zhe_800_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = zhe_800_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    break

                else:  # 否则就解析并且插入
                    goods_data['stock_info'] = item_1.get('stock_info')
                    goods_data['goods_id'] = str(item_1.get('zid'))
                    if item_1.get('stock_info').get('activity_stock') > 0:
                        # self.lg.info(item_1.get('price'))
                        # self.lg.info(item_1.get('taobao_price'))
                        goods_data['price'] = item_1.get('price')
                        goods_data['taobao_price'] = item_1.get('taobao_price')
                    else:
                        self.lg.info('该商品参与活动的对应库存为0')
                        await self._update_is_delete(goods_id=goods_id)
                        break

                    goods_data['sub_title'] = item_1.get('sub_title')
                    goods_data['miaosha_time'] = item_1.get('miaosha_time')
                    goods_data['miaosha_begin_time'], goods_data[
                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                            miaosha_time=item_1.get('miaosha_time'))

                    if goods_data.get('is_delete', 0) == 1:
                        self.lg.info('该商品[{0}]已售罄...'.format(goods_id))

                    # self.lg.info(str(goods_data['stock_info']))
                    # self.lg.info(str(goods_data['miaosha_time']))
                    res = zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table(
                        data=goods_data, pipeline=self.tmp_sql_server)
                    break
            else:
                pass
        collect()

        return res

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = datetime_to_timestamp(get_shanghai_time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -259200:  # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息
            # if diff_time < -172800:     # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来2小时的商品信息
            return 0  # 已过期恢复原价的
        elif diff_time > -172800 and diff_time < 7200:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:
            return 2  # 未来时间的暂时不用更新

    async def _update_db(self):
        '''
        秒杀数据实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.zhe_800_spike = Zhe800Spike()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)

            try:
                del self.zhe_800_spike
            except:
                pass
            collect()

    async def _get_miaoshao_goods_info_list(self, data) -> list:
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        # pprint(data)
        for item in data:
            if item == {}:
                continue
            # pprint(item)
            tmp = {}
            tmp['miaosha_time'] = {
                'miaosha_begin_time':
                timestamp_to_regulartime(int(
                    str(item.get('begin_time'))[0:10])),
                'miaosha_end_time':
                timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])),
            }

            # 折800商品地址
            tmp['zid'] = item.get('zid')
            # 限时秒杀的库存信息
            tmp['stock_info'] = {
                'activity_stock': item.get('activity_stock',
                                           0),  # activity_stock为限时抢的剩余数量
                'stock': item.get('stock', 0),  # stock为限时秒杀的总库存
            }
            # 原始价格
            tmp['price'] = float(item.get('list_price'))
            # 秒杀的价格, float类型
            tmp['taobao_price'] = float(item.get('price'))
            tmp['sub_title'] = item.get('description', '')
            miaosha_goods_list.append(tmp)
            # pprint(miaosha_goods_list)

        return miaosha_goods_list

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.zhe_800_spike
        except:
            pass
        collect()
예제 #9
0
    def deal_with_data(self):
        '''
        解析data数据,得到需要的东西
        :return: dict
        '''
        data = self.result_data
        if data != {}:
            # 店铺名称
            shop_name = self._get_shop_name(data=data)

            # 掌柜
            account = ''

            # 商品名称
            title = data.get('baseInfo', {}).get('title', '')

            # 子标题
            sub_title = ''

            # 商品库存

            # 商品标签属性名称
            detail_name_list = self._get_detail_name_list(data=data)
            if isinstance(detail_name_list, str):  # 单独处理下架的情况
                if detail_name_list == 'is_delete=1':
                    print('该商品已下架...')
                    sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s'
                    params = (self.result_data.get('goods_id', ''), )
                    _ = SqlServerMyPageInfoSaveItemPipeline()
                    result = _._update_table(sql_str=sql_str, params=params)
                    if result:
                        print('### 该商品已经is_delete=1 ###')
                    else:
                        print('is_delete=1标记失败!')

            if detail_name_list == {}:
                self.result_data = {}
                return {}
            # print(detail_name_list)

            # 商品标签属性对应的值(pass不采集)

            # 要存储的每个标签对应的规格的价格及其库存
            price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price(
                data=data)
            # print('最高价为: ', price)
            # print('最低价为: ', taobao_price)
            # pprint(price_info_list)

            # 所有示例图片的地址
            # pprint(data.get('goodImages'))
            all_img_url = [{
                'img_url': item
            } for item in data.get('goodImages')]
            # print(all_img_url)

            # 详细信息标签名对应的属性
            p_info = self._get_p_info(data=data)

            # pprint(p_info)

            # div_desc
            div_desc = self._get_div_desc(data=data)
            # print(div_desc)

            # 商品销售时间段
            schedule = self._get_goods_schedule(data=data)
            # pprint(schedule)

            is_delete = self._get_is_delete(data=data, schedule=schedule)
            if price == 0 or taobao_price == 0:  # 没有获取到价格说明商品已经下架了
                is_delete = 1
            # print('is_delete = ', is_delete)

            result = {
                'shop_name': shop_name,  # 店铺名称
                'account': account,  # 掌柜
                'title': title,  # 商品名称
                'sub_title': sub_title,  # 子标题
                'price': price,  # 商品价格
                'taobao_price': taobao_price,  # 淘宝价
                # 'goods_stock': goods_stock,           # 商品库存
                'detail_name_list': detail_name_list,  # 商品标签属性名称
                # 'detail_value_list': detail_value_list, # 商品标签属性对应的值
                'price_info_list': price_info_list,  # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,  # 所有示例图片地址
                'p_info': p_info,  # 详细信息标签名对应属性
                'div_desc': div_desc,  # div_desc
                'is_delete': is_delete,  # 是否下架判断
                'schedule': schedule,  # 商品销售时间段
            }
            # pprint(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            gc.collect()
            return result

        else:
            print('待处理的data为空的dict')
            return {}
예제 #10
0
    def get_goods_data(self, goods_id: str) -> dict:
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data dict类型
        '''
        if goods_id == '':
            return self._data_error()

        if re.compile(r'/rushdetail/').findall(goods_id) != []:
            tmp_url = goods_id
            print('------>>>| 原pc地址为: ', tmp_url)

            goods_id = re.compile(
                'https://shop.mogujie.com/rushdetail/(.*?)\?.*?').findall(
                    goods_id)[0]
            print('------>>>| 得到的蘑菇街商品id为:', goods_id)

        else:
            print('获取到的蘑菇街买哦啥地址错误!请检查')
            return self._data_error()

        data = {}
        body = Requests.get_url_body(url=tmp_url,
                                     headers=self.headers,
                                     had_referer=True,
                                     ip_pool_type=self.ip_pool_type)
        # print(body)
        if body == '':
            print('获取到的body为空str!')
            return self._data_error()

        try:
            goods_info = re.compile(
                r'var detailInfo = (.*?);</script>').findall(body)[0]
            # print(goods_info)

            item_info = re.compile(r'itemInfo:(.*?) ,priceRuleImg').findall(
                goods_info)[0]
            # print(item_info)

            sku_info = re.compile(r'skuInfo:(.*?),pinTuanInfo').findall(
                goods_info)[0]
            # print(sku_info)

            shop_info = re.compile(r'shopInfo:(.*?),skuInfo').findall(
                goods_info)[0]
            # print(shop_info)

            item_info = json_2_dict(json_str=item_info)
            sku_info = json_2_dict(json_str=sku_info)
            shop_info = json_2_dict(json_str=shop_info)
            # pprint(item_info)
            # pprint(sku_info)
            # pprint(shop_info)

            title = self._get_title(item_info=item_info)
            # print(title)
            data['title'] = title
            data['sub_title'] = ''
            data['shop_name'] = self._get_shop_name(shop_info=shop_info)
            data['all_img_url'] = self._get_all_img_url(item_info=item_info)
            data['p_info'], tmp_p_info_body = self._get_p_info(
                goods_id=goods_id)
            data['div_desc'] = self._get_div_desc(tmp_p_info_body)
            data['detail_name_list'] = self._get_detail_name_list(sku_info)
            '''
            获取每个规格对应价格跟规格以及其库存
            '''
            price_info_list = self.get_price_info_list(sku_info=sku_info)
            assert price_info_list != '', 'price_info_list为空值!'
            # pprint(price_info_list)
            data['price_info_list'] = price_info_list
            if price_info_list == []:
                print('该商品已售完, 此处将商品状态改为1')
                my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                try:
                    my_pipeline._update_table(sql_str=mg_update_str_1,
                                              params=(goods_id))
                except:
                    print('将该商品逻辑删除时出错!')
                    pass
                print('| +++ 该商品状态已被逻辑is_delete = 1 +++ |')
                return self._data_error()

            # 商品价格和淘宝价
            try:
                tmp_price_list = sorted([
                    round(float(item.get('detail_price', '')), 2)
                    for item in data['price_info_list']
                ])
                price = Decimal(tmp_price_list[-1]).__round__(2)  # 商品价格
                taobao_price = Decimal(tmp_price_list[0]).__round__(2)  # 淘宝价
                # print('商品的最高价: ', price, ' 最低价: ', taobao_price)
            except IndexError:
                print('获取price和taobao_price时出错! 请检查')
                raise Exception
            data['price'] = price
            data['taobao_price'] = taobao_price

        except Exception as e:
            print('遇到错误: ', e)
            return self._data_error()

        self.result_data = data

        return data
예제 #11
0
class JPUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.tmp_sql_server = None
        self.concurrency = 8
        self.goods_index = 1
        self.delete_sql_str = jp_delete_str_3

    async def _get_pc_headers(self) -> dict:
        return {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'm.juanpi.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    async def _get_db_old_data(self) -> (None, list):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None)
            await async_sleep(5)
            result = list(self.tmp_sql_server._select_table(sql_str=jp_select_str_4))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_begin_time(self, miaosha_time) -> int:
        miaosha_begin_time = json_2_dict(miaosha_time).get('miaosha_begin_time')
        miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_begin_time

    async def _get_new_jp_obj(self, index):
        if index % 10 == 0:         # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.juanpi_miaosha
            except:
                pass
            collect()
            self.juanpi_miaosha = JuanPiParse()

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        tab_id = item[2]
        page = item[3]
        miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time)
        # self.lg.info(str(miaosha_begin_time))
        await self._get_new_jp_obj(index=index)
        self.tmp_sql_server = await _get_new_db_conn(db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30)

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_begin_time)
            if is_recent_time == 0:
                res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,))
                self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format(goods_id, miaosha_begin_time))
                await async_sleep(.3)
                index += 1
                self.goods_index = index

                return goods_id, res

            elif is_recent_time == 2:
                self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id))
                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(goods_id, index))
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id), str(page),
                )
                # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url))
                body = Requests.get_url_body(url=tmp_url, headers=await self._get_pc_headers(), ip_pool_type=self.ip_pool_type)
                try:
                    data = json_2_dict(body, default_res={}).get('data', {})
                    assert data != {}, 'data为空dict!'
                    data = data.get('goodslist', [])
                    assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(tab_id, page)
                except AssertionError:
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                miaosha_goods_list = await self._get_miaoshao_goods_info_list(data=data)
                # self.lg.info(str(miaosha_goods_list))
                # 该tab_id, page中现有的所有goods_id的list
                miaosha_goods_all_goods_id = [i.get('goods_id') for i in miaosha_goods_list]
                self.lg.info(str(miaosha_goods_all_goods_id))
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    if miaosha_goods_all_goods_id != []:        # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过!
                        self.lg.info('该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id))
                    else:
                        # 表示该tab_id,page中没有了该goods_id
                        res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,))
                        self.lg.info('该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format(goods_id))

                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(miaosha_goods_list=miaosha_goods_list, goods_id=goods_id)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        await async_sleep(1.2)

        return goods_id, res

    async def _update_db(self) -> None:
        '''
        秒杀数据实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency)
                self.juanpi_miaosha = JuanPiParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(self.loop.create_task(self._update_one_goods_info(item=item, index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)
            try:
                del self.juanpi_miaosha
            except:
                pass
            collect()

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :param kwargs:
        :return:
        '''
        res = False
        miaosha_goods_list = kwargs.get('miaosha_goods_list')
        goods_id = kwargs.get('goods_id')

        for item_1 in miaosha_goods_list:
            if item_1.get('goods_id', '') == goods_id:
                self.juanpi_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = self.juanpi_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    break
                else:  # 否则就解析并且插入
                    goods_data['stock_info'] = item_1.get('stock_info')
                    goods_data['goods_id'] = item_1.get('goods_id')
                    # goods_data['username'] = '******'
                    if item_1.get('stock_info').get('activity_stock') > 0:
                        goods_data['price'] = item_1.get('price')  # 秒杀前的原特价
                        goods_data['taobao_price'] = item_1.get('taobao_price')  # 秒杀价
                    else:
                        pass
                    goods_data['sub_title'] = item_1.get('sub_title', '')
                    goods_data['miaosha_time'] = item_1.get('miaosha_time')
                    goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                        miaosha_time=item_1.get('miaosha_time'))

                    res = self.juanpi_miaosha.to_update_juanpi_xianshimiaosha_table(
                        data=goods_data,
                        pipeline=self.tmp_sql_server)
                    await async_sleep(.3)  # 避免太快
                    break
            else:
                pass

        return res

    async def _get_miaoshao_goods_info_list(self, data) -> list:
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            tmp = {}
            tmp['miaosha_time'] = {
                'miaosha_begin_time': timestamp_to_regulartime(int(item.get('start_time'))),
                'miaosha_end_time': timestamp_to_regulartime(int(item.get('end_time'))),
            }
            stock = item.get('stock', 0)
            tmp['goods_id'] = item.get('goods_id')
            # 限时秒杀库存信息
            tmp['stock_info'] = {
                'activity_stock': int(item.get('stock', 0)*(item.get('rate', 0)/100)),
                'stock': item.get('stock', 0),
            }
            # 原始价格
            tmp['price'] = round(float(item.get('oprice', '0')), 2)
            tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2)
            miaosha_goods_list.append(tmp)

        return miaosha_goods_list

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(time.time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -259200:     # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息
        # if diff_time < -172800:     # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来14小时的商品信息(20点到第二天10点时间间隔为14小时)
            return 0    # 已过期恢复原价的
        elif diff_time > -172800 and diff_time < 50400:
            return 1    # 表示是昨天跟今天的也就是待更新的
        else:
            return 2    # 未来时间的暂时不用更新

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
예제 #12
0
    def get_ali_1688_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}

        # 阿里1688手机版地址: https://m.1688.com/offer/559836312862.html
        wait_to_deal_with_url = 'https://m.1688.com/offer/' + str(goods_id) + '.html'
        print('------>>>| 待处理的阿里1688地址为: ', wait_to_deal_with_url)

        body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=wait_to_deal_with_url, css_selector='div.d-content')
        # print(body)
        if body == '':
            print('获取到的body为空str!请检查!')
            self.result_data = {}
            return {}

        # '''
        # 改用requests
        # '''
        # body = MyRequests.get_url_body(url=wait_to_deal_with_url, headers=self.headers)
        # # print(body)
        #
        # if body == '':
        #     return {}
        # print(body)

        tmp_body = body

        try:
            pull_off_shelves = Selector(text=body).css('div.d-content p.info::text').extract_first()
        except:
            pull_off_shelves = ''
        if pull_off_shelves == '该商品无法查看或已下架':   # 表示商品已下架, 同样执行插入数据操作
            # print('test')
            try:
                tmp_my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=2 and GoodsID=%s'
                is_in_db = tmp_my_pipeline._select_table(sql_str=sql_str, params=(str(goods_id),))
                # print(is_in_db)
            except Exception as e:
                print('遇到错误:', e)
                print('数据库连接失败!')
                self.result_data = {}
                return {}

            if is_in_db != []:        # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可
                sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s'
                tmp_my_pipeline._update_table(sql_str=sql_str, params=(goods_id))
                print('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1')
                tmp_data_s = self.init_pull_off_shelves_goods()  # 初始化下架商品的属性
                tmp_data_s['before'] = True     # 用来判断原先该goods是否在db中
                self.result_data = {}

                return tmp_data_s

            else:       # 表示该goods_id没存在于db中
                print('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...')
                tmp_data_s = self.init_pull_off_shelves_goods()      # 初始化下架商品的属性
                tmp_data_s['before'] = False
                self.result_data = {}

                return tmp_data_s

        body = re.compile(r'{"beginAmount"(.*?)</script></div></div>').findall(body)
        if body != []:
            body = body[0]
            body = r'{"beginAmount"' + body
            # print(body)
            body = json.loads(body)
            # pprint(body)

            if body.get('discountPriceRanges') is not None:
                self.result_data = self._wash_discountPriceRanges(body=body)
                return self.result_data
            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

        else:
            print('解析ing..., 该商品正在参与火拼, 此处为火拼价, 为短期活动价格!')
            body = re.compile(r'{"activityId"(.*?)</script></div></div>').findall(tmp_body)
            if body != []:
                body = body[0]
                body = r'{"activityId"' + body
                # print(body)
                body = json.loads(body)
                # pprint(body)

                if body.get('discountPriceRanges') is not None:
                    self.result_data = self._wash_discountPriceRanges(body=body)
                    self.is_activity_goods = True
                    return self.result_data
                else:
                    print('data为空!')
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
            else:
                print('这个商品对应活动属性未知, 此处不解析, 设置为跳过!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
예제 #13
0
class CCUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/楚楚街/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.tmp_sql_server = None
        self.concurrency = 8  # 并发量
        self.goods_index = 1
        self.delete_sql_str = cc_delete_str_1

    async def _get_pc_headers(self):
        return {
            'Accept': 'application/json,text/javascript,*/*;q=0.01',
            # 'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'api.chuchujie.com',
            'Referer': 'https://m.chuchujie.com/?module=99',
            'Cache-Control': 'max-age=0',
            'User-Agent': get_random_pc_ua(),
        }

    async def _get_db_old_data(self) -> (list, None):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=cc_delete_str_2)
            await async_sleep(5)
            result = list(
                self.tmp_sql_server._select_table(sql_str=cc_select_str_1))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_end_time(self, miaosha_time) -> int:
        miaosha_end_time = json_2_dict(miaosha_time).get('miaosha_end_time')
        miaosha_end_time = int(
            str(
                time.mktime(
                    time.strptime(miaosha_end_time,
                                  '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_end_time

    async def _get_new_cc_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.chuchujie_miaosha
            except:
                pass
            collect()
            self.chuchujie_miaosha = ChuChuJie_9_9_Parse()

        return

    async def _update_is_delete(self, goods_id) -> bool:
        '''
        逻辑删除
        :param goods_id:
        :return:
        '''
        res = self.tmp_sql_server._update_table(sql_str=cc_update_str_2,
                                                params=(goods_id, ))

        return res

    async def _update_one_goods_info(self, item, index):
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        gender = item[2]
        page = item[3]
        miaosha_end_time = await self._get_miaosha_end_time(miaosha_time)
        # self.lg.info(str(miaosha_end_time))
        await self._get_new_cc_obj(index=index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
            remainder=25)

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_end_time)
            if is_recent_time == 0:
                res = await self._update_is_delete(goods_id=goods_id)
                self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                    goods_id, miaosha_end_time))
                await async_sleep(.3)
                index += 1
                self.goods_index = index

                return goods_id, res

            elif is_recent_time == 2:
                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                body = await self._get_one_page_goods_info(gender, page)
                if body == '':
                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                json_body = json_2_dict(body, default_res={})
                try:
                    this_page_total_count = json_body.get('data', {}).get(
                        'groupList', [])[0].get('totalCount', 0)
                except IndexError:
                    self.lg.error('获取this_page_total_count时出错, 请检查!')
                    this_page_total_count = 0

                item_list = await self._get_item_list(
                    this_page_total_count=this_page_total_count,
                    json_body=json_body)
                if item_list == []:
                    self.lg.info(
                        '#### 该gender, page对应得到的item_list为空[]!\n该商品已被下架限时秒杀活动,此处将其删除'
                    )
                    res = await self._update_is_delete(goods_id=item[0])
                    self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
                    await async_sleep(.3)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                else:
                    res = await self._one_update(goods_id=goods_id,
                                                 item_list=item_list)

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(CHUCHUJIE_SLEEP_TIME)

        return goods_id, res

    async def _update_db(self) -> None:
        '''
        秒杀数据更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.chuchujie_miaosha = ChuChuJie_9_9_Parse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)
            try:
                del self.chuchujie_miaosha
            except:
                pass
            collect()

    async def _get_item_list(self, **kwargs) -> list:
        '''
        获取对应gender, page的商品list
        :return:
        '''
        this_page_total_count = kwargs.get('this_page_total_count')
        json_body = kwargs.get('json_body')
        tmp_goods_list = json_body.get('data',
                                       {}).get('groupList',
                                               [])[0].get('dataList', [])

        item_list = [{
            'goods_id': str(item_s.get('chuchuId', '')),
            'sub_title': item_s.get('description', ''),
        } for item_s in tmp_goods_list] if this_page_total_count != 0 else []

        return item_list

    async def _one_update(self, **kwargs):
        '''
        未下架的更新
        :param kwargs:
        :return:
        '''
        res = False
        goods_id = kwargs.get('goods_id')
        item_list = kwargs.get('item_list')

        # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in item_list]
        # 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
        # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
        #     self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
        #     tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(goods_id))
        #     self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
        #     pass

        # else:  # 未下架的
        # 不更新秒杀时间和sub_title, 只更新其他相关数据
        # for item_2 in item_list:
        #     if item_2.get('goods_id', '') == goods_id:
        self.chuchujie_miaosha.get_goods_data(goods_id=goods_id)
        goods_data = self.chuchujie_miaosha.deal_with_data()
        if goods_data == {}:  # 返回的data为空则跳过
            pass
        else:
            goods_data['goods_id'] = str(goods_id)
            # goods_data['sub_title'] = item_2.get('sub_title', '')
            # print(goods_data)
            res = self.chuchujie_miaosha.update_chuchujie_xianshimiaosha_table(
                data=goods_data, pipeline=self.tmp_sql_server)

        return res

    async def _get_one_page_goods_info(self, *params) -> str:
        '''
        得到一个页面的html代码
        :param params: 待传入的参数
        :return: '{}' or str
        '''
        gender, page = params
        tmp_url = 'https://api.chuchujie.com/api/'

        client = {
            "ageGroup": "AG_0to24",
            "channel": "QD_web_webkit",
            "deviceId": "0",
            "gender": gender,  # '0' -> 女 | '1' -> 男
            "imei": "0",
            "packageName": "com.culiu.purchase",
            "platform": "wap",
            "sessionId": "0",
            "shopToken": "0",
            "userId": "0",
            "version": "1.0",
            "xingeToken": ""
        }

        query = {"group": 4, "module": "99", "page": page, "tab": "all"}

        # 切记: Query String Parameters直接这样编码发送即可
        # 如果是要post的数据就得使用post的方法
        data = {
            'client': json.dumps(client),
            'query': json.dumps(query),
            'page': page
        }

        body = Requests.get_url_body(url=tmp_url,
                                     headers=self.headers,
                                     params=data,
                                     ip_pool_type=self.ip_pool_type)

        return body

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = datetime_to_timestamp(get_shanghai_time())  # 当前的时间戳

        diff_time = time_1 - time_2
        # if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
        if diff_time < -100000:  # 设置大点避免还在卖的被下掉
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的
        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.chuchujie_miaosha
        except:
            pass
        collect()
예제 #14
0
class JMYPUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/聚美优品/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.tmp_sql_server = None
        self.delete_sql_str = jm_delete_str_1
        self.goods_index = 1
        self.concurrency = 10  # 并发量

    async def _get_pc_headers(self):
        return {
            'Accept': 'application/json,text/javascript,text/plain,*/*;q=0.01',
            # 'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'h5.jumei.com',
            'Referer': 'https://h5.jumei.com/',
            'Cache-Control': 'max-age=0',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    async def _get_db_old_data(self) -> (list, None):
        '''
        待更新数据
        :return:
        '''
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=jm_delete_str_2)
            await async_sleep(5)
            result = list(
                self.tmp_sql_server._select_table(sql_str=jm_select_str_1))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_cookies(self) -> str:
        '''
        获取请求需要的cookies
        :return:
        '''
        # 获取cookies
        my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH,
                                  ip_pool_type=self.ip_pool_type)
        cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(
            url='https://h5.jumei.com/')
        try:
            del my_phantomjs
        except:
            pass
        if cookies == '':
            self.lg.error('!!! 获取cookies失败 !!!')

        self.lg.info('获取cookies成功!')

        return cookies

    async def _get_miaosha_end_time(self, miaosha_time) -> int:
        '''
        获取秒杀结束时间
        :return:
        '''
        miaosha_end_time = json.loads(miaosha_time).get('miaosha_end_time')
        miaosha_end_time = int(
            str(
                time.mktime(
                    time.strptime(miaosha_end_time,
                                  '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_end_time

    async def _get_new_jumei_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.jumeiyoupin_miaosha
            except:
                pass
            collect()
            self.jumeiyoupin_miaosha = JuMeiYouPinParse()

    async def _update_is_delete(self, goods_id):
        '''
        逻辑删
        :param goods_id:
        :return:
        '''
        res = self.tmp_sql_server._update_table(sql_str=jm_update_str_4,
                                                params=(goods_id, ))

        return res

    async def _get_one_page_all_goods_list(self, *params) -> (list, str):
        '''
        得到一个页面地址的所有商品list
        :return: str | list 类型
        '''
        page = params[0]
        all_goods_list = []
        tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
            str(page))
        # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
        json_body = json_2_dict(Requests.get_url_body(
            url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type),
                                default_res={},
                                logger=self.lg)
        if json_body == {}:
            return '网络错误!'

        this_page_item_list = json_body.get('item_list', [])
        if this_page_item_list == []:
            return []

        for item in this_page_item_list:
            if item.get('item_id', '') not in [
                    item_1.get('item_id', '') for item_1 in all_goods_list
            ]:
                item['page'] = page
                all_goods_list.append(item)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]

        return all_goods_list

    async def _update_one_goods_info(self, item, index):
        '''
        更新单个
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        page = item[2]
        goods_url = item[3]
        miaosha_end_time = await self._get_miaosha_end_time(miaosha_time)
        # self.lg.info(str(miaosha_end_time))
        await self._get_new_jumei_obj(index=index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
        )

        if self.tmp_sql_server.is_connect_success:
            is_recent_time_res = await self._is_recent_time(miaosha_end_time)
            if is_recent_time_res == 0:
                res = await self._update_is_delete(goods_id)
                self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                    goods_id,
                    json.loads(miaosha_time).get('miaosha_end_time')))
                await async_sleep(.3)

            elif is_recent_time_res == 2:
                pass

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                this_page_all_goods_list = await self._get_one_page_all_goods_list(
                    page)
                if isinstance(this_page_all_goods_list, str):
                    self.lg.error('网络错误!先跳过')
                    await async_sleep(1.5)
                    return res

                elif this_page_all_goods_list == []:
                    res = await self._update_is_delete(goods_id=goods_id)
                    self.lg.error(
                        '#### 该page对应得到的this_page_all_goods_list为空[]!')
                    self.lg.error(
                        '** 该商品已被下架限时秒杀活动, 此处将其逻辑删除, goods_id:{}'.format(
                            goods_id))
                    await async_sleep(.3)

                else:
                    """
                    由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                    """
                    # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                    #
                    # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    #     self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
                    #     res = await self._update_is_delete(goods_id=goods_id)
                    #     self.lg.info('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                    #     pass

                    # else:  # 未下架的
                    tmp_r = self.jumeiyoupin_miaosha.get_goods_id_from_url(
                        goods_url)
                    self.jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                    goods_data = self.jumeiyoupin_miaosha.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:
                        goods_data['goods_id'] = goods_id
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time':
                            goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        res = self.jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(
                            data=goods_data, pipeline=self.tmp_sql_server)

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(JUMEIYOUPIN_SLEEP_TIME)

        return [goods_id, res]

    async def _update_db(self):
        '''
        数据更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                cookies = await self._get_cookies()
                self.headers = await self._get_pc_headers()
                self.headers.update({
                    'Cookie': cookies,
                })
                self.jumeiyoupin_miaosha = JuMeiYouPinParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)
                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(10)
            try:
                del self.jumeiyoupin_miaosha
            except:
                pass
            collect()

    async def _is_recent_time(self, timestamp):
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(datetime_to_timestamp(get_shanghai_time()))

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的
        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.jumeiyoupin_miaosha
        except:
            pass
        collect()
예제 #15
0
    def run(self):
        global coupon_queue, goods_id_and_coupon_url_list, unique_coupon_id_list

        while True:
            sql_cli = None
            try:
                if coupon_queue.qsize() >= 1:
                    # todo 有些领券url 为预付定金商品, 此处不处理
                    coupon_item = coupon_queue.get()
                    ori_coupon_list = json_2_dict(
                        json_str=re.compile('\((.*)\)').findall(coupon_item)
                        [0],
                        default_res={},
                    ).get('data', {}).get('resultList', [])
                    assert ori_coupon_list != []
                    # pprint(ori_coupon_list)

                    # todo: 测试发现, 返回数据中, 若有多买几件的优惠券在字段'nCouponInfoMap'中
                    # 现只支持1件, 不支持多件的券
                    coupon_list = []
                    for item in ori_coupon_list:
                        try:
                            goods_id = str(item.get('itemId', ''))
                            assert goods_id != ''
                            # 一个账户优惠券只能使用一次
                            # 优惠券展示名称, eg: '优惠券'
                            coupon_display_name = '优惠券'
                            # 优惠券的值, 即优惠几元
                            ori_coupon_value = item.get('couponAmount', '')
                            assert ori_coupon_value != ''
                            coupon_value = str(
                                float(ori_coupon_value).__round__(2))
                            # 使用门槛
                            ori_thresold = item.get('couponStartFee', '')
                            assert ori_thresold != ''
                            threshold = str(float(ori_thresold).__round__(2))
                            begin_time = str(
                                timestamp_to_regulartime(
                                    int(
                                        item.get('couponEffectiveStartTime',
                                                 '')[0:10])))
                            end_time = str(
                                timestamp_to_regulartime(
                                    int(
                                        item.get('couponEffectiveEndTime',
                                                 '')[0:10])))
                            # 使用方法
                            use_method = '满{}元, 减{}元'.format(
                                threshold, coupon_value)

                            if string_to_datetime(
                                    end_time) <= get_shanghai_time():
                                print('该券已过期[goods_id: {}]'.format(goods_id))
                                # 已过期的
                                continue

                            if datetime_to_timestamp(string_to_datetime(end_time)) - datetime_to_timestamp(string_to_datetime(begin_time)) \
                                    <= 60 * 60 * 36:
                                print('该券小于1.5天[goods_id: {}], pass'.format(
                                    goods_id))
                                continue

                            # todo 测试发现, 同一商品可能存在不同活动时间段的同一优惠券(但是活动时间不同), 导致一个商品有多个优惠券
                            #  所以取值时, 按结束时间最大那个来取值
                            # 上面还是会有问题, 导致价格重复减, 所以生成唯一id, 所以在一次转换价格后要把所有的该goods_id券都标记为1
                            # 生成唯一id
                            # unique_id = str(get_uuid3(
                            #     target_str=goods_id \
                            #                + coupon_value \
                            #                + threshold \
                            #                + str(datetime_to_timestamp(string_to_datetime(begin_time)))[0:10]\
                            #                + str(datetime_to_timestamp(string_to_datetime(end_time)))[0:10]))

                            # todo 根据上诉存在多张券导致价格被多次修改的情况,故表中一个goods_id,只允许存一张券, 就不会出现价格被多次修改的情况
                            # 解释就说: 只存储优惠力度最大的券
                            unique_id = str(get_uuid3(target_str=goods_id))

                            # 领券地址
                            # pprint(goods_id_and_coupon_url_list)
                            coupon_url = ''
                            for j in goods_id_and_coupon_url_list:
                                tmp_goods_id = j['goods_id']
                                tmp_coupon_url = j['coupon_url']
                                if goods_id == tmp_goods_id:
                                    print('@@@ 成功匹配到goods_id: {} 的领券地址: {}!!'.
                                          format(goods_id, tmp_coupon_url))
                                    coupon_url = tmp_coupon_url
                                    break
                                else:
                                    continue
                            assert coupon_url != ''

                            coupon_list.append({
                                'unique_id': unique_id,
                                'goods_id': goods_id,
                                'coupon_url': coupon_url,
                                'coupon_display_name': coupon_display_name,
                                'coupon_value': coupon_value,
                                'threshold': threshold,
                                'begin_time': begin_time,
                                'end_time': end_time,
                                'use_method': use_method,
                            })

                        except Exception as e:
                            print(e)
                            continue

                    # pprint(coupon_list)
                    if coupon_list != []:
                        # 存储
                        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
                        if not sql_cli.is_connect_success:
                            raise SqlServerConnectionException

                        for item in coupon_list:
                            unique_id = item['unique_id']
                            goods_id = item['goods_id']
                            if unique_id not in unique_coupon_id_list:
                                save_res = sql_cli._insert_into_table(
                                    sql_str=
                                    'insert into dbo.coupon_info(unique_id, create_time, goods_id, coupon_url, coupon_display_name, coupon_value, threshold, begin_time, end_time, use_method) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
                                    params=(
                                        unique_id,
                                        str(get_shanghai_time()),
                                        goods_id,
                                        item['coupon_url'],
                                        item['coupon_display_name'],
                                        Decimal(
                                            item['coupon_value']).__round__(2),
                                        Decimal(
                                            item['threshold']).__round__(2),
                                        item['begin_time'],
                                        item['end_time'],
                                        item['use_method'],
                                    ),
                                    repeat_insert_default_res=False,  # 避免重复改价
                                )
                                if save_res:
                                    # todo 只更新一次价格, 避免重复更新导致价格错误
                                    # 去重
                                    unique_coupon_id_list.append(unique_id)
                                    # 更新常规表中的商品价格变动
                                    sql_str = '''
                                    select top 1 Price, TaoBaoPrice, SKUInfo
                                    from dbo.GoodsInfoAutoGet
                                    where GoodsID=%s
                                    '''
                                    db_res = []
                                    try:
                                        db_res = list(
                                            sql_cli._select_table(
                                                sql_str=sql_str,
                                                params=(goods_id, ),
                                            ))
                                    except Exception as e:
                                        print(e)

                                    if db_res != []:
                                        # 标记常规商品由于优惠券带来的价格变动
                                        try:
                                            # 减去优惠券的价格
                                            coupon_value = float(
                                                item['coupon_value'])
                                            threshold = float(
                                                item['threshold'])
                                            # 还原为原始价格
                                            db_price = float(
                                                db_res[0][0]) * (1 - CP_PROFIT)
                                            db_taobao_price = float(
                                                db_res[0][1]) * (1 - CP_PROFIT)
                                            # 减去优惠券价, 并且加上CP_PROFIT, 得到最终待存储价格
                                            new_price = (
                                                (db_price - coupon_value
                                                 if db_price >= threshold else
                                                 db_price) *
                                                (1 + CP_PROFIT)).__round__(2)
                                            new_taobao_price = (
                                                (db_taobao_price -
                                                 coupon_value if
                                                 db_taobao_price >= threshold
                                                 else db_taobao_price) *
                                                (1 + CP_PROFIT)).__round__(2)

                                            new_sku_info = get_new_sku_info_from_old_sku_info_subtract_coupon_and_add_cp_profit(
                                                old_sku_info=json_2_dict(
                                                    json_str=db_res[0][2],
                                                    default_res=[],
                                                ),
                                                threshold=threshold,
                                                coupon_value=coupon_value,
                                            )

                                            sql_str2 = '''
                                            update dbo.GoodsInfoAutoGet
                                            set Price=%s, TaoBaoPrice=%s, SKUInfo=%s, ModfiyTime=%s, sku_info_trans_time=%s, IsPriceChange=1, PriceChangeInfo=SKUInfo
                                            where GoodsID=%s 
                                            '''
                                            now_time = get_shanghai_time()
                                            sql_cli._update_table(
                                                sql_str=sql_str2,
                                                params=(
                                                    Decimal(new_price
                                                            ).__round__(2),
                                                    Decimal(new_taobao_price).
                                                    __round__(2),
                                                    dumps(new_sku_info,
                                                          ensure_ascii=False),
                                                    now_time,
                                                    now_time,
                                                    goods_id,
                                                ),
                                            )
                                        except Exception as e:
                                            print(e)
                                    else:
                                        pass
                                else:
                                    continue

                            else:
                                continue

                else:
                    continue

            except IndexError:
                # 跳过相同接口得索引异常
                continue
            except Exception as e:
                print(e)
            finally:
                try:
                    del sql_cli
                except:
                    pass
예제 #16
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            return self._data_error_init()
        else:
            tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id)
            print('------>>>| 得到的商品手机版的地址为: ', tmp_url)
            '''
            2.采用phantomjs来处理,记住使用前别翻墙
            '''
            # body = self.driver.get_url_body(url=tmp_url, css_selector='div.sc-kgoBCf.bTQvTk')    # 该css为手机端标题块
            body = self.driver.get_url_body(url=tmp_url)  # 该css为手机端标题块
            # print(body)
            if re.compile(r'<span id="t-index">页面丢失ing</span>').findall(
                    body) != []:  # 页面为空处理
                _ = SqlServerMyPageInfoSaveItemPipeline()
                if _.is_connect_success:
                    _._update_table(sql_str=jp_update_str_1,
                                    params=(goods_id, ))
                    try:
                        del _
                    except:
                        pass
                    print('@@@ 逻辑删除该商品[{0}] is_delete = 1'.format(goods_id))
                    return self._data_error_init()

            if body == '':
                print('获取到的body为空str!请检查!')
                return self._data_error_init()

            data = re.compile(
                r'__PRELOADED_STATE__ = (.*);</script> <style ').findall(
                    body)  # 贪婪匹配匹配所有

            # 得到skudata
            # 卷皮原先的skudata请求地址1(官方放弃)
            # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id)
            # 现在卷皮skudata请求地址2
            skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str(
                goods_id)

            headers = get_random_headers(upgrade_insecure_requests=False, )
            headers.update({'Host': 'webservice.juanpi.com'})
            skudata_body = Requests.get_url_body(
                url=skudata_url,
                headers=headers,
                ip_pool_type=self.ip_pool_type,
            )
            if skudata_body == '':
                print('获取到的skudata_body为空str!请检查!')
                return self._data_error_init()
            skudata = re.compile(r'(.*)').findall(skudata_body)  # 贪婪匹配匹配所有

            if skudata != []:
                skudata = json_2_dict(json_str=skudata[0]).get('skudata', {})
                if skudata == {}:
                    return self._data_error_init()
                # pprint(skudata)

                try:
                    if skudata.get('info') is not None:
                        pass  # 说明得到正确的skudata
                    else:  # 否则跳出
                        print('skudata中info的key为None, 返回空dict')
                        return self._data_error_init()

                except AttributeError as e:
                    print('遇到错误如下(先跳过!): ', e)
                    return self._data_error_init()

            else:
                print('skudata为空!')
                return self._data_error_init()

            if data != []:
                main_data = json_2_dict(json_str=data[0])
                if main_data == {}:
                    return self._data_error_init()

                if main_data.get('detail') is not None:
                    main_data = self._wash_main_data(
                        main_data.get('detail', {}))

                    main_data['skudata'] = skudata
                    main_data['goods_id'] = goods_id
                    main_data['parent_dir'] = _jp_get_parent_dir(
                        phantomjs=self.driver, goods_id=goods_id)
                    self.result_data = main_data
                    # pprint(main_data)

                    return main_data

                else:
                    print('data中detail的key为None, 返回空dict')
                    return self._data_error_init()
            else:
                print('data为空!')
                return self._data_error_init()
예제 #17
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_4)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_3))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mogujie_miaosha = MoGuJieMiaoShaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0], ))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        item_list = self.get_item_list(event_time=str(item[2]))
                        if item_list == '':
                            # 可能网络状况导致, 先跳过
                            pass

                        elif item_list == []:
                            print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                            # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            tmp_sql_server._update_table(
                                sql_str=mg_update_str_1, params=(item[0], ))
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            # 该event_time中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [
                                item_1.get('iid', '') for item_1 in item_list
                            ]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                                # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                tmp_sql_server._update_table(
                                    sql_str=mg_update_str_1,
                                    params=(item[0], ))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:  # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('iid', '') == item[0]:
                                        spider_url = item[3]
                                        mogujie_miaosha.get_goods_data(
                                            goods_id=spider_url)
                                        goods_data = mogujie_miaosha.deal_with_data(
                                        )

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(
                                                item[0])

                                            # price设置为原价
                                            try:
                                                tmp_price_list = sorted([
                                                    round(
                                                        float(
                                                            item_4.get(
                                                                'normal_price',
                                                                '')), 2)
                                                    for item_4 in goods_data[
                                                        'price_info_list']
                                                ])
                                                price = Decimal(
                                                    tmp_price_list[-1]
                                                ).__round__(2)  # 商品原价
                                                goods_data['price'] = price
                                            except:
                                                print('设置price为原价时出错!请检查')
                                                continue

                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'startTime', 0))),
                                                'miaosha_end_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'endTime', 0))),
                                            }
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'miaosha_time'])
                                            # print(goods_data['title'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mogujie_miaosha.update_mogujie_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)
                                            sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
예제 #18
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=jp_delete_str_1)
            result = list(
                tmp_sql_server._select_table(sql_str=jp_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    try:
                        pintuan_end_time = json.loads(
                            item[1])[0].get('end_time')
                    except IndexError:
                        print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format(
                            item[0]))
                        print('此处将其标记为is_delete=1')
                        tmp_sql_server._update_table(sql_str=jp_update_str_5,
                                                     params=(item[0], ))
                        continue
                    pintuan_end_time = int(
                        str(
                            time.mktime(
                                time.strptime(pintuan_end_time,
                                              '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(
                            datetime_to_timestamp(get_shanghai_time())):
                        tmp_sql_server._delete_table(sql_str=jp_delete_str_2,
                                                     params=(item[0], ))
                        print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0]))
                    else:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        juanpi_pintuan.get_goods_data(goods_id=item[0])
                        data = juanpi_pintuan.deal_with_data()

                        if data != {}:
                            data['goods_id'] = item[0]
                            juanpi_pintuan.to_right_and_update_pintuan_data(
                                data=data, pipeline=tmp_sql_server)
                        else:  # 表示返回的data值为空值
                            pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()