def run_forever(self):
        '''
        这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server._select_table(sql_str=z8_select_str_4))
            tmp_sql_server._delete_table(sql_str=z8_delete_str_4, params=None)
        except TypeError:
            self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(str(result))
            print('--------------------------------------------------------')

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            self._update_old_goods_info(tmp_sql_server=tmp_sql_server, result=result)

        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(10*60)

        return
    async def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods 数据
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time')
            miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10])
            # self.my_lg.info(str(miaosha_begin_time))

            tmall = TmallParse(logger=self.my_lg)
            if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                self.my_lg.info('正在重置,并与数据库建立新连接中...')
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                self.my_lg.info('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                if await self.is_recent_time(miaosha_begin_time) == 0:
                    tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                    self.my_lg.info('过期的goods_id为(%s)' % item[0] + ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time'))

                else:   # 返回1, 表示在待更新的区间内
                    self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index)))

                    '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据'''
                    goods_id = tmall.get_goods_id_from_url(item[2])

                    tmall.get_goods_data(goods_id=goods_id)
                    goods_data = tmall.deal_with_data()

                    if goods_data != {}:
                        # self.my_lg.info(str(item))
                        goods_data['goods_id'] = item[0]

                        await tmall._update_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server)
                        await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                    else:
                        await asyncio.sleep(5)

                index += 1

            try: del tmall
            except: pass

        gc.collect()

        return
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = '''
        select goods_id, miaosha_time, session_id 
        from dbo.zhe_800_xianshimiaosha 
        where site_id=14 and is_delete = 0
        '''
        # 删除过期2天的的
        tmp_del_str = 'delete from dbo.zhe_800_xianshimiaosha where GETDATE()-miaosha_end_time>2'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
            tmp_sql_server._delete_table(sql_str=tmp_del_str, params=None)
        except TypeError:
            self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(str(result))
            print('--------------------------------------------------------')

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            self._update_old_goods_info(tmp_sql_server=tmp_sql_server,
                                        result=result)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)

        return
예제 #4
0
def clear_db():
    sql_cli = SqlServerMyPageInfoSaveItemPipeline()
    # 清理mia_pintuan
    who_is = 'mia_pintuan'
    print('获取 {} 目标数据中...'.format(who_is))
    mia_db_target_goods_id_list = sql_cli._select_table(sql_str='''
        select goods_id
        from.dbo.mia_pintuan
        where 
        (MainGoodsID is null and miaosha_begin_time < GETDATE()-60)
        -- 清掉已下架的且被后台转换的data
        or (MainGoodsID is not null and is_delete=1 and ConvertTime > modfiy_time)
        ''')
    _len = len(mia_db_target_goods_id_list)
    print('Got {} target_data len: {}'.format(who_is, _len))
    for item in mia_db_target_goods_id_list:
        goods_id = item[0]
        res = sql_cli._delete_table(
            sql_str='delete from dbo.mia_pintuan where goods_id=%s',
            params=(goods_id, ))
        print('[{}] [{}, rest_num: {}] deleting row where goods_id: {} ...'.
              format(
                  '+' if res else '-',
                  who_is,
                  _len,
                  goods_id,
              ))
        _len -= 1
    print('clear {} over!'.format(who_is))
    sleep(2.)

    try:
        del sql_cli
        del mia_db_target_goods_id_list
    except:
        pass
    collect()
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=jp_delete_str_1)
            result = list(sql_cli._select_table(sql_str=jp_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                goods_id = item[0]
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    try:
                        pintuan_end_time = json.loads(
                            item[1])[0].get('end_time')
                    except IndexError:
                        print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format(
                            goods_id))
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            sql_cli=sql_cli,
                            update_sql_str=jp_update_str_7,
                        )
                        continue
                    pintuan_end_time = int(
                        str(
                            time.mktime(
                                time.strptime(pintuan_end_time,
                                              '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(
                            datetime_to_timestamp(get_shanghai_time())):
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            sql_cli=sql_cli,
                            update_sql_str=jp_update_str_7,
                        )
                        print('该goods_id[{0}]已过期或者售完,逻辑删除成功!'.format(goods_id))
                    else:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        juanpi_pintuan.get_goods_data(goods_id=goods_id)
                        data = juanpi_pintuan.deal_with_data()
                        if data == {}:
                            continue

                        data['goods_id'] = goods_id
                        juanpi_pintuan.to_right_and_update_pintuan_data(
                            data=data, pipeline=sql_cli)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        gc.collect()
    async def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=jm_delete_str_3, )
            await async_sleep(5)
            result = sql_cli._select_table(sql_str=jm_select_str_3,
                                           logger=self.lg)
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            await _print_db_old_data(result=result, logger=self.lg)
            index = 1
            for item in result:
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                sql_cli = await _get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=self.lg,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    time_number = await self.is_recent_time(pintuan_end_time)
                    if time_number == 0:
                        await sql_cli._update_table_3(
                            sql_str=jm_update_str_5,
                            params=(str(get_shanghai_time()), item[0]),
                            logger=self.lg)
                        await async_sleep(.5)
                        self.msg = '过期的goods_id为(%s)' % item[
                            0] + ', 拼团结束时间为(%s), 删除成功!' % str(
                                json.loads(item[1]).get('begin_time'))
                        self.lg.info(self.msg)

                    elif time_number == 2:
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (
                            item[0], str(index))
                        self.lg.info(self.msg)
                        data['goods_id'] = item[0]
                        jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.lg)

                        _ = item[2] + '-' + str(
                            item[3])  # 格式: 'coutuan_baby-1'
                        item_list = self.api_all_goods_id.get(
                            _, [])  # 用于判断tab, index已在self.api_all_goods_id中

                        if item_list == []:
                            driver = BaseDriver(
                                executable_path=PHANTOMJS_DRIVER_PATH,
                                ip_pool_type=self.ip_pool_type)
                            item_list = await jumeiyoupin_2.get_one_page_goods_list(
                                driver=driver, tab=item[2], index=item[3])
                            try:
                                del driver
                            except:
                                pass

                        if item_list == []:
                            self.lg.info('获取到的body为空str, 网络原因, 此处先跳过!')
                            pass
                        else:
                            if self.api_all_goods_id.get(_) is None:
                                self.api_all_goods_id[_] = item_list

                            pintuan_goods_all_goods_id = [
                                item_1.get('goods_id', '')
                                for item_1 in item_list
                            ]

                            jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse(
                                logger=self.lg)
                            # 内部已经下架的(测试发现官方不会提前下架活动商品)
                            if item[0] not in pintuan_goods_all_goods_id:
                                await self.update_data_2(
                                    jumeiyoupin_pintuan=jumeiyoupin_pintuan,
                                    jumei_pintuan_url=item[4],
                                    goods_id=item[0],
                                    pipeline=sql_cli)

                            else:  # 未内部下架
                                await self.update_data_1(
                                    jumeiyoupin_pintuan=jumeiyoupin_pintuan,
                                    jumeiyoupin_2=jumeiyoupin_2,
                                    jumei_pintuan_url=item[4],
                                    goods_id=item[0],
                                    item_list=item_list,
                                    pipeline=sql_cli)

                else:
                    self.lg.error('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(10 * 60)
            gc.collect()

        return None
예제 #7
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=cc_delete_str_2)
            result = list(
                tmp_sql_server._select_table(sql_str=cc_select_str_1))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                        chuchujie_miaosha = ChuChuJie_9_9_Parse()
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        body = self.get_one_page_goods_info(item[2], item[3])

                        if body == '{}':
                            # 可能是网络原因导致, 先跳过
                            pass

                        else:
                            try:
                                json_body = json.loads(body)
                                # print(json_body)
                            except:
                                print('json.loads转换body时出错!请检查')
                                json_body = {}
                                pass

                            try:
                                this_page_total_count = json_body.get(
                                    'data',
                                    {}).get('groupList',
                                            [])[0].get('totalCount', 0)
                            except IndexError:
                                print('获取this_page_total_count时出错, 请检查!')
                                this_page_total_count = 0

                            # 获取对应gender, page的商品list
                            if this_page_total_count == 0:
                                item_list = []

                            else:
                                tmp_goods_list = json_body.get('data', {}).get(
                                    'groupList', [])[0].get('dataList', [])

                                item_list = [{
                                    'goods_id':
                                    str(item_s.get('chuchuId', '')),
                                    'sub_title':
                                    item_s.get('description', ''),
                                } for item_s in tmp_goods_list]

                            if item_list == []:
                                print('#### 该gender, page对应得到的item_list为空[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in item_list]
                                """
                                由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                                """
                                # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                #     print('该商品已被下架限时秒杀活动,此处将其删除')
                                #     tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                #     print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                #     pass
                                #
                                # else:  # 未下架的
                                '''
                                不更新秒杀时间和sub_title, 只更新其他相关数据
                                '''
                                # for item_2 in item_list:
                                #     if item_2.get('goods_id', '') == item[0]:
                                chuchujie_miaosha.get_goods_data(
                                    goods_id=item[0])
                                goods_data = chuchujie_miaosha.deal_with_data()

                                if goods_data == {}:  # 返回的data为空则跳过
                                    pass
                                else:
                                    goods_data['goods_id'] = str(item[0])

                                    # goods_data['sub_title'] = item_2.get('sub_title', '')

                                    # print(goods_data)
                                    chuchujie_miaosha.update_chuchujie_xianshimiaosha_table(
                                        data=goods_data,
                                        pipeline=tmp_sql_server)
                                    sleep(CHUCHUJIE_SLEEP_TIME)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        gc.collect()
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, miaosha_time, tab_id, page from dbo.juanpi_xianshimiaosha where site_id=15'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_miaosha = JuanPiParse()

            for item in result:  # 实时更新数据
                miaosha_begin_time = json.loads(
                    item[1]).get('miaosha_begin_time')
                miaosha_begin_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_begin_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_begin_time)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_begin_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_begin_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                            str(item[2]),
                            str(item[3]),
                        )
                        # print('待爬取的tab_id, page地址为: ', tmp_url)

                        data = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers)
                        if data == '': break

                        try:
                            data = json.loads(data)
                            data = data.get('data', {})
                            # print(data)
                        except:
                            break

                        if data.get('goodslist') == []:
                            print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.
                                  format(item[2], item[3]))
                            pass

                        else:
                            data = data.get('goodslist', [])
                            # print(data)
                            if data == []:
                                print('goodslist为[], 此处跳过')
                                pass
                            else:
                                miaosha_goods_list = self.get_miaoshao_goods_info_list(
                                    data=data)
                                # print(miaosha_goods_list)

                                # 该tab_id, page中现有的所有goods_id的list
                                miaosha_goods_all_goods_id = [
                                    i.get('goods_id')
                                    for i in miaosha_goods_list
                                ]
                                # print(miaosha_goods_all_goods_id)

                                if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                    '''
                                    表示该tab_id,page中没有了该goods_id
                                    '''
                                    tmp_sql_server._delete_table(
                                        sql_str=self.delete_sql_str,
                                        params=(item[0]))
                                    print(
                                        '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                        item[0])
                                    pass

                                else:  # 未下架的
                                    for item_1 in miaosha_goods_list:
                                        if item_1.get('goods_id',
                                                      '') == item[0]:
                                            # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                            # juanpi_miaosha = JuanPiParse()
                                            juanpi_miaosha.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = juanpi_miaosha.deal_with_data(
                                            )

                                            if goods_data == {}:  # 返回的data为空则跳过
                                                pass
                                            else:  # 否则就解析并且插入
                                                goods_data[
                                                    'stock_info'] = item_1.get(
                                                        'stock_info')
                                                goods_data[
                                                    'goods_id'] = item_1.get(
                                                        'goods_id')
                                                # goods_data['username'] = '******'
                                                if item_1.get(
                                                        'stock_info'
                                                ).get('activity_stock') > 0:
                                                    goods_data[
                                                        'price'] = item_1.get(
                                                            'price')  # 秒杀前的原特价
                                                    goods_data[
                                                        'taobao_price'] = item_1.get(
                                                            'taobao_price'
                                                        )  # 秒杀价
                                                else:
                                                    pass
                                                goods_data[
                                                    'sub_title'] = item_1.get(
                                                        'sub_title', '')
                                                goods_data[
                                                    'miaosha_time'] = item_1.get(
                                                        'miaosha_time')
                                                goods_data[
                                                    'miaosha_begin_time'], goods_data[
                                                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=item_1
                                                            .get('miaosha_time'
                                                                 ))

                                                # print(goods_data)
                                                juanpi_miaosha.to_update_juanpi_xianshimiaosha_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)

                                                sleep(.2)  # 避免太快
                                        else:
                                            pass
                    if index % 10 == 0:  # 每过几个初始化一次,既能加快速度,又能优化内存
                        # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                        juanpi_miaosha = JuanPiParse()
                        gc.collect()

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            # sleep(5)
            pass
        gc.collect()
예제 #9
0
class GoodsCouponSpider(AsyncCrawler):
    def __init__(self):
        AsyncCrawler.__init__(
            self,
            user_agent_type=PHONE,
            ip_pool_type=IP_POOL_TYPE,
            log_print=True,
            logger=None,
            log_save_path=MY_SPIDER_LOGS_PATH + '/coupon/_/',
            headless=True,
        )
        # 不宜过大, 官网会发现
        self.concurrency = 10
        # 不可太大 电脑卡死
        self.concurrency2 = 3
        self.req_num_retries = 7
        self.proxy_type = PROXY_TYPE_HTTPS
        self.driver_load_images = DRIVER_LOAD_IMAGES
        # 用线程模式长期运行报: too many open files
        self.concurrent_type = 0
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        self.init_sql_str()

    async def _fck_run(self):
        """
        main
        :return:
        """
        while True:
            try:
                if get_shanghai_time().hour == 0:
                    await async_sleep(60 * 60 * 3.5)
                    continue

                self.db_res = await self.get_db_res()

                all_tasks_params_list_obj = await self.get_all_tasks_params_list_obj(
                )
                tasks_params_list_obj = TasksParamsListObj(
                    tasks_params_list=all_tasks_params_list_obj,
                    step=self.concurrency,
                    slice_start_index=0,
                )
                while True:
                    try:
                        slice_params_list = tasks_params_list_obj.__next__()
                    except AssertionError:
                        break

                    coupon_url_list = await self.get_coupon_url_list_by_goods_id_list(
                        slice_params_list=slice_params_list)
                    # pprint(coupon_url_list)

                    # 测试
                    # coupon_url = 'https://uland.taobao.com/coupon/edetail?e=5M3kt6O%2FfZqa2P%2BN2ppgB2X2iX5OaVULVb9%2F1Hxlj5NQYhkEFAI5hGSlkL8%2BFO6JZSEGEhAo6u3FrE8HH4fiD8KUixUTTLeu0WMS0ZKY%2BzmLVIDjuHwzlw%3D%3D&af=1&pid=mm_55371245_39912139_149806421'
                    # coupon_url_list = [coupon_url for i in range(6)]
                    # # goods_id得对应上面的领券地址
                    # goods_id_and_coupon_url_queue.put({
                    #     'goods_id': '562016826663',
                    #     'coupon_url': coupon_url,
                    # })

                    if coupon_url_list == []:
                        # 此处也回收下
                        collect()
                        self.lg.info('coupon_url_list为空list, 跳过!')
                        random_sleep_time = random_uniform(3., 6.)
                        self.lg.info('休眠{}s ...'.format(random_sleep_time))
                        await async_sleep(random_sleep_time)
                        continue

                    # 划分coupon_url_list, 避免多开使内存崩溃
                    tasks_params_list_obj2 = TasksParamsListObj(
                        tasks_params_list=coupon_url_list,
                        step=self.concurrency2,
                        slice_start_index=0,
                    )
                    while True:
                        try:
                            slice_params_list2 = tasks_params_list_obj2.__next__(
                            )
                        except AssertionError:
                            break

                        tasks = []
                        for coupon_url in slice_params_list2:
                            self.lg.info(
                                'create task[where coupon_url: {}] ...'.format(
                                    coupon_url))
                            tasks.append(
                                self.loop.create_task(
                                    self.intercept_target_api(
                                        coupon_url=coupon_url)))

                        try:
                            one_res = await wait_for(
                                fut=async_wait_tasks_finished(tasks=tasks),
                                timeout=60 * 2,
                            )
                        except AsyncTimeoutError:
                            self.lg.error('遇到错误:', exc_info=True)
                            continue

                        # 成功总数
                        success_count = 0
                        for item in one_res:
                            if item:
                                success_count += 1
                        self.lg.info('成功个数: {}, 成功概率: {:.3f}'.format(
                            success_count, success_count / self.concurrency2))
                        collect()

                    collect()

                self.lg.info('一次大循环结束!!')

            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                await async_sleep(30)

            finally:
                self.lg.info('休眠6s...')
                await async_sleep(6.)
                collect()

    async def get_all_tasks_params_list_obj(self) -> list:
        """
        根据db 给与的数据获取到所有的目标数据
        :return:
        """
        global unique_coupon_id_list

        all_tasks_params_list_obj = []
        for item in self.db_res:
            goods_id = item[0]
            # 因为现在只取单件购买优惠券, 不处理多件的, 所以此处可去除已存在的
            coupon_unique_id = str(get_uuid3(target_str=goods_id))
            if coupon_unique_id in unique_coupon_id_list:
                self.lg.info(
                    'coupon_info 表中已存在coupon_unique_id: {}, goods_id: {}, pass'
                    .format(
                        coupon_unique_id,
                        goods_id,
                    ))
                continue

            all_tasks_params_list_obj.append({
                'goods_id': goods_id,
                'site_id': item[1],
            })

        return all_tasks_params_list_obj

    async def get_coupon_url_list_by_goods_id_list(self,
                                                   slice_params_list) -> list:
        """
        根据给与的goods_id_list来获取对应的coupon_url_list
        :return:
        """
        def get_create_task_msg(k) -> str:
            return 'create task[where goods_id: {}, site_id: {}] ...'.format(
                k['goods_id'],
                k['site_id'],
            )

        def get_now_args(k) -> list:
            return [
                k['goods_id'],
            ]

        all_res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=slice_params_list,
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=self.get_tm_coupon_url_from_lq5u,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=None,
            func_name_where_add_one_res_2_all_res=
            default_add_one_res_2_all_res2,
            one_default_res='',
            step=self.concurrency,
            logger=self.lg,
            concurrent_type=self.concurrent_type,
            func_timeout=25,
        )

        res = []
        for item in all_res:
            if item != '':
                res.append(item)

        # 修改对应的goods_id的coupon_check_time
        sql_str = 'update dbo.GoodsInfoAutoGet set coupon_check_time=%s where GoodsID=%s'
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        for item in slice_params_list:
            goods_id = item['goods_id']
            coupon_check_time_change_res = False
            try:
                coupon_check_time_change_res = sql_cli._update_table_2(
                    sql_str=sql_str,
                    params=(
                        get_shanghai_time(),
                        goods_id,
                    ),
                    logger=self.lg,
                )
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)

            self.lg.info('[{}] update goods_id: {} coupon_check_time'.format(
                '+' if coupon_check_time_change_res else '-',
                goods_id,
            ))
        try:
            del sql_cli
        except:
            pass

        try:
            del all_res
        except:
            pass
        collect()

        return res

    async def get_db_res(self) -> list:
        """
        获取目标goods_id_list
        :return:
        """
        get_current_func_info_by_traceback(self=self, logger=self.lg)
        db_res = []
        try:
            self.lg.info('清除过期优惠券ing ...')
            # 清除过期优惠券
            self.sql_cli._delete_table(
                sql_str=
                'delete from dbo.coupon_info where GETDATE()-end_time >= 3',
                params=None,
            )
            self.lg.info('休眠15s ...')
            await async_sleep(15)
            self.lg.info('获取新待检测的goods数据ing...')
            db_res = list(self.sql_cli._select_table(sql_str=self.sql_tr0, ))
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()

        assert db_res != []
        self.lg.info('db_res_len: {}'.format(len(db_res)))

        return db_res

    async def intercept_target_api(self, coupon_url: str):
        """
        拦截目标接口
        :param coupon_url:
        :return:
        """
        chromium_puppeteer = ChromiumPuppeteer(
            load_images=self.driver_load_images,
            executable_path=PYPPETEER_CHROMIUM_DRIVER_PATH,
            ip_pool_type=self.ip_pool_type,
            headless=self.headless,
            user_agent_type=self.user_agent_type,
        )
        driver = await chromium_puppeteer.create_chromium_puppeteer_browser()
        # self.lg.info('chromium version: {}'.format(await driver.version()))
        # self.lg.info('初始user_agent: {}'.format(await driver.userAgent()))
        page = await driver.newPage()
        await bypass_chrome_spiders_detection(page=page)

        # ** 截获 request 和 response, 劫持请求跟应答必须都设置!
        # ** puppeteer官网事件api: https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md
        # 搜索class: Page, 找到需求事件进行重写
        await page.setRequestInterception(True)
        network_interceptor = NetworkInterceptorTest()
        page.on(event='request', f=network_interceptor.intercept_request)
        page.on(event='response', f=network_interceptor.intercept_response)
        page.on(event='requestfailed', f=network_interceptor.request_failed)
        # page.on(event='requestfinished', f=network_interceptor.request_finished)

        res = False
        try:
            await goto_plus(
                page=page,
                url=coupon_url,
                options={
                    'timeout':
                    1000 * 45,  # unit: ms
                    'waitUntil': [  # 页面加载完成 or 不再有网络连接
                        'domcontentloaded',
                        'networkidle0',
                    ]
                },
                num_retries=2,
            )

            # 全屏截图
            # await page.screenshot({
            #     'path': 'screen.png',
            #     'type': 'png',
            #     'fullPage': True,
            # })
            # 目标元素定位截图
            # target_ele = await page.querySelector(selector='div.board')
            # await target_ele.screenshot({
            #     'path': 'target_ele.png',
            #     'type': 'png',
            # })

            # 如果网页内有用iframe等标签,这时page对象是无法读取<iframe>里面的内容的,需要用到下面
            # frames_list = page.frames
            # pprint(frames_list)

            body = Requests._wash_html(await page.content())
            # print('[{:8s}] {}'.format(
            #     colored('body', 'red'),
            #     body, ))
            res = True if body != '' else res

        except (WebsocketsConnectionClosed, InvalidStateError):
            pass
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        try:
            await driver.close()
        except:
            try:
                await driver.close()
            except:
                pass
        try:
            del page
        except:
            try:
                del page
            except:
                pass
        try:
            del chromium_puppeteer
        except:
            try:
                del chromium_puppeteer
            except:
                pass
        collect()

        return res

    @catch_exceptions_with_class_logger(default_res='')
    def get_tm_coupon_url_from_lq5u(
        self,
        goods_id='',
        goods_name_or_m_url: str = '',
    ) -> str:
        """
        从领券无忧根据goods_id搜索tm优惠券, 并返回领券地址
            url: http://www.lq5u.com
        :param goods_id: 推荐使用商品id来查券
        :param goods_name_or_m_url: 商品名 or 商品地址
        :param proxy_type:
        :param num_retries:
        :return: 优惠券领取地址
        """
        global goods_id_and_coupon_url_queue

        # todo 测试发现无需搜索, 只需把goods_id 改为领券无忧的对应的url即可查询是否有券
        # 基于领券无忧来根据商品名获取其优惠券
        # headers = get_random_headers(
        #     user_agent_type=1,
        #     connection_status_keep_alive=False,
        # )
        # headers.update({
        #     'Proxy-Connection': 'keep-alive',
        #     'Origin': 'http://www.lq5u.com',
        #     'Content-Type': 'application/x-www-form-urlencoded',
        #     'Referer': 'http://www.lq5u.com/',
        # })
        # # 只搜索天猫的
        # data = {
        #   'p': '1',
        #   'cid': '0',
        #   'sort': '0',
        #   'b2c': '1',           # '0'为搜索tb, tm | '1'为只搜索tm
        #   'coupon': '1',
        #   'k': goods_name_or_m_url,
        # }
        # body = Requests.get_url_body(
        #     method='post',
        #     url='http://www.lq5u.com/',
        #     headers=headers,
        #     # cookies=cookies,
        #     data=data,
        #     verify=False,
        #     ip_pool_type=IP_POOL_TYPE,
        #     num_retries=num_retries,
        #     proxy_type=proxy_type,)
        # assert body != ''
        # # print(body)
        #
        # lq5u_url_list_sel = {
        #     'method': 'css',
        #     'selector': 'li a ::attr("onmousedown")',
        # }
        # ori_lq5u_url_list = parse_field(
        #     parser=lq5u_url_list_sel,
        #     target_obj=body,
        #     is_first=False,)
        # lq5u_url_list = []
        # for item in ori_lq5u_url_list:
        #     try:
        #         url = re.compile('this.href=\'(.*?)\'').findall(item)[0]
        #         assert url != ''
        #     except Exception:
        #         continue
        #
        #     lq5u_url_list.append('http://www.lq5u.com' + url)
        #
        # assert lq5u_url_list != []
        # pprint(lq5u_url_list)

        # 领券无忧对应页面如下
        # url = 'http://www.lq5u.com/item/index/iid/{}.html'.format(goods_id)
        # body = Requests.get_url_body(
        #     method='get',
        #     url=url,
        #     headers=headers,
        #     verify=False,
        #     ip_pool_type=IP_POOL_TYPE,
        #     num_retries=num_retries,
        #     proxy_type=proxy_type, )
        # assert body != ''
        # print(body)
        #
        # coupon_info_sel = {
        #     'method': 'css',
        #     'selector': 'span.b.red ::text',
        # }
        # coupon_info = parse_field(
        #     parser=coupon_info_sel,
        #     target_obj=body,
        # )
        # if '很遗憾,该商品没有优惠券' in coupon_info:
        #     return []
        # else:
        #     _print(msg='goods_id: {}, 存在优惠券'.format(goods_id), logger=logger)
        #     return []

        # 查看某商品是否含有优惠券
        # 地址: http://www.i075.com/item/index/iid/562016826663.html

        # 可以从下面网站拿商品测试
        # http://www.i075.com/index/cate/cid/1.html
        # tm
        # goods_id = '562016826663'
        # goods_id = '565122084412'
        # tb
        # goods_id = '573406377569'

        # # 根据领券无忧接口
        # # base_url = 'www.i075.com'
        # base_url = 'quan.mmfad.com'
        # headers = get_random_headers(
        #     user_agent_type=1,
        #     connection_status_keep_alive=False,
        #     upgrade_insecure_requests=False,
        #     cache_control='',)
        # headers.update({
        #     'accept': 'application/json, text/javascript, */*; q=0.01',
        #     'Referer': 'http://{}/item/index/iid/{}.html'.format(base_url, goods_id),
        #     'Origin': 'http://{}'.format(base_url),
        #     'X-Requested-With': 'XMLHttpRequest',
        #     'Content-Type': 'application/x-www-form-urlencoded',
        #     'Proxy-Connection': 'keep-alive',
        # })
        # params = (
        #     ('rnd', str(random_uniform(0, 1))),  # eg: '0.4925945510743117'
        # )
        # data = {
        #     'iid': goods_id,
        # }
        # body = Requests.get_url_body(
        #     method='post',
        #     url='http://{}/item/ajax_get_auction_code.html'.format(base_url),
        #     headers=headers,
        #     params=params,
        #     data=data,
        #     verify=False,
        #     ip_pool_type=self.ip_pool_type,
        #     num_retries=self.req_num_retries,
        #     proxy_type=self.proxy_type, )
        # assert body != ''
        # # self.lg.info(body)
        #
        # data = json_2_dict(
        #     json_str=body,
        #     default_res={},
        #     logger=self.lg,).get('data', {})
        # # pprint(data)
        # # 处理data = ''
        # data = data if not isinstance(data, str) else {}
        # coupon_url = data.get('coupon_click_url', '')

        # 通过全优惠网(https://www.quanyoubuy.com)
        headers = get_random_headers(
            user_agent_type=1,
            connection_status_keep_alive=False,
            cache_control='',
        )
        headers.update({
            'authority': 'm.quanyoubuy.com',
        })
        url = 'https://m.quanyoubuy.com/item/index/iid/{}.html'.format(
            goods_id)
        body = Requests.get_url_body(
            url=url,
            headers=headers,
            ip_pool_type=self.ip_pool_type,
            proxy_type=self.proxy_type,
            num_retries=self.req_num_retries,
        )
        assert body != ''
        # self.lg.info(body)

        # pc 的
        # qrcode_url_sel = {
        #     'method': 'css',
        #     'selector': 'img#qrcode ::attr("src")',
        # }
        # qrcode_url = parse_field(
        #     parser=qrcode_url_sel,
        #     target_obj=body,
        #     logger=self.lg,)
        # assert qrcode_url != ''
        # # self.lg.info(qrcode_url)
        # coupon_url_sel = {
        #     'method': 're',
        #     'selector': 'text=(.*)',
        # }
        # coupon_url = parse_field(
        #     parser=coupon_url_sel,
        #     target_obj=qrcode_url,
        #     logger=self.lg,)

        # m
        coupon_url_sel = {
            'method': 'css',
            'selector': 'div.goods_quan  a.getGoodsLink ::attr("href")',
        }
        coupon_url = parse_field(
            parser=coupon_url_sel,
            target_obj=body,
            logger=self.lg,
            is_print_error=False,
        )
        # self.lg.info(coupon_url)

        if 'uland.taobao.com' not in coupon_url:
            # 地址含有上诉的才为领券地址
            coupon_url = ''
        else:
            pass

        if coupon_url != '':
            self.lg.info('[+] 该goods_id: {} 含 有优惠券, coupon领取地址: {}'.format(
                goods_id,
                coupon_url,
            ))
            # 队列录值
            goods_id_and_coupon_url_queue.put({
                'goods_id': goods_id,
                'coupon_url': coupon_url,
            })
        else:
            self.lg.info('[-] 该goods_id: {} 不含 有优惠券'.format(goods_id))

        try:
            del body
        except:
            pass
        collect()

        return coupon_url

    def init_sql_str(self):
        self.sql_tr0 = '''
        select top 800 GoodsID, SiteID
        from dbo.GoodsInfoAutoGet
        where MainGoodsID is not null
        and IsDelete=0
        and (SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6)
        and GoodsID not in (select goods_id from dbo.coupon_info)
        -- and MainGoodsID=143509
        -- and GoodsID='18773718545'
        order by coupon_check_time asc
        '''

    def __del__(self):
        try:
            del self.concurrency
            del self.loop
        except:
            pass
        collect()
예제 #10
0
class MiaPintuanRealTimeUpdate(object):
    def __init__(self):
        self.ip_pool_type = IP_POOL_TYPE
        self.sql_cli = None

    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        result = self._get_db_old_data()
        if result is None:
            sleep_time = 20
            print('获取db数据失败, 休眠{}s ...'.format(sleep_time))
            sleep(sleep_time)

            return None

        index = 1
        for item in result:  # 实时更新数据
            goods_id = item[0]
            pid = item[2]
            # 2020-04-12 00:00:00
            pintuan_end_time = json_2_dict(item[1]).get('end_time')
            pintuan_end_time = datetime_to_timestamp(
                string_to_datetime(pintuan_end_time))
            # print(pintuan_end_time)

            data = {}
            self.sql_cli = _block_get_new_db_conn(db_obj=self.sql_cli,
                                                  index=index,
                                                  remainder=50)
            if self.sql_cli.is_connect_success:
                is_recent_time = self.is_recent_time(pintuan_end_time)
                if is_recent_time == 0:
                    # 已恢复原价的
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        update_sql_str=mia_update_str_7,
                        sql_cli=self.sql_cli)
                    print('该goods拼团开始时间为({})'.format(
                        json.loads(item[1]).get('begin_time')))
                    sleep(.4)

                elif is_recent_time == 2:
                    # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
                    pass

                else:  # 返回1,表示在待更新区间内
                    print(
                        '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'
                        .format(goods_id, index))
                    data['goods_id'] = goods_id
                    try:
                        data_list = get_mia_pintuan_one_page_api_goods_info(
                            page_num=pid)
                    except ResponseBodyIsNullStrException:
                        index += 1
                        sleep(.4)
                        continue

                    # TODO 会导致在售商品被异常下架, 不进行判断, 一律进行更新
                    # try:
                    #     assert data_list != [], 'data_list不为空list!'
                    # except AssertionError as e:
                    #     print(e)
                    #     _handle_goods_shelves_in_auto_goods_table(
                    #         goods_id=goods_id,
                    #         update_sql_str=mia_update_str_7,
                    #         sql_cli=self.sql_cli)
                    #     sleep(.4)
                    #     index += 1
                    #     continue

                    pintuan_goods_all_goods_id = [
                        item_1.get('goods_id', '') for item_1 in data_list
                    ]
                    # print(pintuan_goods_all_goods_id)
                    '''
                    蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍)
                    '''
                    mia_pt = MiaPintuanParse(is_real_times_update_call=True)
                    if goods_id not in pintuan_goods_all_goods_id:
                        # 内部已经下架的
                        # 一律更新
                        try:
                            goods_data = self._get_mia_pt_one_goods_info(
                                mia_pt_obj=mia_pt,
                                goods_id=goods_id,
                            )
                        except AssertionError:
                            # 返回的data为空则跳过
                            index += 1
                            continue

                        # pprint(goods_data)
                        mia_pt.update_mia_pintuan_table(data=goods_data,
                                                        pipeline=self.sql_cli)
                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度

                    else:
                        # 未下架的
                        for item_2 in data_list:
                            if item_2.get('goods_id', '') == goods_id:
                                sub_title = item_2.get('sub_title', '')
                                try:
                                    goods_data = self._get_mia_pt_one_goods_info(
                                        mia_pt_obj=mia_pt,
                                        goods_id=goods_id,
                                        sub_title=sub_title,
                                    )
                                except AssertionError:
                                    # 返回的data为空则跳过
                                    continue

                                # pprint(goods_data)
                                mia_pt.update_mia_pintuan_table(
                                    data=goods_data, pipeline=self.sql_cli)
                                sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
                            else:
                                pass

                    try:
                        del mia_pt
                    except:
                        pass

            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass

            index += 1
            collect()

        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()

    def _get_mia_pt_one_goods_info(self,
                                   mia_pt_obj,
                                   goods_id,
                                   sub_title='') -> dict:
        """
        获取mia单个goods info
        :return:
        """
        mia_pt_obj.get_goods_data(goods_id=goods_id)
        goods_data = mia_pt_obj.deal_with_data()
        assert goods_data != {}, 'goods_data不为空dict'

        goods_data['goods_id'] = str(goods_id)
        goods_data['sub_title'] = sub_title
        if goods_data['pintuan_time'] == {}:  # 当没有拼团时间时,就表示已下架拼团
            now_time = get_shanghai_time()
            goods_data['pintuan_begin_time'], goods_data[
                'pintuan_end_time'] = (now_time, now_time)
        else:
            goods_data['pintuan_begin_time'], goods_data[
                'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                    miaosha_time=goods_data['pintuan_time'])

        return goods_data

    def _get_db_old_data(self) -> (list, None):
        """
        获取db待更新data
        :return:
        """
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.sql_cli._delete_table(sql_str=mia_delete_str_2)
            result = list(self.sql_cli._select_table(sql_str=mia_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')

        _block_print_db_old_data(result=result)

        return result

    def is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(datetime_to_timestamp(get_shanghai_time()))  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的

        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的

        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        collect()
class Z8Updater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/折800/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.sql_cli = None
        self.goods_index = 1
        self.concurrency = 8  # 并发量

    async def _get_db_old_data(self):
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.sql_cli._delete_table(sql_str=z8_delete_str_4, params=None)
            await async_sleep(5)
            result = list(self.sql_cli._select_table(sql_str=z8_select_str_4))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        session_id = item[2]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_z8_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg,
                                              remainder=30)

        if self.sql_cli.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_begin_time)
            if is_recent_time == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=z8_update_str_6,
                    sql_cli=self.sql_cli,
                )
                self.lg.info(
                    '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format(
                        goods_id,
                        timestamp_to_regulartime(miaosha_begin_time)))
                index += 1
                self.goods_index = index
                await async_sleep(.3)

                return goods_id, res

            elif is_recent_time == 2:
                # 可能包括过期的
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    # 处理已过期的逻辑删
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=z8_update_str_6,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '过期的goods_id为({0}), 限时秒杀开始时间为({1}), 逻辑删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_begin_time)))
                else:
                    self.lg.info(
                        '未来时间暂时不更新! miaosha_begin_time: {}, miaosha_end_time: {}'
                        .format(
                            timestamp_to_regulartime(miaosha_begin_time),
                            timestamp_to_regulartime(miaosha_end_time),
                        ))

                index += 1
                self.goods_index = index

                return goods_id, res

            else:
                # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                try:
                    tmp_data = self.zhe_800_spike._get_one_session_id_data(
                        base_session_id=str(session_id))
                except Exception:
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                try:
                    tmp_data = tmp_data.get('data', {}).get('blocks', [])
                    assert tmp_data != [], '该session_id不存在,此处跳过'
                except AssertionError:
                    # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=z8_update_str_6,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        msg=
                        '该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'
                        .format(goods_id, miaosha_begin_time))
                    index += 1
                    self.goods_index = index
                    await async_sleep(1.2)

                    return goods_id, res

                tmp_data = [item_s.get('deal', {}) for item_s in tmp_data]
                # pprint(tmp_data)
                try:
                    miaosha_goods_list = await self._get_miaoshao_goods_info_list(
                        data=tmp_data)
                    # pprint(miaosha_goods_list)
                except ValueError:
                    await async_sleep(2)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                # 该session_id中现有的所有zid的list
                miaosha_goods_all_goods_id = [
                    i.get('zid') for i in miaosha_goods_list
                ]
                if goods_id not in miaosha_goods_all_goods_id:
                    # 内部已经下架的
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=z8_update_str_6,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format(
                            goods_id))
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                else:
                    # 未下架的
                    res = await self._one_update(
                        miaosha_goods_list=miaosha_goods_list,
                        goods_id=goods_id)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(1.5)

        return goods_id, res

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :return:
        '''
        miaosha_goods_list = kwargs.get('miaosha_goods_list')
        goods_id = kwargs.get('goods_id')

        zhe_800_miaosha = Zhe800Parse()
        res = False
        for item_1 in miaosha_goods_list:
            if item_1.get('zid', '') == goods_id:
                zhe_800_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = zhe_800_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    break

                else:  # 否则就解析并且插入
                    goods_data['stock_info'] = item_1.get('stock_info')
                    goods_data['goods_id'] = str(item_1.get('zid'))
                    if item_1.get('stock_info').get('activity_stock') > 0:
                        # self.lg.info(item_1.get('price'))
                        # self.lg.info(item_1.get('taobao_price'))
                        goods_data['price'] = item_1.get('price')
                        goods_data['taobao_price'] = item_1.get('taobao_price')
                    else:
                        self.lg.info('该商品参与活动的对应库存为0')
                        res = _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            logger=self.lg,
                            update_sql_str=z8_update_str_6,
                            sql_cli=self.sql_cli,
                        )
                        break

                    goods_data['sub_title'] = item_1.get('sub_title')
                    goods_data['miaosha_time'] = item_1.get('miaosha_time')
                    goods_data['miaosha_begin_time'], goods_data[
                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                            miaosha_time=item_1.get('miaosha_time'))

                    if goods_data.get('is_delete', 0) == 1:
                        self.lg.info('该商品[{0}]已售罄...'.format(goods_id))

                    res = zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table(
                        data=goods_data, pipeline=self.sql_cli)
                    break
            else:
                pass
        collect()

        return res

    async def _get_new_z8_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.zhe_800_spike
            except:
                pass
            collect()
            self.zhe_800_spike = Zhe800Spike()

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = datetime_to_timestamp(get_shanghai_time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -259200:  # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息
            # if diff_time < -172800:     # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来2小时的商品信息
            return 0  # 已过期恢复原价的
        elif diff_time > -172800 and diff_time < 7200:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:
            return 2  # 未来时间的暂时不用更新

    async def _update_db(self):
        '''
        秒杀数据实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.zhe_800_spike = Zhe800Spike()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)

            try:
                del self.zhe_800_spike
            except:
                pass
            collect()

    async def _get_miaoshao_goods_info_list(self, data) -> list:
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        # pprint(data)
        for item in data:
            if item == {}:
                continue
            # pprint(item)
            tmp = {}
            tmp['miaosha_time'] = {
                'miaosha_begin_time':
                timestamp_to_regulartime(int(
                    str(item.get('begin_time'))[0:10])),
                'miaosha_end_time':
                timestamp_to_regulartime(int(str(item.get('end_time'))[0:10])),
            }

            # 折800商品地址
            tmp['zid'] = item.get('zid')
            # 限时秒杀的库存信息
            tmp['stock_info'] = {
                'activity_stock': item.get('activity_stock',
                                           0),  # activity_stock为限时抢的剩余数量
                'stock': item.get('stock', 0),  # stock为限时秒杀的总库存
            }
            # 原始价格
            tmp['price'] = float(item.get('list_price'))
            # 秒杀的价格, float类型
            tmp['taobao_price'] = float(item.get('price'))
            tmp['sub_title'] = item.get('description', '')
            miaosha_goods_list.append(tmp)
            # pprint(miaosha_goods_list)

        return miaosha_goods_list

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.zhe_800_spike
        except:
            pass
        collect()
예제 #12
0
class CCUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/楚楚街/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.sql_cli = None
        self.concurrency = 8  # 并发量
        self.goods_index = 1
        self.delete_sql_str = cc_delete_str_1

    async def _get_pc_headers(self):
        headers = await async_get_random_headers(
            upgrade_insecure_requests=False, )
        headers.update({
            'accept': 'application/json,text/javascript,*/*;q=0.01',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'api.chuchujie.com',
            'referer': 'https://m.chuchujie.com/?module=99',
        })

        return headers

    async def _get_db_old_data(self) -> (list, None):
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.sql_cli._delete_table(sql_str=cc_delete_str_2)
            await async_sleep(5)
            result = list(self.sql_cli._select_table(sql_str=cc_select_str_1))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_new_cc_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.chuchujie_miaosha
            except:
                pass
            collect()
            self.chuchujie_miaosha = ChuChuJie_9_9_Parse()

        return

    async def _update_one_goods_info(self, item, index):
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        gender = item[2]
        page = item[3]

        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_cc_obj(index=index)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
            remainder=25,
        )

        if self.sql_cli.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_end_time)
            if is_recent_time == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=cc_update_str_2,
                    sql_cli=self.sql_cli,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_end_time)))
                await async_sleep(.3)
                index += 1
                self.goods_index = index

                return goods_id, res

            elif is_recent_time == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=cc_update_str_2,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_end_time)))

                else:
                    pass

                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                body = await self._get_one_page_goods_info(gender, page)
                if body == '':
                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                json_body = json_2_dict(body, default_res={})
                try:
                    this_page_total_count = json_body.get('data', {}).get(
                        'groupList', [])[0].get('totalCount', 0)
                except IndexError:
                    self.lg.error('获取this_page_total_count时出错, 请检查!')
                    this_page_total_count = 0

                item_list = await self._get_item_list(
                    this_page_total_count=this_page_total_count,
                    json_body=json_body)
                if item_list == []:
                    self.lg.info(
                        '#### 该gender, page对应得到的item_list为空[]!\n该商品已被下架限时秒杀活动,此处将其删除'
                    )
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=item[0],
                        logger=self.lg,
                        update_sql_str=cc_update_str_2,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
                    await async_sleep(.3)
                    index += 1
                    self.goods_index = index

                    return goods_id, res

                else:
                    res = await self._one_update(goods_id=goods_id,
                                                 item_list=item_list)

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(CHUCHUJIE_SLEEP_TIME)

        return goods_id, res

    async def _update_db(self) -> None:
        '''
        秒杀数据更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.chuchujie_miaosha = ChuChuJie_9_9_Parse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)
            try:
                del self.chuchujie_miaosha
            except:
                pass
            collect()

    async def _get_item_list(self, **kwargs) -> list:
        '''
        获取对应gender, page的商品list
        :return:
        '''
        this_page_total_count = kwargs.get('this_page_total_count')
        json_body = kwargs.get('json_body')
        tmp_goods_list = json_body.get('data',
                                       {}).get('groupList',
                                               [])[0].get('dataList', [])

        item_list = [{
            'goods_id': str(item_s.get('chuchuId', '')),
            'sub_title': item_s.get('description', ''),
        } for item_s in tmp_goods_list] if this_page_total_count != 0 else []

        return item_list

    async def _one_update(self, **kwargs):
        '''
        未下架的更新
        :param kwargs:
        :return:
        '''
        res = False
        goods_id = kwargs.get('goods_id')
        item_list = kwargs.get('item_list')

        # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in item_list]
        # 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
        # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
        #     self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
        #     tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(goods_id))
        #     self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
        #     pass

        # else:  # 未下架的
        # 不更新秒杀时间和sub_title, 只更新其他相关数据
        # for item_2 in item_list:
        #     if item_2.get('goods_id', '') == goods_id:
        self.chuchujie_miaosha.get_goods_data(goods_id=goods_id)
        goods_data = self.chuchujie_miaosha.deal_with_data()
        if goods_data == {}:  # 返回的data为空则跳过
            pass
        else:
            goods_data['goods_id'] = str(goods_id)
            # goods_data['sub_title'] = item_2.get('sub_title', '')
            # print(goods_data)
            res = self.chuchujie_miaosha.update_chuchujie_xianshimiaosha_table(
                data=goods_data, pipeline=self.sql_cli)

        return res

    async def _get_one_page_goods_info(self, *params) -> str:
        '''
        得到一个页面的html代码
        :param params: 待传入的参数
        :return: '{}' or str
        '''
        gender, page = params
        tmp_url = 'https://api.chuchujie.com/api/'

        client = {
            "ageGroup": "AG_0to24",
            "channel": "QD_web_webkit",
            "deviceId": "0",
            "gender": gender,  # '0' -> 女 | '1' -> 男
            "imei": "0",
            "packageName": "com.culiu.purchase",
            "platform": "wap",
            "sessionId": "0",
            "shopToken": "0",
            "userId": "0",
            "version": "1.0",
            "xingeToken": ""
        }

        query = {"group": 4, "module": "99", "page": page, "tab": "all"}

        # 切记: Query String Parameters直接这样编码发送即可
        # 如果是要post的数据就得使用post的方法
        data = {
            'client': json.dumps(client),
            'query': json.dumps(query),
            'page': page
        }

        body = Requests.get_url_body(url=tmp_url,
                                     headers=self.headers,
                                     params=data,
                                     ip_pool_type=self.ip_pool_type)

        return body

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = datetime_to_timestamp(get_shanghai_time())  # 当前的时间戳

        diff_time = time_1 - time_2
        # if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
        if diff_time < -100000:  # 设置大点避免还在卖的被下掉
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的
        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.chuchujie_miaosha
        except:
            pass
        collect()
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=z8_delete_str_1)
            result = list(sql_cli._select_table(sql_str=z8_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            for item in result:  # 实时更新数据
                goods_id = item[0]
                db_is_delete = item[1]
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    remainder=50,
                )
                if index % 300 == 0:  # 每更新300个,休眠3分钟
                    sleep_time = 3 * 60
                    sleep(sleep_time)
                    print('休眠{}s中...'.format(sleep_time))

                if sql_cli.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    # 不用这个了因为会影响到正常情况的商品
                    try:  # 单独处理商品页面不存在的情况
                        if isinstance(tmp_tmp, str) and re.compile(
                                r'^ze').findall(tmp_tmp) != []:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                            sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                            continue
                        else:
                            pass
                    except:
                        pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        if db_is_delete == 1:
                            print('该goods_id[{0}]已过期!'.format(goods_id))
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                        else:
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()
class MIUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/蜜芽/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.delete_sql_str = mia_delete_str_3
        self.concurrency = 8  # 并发量
        self.tmp_sql_server = None
        self.goods_index = 1

    async def _get_pc_headers(self) -> dict:
        headers = await async_get_random_headers(
            upgrade_insecure_requests=False, )
        headers.update({
            'Host': 'm.mia.com',
        })

        return headers

    async def _get_db_old_data(self):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=mia_delete_str_4)
            await async_sleep(5)
            result = list(
                self.tmp_sql_server._select_table(sql_str=mia_select_str_3))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_end_time(self, miaosha_time):
        miaosha_end_time = json.loads(miaosha_time).get('miaosha_end_time')
        miaosha_end_time = int(
            str(
                time.mktime(
                    time.strptime(miaosha_end_time,
                                  '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_end_time

    async def _get_new_mia_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.mia_miaosha
            except:
                pass
            collect()
            self.mia_miaosha = MiaParse()

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        单个更新
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        pid = item[2]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_mia_obj(index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
            remainder=30,
        )

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_end_time)
            if is_recent_time == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=mia_update_str_6,
                    sql_cli=self.tmp_sql_server,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_begin_time)))
                await async_sleep(.5)
                self.goods_index = index + 1

                return goods_id, res

            elif is_recent_time == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=mia_update_str_6,
                        sql_cli=self.tmp_sql_server,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_begin_time)))

                else:
                    pass

                self.goods_index = index + 1

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                    pid)
                body = Requests.get_url_body(url=tmp_url,
                                             headers=self.headers,
                                             had_referer=True,
                                             ip_pool_type=self.ip_pool_type)
                # print(body)
                body = '' if body == '' or body == '[]' else body
                try:
                    tmp_data = json_2_dict(
                        json_str=body,
                        default_res={},
                        logger=self.lg,
                    )
                    assert tmp_data != {}, 'tmp_data为空dict!'
                except AssertionError:
                    self.lg.error('遇到错误:', exc_info=True)
                    self.goods_index = index + 1
                    await async_sleep(.3)

                    return goods_id, res

                item_list = tmp_data.get('item_list', [])
                # 该pid中现有的所有goods_id的list
                miaosha_goods_all_goods_id = [
                    item_1.get('item_id', '') for item_1 in item_list
                ]
                # self.lg.info(str(miaosha_goods_all_goods_id))
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=mia_update_str_6,
                        sql_cli=self.tmp_sql_server,
                    )
                    self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
                    self.goods_index = index + 1
                    await async_sleep(.3)

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(
                        item_list=item_list,
                        goods_id=goods_id,
                        tmp_data=tmp_data,
                    )

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')

        await async_sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        self.goods_index = index + 1
        collect()

        return goods_id, res

    async def _update_db(self) -> None:
        '''
        秒杀实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.mia_miaosha = MiaParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)
            try:
                del self.mia_miaosha
            except:
                pass
            collect()

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :param kwargs:
        :return:
        '''
        res = False
        item_list = kwargs.get('item_list')
        goods_id = kwargs.get('goods_id')
        tmp_data = kwargs.get('tmp_data')

        begin_time, end_time = await self._get_begin_time_and_end_time(tmp_data
                                                                       )
        for item_2 in item_list:
            if item_2.get('item_id', '') == goods_id:
                self.mia_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = self.mia_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    pass
                else:
                    goods_data['goods_id'] = str(goods_id)
                    goods_data['price'] = item_2.get('active_price')
                    goods_data['taobao_price'] = item_2.get('active_price')
                    goods_data['sub_title'] = item_2.get('short_info', '')
                    goods_data['miaosha_time'] = {
                        'miaosha_begin_time':
                        timestamp_to_regulartime(begin_time),
                        'miaosha_end_time': timestamp_to_regulartime(end_time),
                    }
                    goods_data['miaosha_begin_time'], goods_data[
                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                            miaosha_time=goods_data['miaosha_time'])

                    res = self.mia_miaosha.update_mia_xianshimiaosha_table(
                        data=goods_data, pipeline=self.tmp_sql_server)
                    break
            else:
                pass

        return res

    async def _get_begin_time_and_end_time(self, tmp_data) -> tuple:
        begin_time = tmp_data.get('p_info', {}).get('start_time', '')
        end_time = tmp_data.get('p_info', {}).get('end_time', '')
        # 把str字符串类型转换为时间戳的形式
        begin_time = int(
            time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S')))
        end_time = int(
            time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S')))

        return begin_time, end_time

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = datetime_to_timestamp(get_shanghai_time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的
        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.mia_miaosha
        except:
            pass
        collect()
예제 #15
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_xianshimiaosha where site_id=20'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mia_miaosha = MiaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)

                        tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                            item[2])

                        body = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers,
                                                       had_referer=True)
                        # print(body)

                        if body == '' or body == '[]':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                tmp_data = json.loads(body)
                            except:
                                tmp_data = {}
                                print('json.loads转换body时出错, 此处跳过!')

                            begin_time = tmp_data.get('p_info', {}).get(
                                'start_time', '')
                            end_time = tmp_data.get('p_info',
                                                    {}).get('end_time', '')
                            begin_time = int(
                                time.mktime(
                                    time.strptime(begin_time,
                                                  '%Y/%m/%d %H:%M:%S'))
                            )  # 把str字符串类型转换为时间戳的形式
                            end_time = int(
                                time.mktime(
                                    time.strptime(end_time,
                                                  '%Y/%m/%d %H:%M:%S')))
                            item_list = tmp_data.get('item_list', [])

                            # 该pid中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [
                                item_1.get('item_id', '')
                                for item_1 in item_list
                            ]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:  # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('item_id', '') == item[0]:
                                        mia_miaosha.get_goods_data(
                                            goods_id=item[0])
                                        goods_data = mia_miaosha.deal_with_data(
                                        )

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(
                                                item[0])
                                            goods_data['price'] = item_2.get(
                                                'active_price')
                                            goods_data[
                                                'taobao_price'] = item_2.get(
                                                    'active_price')
                                            goods_data[
                                                'sub_title'] = item_2.get(
                                                    'short_info', '')
                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time':
                                                timestamp_to_regulartime(
                                                    begin_time),
                                                'miaosha_end_time':
                                                timestamp_to_regulartime(
                                                    end_time),
                                            }
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'miaosha_time'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mia_miaosha.update_mia_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)
                                            sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
예제 #16
0
class JPUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.tmp_sql_server = None
        self.concurrency = 8
        self.goods_index = 1
        self.delete_sql_str = jp_delete_str_3

    async def _get_pc_headers(self) -> dict:
        return {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'm.juanpi.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    async def _get_db_old_data(self) -> (None, list):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None)
            await async_sleep(5)
            result = list(self.tmp_sql_server._select_table(sql_str=jp_select_str_4))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_begin_time(self, miaosha_time) -> int:
        miaosha_begin_time = json_2_dict(miaosha_time).get('miaosha_begin_time')
        miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_begin_time

    async def _get_new_jp_obj(self, index):
        if index % 10 == 0:         # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.juanpi_miaosha
            except:
                pass
            collect()
            self.juanpi_miaosha = JuanPiParse()

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        tab_id = item[2]
        page = item[3]
        miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time)
        # self.lg.info(str(miaosha_begin_time))
        await self._get_new_jp_obj(index=index)
        self.tmp_sql_server = await _get_new_db_conn(db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30)

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_begin_time)
            if is_recent_time == 0:
                res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,))
                self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format(goods_id, miaosha_begin_time))
                await async_sleep(.3)
                index += 1
                self.goods_index = index

                return goods_id, res

            elif is_recent_time == 2:
                self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id))
                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(goods_id, index))
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id), str(page),
                )
                # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url))
                body = Requests.get_url_body(url=tmp_url, headers=await self._get_pc_headers(), ip_pool_type=self.ip_pool_type)
                try:
                    data = json_2_dict(body, default_res={}).get('data', {})
                    assert data != {}, 'data为空dict!'
                    data = data.get('goodslist', [])
                    assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(tab_id, page)
                except AssertionError:
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                miaosha_goods_list = await self._get_miaoshao_goods_info_list(data=data)
                # self.lg.info(str(miaosha_goods_list))
                # 该tab_id, page中现有的所有goods_id的list
                miaosha_goods_all_goods_id = [i.get('goods_id') for i in miaosha_goods_list]
                self.lg.info(str(miaosha_goods_all_goods_id))
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    if miaosha_goods_all_goods_id != []:        # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过!
                        self.lg.info('该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id))
                    else:
                        # 表示该tab_id,page中没有了该goods_id
                        res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,))
                        self.lg.info('该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format(goods_id))

                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(miaosha_goods_list=miaosha_goods_list, goods_id=goods_id)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        await async_sleep(1.2)

        return goods_id, res

    async def _update_db(self) -> None:
        '''
        秒杀数据实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency)
                self.juanpi_miaosha = JuanPiParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(self.loop.create_task(self._update_one_goods_info(item=item, index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)
            try:
                del self.juanpi_miaosha
            except:
                pass
            collect()

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :param kwargs:
        :return:
        '''
        res = False
        miaosha_goods_list = kwargs.get('miaosha_goods_list')
        goods_id = kwargs.get('goods_id')

        for item_1 in miaosha_goods_list:
            if item_1.get('goods_id', '') == goods_id:
                self.juanpi_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = self.juanpi_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    break
                else:  # 否则就解析并且插入
                    goods_data['stock_info'] = item_1.get('stock_info')
                    goods_data['goods_id'] = item_1.get('goods_id')
                    # goods_data['username'] = '******'
                    if item_1.get('stock_info').get('activity_stock') > 0:
                        goods_data['price'] = item_1.get('price')  # 秒杀前的原特价
                        goods_data['taobao_price'] = item_1.get('taobao_price')  # 秒杀价
                    else:
                        pass
                    goods_data['sub_title'] = item_1.get('sub_title', '')
                    goods_data['miaosha_time'] = item_1.get('miaosha_time')
                    goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                        miaosha_time=item_1.get('miaosha_time'))

                    res = self.juanpi_miaosha.to_update_juanpi_xianshimiaosha_table(
                        data=goods_data,
                        pipeline=self.tmp_sql_server)
                    await async_sleep(.3)  # 避免太快
                    break
            else:
                pass

        return res

    async def _get_miaoshao_goods_info_list(self, data) -> list:
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            tmp = {}
            tmp['miaosha_time'] = {
                'miaosha_begin_time': timestamp_to_regulartime(int(item.get('start_time'))),
                'miaosha_end_time': timestamp_to_regulartime(int(item.get('end_time'))),
            }
            stock = item.get('stock', 0)
            tmp['goods_id'] = item.get('goods_id')
            # 限时秒杀库存信息
            tmp['stock_info'] = {
                'activity_stock': int(item.get('stock', 0)*(item.get('rate', 0)/100)),
                'stock': item.get('stock', 0),
            }
            # 原始价格
            tmp['price'] = round(float(item.get('oprice', '0')), 2)
            tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2)
            miaosha_goods_list.append(tmp)

        return miaosha_goods_list

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(time.time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -259200:     # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息
        # if diff_time < -172800:     # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来14小时的商品信息(20点到第二天10点时间间隔为14小时)
            return 0    # 已过期恢复原价的
        elif diff_time > -172800 and diff_time < 50400:
            return 1    # 表示是昨天跟今天的也就是待更新的
        else:
            return 2    # 未来时间的暂时不用更新

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
예제 #17
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, schedule, is_delete from dbo.juanpi_pintuan where site_id=18'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                data = {}
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    pintuan_end_time = json.loads(item[1])[0].get('end_time')
                    pintuan_end_time = int(str(time.mktime(time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(time.time()):
                        sql_str = 'delete from dbo.juanpi_pintuan where goods_id=%s'
                        tmp_sql_server._delete_table(sql_str=sql_str, params=(item[0],))
                        print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0]))
                    else:
                        print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                        juanpi_pintuan.get_goods_data(goods_id=item[0])
                        data = juanpi_pintuan.deal_with_data()

                        if data != {}:
                            data['goods_id'] = item[0]
                            juanpi_pintuan.to_right_and_update_pintuan_data(data=data, pipeline=tmp_sql_server)
                        else:  # 表示返回的data值为空值
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del juanpi_pintuan
                # except:
                #     pass
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
예제 #18
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = 'select goods_id, miaosha_time, pid from dbo.mia_pintuan where site_id=21'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mia_pintuan = MiaPintuanParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)

                        tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(
                            item[2]) + '/0/'
                        # print(tmp_url)

                        body = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers,
                                                       had_referer=True)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                tmp_data = json.loads(body)
                            except:
                                tmp_data = {}
                                print('json.loads转换body时出错, 此处跳过!')

                            if tmp_data.get('data_list', []) == []:
                                print('得到的data_list为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                data_list = [{
                                    'goods_id':
                                    item_2.get('sku', ''),
                                    'sub_title':
                                    item_2.get('intro', ''),
                                } for item_2 in tmp_data.get('data_list', [])]
                                # pprint(data_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in data_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:  # 内部已经下架的
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass

                                    # 一律更新
                                    mia_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mia_pintuan.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:
                                        goods_data['goods_id'] = str(item[0])
                                        if goods_data[
                                                'pintuan_time'] == {}:  # 当没有拼团时间时,就表示已下架拼团(未让其正常更新进数据库, 我把拼团开始结束时间都设置为当前时间)
                                            now_time = get_shanghai_time()
                                            goods_data[
                                                'pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = (
                                                        now_time, now_time)
                                        else:
                                            goods_data[
                                                'pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                                        pintuan_time=goods_data[
                                                            'pintuan_time'])

                                        # pprint(goods_data)
                                        # print(goods_data)
                                        mia_pintuan.update_mia_pintuan_table(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in data_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mia_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mia_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}:  # 返回的data为空则跳过
                                                pass
                                            else:
                                                goods_data['goods_id'] = str(
                                                    item[0])
                                                goods_data[
                                                    'sub_title'] = item_2.get(
                                                        'sub_title', '')
                                                if goods_data[
                                                        'pintuan_time'] == {}:  # 当没有拼团时间时,就表示已下架拼团
                                                    now_time = get_shanghai_time(
                                                    )
                                                    goods_data[
                                                        'pintuan_begin_time'], goods_data[
                                                            'pintuan_end_time'] = (
                                                                now_time,
                                                                now_time)
                                                else:
                                                    goods_data[
                                                        'pintuan_begin_time'], goods_data[
                                                            'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                                                pintuan_time=
                                                                goods_data[
                                                                    'pintuan_time']
                                                            )

                                                # pprint(goods_data)
                                                # print(goods_data)
                                                mia_pintuan.update_mia_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(MIA_SPIKE_SLEEP_TIME
                                                      )  # 放慢速度
                                        else:
                                            pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=jm_delete_str_2)
            result = list(tmp_sql_server._select_table(sql_str=jm_select_str_1))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            # 获取cookies
            my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type)
            cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/')
            try: del my_phantomjs
            except: pass
            if cookies == '':
                print('!!! 获取cookies失败 !!!')
                return False

            print('获取cookies成功!')
            self.headers.update(Cookie=cookies)
            for item in result:     # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(str(time.mktime(time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                jumeiyoupin_miaosha = JuMeiYouPinParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                        print('过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass          # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:   # 返回1,表示在待更新区间内
                        print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                        data['goods_id'] = item[0]

                        this_page_all_goods_list = self.get_one_page_all_goods_list(item[2])

                        if this_page_all_goods_list == '网络错误!':
                            print('网络错误!先跳过')
                            continue

                        elif this_page_all_goods_list == []:
                            print('#### 该page对应得到的this_page_all_goods_list为空[]!')
                            print('** 该商品已被下架限时秒杀活动, 此处将其删除')
                            tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            """
                            由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                            """
                            # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                            #
                            # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            #     print('该商品已被下架限时秒杀活动,此处将其删除')
                            #     tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            #     print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            #     pass
                            #
                            # else:  # 未下架的
                            tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(item[3])
                            jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                            goods_data = jumeiyoupin_miaosha.deal_with_data()

                            if goods_data == {}:  # 返回的data为空则跳过
                                pass
                            else:
                                goods_data['goods_id'] = str(item[0])
                                goods_data['miaosha_time'] = {
                                    'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''),
                                    'miaosha_end_time': goods_data['schedule'].get('end_time', ''),
                                }
                                goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])

                                # print(goods_data)
                                jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server)
                                sleep(JUMEIYOUPIN_SLEEP_TIME)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
예제 #20
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=z8_delete_str_1)
            result = list(
                tmp_sql_server._select_table(sql_str=z8_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=item[0])
                    # 不用这个了因为会影响到正常情况的商品
                    try:  # 单独处理商品页面不存在的情况
                        if isinstance(tmp_tmp, str) and re.compile(
                                r'^ze').findall(tmp_tmp) != []:
                            print('@@ 该商品的页面已经不存在!此处将其删除!')
                            tmp_sql_server._delete_table(
                                sql_str=z8_delete_str_2, params=(item[0], ))
                            sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                            continue
                        else:
                            pass
                    except:
                        pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]

                        if item[1] == 1:
                            tmp_sql_server._delete_table(
                                sql_str=z8_delete_str_2, params=(item[0], ))
                            print('该goods_id[{0}]已过期,删除成功!'.format(item[0]))
                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (item[0], index))
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                gc.collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
예제 #21
0
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=pd_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            pinduoduo_miaosha = PinduoduoParse()

            all_miaosha_goods_list = self.get_all_miaosha_goods_list()

            # 其中所有goods_id的list
            miaosha_goods_all_goods_id = [
                i.get('goods_id') for i in all_miaosha_goods_list
            ]
            # print(miaosha_goods_all_goods_id)

            for item in result:  # 实时更新数据
                # 对于拼多多先拿到该商品的结束时间点
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        sql_cli._delete_table(sql_str=self.delete_sql_str,
                                              params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))
                        sleep(.3)

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            '''
                            表示其中没有了该goods_id
                            '''
                            sql_cli._delete_table(sql_str=self.delete_sql_str,
                                                  params=(item[0]))
                            print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                  item[0])
                            sleep(.3)

                        else:  # 未下架的
                            for item_1 in all_miaosha_goods_list:
                                if item_1.get('goods_id', '') == item[0]:
                                    # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                    # pinduoduo_miaosha = PinduoduoParse()
                                    pinduoduo_miaosha.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = pinduoduo_miaosha.deal_with_data(
                                    )

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        # sleep(3)
                                        pass
                                    else:  # 否则就解析并且插入
                                        goods_data['stock_info'] = item_1.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item_1.get(
                                            'goods_id')
                                        if item_1.get('stock_info').get(
                                                'activity_stock') > 0:
                                            goods_data['price'] = item_1.get(
                                                'price')  # 秒杀前的原特价
                                            goods_data[
                                                'taobao_price'] = item_1.get(
                                                    'taobao_price')  # 秒杀价
                                        else:
                                            pass
                                        goods_data['sub_title'] = item_1.get(
                                            'sub_title', '')
                                        goods_data[
                                            'miaosha_time'] = item_1.get(
                                                'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item_1.get(
                                                        'miaosha_time'))

                                        if item_1.get('stock_info').get(
                                                'activity_stock') <= 1:
                                            # 实时秒杀库存小于等于1时就标记为 已售罄
                                            print('该秒杀商品已售罄...')
                                            goods_data['is_delete'] = 1

                                        # print(goods_data)
                                        pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table(
                                            data=goods_data, pipeline=sql_cli)
                                    sleep(PINDUODUO_SLEEP_TIME)
                                else:
                                    pass

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(3 * 60)
        # del ali_1688
        gc.collect()
예제 #22
0
class JMYPUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/聚美优品/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.sql_cli = None
        self.delete_sql_str = jm_delete_str_1
        self.goods_index = 1
        self.concurrency = 10  # 并发量

    async def _get_pc_headers(self):
        headers = await async_get_random_headers(
            upgrade_insecure_requests=False, )
        headers.update({
            'accept': 'application/json,text/javascript,text/plain,*/*;q=0.01',
            # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'h5.jumei.com',
            'referer': 'https://h5.jumei.com/',
            'X-Requested-With': 'XMLHttpRequest',
        })

        return headers

    async def _get_db_old_data(self) -> (list, None):
        '''
        待更新数据
        :return:
        '''
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.sql_cli._delete_table(sql_str=jm_delete_str_2)
            await async_sleep(5)
            result = list(self.sql_cli._select_table(sql_str=jm_select_str_1))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_cookies(self) -> str:
        '''
        获取请求需要的cookies
        :return:
        '''
        # 获取cookies
        my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH,
                                  ip_pool_type=self.ip_pool_type)
        cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(
            url='https://h5.jumei.com/')
        try:
            del my_phantomjs
        except:
            pass
        if cookies == '':
            self.lg.error('!!! 获取cookies失败 !!!')

        self.lg.info('获取cookies成功!')

        return cookies

    async def _get_new_jumei_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.jumeiyoupin_miaosha
            except:
                pass
            collect()
            self.jumeiyoupin_miaosha = JuMeiYouPinParse()

    async def _get_one_page_all_goods_list(self, *params) -> (list, str):
        '''
        得到一个页面地址的所有商品list
        :return: str | list 类型
        '''
        page = params[0]
        all_goods_list = []
        tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
            str(page))
        # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
        json_body = json_2_dict(Requests.get_url_body(
            url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type),
                                default_res={},
                                logger=self.lg)
        if json_body == {}:
            return '网络错误!'

        this_page_item_list = json_body.get('item_list', [])
        if this_page_item_list == []:
            return []

        for item in this_page_item_list:
            if item.get('item_id', '') not in [
                    item_1.get('item_id', '') for item_1 in all_goods_list
            ]:
                item['page'] = page
                all_goods_list.append(item)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]

        return all_goods_list

    async def _update_one_goods_info(self, item, index):
        '''
        更新单个
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        page = item[2]
        goods_url = item[3]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_jumei_obj(index=index)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
        )

        if self.sql_cli.is_connect_success:
            is_recent_time_res = await self._is_recent_time(miaosha_end_time)
            if is_recent_time_res == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=jm_update_str_4,
                    sql_cli=self.sql_cli,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_end_time)))
                await async_sleep(.3)

            elif is_recent_time_res == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=jm_update_str_4,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_end_time)))

                else:
                    pass

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                this_page_all_goods_list = await self._get_one_page_all_goods_list(
                    page)
                if isinstance(this_page_all_goods_list, str):
                    self.lg.error('网络错误!先跳过')
                    await async_sleep(1.5)
                    return res

                elif this_page_all_goods_list == []:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=jm_update_str_4,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.error(
                        '#### 该page对应得到的this_page_all_goods_list为空[]!')
                    self.lg.error(
                        '** 该商品已被下架限时秒杀活动, 此处将其逻辑删除, goods_id:{}'.format(
                            goods_id))
                    await async_sleep(.3)

                else:
                    """
                    由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                    """
                    # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                    #
                    # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    #     self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
                    #     res = _handle_goods_shelves_in_auto_goods_table(
                    #         goods_id=goods_id,
                    #         logger=self.lg,
                    #         update_sql_str=jm_update_str_4,
                    #         sql_cli=self.sql_cli, )
                    #     self.lg.info('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                    #     pass

                    # else:  # 未下架的
                    tmp_r = self.jumeiyoupin_miaosha.get_goods_id_from_url(
                        goods_url)
                    self.jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                    goods_data = self.jumeiyoupin_miaosha.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:
                        goods_data['goods_id'] = goods_id
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time':
                            goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        res = self.jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(
                            data=goods_data, pipeline=self.sql_cli)

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(JUMEIYOUPIN_SLEEP_TIME)

        return [goods_id, res]

    async def _update_db(self):
        '''
        数据更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                cookies = await self._get_cookies()
                self.headers = await self._get_pc_headers()
                self.headers.update({
                    'Cookie': cookies,
                })
                self.jumeiyoupin_miaosha = JuMeiYouPinParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)
                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(10)
            try:
                del self.jumeiyoupin_miaosha
            except:
                pass
            collect()

    async def _is_recent_time(self, timestamp):
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(datetime_to_timestamp(get_shanghai_time()))

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的
        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.jumeiyoupin_miaosha
        except:
            pass
        collect()
예제 #23
0
    def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods info
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time')
            miaosha_begin_time = int(
                str(
                    time.mktime(
                        time.strptime(miaosha_begin_time,
                                      '%Y-%m-%d %H:%M:%S')))[0:10])
            # print(miaosha_begin_time)

            data = {}
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            zhe_800_miaosha = Zhe800Parse()
            if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                print('正在重置,并与数据库建立新连接中...')
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                print('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                if self.is_recent_time(miaosha_begin_time) == 0:
                    tmp_sql_server._delete_table(sql_str=self.delete_sql_str,
                                                 params=(item[0]))
                    print(
                        '过期的goods_id为(%s)' % item[0],
                        ', 限时秒杀开始时间为(%s), 删除成功!' %
                        json.loads(item[1]).get('miaosha_begin_time'))

                elif self.is_recent_time(miaosha_begin_time) == 2:
                    # break       # 跳出循环
                    pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                else:  # 返回1,表示在待更新区间内
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    data['goods_id'] = item[0]

                    try:
                        tmp_data = self.zhe_800_spike._get_one_session_id_data(
                            base_session_id=str(item[2]))
                    except Exception as e:
                        print(e)
                        continue

                    if tmp_data.get('data', {}).get('blocks',
                                                    []) == []:  # session_id不存在
                        print('该session_id不存在,此处跳过')
                        pass

                    else:
                        tmp_data = [
                            item_s.get('deal', {}) for item_s in tmp_data.get(
                                'data', {}).get('blocks', [])
                        ]
                        if tmp_data != []:  # 否则说明里面有数据
                            miaosha_goods_list = self.get_miaoshao_goods_info_list(
                                data=tmp_data)
                            # pprint(miaosha_goods_list)

                            # 该session_id中现有的所有zid的list
                            miaosha_goods_all_goods_id = [
                                i.get('zid') for i in miaosha_goods_list
                            ]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:  # 未下架的
                                for item_1 in miaosha_goods_list:
                                    if item_1.get('zid', '') == item[0]:
                                        zhe_800_miaosha.get_goods_data(
                                            goods_id=item[0])
                                        goods_data = zhe_800_miaosha.deal_with_data(
                                        )

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:  # 否则就解析并且插入
                                            goods_data[
                                                'stock_info'] = item_1.get(
                                                    'stock_info')
                                            goods_data['goods_id'] = str(
                                                item_1.get('zid'))
                                            # goods_data['username'] = '******'
                                            if item_1.get('stock_info').get(
                                                    'activity_stock') > 0:
                                                goods_data[
                                                    'price'] = item_1.get(
                                                        'price')
                                                goods_data[
                                                    'taobao_price'] = item_1.get(
                                                        'taobao_price')
                                            else:
                                                pass
                                            goods_data[
                                                'sub_title'] = item_1.get(
                                                    'sub_title')
                                            goods_data[
                                                'miaosha_time'] = item_1.get(
                                                    'miaosha_time')
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=item_1.
                                                        get('miaosha_time'))

                                            # print(goods_data['stock_info'])
                                            # print(goods_data['miaosha_time'])
                                            zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)
                                    else:
                                        pass

                        else:  # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品
                            print('该sessionid没有相关key为jsons的数据')
                            # return {}
                            tmp_sql_server._delete_table(
                                sql_str=self.delete_sql_str, params=(item[0]))
                            print(
                                '过期的goods_id为(%s)' % item[0],
                                ', 限时秒杀开始时间为(%s), 删除成功!' %
                                json.loads(item[1]).get('miaosha_begin_time'))
                            pass

            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            # try:
            #     del tmall
            # except:
            #     pass
            # sleep(.8)
            gc.collect()
        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        gc.collect()

        return
async def run_forever():
    #### 实时更新数据
    # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志
    lg = set_logger(logger_name=get_uuid1(),
                    log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' +
                    str(get_shanghai_time())[0:10] + '.txt',
                    console_log_level=INFO,
                    file_log_level=ERROR)

    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
    # 由于不处理下架的商品,所以is_delete=0
    try:
        # todo 先不处理过期的因为后台没有同步下架会导致其无法查到数据
        # 得 处理 因为只要此处会清数据了
        tmp_sql_server._delete_table(sql_str=tb_delete_str_2, params=None)
        # await async_sleep(10)
        result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7))
    except TypeError:
        lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)')
        return None

    await _print_db_old_data(
        result=result,
        logger=lg,
    )

    index = 1
    for item in result:
        goods_id = item[0]
        tejia_end_time = item[2]

        tmp_sql_server = await _get_new_db_conn(
            db_obj=tmp_sql_server,
            index=index,
            logger=lg,
            db_conn_type=1,
        )
        if tmp_sql_server.is_connect_success:
            # lg.info(str(tejia_end_time))
            if tejia_end_time < get_shanghai_time():
                # 过期的不删除, 降为更新为常规爆款促销商品
                # index = await update_expired_goods_to_normal_goods(
                #     goods_id=goods_id,
                #     index=index,
                #     tmp_sql_server=tmp_sql_server,
                #     logger=lg
                # )
                # 过期直接下架
                lg.info('@@ 过期下架[goods_id: {}]'.format(goods_id))
                _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=lg,
                    update_sql_str=tb_update_str_5,
                )
                index += 1

            else:
                # 下面为天天特价商品信息更新
                '''
                ** 由于天天特价不会提前下架商品,就不对应更新特价时间段
                '''
                # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间
                # if index % 6 == 0:
                #     try: del tmp_taobao_tiantiantejia
                #     except: pass
                #     collect()
                #     tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=lg)
                #
                # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3])
                # if tmp_body == '':
                #     msg = '获取到的tmp_body为空str! 出错category为: ' + item[3]
                #     lg.error(msg)
                #     continue
                #
                # try:
                #     tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0]
                # except IndexError:
                #     msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3]
                #     lg.error(msg)
                #     continue
                # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body)
                # if tmp_sort_data == 'no items':
                #     lg.info('该api接口获取到的item_list为no items!请检查')
                #     break
                # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data)
                # # lg.info(str(tejia_goods_list))
                # await async_sleep(.45)
                # # lg.info('111')
                '''
                研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过
                '''
                # if is_in_child_sort(tejia_goods_list, goods_id=goods_id) is False:     # 表示被官方提前下架
                #     # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=goods_id)
                #     # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(goods_id))
                #     print('222')
                #     pass

                # else:       # 表示商品未被提前下架
                lg.info(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                    (goods_id, str(index)))
                taobao = TaoBaoLoginAndParse(
                    logger=lg,
                    is_real_times_update_call=is_real_times_update_call)
                taobao.get_goods_data(goods_id)
                goods_data = taobao.deal_with_data(goods_id=goods_id)
                if goods_data != {}:
                    # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=goods_id)
                    # if tmp_time != []:
                    #     begin_time, end_time = tmp_time
                    #
                    #     goods_data['goods_id'] = goods_id
                    #     goods_data['schedule'] = [{
                    #         'begin_time': begin_time,
                    #         'end_time': end_time,
                    #     }]
                    #     goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    #     await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server)
                    # else:
                    #     lg.info('该goods_id不在该api接口的商品中!!')
                    #     pass

                    goods_data['goods_id'] = goods_id
                    if goods_data.get('is_delete', 0) == 1:
                        lg.info('@该商品已下架...')

                    await taobao.update_taobao_tiantiantejia_table(
                        data=goods_data, pipeline=tmp_sql_server)

                else:
                    await async_sleep(4)

                await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                index += 1
                collect()

        else:
            lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass
        collect()
    lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
    if get_shanghai_time().hour == 0:  # 0点以后不更新
        # sleep(60 * 60 * .5)
        await async_sleep(5 * 60)

    else:
        await async_sleep(60 * 1)
    collect()

    return True
예제 #25
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_4)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_3))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mogujie_miaosha = MoGuJieMiaoShaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0], ))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        item_list = self.get_item_list(event_time=str(item[2]))
                        if item_list == '':
                            # 可能网络状况导致, 先跳过
                            pass

                        elif item_list == []:
                            print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                            # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            tmp_sql_server._update_table(
                                sql_str=mg_update_str_1, params=(item[0], ))
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            # 该event_time中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [
                                item_1.get('iid', '') for item_1 in item_list
                            ]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                                # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                tmp_sql_server._update_table(
                                    sql_str=mg_update_str_1,
                                    params=(item[0], ))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:  # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('iid', '') == item[0]:
                                        spider_url = item[3]
                                        mogujie_miaosha.get_goods_data(
                                            goods_id=spider_url)
                                        goods_data = mogujie_miaosha.deal_with_data(
                                        )

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(
                                                item[0])

                                            # price设置为原价
                                            try:
                                                tmp_price_list = sorted([
                                                    round(
                                                        float(
                                                            item_4.get(
                                                                'normal_price',
                                                                '')), 2)
                                                    for item_4 in goods_data[
                                                        'price_info_list']
                                                ])
                                                price = Decimal(
                                                    tmp_price_list[-1]
                                                ).__round__(2)  # 商品原价
                                                goods_data['price'] = price
                                            except:
                                                print('设置price为原价时出错!请检查')
                                                continue

                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'startTime', 0))),
                                                'miaosha_end_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'endTime', 0))),
                                            }
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'miaosha_time'])
                                            # print(goods_data['title'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mogujie_miaosha.update_mogujie_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)
                                            sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
예제 #26
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_2)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            self.my_phantomjs = MyPhantomjs(
                executable_path=PHANTOMJS_DRIVER_PATH)
            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs(
                        executable_path=PHANTOMJS_DRIVER_PATH)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = item[0]
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data['goods_id'] = item[
                                                    0]
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()