Exemplo n.º 1
0
 async def _get_new_jumei_obj(self, index):
     if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
         try:
             del self.jumeiyoupin_miaosha
         except:
             pass
         collect()
         self.jumeiyoupin_miaosha = JuMeiYouPinParse()
Exemplo n.º 2
0
    async def _update_db(self):
        '''
        数据更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                cookies = await self._get_cookies()
                self.headers = await self._get_pc_headers()
                self.headers.update({
                    'Cookie': cookies,
                })
                self.jumeiyoupin_miaosha = JuMeiYouPinParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)
                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(10)
            try:
                del self.jumeiyoupin_miaosha
            except:
                pass
            collect()
Exemplo n.º 3
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, miaosha_time, page, goods_url from dbo.jumeiyoupin_xianshimiaosha where site_id=26'
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    jumei = JuMeiYouPinParse()
                    goods_id = item.get('goods_id', '')
                    type = item.get('type', '')
                    tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format(goods_id, type)
                    jumei.get_goods_data(goods_id=[goods_id, type])
                    goods_data = jumei.deal_with_data()

                    if goods_data == {}:
                        pass

                    elif goods_data.get('is_delete', 0) == 1:
                        print('------>>>| 该商品库存为0,已被抢光!')
                        pass

                    else:   # 否则就解析并且插入
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time': goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])
                        goods_data['page'] = item.get('page')

                        # pprint(goods_data)
                        # print(goods_data)
                        jumei.insert_into_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                        sleep(JUMEIYOUPIN_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠

                    try: del jumei
                    except: pass

        else:
            print('数据库连接失败,此处跳过!')
            pass

        gc.collect()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.
                          select_jumeiyoupin_xianshimiaosha_all_goods_id())
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            jumeiyoupin_spike = JuMeiYouPinSpike()
            # 获取cookies
            my_phantomjs = MyPhantomjs()
            cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(
                url='https://h5.jumei.com/')
            try:
                del my_phantomjs
            except:
                pass
            if cookies == '':
                print('!!! 获取cookies失败 !!!')
                return False

            print('获取cookies成功!')
            self.headers.update(Cookie=cookies)
            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                jumeiyoupin_miaosha = JuMeiYouPinParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(
                            goods_id=item[0])
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        this_page_all_goods_list = self.get_one_page_all_goods_list(
                            item[2])

                        if this_page_all_goods_list == '网络错误!':
                            print('网络错误!先跳过')
                            continue

                        elif this_page_all_goods_list == []:
                            print(
                                '#### 该page对应得到的this_page_all_goods_list为空[]!')
                            print('** 该商品已被下架限时秒杀活动, 此处将其删除')
                            tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(
                                item[0])
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            """
                            由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                            """
                            # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                            #
                            # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            #     print('该商品已被下架限时秒杀活动,此处将其删除')
                            #     tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(goods_id=item[0])
                            #     print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            #     pass
                            #
                            # else:  # 未下架的
                            tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(
                                item[3])
                            jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                            goods_data = jumeiyoupin_miaosha.deal_with_data()

                            if goods_data == {}:  # 返回的data为空则跳过
                                pass
                            else:
                                goods_data['goods_id'] = str(item[0])
                                goods_data['miaosha_time'] = {
                                    'miaosha_begin_time':
                                    goods_data['schedule'].get(
                                        'begin_time', ''),
                                    'miaosha_end_time':
                                    goods_data['schedule'].get('end_time', ''),
                                }
                                goods_data['miaosha_begin_time'], goods_data[
                                    'miaosha_end_time'] = jumeiyoupin_spike.get_miaosha_begin_time_and_miaosha_end_time(
                                        miaosha_time=goods_data['miaosha_time']
                                    )

                                # print(goods_data)
                                jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(
                                    data=goods_data, pipeline=tmp_sql_server)
                                sleep(JUMEIYOUPIN_SLEEP_TIME)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Exemplo n.º 5
0
class JMYPUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/聚美优品/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.sql_cli = None
        self.delete_sql_str = jm_delete_str_1
        self.goods_index = 1
        self.concurrency = 10  # 并发量

    async def _get_pc_headers(self):
        headers = await async_get_random_headers(
            upgrade_insecure_requests=False, )
        headers.update({
            'accept': 'application/json,text/javascript,text/plain,*/*;q=0.01',
            # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'h5.jumei.com',
            'referer': 'https://h5.jumei.com/',
            'X-Requested-With': 'XMLHttpRequest',
        })

        return headers

    async def _get_db_old_data(self) -> (list, None):
        '''
        待更新数据
        :return:
        '''
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.sql_cli._delete_table(sql_str=jm_delete_str_2)
            await async_sleep(5)
            result = list(self.sql_cli._select_table(sql_str=jm_select_str_1))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_cookies(self) -> str:
        '''
        获取请求需要的cookies
        :return:
        '''
        # 获取cookies
        my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH,
                                  ip_pool_type=self.ip_pool_type)
        cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(
            url='https://h5.jumei.com/')
        try:
            del my_phantomjs
        except:
            pass
        if cookies == '':
            self.lg.error('!!! 获取cookies失败 !!!')

        self.lg.info('获取cookies成功!')

        return cookies

    async def _get_new_jumei_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.jumeiyoupin_miaosha
            except:
                pass
            collect()
            self.jumeiyoupin_miaosha = JuMeiYouPinParse()

    async def _get_one_page_all_goods_list(self, *params) -> (list, str):
        '''
        得到一个页面地址的所有商品list
        :return: str | list 类型
        '''
        page = params[0]
        all_goods_list = []
        tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
            str(page))
        # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
        json_body = json_2_dict(Requests.get_url_body(
            url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type),
                                default_res={},
                                logger=self.lg)
        if json_body == {}:
            return '网络错误!'

        this_page_item_list = json_body.get('item_list', [])
        if this_page_item_list == []:
            return []

        for item in this_page_item_list:
            if item.get('item_id', '') not in [
                    item_1.get('item_id', '') for item_1 in all_goods_list
            ]:
                item['page'] = page
                all_goods_list.append(item)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]

        return all_goods_list

    async def _update_one_goods_info(self, item, index):
        '''
        更新单个
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        page = item[2]
        goods_url = item[3]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_jumei_obj(index=index)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
        )

        if self.sql_cli.is_connect_success:
            is_recent_time_res = await self._is_recent_time(miaosha_end_time)
            if is_recent_time_res == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=jm_update_str_4,
                    sql_cli=self.sql_cli,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_end_time)))
                await async_sleep(.3)

            elif is_recent_time_res == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=jm_update_str_4,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀结束时间为({}), 逻辑删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_end_time)))

                else:
                    pass

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                this_page_all_goods_list = await self._get_one_page_all_goods_list(
                    page)
                if isinstance(this_page_all_goods_list, str):
                    self.lg.error('网络错误!先跳过')
                    await async_sleep(1.5)
                    return res

                elif this_page_all_goods_list == []:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=jm_update_str_4,
                        sql_cli=self.sql_cli,
                    )
                    self.lg.error(
                        '#### 该page对应得到的this_page_all_goods_list为空[]!')
                    self.lg.error(
                        '** 该商品已被下架限时秒杀活动, 此处将其逻辑删除, goods_id:{}'.format(
                            goods_id))
                    await async_sleep(.3)

                else:
                    """
                    由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                    """
                    # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                    #
                    # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    #     self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
                    #     res = _handle_goods_shelves_in_auto_goods_table(
                    #         goods_id=goods_id,
                    #         logger=self.lg,
                    #         update_sql_str=jm_update_str_4,
                    #         sql_cli=self.sql_cli, )
                    #     self.lg.info('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                    #     pass

                    # else:  # 未下架的
                    tmp_r = self.jumeiyoupin_miaosha.get_goods_id_from_url(
                        goods_url)
                    self.jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                    goods_data = self.jumeiyoupin_miaosha.deal_with_data()
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:
                        goods_data['goods_id'] = goods_id
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time':
                            goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        res = self.jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(
                            data=goods_data, pipeline=self.sql_cli)

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(JUMEIYOUPIN_SLEEP_TIME)

        return [goods_id, res]

    async def _update_db(self):
        '''
        数据更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                cookies = await self._get_cookies()
                self.headers = await self._get_pc_headers()
                self.headers.update({
                    'Cookie': cookies,
                })
                self.jumeiyoupin_miaosha = JuMeiYouPinParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)
                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(10)
            try:
                del self.jumeiyoupin_miaosha
            except:
                pass
            collect()

    async def _is_recent_time(self, timestamp):
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(datetime_to_timestamp(get_shanghai_time()))

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的
        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.jumeiyoupin_miaosha
        except:
            pass
        collect()