示例#1
0
def get_one_mia_data(**kwargs):
    '''
    抓取一个mia地址的数据
    :param kwargs: 
    :return: 
    '''
    username = kwargs.get('username', DEFAULT_USERNAME)
    wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '')
    my_lg = kwargs.get('my_lg')

    mi = MiaParse()
    goods_id = mi.get_goods_id_from_url(
        wait_to_deal_with_url)  # 获取goods_id, 这里返回的是一个list
    if goods_id == '':  # 如果得不到goods_id, 则return error
        my_lg.info('获取到的goods_id为空!')
        try:
            del mi  # 每次都回收一下
        except Exception:
            pass
        gc.collect()
        return {'goods_id': ''}  # 错误1: goods_id为空值

    tmp_result = mi.get_goods_data(goods_id=goods_id)
    data = mi.deal_with_data()  # 如果成功获取的话, 返回的是一个data的dict对象
    if data == {} or tmp_result == {}:
        my_lg.error('获取到的data为空!出错地址: {0}'.format(wait_to_deal_with_url))
        try:
            del mi
        except:
            pass
        gc.collect()
        return {'goods_id': goods_id, 'msg': 'data为空!'}  # 错误2: 抓取失败

    wait_to_deal_with_url = 'https://www.mia.com/item-{}.html'.format(goods_id)
    wait_to_save_data = add_base_info_2_processed_data(
        data=data,
        spider_url=wait_to_deal_with_url,
        username=username,
        goods_id=goods_id)
    try:
        del mi
    except:
        pass

    return wait_to_save_data
示例#2
0
    def deal_with_data(self, *param):
        '''
        处理并存储相关秒杀商品的数据
        :param param: 相关参数
        :return:
        '''
        pid = param[0]
        begin_time = int(time.mktime(time.strptime(param[1], '%Y/%m/%d %H:%M:%S')))     # 把str字符串类型转换为时间戳的形式
        end_time = int(time.mktime(time.strptime(param[2], '%Y/%m/%d %H:%M:%S')))
        item_list = param[3]

        mia = MiaParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=mia_select_str_4))]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('item_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('item_id', ''))
                    tmp_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'

                    mia.get_goods_data(goods_id=str(goods_id))
                    goods_data = mia.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        goods_url = goods_data['goods_url']
                        if re.compile(r'://m.miyabaobei.hk/').findall(goods_url) != '':
                            goods_url = 'https://www.miyabaobei.hk/item-' + str(goods_id) + '.html'
                        else:
                            goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'
                        goods_data['goods_url'] = goods_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['price'] = item.get('active_price')
                        goods_data['taobao_price'] = item.get('active_price')       # 秒杀最低价
                        goods_data['sub_title'] = item.get('short_info', '')
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time': timestamp_to_regulartime(begin_time),
                            'miaosha_end_time': timestamp_to_regulartime(end_time),
                        }
                        goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])
                        goods_data['pid'] = str(pid)

                        # pprint(goods_data)
                        # print(goods_data)
                        mia.insert_into_mia_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mia
        except:
            pass
        gc.collect()
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            logger_name=get_uuid1(),
            log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=mia_select_str_5))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            mia = MiaParse()
            for item in result:
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del mia
                    except:
                        pass
                    mia = MiaParse()
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    mia.get_goods_data(goods_id=goods_id)
                    data = mia.deal_with_data()
                    db_goods_info_obj = MIADbGoodsInfoObj(item=item,
                                                          logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='mia',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        mia._to_right_and_update_data(data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        try:
            del my_lg
        except:
            pass
        collect()
class MIUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/蜜芽/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.delete_sql_str = mia_delete_str_3
        self.concurrency = 8  # 并发量
        self.tmp_sql_server = None
        self.goods_index = 1

    async def _get_pc_headers(self) -> dict:
        headers = await async_get_random_headers(
            upgrade_insecure_requests=False, )
        headers.update({
            'Host': 'm.mia.com',
        })

        return headers

    async def _get_db_old_data(self):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=mia_delete_str_4)
            await async_sleep(5)
            result = list(
                self.tmp_sql_server._select_table(sql_str=mia_select_str_3))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_end_time(self, miaosha_time):
        miaosha_end_time = json.loads(miaosha_time).get('miaosha_end_time')
        miaosha_end_time = int(
            str(
                time.mktime(
                    time.strptime(miaosha_end_time,
                                  '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_end_time

    async def _get_new_mia_obj(self, index):
        if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.mia_miaosha
            except:
                pass
            collect()
            self.mia_miaosha = MiaParse()

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        单个更新
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        pid = item[2]
        miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
            miaosha_time=miaosha_time,
            logger=self.lg,
        )
        await self._get_new_mia_obj(index)
        self.tmp_sql_server = await _get_new_db_conn(
            db_obj=self.tmp_sql_server,
            index=index,
            logger=self.lg,
            remainder=30,
        )

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_end_time)
            if is_recent_time == 0:
                res = _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=self.lg,
                    update_sql_str=mia_update_str_6,
                    sql_cli=self.tmp_sql_server,
                )
                self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format(
                    goods_id, timestamp_to_regulartime(miaosha_begin_time)))
                await async_sleep(.5)
                self.goods_index = index + 1

                return goods_id, res

            elif is_recent_time == 2:
                if datetime_to_timestamp(
                        get_shanghai_time()) > miaosha_end_time:
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=mia_update_str_6,
                        sql_cli=self.tmp_sql_server,
                    )
                    self.lg.info(
                        '过期的goods_id为({}), 限时秒杀开始时间为({}), 删除成功!'.format(
                            goods_id,
                            timestamp_to_regulartime(miaosha_begin_time)))

                else:
                    pass

                self.goods_index = index + 1

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info(
                    '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                    format(goods_id, index))
                tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                    pid)
                body = Requests.get_url_body(url=tmp_url,
                                             headers=self.headers,
                                             had_referer=True,
                                             ip_pool_type=self.ip_pool_type)
                # print(body)
                body = '' if body == '' or body == '[]' else body
                try:
                    tmp_data = json_2_dict(
                        json_str=body,
                        default_res={},
                        logger=self.lg,
                    )
                    assert tmp_data != {}, 'tmp_data为空dict!'
                except AssertionError:
                    self.lg.error('遇到错误:', exc_info=True)
                    self.goods_index = index + 1
                    await async_sleep(.3)

                    return goods_id, res

                item_list = tmp_data.get('item_list', [])
                # 该pid中现有的所有goods_id的list
                miaosha_goods_all_goods_id = [
                    item_1.get('item_id', '') for item_1 in item_list
                ]
                # self.lg.info(str(miaosha_goods_all_goods_id))
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    self.lg.info('该商品已被下架限时秒杀活动,此处将其删除')
                    res = _handle_goods_shelves_in_auto_goods_table(
                        goods_id=goods_id,
                        logger=self.lg,
                        update_sql_str=mia_update_str_6,
                        sql_cli=self.tmp_sql_server,
                    )
                    self.lg.info('下架的goods_id为({}), 删除成功!'.format(goods_id))
                    self.goods_index = index + 1
                    await async_sleep(.3)

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(
                        item_list=item_list,
                        goods_id=goods_id,
                        tmp_data=tmp_data,
                    )

        else:  # 表示返回的data值为空值
            self.lg.info('数据库连接失败,数据库可能关闭或者维护中')

        await async_sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        self.goods_index = index + 1
        collect()

        return goods_id, res

    async def _update_db(self) -> None:
        '''
        秒杀实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.mia_miaosha = MiaParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(item=item,
                                                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)
            try:
                del self.mia_miaosha
            except:
                pass
            collect()

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :param kwargs:
        :return:
        '''
        res = False
        item_list = kwargs.get('item_list')
        goods_id = kwargs.get('goods_id')
        tmp_data = kwargs.get('tmp_data')

        begin_time, end_time = await self._get_begin_time_and_end_time(tmp_data
                                                                       )
        for item_2 in item_list:
            if item_2.get('item_id', '') == goods_id:
                self.mia_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = self.mia_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    pass
                else:
                    goods_data['goods_id'] = str(goods_id)
                    goods_data['price'] = item_2.get('active_price')
                    goods_data['taobao_price'] = item_2.get('active_price')
                    goods_data['sub_title'] = item_2.get('short_info', '')
                    goods_data['miaosha_time'] = {
                        'miaosha_begin_time':
                        timestamp_to_regulartime(begin_time),
                        'miaosha_end_time': timestamp_to_regulartime(end_time),
                    }
                    goods_data['miaosha_begin_time'], goods_data[
                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                            miaosha_time=goods_data['miaosha_time'])

                    res = self.mia_miaosha.update_mia_xianshimiaosha_table(
                        data=goods_data, pipeline=self.tmp_sql_server)
                    break
            else:
                pass

        return res

    async def _get_begin_time_and_end_time(self, tmp_data) -> tuple:
        begin_time = tmp_data.get('p_info', {}).get('start_time', '')
        end_time = tmp_data.get('p_info', {}).get('end_time', '')
        # 把str字符串类型转换为时间戳的形式
        begin_time = int(
            time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S')))
        end_time = int(
            time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S')))

        return begin_time, end_time

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = datetime_to_timestamp(get_shanghai_time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的
        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的
        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        try:
            del self.mia_miaosha
        except:
            pass
        collect()
示例#5
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_xianshimiaosha where site_id=20'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mia_miaosha = MiaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)

                        tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                            item[2])

                        body = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers,
                                                       had_referer=True)
                        # print(body)

                        if body == '' or body == '[]':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                tmp_data = json.loads(body)
                            except:
                                tmp_data = {}
                                print('json.loads转换body时出错, 此处跳过!')

                            begin_time = tmp_data.get('p_info', {}).get(
                                'start_time', '')
                            end_time = tmp_data.get('p_info',
                                                    {}).get('end_time', '')
                            begin_time = int(
                                time.mktime(
                                    time.strptime(begin_time,
                                                  '%Y/%m/%d %H:%M:%S'))
                            )  # 把str字符串类型转换为时间戳的形式
                            end_time = int(
                                time.mktime(
                                    time.strptime(end_time,
                                                  '%Y/%m/%d %H:%M:%S')))
                            item_list = tmp_data.get('item_list', [])

                            # 该pid中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [
                                item_1.get('item_id', '')
                                for item_1 in item_list
                            ]

                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:  # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('item_id', '') == item[0]:
                                        mia_miaosha.get_goods_data(
                                            goods_id=item[0])
                                        goods_data = mia_miaosha.deal_with_data(
                                        )

                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(
                                                item[0])
                                            goods_data['price'] = item_2.get(
                                                'active_price')
                                            goods_data[
                                                'taobao_price'] = item_2.get(
                                                    'active_price')
                                            goods_data[
                                                'sub_title'] = item_2.get(
                                                    'short_info', '')
                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time':
                                                timestamp_to_regulartime(
                                                    begin_time),
                                                'miaosha_end_time':
                                                timestamp_to_regulartime(
                                                    end_time),
                                            }
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'miaosha_time'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mia_miaosha.update_mia_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)
                                            sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()