Пример #1
0
    def _get_db_old_data(self) -> (list, None):
        """
        获取db待更新data
        :return:
        """
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.sql_cli._delete_table(sql_str=mia_delete_str_2)
            result = list(self.sql_cli._select_table(sql_str=mia_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')

        _block_print_db_old_data(result=result)

        return result
Пример #2
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            logger_name=get_uuid1(),
            log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=mia_select_str_5))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            mia = MiaParse()
            for item in result:
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del mia
                    except:
                        pass
                    mia = MiaParse()
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    mia.get_goods_data(goods_id=goods_id)
                    data = mia.deal_with_data()
                    db_goods_info_obj = MIADbGoodsInfoObj(item=item,
                                                          logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='mia',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        mia._to_right_and_update_data(data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        try:
            del my_lg
        except:
            pass
        collect()
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=jp_delete_str_1)
            result = list(sql_cli._select_table(sql_str=jp_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                goods_id = item[0]
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    try:
                        pintuan_end_time = json.loads(
                            item[1])[0].get('end_time')
                    except IndexError:
                        print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format(
                            goods_id))
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            sql_cli=sql_cli,
                            update_sql_str=jp_update_str_7,
                        )
                        continue
                    pintuan_end_time = int(
                        str(
                            time.mktime(
                                time.strptime(pintuan_end_time,
                                              '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(
                            datetime_to_timestamp(get_shanghai_time())):
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            sql_cli=sql_cli,
                            update_sql_str=jp_update_str_7,
                        )
                        print('该goods_id[{0}]已过期或者售完,逻辑删除成功!'.format(goods_id))
                    else:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        juanpi_pintuan.get_goods_data(goods_id=goods_id)
                        data = juanpi_pintuan.deal_with_data()
                        if data == {}:
                            continue

                        data['goods_id'] = goods_id
                        juanpi_pintuan.to_right_and_update_pintuan_data(
                            data=data, pipeline=sql_cli)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        gc.collect()
Пример #4
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=kl_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True)
            for item in result:  # 实时更新数据
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del kaola
                    except:
                        pass
                    kaola = KaoLaParse(logger=my_lg,
                                       is_real_times_update_call=True)
                    collect()

                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    logger=my_lg,
                    remainder=10,
                )
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    db_goods_info_obj = KLDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    data = kaola._get_goods_data(goods_id=goods_id)
                    if data.get('is_delete', 0) == 1:
                        # 单独处理下架商品
                        data['goods_id'] = goods_id
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=db_goods_info_obj.is_delete,
                                shelf_time=db_goods_info_obj.shelf_time,
                                delete_time=db_goods_info_obj.delete_time,
                            )

                        try:
                            kaola.to_right_and_update_data(data,
                                                           pipeline=sql_cli)
                        except Exception:
                            my_lg.error(exc_info=True)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        collect()
                        continue

                    data = kaola._deal_with_data()
                    if data != {}:
                        if data.get('is_delete', 0) == 1:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='kl',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )
                        kaola.to_right_and_update_data(data, pipeline=sql_cli)

                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠3s中...')
                        sleep(3.)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:
            # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
Пример #5
0
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=vip_select_str_1))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            continue

        _block_print_db_old_data(result=result)
        index = 1
        for item in result:  # 实时更新数据
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            vip = VipParse()
            sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                             index=index,
                                             remainder=50)
            if sql_cli.is_connect_success:
                print(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' %
                    (item[0], index))
                vip.get_goods_data(goods_id=[0, item[0]])
                data = vip.deal_with_data()
                if data != {}:
                    data['goods_id'] = item[0]
                    data['shelf_time'], data[
                        'delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[1],
                            shelf_time=item[4],
                            delete_time=item[5])
                    price_info_list = old_sku_info = json_2_dict(
                        item[6], default_res=[])
                    try:
                        old_sku_info = format_price_info_list(
                            price_info_list=price_info_list, site_id=25)
                    except AttributeError:  # 处理已被格式化过的
                        pass
                    new_sku_info = format_price_info_list(
                        data['price_info_list'], site_id=25)
                    data['_is_price_change'], data[
                        'sku_info_trans_time'], price_change_info = _get_sku_price_trans_record(
                            old_sku_info=old_sku_info,
                            new_sku_info=new_sku_info,
                            is_price_change=item[7]
                            if item[7] is not None else 0,
                            db_price_change_info=json_2_dict(item[9],
                                                             default_res=[]),
                            old_price_trans_time=item[12],
                        )
                    data['_is_price_change'], data[
                        '_price_change_info'] = _get_price_change_info(
                            old_price=item[2],
                            old_taobao_price=item[3],
                            new_price=data['price'],
                            new_taobao_price=data['taobao_price'],
                            is_price_change=data['_is_price_change'],
                            price_change_info=price_change_info,
                        )
                    # 监控纯规格变动
                    data['is_spec_change'], data[
                        'spec_trans_time'] = _get_spec_trans_record(
                            old_sku_info=old_sku_info,
                            new_sku_info=new_sku_info,
                            is_spec_change=item[8]
                            if item[8] is not None else 0,
                            old_spec_trans_time=item[13],
                        )

                    # 监控纯库存变动
                    data['is_stock_change'], data['stock_trans_time'], data[
                        'stock_change_info'] = _get_stock_trans_record(
                            old_sku_info=old_sku_info,
                            new_sku_info=new_sku_info,
                            is_stock_change=item[10]
                            if item[10] is not None else 0,
                            db_stock_change_info=json_2_dict(item[11],
                                                             default_res=[]),
                            old_stock_trans_time=item[14],
                        )

                    vip.to_right_and_update_data(data=data, pipeline=sql_cli)
                else:  # 表示返回的data值为空值
                    pass
            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            try:
                del vip
            except:
                pass
            gc.collect()
            sleep(VIP_SLEEP_TIME)
        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(30)
        gc.collect()
Пример #6
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=yx_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            yanxuan = YanXuanParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del yanxuan
                    except:
                        pass
                    yanxuan = YanXuanParse(logger=my_lg)
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    yanxuan._get_goods_data(goods_id=item[1])

                    data = yanxuan._deal_with_data()
                    db_goods_info_obj = YXDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:
                            # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            sql_cli._update_table_2(
                                sql_str=yx_update_str_2,
                                params=(db_goods_info_obj.goods_id, ),
                                logger=my_lg,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='yx',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        yanxuan.to_right_and_update_data(data,
                                                         pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
Пример #7
0
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=pd_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            pinduoduo_miaosha = PinduoduoParse()

            all_miaosha_goods_list = self.get_all_miaosha_goods_list()

            # 其中所有goods_id的list
            miaosha_goods_all_goods_id = [
                i.get('goods_id') for i in all_miaosha_goods_list
            ]
            # print(miaosha_goods_all_goods_id)

            for item in result:  # 实时更新数据
                # 对于拼多多先拿到该商品的结束时间点
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        sql_cli._delete_table(sql_str=self.delete_sql_str,
                                              params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))
                        sleep(.3)

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            '''
                            表示其中没有了该goods_id
                            '''
                            sql_cli._delete_table(sql_str=self.delete_sql_str,
                                                  params=(item[0]))
                            print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                  item[0])
                            sleep(.3)

                        else:  # 未下架的
                            for item_1 in all_miaosha_goods_list:
                                if item_1.get('goods_id', '') == item[0]:
                                    # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                    # pinduoduo_miaosha = PinduoduoParse()
                                    pinduoduo_miaosha.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = pinduoduo_miaosha.deal_with_data(
                                    )

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        # sleep(3)
                                        pass
                                    else:  # 否则就解析并且插入
                                        goods_data['stock_info'] = item_1.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item_1.get(
                                            'goods_id')
                                        if item_1.get('stock_info').get(
                                                'activity_stock') > 0:
                                            goods_data['price'] = item_1.get(
                                                'price')  # 秒杀前的原特价
                                            goods_data[
                                                'taobao_price'] = item_1.get(
                                                    'taobao_price')  # 秒杀价
                                        else:
                                            pass
                                        goods_data['sub_title'] = item_1.get(
                                            'sub_title', '')
                                        goods_data[
                                            'miaosha_time'] = item_1.get(
                                                'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item_1.get(
                                                        'miaosha_time'))

                                        if item_1.get('stock_info').get(
                                                'activity_stock') <= 1:
                                            # 实时秒杀库存小于等于1时就标记为 已售罄
                                            print('该秒杀商品已售罄...')
                                            goods_data['is_delete'] = 1

                                        # print(goods_data)
                                        pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table(
                                            data=goods_data, pipeline=sql_cli)
                                    sleep(PINDUODUO_SLEEP_TIME)
                                else:
                                    pass

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(3 * 60)
        # del ali_1688
        gc.collect()
Пример #8
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=mg_delete_str_2)
            result = list(sql_cli._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            self.my_phantomjs = BaseDriver(
                executable_path=PHANTOMJS_DRIVER_PATH,
                ip_pool_type=self.ip_pool_type)
            for item in result:  # 实时更新数据
                goods_id = item[0]
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = BaseDriver(
                        executable_path=PHANTOMJS_DRIVER_PATH,
                        ip_pool_type=self.ip_pool_type)

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            update_sql_str=mg_update_str_5,
                            sql_cli=sql_cli,
                        )
                        print(
                            '过期的goods_id为(%s)' % goods_id,
                            ', 拼团开始时间为(%s), 逻辑删除成功!' %
                            json.loads(item[1]).get('begin_time'))
                        sleep(.3)

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                _handle_goods_shelves_in_auto_goods_table(
                                    goods_id=goods_id,
                                    update_sql_str=mg_update_str_5,
                                    sql_cli=sql_cli,
                                )
                                sleep(.3)

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if goods_id not in pintuan_goods_all_goods_id:
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=goods_id)
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = goods_id
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data, pipeline=sql_cli)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == goods_id:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=goods_id)
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data[
                                                    'goods_id'] = goods_id
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=sql_cli)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        gc.collect()
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=z8_delete_str_1)
            result = list(sql_cli._select_table(sql_str=z8_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            for item in result:  # 实时更新数据
                goods_id = item[0]
                db_is_delete = item[1]
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    remainder=50,
                )
                if index % 300 == 0:  # 每更新300个,休眠3分钟
                    sleep_time = 3 * 60
                    sleep(sleep_time)
                    print('休眠{}s中...'.format(sleep_time))

                if sql_cli.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    # 不用这个了因为会影响到正常情况的商品
                    try:  # 单独处理商品页面不存在的情况
                        if isinstance(tmp_tmp, str) and re.compile(
                                r'^ze').findall(tmp_tmp) != []:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                            sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                            continue
                        else:
                            pass
                    except:
                        pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        data['goods_id'] = goods_id

                        if db_is_delete == 1:
                            print('该goods_id[{0}]已过期!'.format(goods_id))
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                update_sql_str=z8_update_str_4,
                                sql_cli=sql_cli,
                            )
                        else:
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(10 * 60)
        collect()
Пример #10
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_4)
            sleep(5)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_3))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            for item in result:  # 实时更新数据
                goods_id = item[0]
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                mogujie_miaosha = MoGuJieMiaoShaParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._update_table(sql_str=mg_update_str_1,
                                                     params=(goods_id, ))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))
                        sleep(.5)

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        item_list = self.get_item_list(event_time=str(item[2]))
                        if item_list == '':
                            # 可能网络状况导致, 先跳过
                            pass

                        elif item_list == []:
                            print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                            tmp_sql_server._update_table(
                                sql_str=mg_update_str_1, params=(item[0], ))
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            sleep(.4)  # 避免死锁

                        else:
                            # 该event_time中现有的所有goods_id的list
                            miaosha_goods_all_goods_id = [
                                item_1.get('iid', '') for item_1 in item_list
                            ]
                            if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                print('该商品已被下架限时秒杀活动,此处将其逻辑删除')
                                tmp_sql_server._update_table(
                                    sql_str=mg_update_str_1,
                                    params=(item[0], ))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                sleep(.4)

                            else:  # 未下架的
                                for item_2 in item_list:
                                    if item_2.get('iid', '') == item[0]:
                                        spider_url = item[3]
                                        mogujie_miaosha.get_goods_data(
                                            goods_id=spider_url)
                                        goods_data = mogujie_miaosha.deal_with_data(
                                        )
                                        if goods_data == {}:  # 返回的data为空则跳过
                                            pass
                                        else:
                                            goods_data['goods_id'] = str(
                                                item[0])

                                            # price设置为原价
                                            try:
                                                tmp_price_list = sorted([
                                                    round(
                                                        float(
                                                            item_4.get(
                                                                'normal_price',
                                                                '')), 2)
                                                    for item_4 in goods_data[
                                                        'price_info_list']
                                                ])
                                                price = Decimal(
                                                    tmp_price_list[-1]
                                                ).__round__(2)  # 商品原价
                                                goods_data['price'] = price
                                            except:
                                                print('设置price为原价时出错!请检查')
                                                sleep(MOGUJIE_SLEEP_TIME)
                                                continue

                                            goods_data['miaosha_time'] = {
                                                'miaosha_begin_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'startTime', 0))),
                                                'miaosha_end_time':
                                                timestamp_to_regulartime(
                                                    int(
                                                        item_2.get(
                                                            'endTime', 0))),
                                            }
                                            goods_data[
                                                'miaosha_begin_time'], goods_data[
                                                    'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                        miaosha_time=goods_data[
                                                            'miaosha_time'])
                                            # print(goods_data['title'])

                                            # pprint(goods_data)
                                            # print(goods_data)
                                            mogujie_miaosha.update_mogujie_xianshimiaosha_table(
                                                data=goods_data,
                                                pipeline=tmp_sql_server)

                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度
                                    else:
                                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        collect()