예제 #1
0
    async def _deal_with_all_goods_id(self):
        '''
        获取每个详细分类的商品信息
        :return: None
        '''
        _data = await self._get_all_goods_list()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        index = 1
        if my_pipeline.is_connect_success:
            self.my_lg.info('正在获取淘抢购db原有goods_id, 请耐心等待...')
            sql_str = r'select goods_id from dbo.tao_qianggou_xianshimiaosha where site_id=28'
            db_ = list(my_pipeline._select_table(sql_str=sql_str))
            db_all_goods_id = [item[0] for item in db_]
            self.my_lg.info('获取完毕!!!')
            # self.my_lg.info(str(db_all_goods_id))

            for item in _data:
                miaosha_goods_list = await self._get_taoqianggou_goods_list(data=item.get('data', []))
                # self.my_lg.info(str(miaosha_goods_list))
                # pprint(miaosha_goods_list)

                for tmp_item in miaosha_goods_list:
                    if tmp_item.get('goods_id', '') in db_all_goods_id:    # 处理如果该goods_id已经存在于数据库中的情况
                        self.my_lg.info('该goods_id[%s]已存在db中' % tmp_item.get('goods_id', ''))
                        continue

                    if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                        self.my_lg.info('正在重置,并与数据库建立新连接中...')
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        # my_pipeline = SqlPools()
                        self.my_lg.info('与数据库的新连接成功建立...')

                    if my_pipeline.is_connect_success:
                        tmall = TmallParse(logger=self.my_lg)
                        tmp_url = 'https://detail.tmall.com/item.htm?id={0}'.format(tmp_item.get('goods_id'))
                        goods_id = tmall.get_goods_id_from_url(tmp_url)

                        tmall.get_goods_data(goods_id=goods_id)
                        goods_data = tmall.deal_with_data()

                        if goods_data != {}:
                            # self.my_lg.info(str(tmp_item))
                            goods_data['goods_id'] = tmp_item.get('goods_id')
                            goods_data['spider_url'] = tmp_url
                            goods_data['miaosha_time'] = tmp_item.get('miaosha_time')
                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=tmp_item.get('miaosha_time'))
                            goods_data['page'] = tmp_item.get('page')
                            goods_data['spider_time'] = tmp_item.get('spider_time')

                            tmall.insert_into_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                            await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME)

                        else:
                            await asyncio.sleep(5)

                        try: del tmall
                        except: pass
                        gc.collect()
    async def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods 数据
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time')
            miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10])
            # self.my_lg.info(str(miaosha_begin_time))

            tmall = TmallParse(logger=self.my_lg)
            if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                self.my_lg.info('正在重置,并与数据库建立新连接中...')
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                self.my_lg.info('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                if await self.is_recent_time(miaosha_begin_time) == 0:
                    tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                    self.my_lg.info('过期的goods_id为(%s)' % item[0] + ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time'))

                else:   # 返回1, 表示在待更新的区间内
                    self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index)))

                    '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据'''
                    goods_id = tmall.get_goods_id_from_url(item[2])

                    tmall.get_goods_data(goods_id=goods_id)
                    goods_data = tmall.deal_with_data()

                    if goods_data != {}:
                        # self.my_lg.info(str(item))
                        goods_data['goods_id'] = item[0]

                        await tmall._update_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server)
                        await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                    else:
                        await asyncio.sleep(5)

                index += 1

            try: del tmall
            except: pass

        gc.collect()

        return
예제 #3
0
    def block_get_tm_one_goods_info_task(self, goods_id: list, index: int):
        """
        阻塞获取tm单个goods信息
        :param goods_id:
        :param index:
        :return:
        """
        tm = TmallParse(logger=self.lg)
        site_id, _goods_id = goods_id
        before_goods_data = tm.get_goods_data(goods_id=goods_id)
        end_goods_data = tm.deal_with_data()

        # 处理前后某个为1, 则为1
        is_delete = 1 \
            if before_goods_data.get('is_delete', 0) == 1 or end_goods_data.get('is_delete', 0) == 1 \
            else 0
        _label = '+' \
            if end_goods_data != {} or is_delete == 1 \
            else '-'
        self.lg.info('[{}] goods_id: {}, site_id: {}, is_delete: {}'.format(
            _label,
            _goods_id,
            site_id,
            is_delete,
        ))

        try:
            del tm
        except:
            pass
        collect()

        return (site_id, _goods_id, index, before_goods_data, end_goods_data)
예제 #4
0
파일: tm.py 프로젝트: yfeng2018/python-1
def get_one_tm_data(**kwargs):
    '''
    抓取一个tm url的data
    :param kwargs:
    :return:
    '''
    username = kwargs.get('username', DEFAULT_USERNAME)
    wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '')
    my_lg = kwargs.get('my_lg')

    login_tmall = TmallParse(logger=my_lg)
    goods_id = login_tmall.get_goods_id_from_url(
        wait_to_deal_with_url)  # 获取goods_id, 这里返回的是一个list
    if goods_id == []:  # 如果得不到goods_id, 则return error
        my_lg.info('获取到的goods_id为空!')
        try:
            del login_tmall  # 每次都回收一下
        except:
            pass
        gc.collect()

        return {'goods_id': ''}

    # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际
    #####################################################
    if goods_id[0] == 0:  # [0, '1111']
        wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[
            1]  # 构造成标准干净的天猫商品地址
    elif goods_id[0] == 1:  # [1, '1111']
        wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[
            1]
    elif goods_id[0] == 2:  # [2, '1111', 'https://xxxxx']
        wait_to_deal_with_url = str(goods_id[2]) + '?id=' + goods_id[1]

    tmp_result = login_tmall.get_goods_data(goods_id=goods_id)
    data = login_tmall.deal_with_data()  # 如果成功获取的话, 返回的是一个data的dict对象

    sleep(TMALL_SLEEP_TIME)  # 这个在服务器里面可以注释掉为.5s
    if data == {} or tmp_result == {}:
        my_lg.info('获取到的data为空!')
        try:
            del login_tmall
        except:
            pass
        gc.collect()

        return {'goods_id': goods_id[1], 'msg': 'data为空!'}

    wait_to_save_data = add_base_info_2_processed_data(
        data=data,
        spider_url=wait_to_deal_with_url,
        username=username,
        goods_id=goods_id[1])
    try:
        del login_tmall
    except:
        pass

    return wait_to_save_data
예제 #5
0
def _get_tm_one_goods_info_task(self, goods_id:list, index:int) -> tuple:
    """
    获取tmall单个goods信息
    :param self:
    :return:
    """
    tm = TmallParse(logger=lg)
    site_id, _goods_id = goods_id
    before_goods_data = tm.get_goods_data(goods_id=goods_id)
    end_goods_data = tm.deal_with_data()

    try:
        del tm
    except:
        pass
    collect()

    return (site_id, _goods_id, index, before_goods_data, end_goods_data)
예제 #6
0
def test_tm_m():
    # todo 注意部分商品预售,当前无法购买, 不更新, 待其状态正常后会更新
    goods_id = '575090086713'
    # data = get_tm_m_body_data(goods_id=goods_id)
    # pprint(data)
    pc_url = 'https://detail.tmall.com/item.htm?id={}'.format(goods_id)
    phone_url = 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id)
    print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url))

    tm = TmallParse(is_real_times_update_call=True)
    goods_id = tm.get_goods_id_from_url(tmall_url=pc_url)
    ori_data = tm.get_goods_data(goods_id=goods_id)
    # pprint(ori_data)
    data = tm.deal_with_data()
    pprint(data)

    try:del tm
    except:pass
예제 #7
0
    def _get_seller_id(self, type, goods_id):
        '''
        得到seller_id
        :param type:
        :param goods_id:
        :return:
        '''
        _ = TmallParse(logger=self.lg)
        _g = [type, goods_id]
        self.g_data = _.get_goods_data(goods_id=_g)
        seller_id = str(self.g_data.get('seller', {}).get('userId', 0))
        # self.lg.info('获取到的seller_id: ' + seller_id)
        try:
            del _
        except:
            pass
        assert seller_id != 0, '获取到的seller_id为0!'

        return seller_id
예제 #8
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR
        )

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        #  and GETDATE()-ModfiyTime>0.2
        sql_str = '''
        select SiteID, GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time
        from dbo.GoodsInfoAutoGet 
        where (SiteID=3 or SiteID=4 or SiteID=6) and MainGoodsID is not null 
        order by ID desc'''

        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('总计待更新个数: {0}'.format(len(result)))
            my_lg.info('--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            tmall = TmallParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:del tmall
                    except: pass
                    tmall = TmallParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:    # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index)))
                    tmp_item = []
                    if item[0] == 3:        # 从数据库中取出时,先转换为对应的类型
                        tmp_item.append(0)
                    elif item[0] == 4:
                        tmp_item.append(1)
                    elif item[0] == 6:
                        tmp_item.append(2)
                    tmp_item.append(item[1])
                    data = tmall.get_goods_data(goods_id=tmp_item)
                    if isinstance(data, int):       # 单独处理return 4041
                        index += 1
                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        continue

                    if data.get('is_delete') == 1:  # 单独处理下架商品
                        data['goods_id'] = item[1]

                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[2],
                            shelf_time=item[5],
                            delete_time=item[6])

                        # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data))
                        tmall.to_right_and_update_data(data, pipeline=tmp_sql_server)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        gc.collect()
                        continue

                    data = tmall.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[2],
                            shelf_time=item[5],
                            delete_time=item[6])
                        data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                            old_price=item[3],
                            old_taobao_price=item[4],
                            new_price=data['price'],
                            new_taobao_price=data['taobao_price']
                        )
                        # my_lg.info(str(data['_is_price_change']) + ' ' +str(data['_price_change_info']))

                        # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data))
                        tmall.to_right_and_update_data(data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            
        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(5)
        gc.collect()
예제 #9
0
    def _tmall_keywords_spider(self, **kwargs):
        '''
        tmall对应关键字采集
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https:' + re.compile('&skuId=.*').sub('', item)
            for item in goods_id_list
        ]

        self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入的参数
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass
            else:
                tmall = TmallParse(logger=self.my_lg)
                if self.add_goods_index % 20 == 0:  # 每20次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.my_lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id[1], str(self.add_goods_index)))
                        tt = tmall.get_goods_data(goods_id)
                        data = tmall.deal_with_data()
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data[
                                'goods_url'] = tmall._from_tmall_type_get_tmall_url(
                                    type=data['type'], goods_id=goods_id)
                            if data['goods_url'] == '':
                                self.my_lg.error('该goods_url为空值! 此处跳过!')
                                continue

                            result = tmall.old_tmall_goods_insert_into_new_table(
                                data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.my_lg.info('该关键字的商品已经抓取完毕!')

        return True
예제 #10
0
    async def _update_one_goods_info(self, db_goods_info_obj, index):
        """
        更新单个goods
        :param db_goods_info_obj:
        :param index: 
        :return: 
        """
        res = False

        tmall = TmallParse(logger=self.lg)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
            remainder=50,
        )
        if self.sql_cli.is_connect_success:
            self.lg.info(
                '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'.
                format(db_goods_info_obj.goods_id, index))
            tmp_item = self._get_tmp_item(site_id=db_goods_info_obj.site_id,
                                          goods_id=db_goods_info_obj.goods_id)
            # self.lg.info(str(tmp_item))

            # ** 阻塞方式运行
            oo = tmall.get_goods_data(goods_id=tmp_item)
            # ** 非阻塞方式运行
            # oo = await unblock_func(
            #     func_name=tmall.get_goods_data,
            #     func_args=[
            #         tmp_item,
            #     ],
            #     default_res={},
            #     logger=self.lg,)

            before_goods_data_is_delete = oo.get('is_delete',
                                                 0)  # 避免下面解析data错误休眠
            # 阻塞方式
            data = tmall.deal_with_data()
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tm',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,
                )
                res = to_right_and_update_tm_data(data=data,
                                                  pipeline=self.sql_cli,
                                                  logger=self.lg)

            else:
                if before_goods_data_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('------>>>| 阻塞休眠7s中...')
                    await async_sleep(delay=7., loop=self.loop)
                    # 改为阻塞进程, 机器会挂
                    # sleep(7.)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=5, loop=self.loop)

        try:
            del tmall
        except:
            pass
        collect()
        await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME)

        return [
            db_goods_info_obj.goods_id,
            res,
        ]
예제 #11
0
# coding:utf-8
'''
@author = super_fazai
@File    : test_tm_m.py
@connect : [email protected]
'''
"""
测试tm m
"""

from sys import path as sys_path
sys_path.append('..')

from multiplex_code import get_tm_m_body_data
from tmall_parse_2 import TmallParse
from pprint import pprint

goods_id = '589363967773'
# data = get_tm_m_body_data(goods_id=goods_id)
# pprint(data)
pc_url = 'https://detail.tmall.com/item.htm?id={}'.format(goods_id)
phone_url = 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id)
print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url))

tm = TmallParse(is_real_times_update_call=True)
goods_id = tm.get_goods_id_from_url(tmall_url=pc_url)
tm.get_goods_data(goods_id=goods_id)
data = tm.deal_with_data()
pprint(data)
예제 #12
0
    def _tmall_keywords_spider(self, **kwargs):
        """
        tmall对应关键字采集
        :param kwargs:
        :return:
        """
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')
        for item in goods_url_list:
            # item为goods_url
            # 用于判断某个goods是否被插入的参数
            result = False
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True   # 原先存在的情况
                pass
            else:
                tmall = TmallParse(logger=self.lg, is_real_times_update_call=True)
                self.sql_cli = _block_get_new_db_conn(
                    db_obj=self.sql_cli,
                    index=self.add_goods_index,
                    logger=self.lg,
                    remainder=20, )
                if self.sql_cli.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index)))
                        tt = tmall.get_goods_data(goods_id)
                        data = tmall.deal_with_data()
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id)
                            if data['goods_url'] == '':
                                self.lg.error('该goods_url为空值! 此处跳过!')
                                continue

                            if not self.check_target_data_is_legal(target_data=data):
                                return False

                            result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.sql_cli)
                        else:
                            pass

                else:
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:
                # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id,
                    keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True
예제 #13
0
    async def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods 数据
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            miaosha_begin_time = json_2_dict(
                json_str=item[1],
                logger=self.lg,
            ).get('miaosha_begin_time')
            miaosha_begin_time = int(
                str(
                    time.mktime(
                        time.strptime(miaosha_begin_time,
                                      '%Y-%m-%d %H:%M:%S')))[0:10])
            # self.lg.info(str(miaosha_begin_time))

            tmall = TmallParse(logger=self.lg)
            tmp_sql_server = await _get_new_db_conn(
                db_obj=tmp_sql_server,
                index=index,
                logger=self.lg,
                remainder=20,
            )

            if tmp_sql_server.is_connect_success:
                if await self.is_recent_time(miaosha_begin_time) == 0:
                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],))
                    tmp_sql_server._update_table(sql_str=tb_update_str_4,
                                                 params=(item[0], ))
                    self.lg.info('过期的goods_id为(%s)' % item[0] +
                                 ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time)
                    await async_sleep(.3)

                else:  # 返回1, 表示在待更新的区间内
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据'''
                    goods_id = tmall.get_goods_id_from_url(item[2])

                    tmall.get_goods_data(goods_id=goods_id)
                    goods_data = tmall.deal_with_data()

                    if goods_data != {}:
                        # self.lg.info(str(item))
                        goods_data['goods_id'] = item[0]

                        await tmall._update_taoqianggou_xianshimiaosha_table(
                            data=goods_data, pipeline=tmp_sql_server)
                        await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                    else:
                        await async_sleep(5)

                index += 1

            try:
                del tmall
            except:
                pass

        collect()

        return
    async def deal_with_tmcs_goods_id_list(self):
        self.lg.info('即将开始抓取tmcs goods, 请耐心等待...')
        for item in self.db_wait_2_save_goods_id_list:
            # eg: '61864164616'
            goods_id = item

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                continue

            tmall = TmallParse(logger=self.lg, is_real_times_update_call=True)
            self.sql_cli = _block_get_new_db_conn(
                db_obj=self.sql_cli,
                index=self.add_goods_index,
                logger=self.lg,
                remainder=self.sql_cli_remainder,
            )
            if self.sql_cli.is_connect_success:
                # 加spm 是为了get_goods_id_from_url能筛选, id
                # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id)
                goods_url = 'https://detail.tmall.com/item.htm?id={}'.format(
                    goods_id)
                # 下面这个goods_id为类型加goods_id的list
                goods_id = tmall.get_goods_id_from_url(goods_url)
                if goods_id == []:
                    self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url))
                    continue
                else:
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (goods_id[1], str(self.add_goods_index)))
                    tt = tmall.get_goods_data(goods_id)
                    data = tmall.deal_with_data()
                    goods_id = goods_id[1]
                    if data != {}:
                        data['goods_id'] = goods_id
                        data['username'] = '******'
                        data['main_goods_id'] = None
                        data[
                            'goods_url'] = tmall._from_tmall_type_get_tmall_url(
                                type=data['type'],
                                goods_id=goods_id,
                            )
                        if data['goods_url'] == '':
                            self.lg.error('该goods_url为空值! 此处跳过!')
                            continue

                        if len(data['all_img_url']) <= 1:
                            self.lg.info(
                                '[goods_id: {}]主图个数<=1, pass'.format(goods_id))
                            return False

                        result = tmall.old_tmall_goods_insert_into_new_table(
                            data=data, pipeline=self.sql_cli)
                        if result:
                            # 避免后续重复采集
                            self.db_existed_goods_id_list.append(goods_id)
                        else:
                            pass
                    else:
                        pass

            else:
                self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                pass
            self.add_goods_index += 1
            collect()
            sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        self.lg.info('tmcs已经抓取完毕!')

        return True
예제 #15
0
    async def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods 数据
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            _goods_id = item[0]
            miaosha_time = item[1]
            miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
                miaosha_time=miaosha_time,
                logger=self.lg,
            )

            tmall = TmallParse(logger=self.lg)
            tmp_sql_server = await _get_new_db_conn(
                db_obj=tmp_sql_server,
                index=index,
                logger=self.lg,
                remainder=20,
            )

            if tmp_sql_server.is_connect_success:
                if await self.is_recent_time(miaosha_begin_time) == 0:
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=_goods_id,
                        logger=self.lg,
                        update_sql_str=tb_update_str_4,
                        sql_cli=tmp_sql_server,
                    )
                    self.lg.info('过期的goods_id为(%s)' % _goods_id +
                                 ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time)
                    await async_sleep(.3)

                else:  # 返回1, 表示在待更新的区间内
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (_goods_id, str(index)))
                    '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据'''
                    goods_id = tmall.get_goods_id_from_url(item[2])

                    tmall.get_goods_data(goods_id=goods_id)
                    goods_data = tmall.deal_with_data()

                    if goods_data != {}:
                        # self.lg.info(str(item))
                        goods_data['goods_id'] = _goods_id

                        await tmall._update_taoqianggou_xianshimiaosha_table(
                            data=goods_data, pipeline=tmp_sql_server)
                        await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                    else:
                        await async_sleep(5)

                index += 1

            try:
                del tmall
            except:
                pass

        collect()

        return
예제 #16
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=tm_select_str_3))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('总计待更新个数: {0}'.format(len(result)))
            my_lg.info(
                '--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            tmall = TmallParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del tmall
                    except:
                        pass
                    tmall = TmallParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:  # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    tmp_item = []
                    if item[0] == 3:  # 从数据库中取出时,先转换为对应的类型
                        tmp_item.append(0)
                    elif item[0] == 4:
                        tmp_item.append(1)
                    elif item[0] == 6:
                        tmp_item.append(2)
                    tmp_item.append(item[1])
                    oo = tmall.get_goods_data(goods_id=tmp_item)
                    oo_is_delete = oo.get('is_detele', 0)  # 避免下面解析data错误休眠
                    if isinstance(oo, int):  # 单独处理return 4041
                        index += 1
                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        continue

                    data = tmall.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[2],
                                shelf_time=item[5],
                                delete_time=item[6])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        site_id = tmall._from_tmall_type_get_site_id(
                            type=data['type'])
                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[7]),
                                site_id=site_id)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[7]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['price_info_list'], site_id=site_id),
                                is_price_change=item[8]
                                if item[8] is not None else 0)

                        tmall.to_right_and_update_data(data,
                                                       pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        if oo_is_delete == 1:
                            pass
                        else:
                            my_lg.info('------>>>| 休眠8s中...')
                            sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()