Пример #1
0
def get_one_tm_data(**kwargs):
    '''
    抓取一个tm url的data
    :param kwargs:
    :return:
    '''
    username = kwargs.get('username', DEFAULT_USERNAME)
    wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '')
    my_lg = kwargs.get('my_lg')

    login_tmall = TmallParse(logger=my_lg)
    goods_id = login_tmall.get_goods_id_from_url(
        wait_to_deal_with_url)  # 获取goods_id, 这里返回的是一个list
    if goods_id == []:  # 如果得不到goods_id, 则return error
        my_lg.info('获取到的goods_id为空!')
        try:
            del login_tmall  # 每次都回收一下
        except:
            pass
        gc.collect()

        return {'goods_id': ''}

    # 改进判断,根据传入数据判断是天猫,还是天猫超市,还是天猫国际
    #####################################################
    if goods_id[0] == 0:  # [0, '1111']
        wait_to_deal_with_url = 'https://detail.tmall.com/item.htm?id=' + goods_id[
            1]  # 构造成标准干净的天猫商品地址
    elif goods_id[0] == 1:  # [1, '1111']
        wait_to_deal_with_url = 'https://chaoshi.detail.tmall.com/item.htm?id=' + goods_id[
            1]
    elif goods_id[0] == 2:  # [2, '1111', 'https://xxxxx']
        wait_to_deal_with_url = str(goods_id[2]) + '?id=' + goods_id[1]

    tmp_result = login_tmall.get_goods_data(goods_id=goods_id)
    data = login_tmall.deal_with_data()  # 如果成功获取的话, 返回的是一个data的dict对象

    sleep(TMALL_SLEEP_TIME)  # 这个在服务器里面可以注释掉为.5s
    if data == {} or tmp_result == {}:
        my_lg.info('获取到的data为空!')
        try:
            del login_tmall
        except:
            pass
        gc.collect()

        return {'goods_id': goods_id[1], 'msg': 'data为空!'}

    wait_to_save_data = add_base_info_2_processed_data(
        data=data,
        spider_url=wait_to_deal_with_url,
        username=username,
        goods_id=goods_id[1])
    try:
        del login_tmall
    except:
        pass

    return wait_to_save_data
Пример #2
0
    async def _deal_with_all_goods_id(self):
        '''
        获取每个详细分类的商品信息
        :return: None
        '''
        _data = await self._get_all_goods_list()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        index = 1
        if my_pipeline.is_connect_success:
            self.my_lg.info('正在获取淘抢购db原有goods_id, 请耐心等待...')
            sql_str = r'select goods_id from dbo.tao_qianggou_xianshimiaosha where site_id=28'
            db_ = list(my_pipeline._select_table(sql_str=sql_str))
            db_all_goods_id = [item[0] for item in db_]
            self.my_lg.info('获取完毕!!!')
            # self.my_lg.info(str(db_all_goods_id))

            for item in _data:
                miaosha_goods_list = await self._get_taoqianggou_goods_list(data=item.get('data', []))
                # self.my_lg.info(str(miaosha_goods_list))
                # pprint(miaosha_goods_list)

                for tmp_item in miaosha_goods_list:
                    if tmp_item.get('goods_id', '') in db_all_goods_id:    # 处理如果该goods_id已经存在于数据库中的情况
                        self.my_lg.info('该goods_id[%s]已存在db中' % tmp_item.get('goods_id', ''))
                        continue

                    if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                        self.my_lg.info('正在重置,并与数据库建立新连接中...')
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        # my_pipeline = SqlPools()
                        self.my_lg.info('与数据库的新连接成功建立...')

                    if my_pipeline.is_connect_success:
                        tmall = TmallParse(logger=self.my_lg)
                        tmp_url = 'https://detail.tmall.com/item.htm?id={0}'.format(tmp_item.get('goods_id'))
                        goods_id = tmall.get_goods_id_from_url(tmp_url)

                        tmall.get_goods_data(goods_id=goods_id)
                        goods_data = tmall.deal_with_data()

                        if goods_data != {}:
                            # self.my_lg.info(str(tmp_item))
                            goods_data['goods_id'] = tmp_item.get('goods_id')
                            goods_data['spider_url'] = tmp_url
                            goods_data['miaosha_time'] = tmp_item.get('miaosha_time')
                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=tmp_item.get('miaosha_time'))
                            goods_data['page'] = tmp_item.get('page')
                            goods_data['spider_time'] = tmp_item.get('spider_time')

                            tmall.insert_into_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                            await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME)

                        else:
                            await asyncio.sleep(5)

                        try: del tmall
                        except: pass
                        gc.collect()
    async def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods 数据
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time')
            miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10])
            # self.my_lg.info(str(miaosha_begin_time))

            tmall = TmallParse(logger=self.my_lg)
            if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                self.my_lg.info('正在重置,并与数据库建立新连接中...')
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                self.my_lg.info('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                if await self.is_recent_time(miaosha_begin_time) == 0:
                    tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                    self.my_lg.info('过期的goods_id为(%s)' % item[0] + ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time'))

                else:   # 返回1, 表示在待更新的区间内
                    self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index)))

                    '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据'''
                    goods_id = tmall.get_goods_id_from_url(item[2])

                    tmall.get_goods_data(goods_id=goods_id)
                    goods_data = tmall.deal_with_data()

                    if goods_data != {}:
                        # self.my_lg.info(str(item))
                        goods_data['goods_id'] = item[0]

                        await tmall._update_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server)
                        await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                    else:
                        await asyncio.sleep(5)

                index += 1

            try: del tmall
            except: pass

        gc.collect()

        return
Пример #4
0
def test_tm_m():
    # todo 注意部分商品预售,当前无法购买, 不更新, 待其状态正常后会更新
    goods_id = '575090086713'
    # data = get_tm_m_body_data(goods_id=goods_id)
    # pprint(data)
    pc_url = 'https://detail.tmall.com/item.htm?id={}'.format(goods_id)
    phone_url = 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id)
    print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url))

    tm = TmallParse(is_real_times_update_call=True)
    goods_id = tm.get_goods_id_from_url(tmall_url=pc_url)
    ori_data = tm.get_goods_data(goods_id=goods_id)
    # pprint(ori_data)
    data = tm.deal_with_data()
    pprint(data)

    try:del tm
    except:pass
Пример #5
0
    def _tmall_keywords_spider(self, **kwargs):
        '''
        tmall对应关键字采集
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https:' + re.compile('&skuId=.*').sub('', item)
            for item in goods_id_list
        ]

        self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入的参数
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass
            else:
                tmall = TmallParse(logger=self.my_lg)
                if self.add_goods_index % 20 == 0:  # 每20次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.my_lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id[1], str(self.add_goods_index)))
                        tt = tmall.get_goods_data(goods_id)
                        data = tmall.deal_with_data()
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data[
                                'goods_url'] = tmall._from_tmall_type_get_tmall_url(
                                    type=data['type'], goods_id=goods_id)
                            if data['goods_url'] == '':
                                self.my_lg.error('该goods_url为空值! 此处跳过!')
                                continue

                            result = tmall.old_tmall_goods_insert_into_new_table(
                                data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.my_lg.info('该关键字的商品已经抓取完毕!')

        return True
Пример #6
0
# coding:utf-8
'''
@author = super_fazai
@File    : test_tm_m.py
@connect : [email protected]
'''
"""
测试tm m
"""

from sys import path as sys_path
sys_path.append('..')

from multiplex_code import get_tm_m_body_data
from tmall_parse_2 import TmallParse
from pprint import pprint

goods_id = '589363967773'
# data = get_tm_m_body_data(goods_id=goods_id)
# pprint(data)
pc_url = 'https://detail.tmall.com/item.htm?id={}'.format(goods_id)
phone_url = 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id)
print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url))

tm = TmallParse(is_real_times_update_call=True)
goods_id = tm.get_goods_id_from_url(tmall_url=pc_url)
tm.get_goods_data(goods_id=goods_id)
data = tm.deal_with_data()
pprint(data)
Пример #7
0
    def _tmall_keywords_spider(self, **kwargs):
        """
        tmall对应关键字采集
        :param kwargs:
        :return:
        """
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')
        for item in goods_url_list:
            # item为goods_url
            # 用于判断某个goods是否被插入的参数
            result = False
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True   # 原先存在的情况
                pass
            else:
                tmall = TmallParse(logger=self.lg, is_real_times_update_call=True)
                self.sql_cli = _block_get_new_db_conn(
                    db_obj=self.sql_cli,
                    index=self.add_goods_index,
                    logger=self.lg,
                    remainder=20, )
                if self.sql_cli.is_connect_success:
                    goods_id = tmall.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index)))
                        tt = tmall.get_goods_data(goods_id)
                        data = tmall.deal_with_data()
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id)
                            if data['goods_url'] == '':
                                self.lg.error('该goods_url为空值! 此处跳过!')
                                continue

                            if not self.check_target_data_is_legal(target_data=data):
                                return False

                            result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.sql_cli)
                        else:
                            pass

                else:
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:
                # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id,
                    keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True
Пример #8
0
    async def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods 数据
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            miaosha_begin_time = json_2_dict(
                json_str=item[1],
                logger=self.lg,
            ).get('miaosha_begin_time')
            miaosha_begin_time = int(
                str(
                    time.mktime(
                        time.strptime(miaosha_begin_time,
                                      '%Y-%m-%d %H:%M:%S')))[0:10])
            # self.lg.info(str(miaosha_begin_time))

            tmall = TmallParse(logger=self.lg)
            tmp_sql_server = await _get_new_db_conn(
                db_obj=tmp_sql_server,
                index=index,
                logger=self.lg,
                remainder=20,
            )

            if tmp_sql_server.is_connect_success:
                if await self.is_recent_time(miaosha_begin_time) == 0:
                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],))
                    tmp_sql_server._update_table(sql_str=tb_update_str_4,
                                                 params=(item[0], ))
                    self.lg.info('过期的goods_id为(%s)' % item[0] +
                                 ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time)
                    await async_sleep(.3)

                else:  # 返回1, 表示在待更新的区间内
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据'''
                    goods_id = tmall.get_goods_id_from_url(item[2])

                    tmall.get_goods_data(goods_id=goods_id)
                    goods_data = tmall.deal_with_data()

                    if goods_data != {}:
                        # self.lg.info(str(item))
                        goods_data['goods_id'] = item[0]

                        await tmall._update_taoqianggou_xianshimiaosha_table(
                            data=goods_data, pipeline=tmp_sql_server)
                        await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                    else:
                        await async_sleep(5)

                index += 1

            try:
                del tmall
            except:
                pass

        collect()

        return
    async def deal_with_tmcs_goods_id_list(self):
        self.lg.info('即将开始抓取tmcs goods, 请耐心等待...')
        for item in self.db_wait_2_save_goods_id_list:
            # eg: '61864164616'
            goods_id = item

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                continue

            tmall = TmallParse(logger=self.lg, is_real_times_update_call=True)
            self.sql_cli = _block_get_new_db_conn(
                db_obj=self.sql_cli,
                index=self.add_goods_index,
                logger=self.lg,
                remainder=self.sql_cli_remainder,
            )
            if self.sql_cli.is_connect_success:
                # 加spm 是为了get_goods_id_from_url能筛选, id
                # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id)
                goods_url = 'https://detail.tmall.com/item.htm?id={}'.format(
                    goods_id)
                # 下面这个goods_id为类型加goods_id的list
                goods_id = tmall.get_goods_id_from_url(goods_url)
                if goods_id == []:
                    self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url))
                    continue
                else:
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (goods_id[1], str(self.add_goods_index)))
                    tt = tmall.get_goods_data(goods_id)
                    data = tmall.deal_with_data()
                    goods_id = goods_id[1]
                    if data != {}:
                        data['goods_id'] = goods_id
                        data['username'] = '******'
                        data['main_goods_id'] = None
                        data[
                            'goods_url'] = tmall._from_tmall_type_get_tmall_url(
                                type=data['type'],
                                goods_id=goods_id,
                            )
                        if data['goods_url'] == '':
                            self.lg.error('该goods_url为空值! 此处跳过!')
                            continue

                        if len(data['all_img_url']) <= 1:
                            self.lg.info(
                                '[goods_id: {}]主图个数<=1, pass'.format(goods_id))
                            return False

                        result = tmall.old_tmall_goods_insert_into_new_table(
                            data=data, pipeline=self.sql_cli)
                        if result:
                            # 避免后续重复采集
                            self.db_existed_goods_id_list.append(goods_id)
                        else:
                            pass
                    else:
                        pass

            else:
                self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                pass
            self.add_goods_index += 1
            collect()
            sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        self.lg.info('tmcs已经抓取完毕!')

        return True
Пример #10
0
    async def _update_old_goods_info(self, tmp_sql_server, result):
        '''
        更新old goods 数据
        :param result:
        :return:
        '''
        index = 1
        for item in result:  # 实时更新数据
            _goods_id = item[0]
            miaosha_time = item[1]
            miaosha_begin_time, miaosha_end_time = await async_get_ms_begin_time_and_miaos_end_time_from_ms_time(
                miaosha_time=miaosha_time,
                logger=self.lg,
            )

            tmall = TmallParse(logger=self.lg)
            tmp_sql_server = await _get_new_db_conn(
                db_obj=tmp_sql_server,
                index=index,
                logger=self.lg,
                remainder=20,
            )

            if tmp_sql_server.is_connect_success:
                if await self.is_recent_time(miaosha_begin_time) == 0:
                    _handle_goods_shelves_in_auto_goods_table(
                        goods_id=_goods_id,
                        logger=self.lg,
                        update_sql_str=tb_update_str_4,
                        sql_cli=tmp_sql_server,
                    )
                    self.lg.info('过期的goods_id为(%s)' % _goods_id +
                                 ', 限时秒杀开始时间为(%s), 删除成功!' % miaosha_begin_time)
                    await async_sleep(.3)

                else:  # 返回1, 表示在待更新的区间内
                    self.lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (_goods_id, str(index)))
                    '''NOTICE: 由于都是当天数据, 此处不更新上下架时间,就更新商品数据'''
                    goods_id = tmall.get_goods_id_from_url(item[2])

                    tmall.get_goods_data(goods_id=goods_id)
                    goods_data = tmall.deal_with_data()

                    if goods_data != {}:
                        # self.lg.info(str(item))
                        goods_data['goods_id'] = _goods_id

                        await tmall._update_taoqianggou_xianshimiaosha_table(
                            data=goods_data, pipeline=tmp_sql_server)
                        await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                    else:
                        await async_sleep(5)

                index += 1

            try:
                del tmall
            except:
                pass

        collect()

        return