Exemplo n.º 1
0
 def __init__(self, logger=None):
     self._set_headers()
     self._set_pc_headers()
     self.result_data = {}
     self._set_logger(logger)
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH,
                                     logger=self.my_lg)
Exemplo n.º 2
0
 def __init__(self):
     self._set_headers()
     self.page_sleep_time = 1.2
     self.phantomjs_sleep_time = 2
     self.my_phantomjs = MyPhantomjs(
         load_images=True)  # load_images为True才加载图片!
     self.qrcode_base_path = '/Users/afa/myFiles/tmp/外卖券qrcode/'
Exemplo n.º 3
0
 def __init__(self, logger=None):
     super(ALi1688LoginAndParse, self).__init__()
     self._set_headers()
     self.result_data = {}
     self.is_activity_goods = False
     self._set_logger(logger)
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH,
                                     logger=self.my_lg)
Exemplo n.º 4
0
 def __init__(self, logger=None):
     self.result_data = {}
     self.msg = ''
     self._set_logger(logger)
     self._set_headers()
     self.comment_page_switch_sleep_time = 1.2  # 评论下一页sleep time
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
     self._add_headers_cookies()
Exemplo n.º 5
0
 def __init__(self, logger=None):
     self._set_sort_type_name()
     self._set_logger(logger)
     self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
     self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s'
     self.phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH,
                                  logger=self.my_lg)
     self.id_list = []
     self.update_index = 0
Exemplo n.º 6
0
 def __init__(self, logger=None):
     self.result_data = {}
     self.msg = ''
     self._set_logger(logger)
     self._set_headers()
     self.page_size = '10'
     self.comment_page_switch_sleep_time = 1.5   # 评论下一页sleep time
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
     self.g_data = {}                # 临时数据
     self.random_sku_info_list = []  # 临时数据(存该商品所有的规格)
 def __init__(self, base_path='/Users/afa/myFiles/tmp/基金/伪好基/'):
     '''
     :param base_path: 基金图片存储path
     '''
     self.page_num_start = 1  # 开放基金排行开始page
     self.page_num_end = 3
     self.CRAWL_FUND_TIME = 1.5  # 抓取每只基金的sleep time
     self.plot_pic = None
     self.base_path = base_path
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_PATH)
Exemplo n.º 8
0
    async def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return:
        '''
        s_time = time.time()
        goods_list = []
        my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH,
                                   logger=self.my_lg)
        for key in self.tab_dict:
            self.msg = '正在抓取的分类为: ' + key
            self.my_lg.info(self.msg)
            for index in range(1, 20):
                item_list = await self.get_one_page_goods_list(
                    my_phantomjs=my_phantomjs,
                    key=key,
                    tab=self.tab_dict[key],
                    index=index)

                all_goods_id = list(
                    set([s.get('goods_id', '') for s in goods_list]))
                for item in item_list:
                    if item.get('goods_id', '') not in all_goods_id:
                        goods_list.append(item)
                # await asyncio.sleep(.5)

                # break
            # break

        try:
            del my_phantomjs
        except:
            pass
        self.my_lg.info(str(goods_list))
        self.my_lg.info('本次抓到所有拼团商品个数为: ' + str(len(goods_list)))
        e_time = time.time()
        self.my_lg.info('总用时:' + str(e_time - s_time))
        await asyncio.sleep(3)

        return goods_list
Exemplo n.º 9
0
class JdParse(object):
    def __init__(self, logger=None):
        self._set_headers()
        self._set_pc_headers()
        self.result_data = {}
        self._set_logger(logger)
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH,
                                        logger=self.my_lg)

    def _set_logger(self, logger):
        if logger is None:
            self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                    '/jd/_/' + str(get_shanghai_time())[0:10] +
                                    '.txt',
                                    console_log_level=INFO,
                                    file_log_level=ERROR)
        else:
            self.my_lg = logger

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'jd.com;jd.hk',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def _set_pc_headers(self):
        # pc头, 只识别小写
        self.pc_headers = {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            # 'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'connection': 'keep-alive',
            'user-agent': get_random_pc_ua(),
        }

    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == []:
            self.my_lg.error('goods_id为空list')
            return self._data_error_init()

        if isinstance(self._get_need_url(goods_id=goods_id), dict):  # 即返回{}
            return self._data_error_init()

        self.error_record = '出错goods_id:{0}'.format(goods_id[1])

        phone_url, tmp_url, comment_url = self._get_need_url(goods_id=goods_id)
        self.my_lg.info('------>>>| 得到的移动端地址为: {0}'.format(phone_url))

        # self.my_lg.info(str(tmp_url))
        if goods_id[0] == 1:  # ** 注意: 先预加载让driver获取到sid **
            # 研究分析发现京东全球购,大药房商品访问需要cookies中的sid值
            self.my_phantomjs.use_phantomjs_to_get_url_body(
                url='https://mitem.jd.hk/cart/cartNum.json')
        elif goods_id[0] == 2:
            # 研究分析发现京东全球购,大药房商品访问需要cookies中的sid值
            self.my_phantomjs.use_phantomjs_to_get_url_body(
                url='https://m.yiyaojd.com/cart/cartNum.json')

        # 得到总销售量
        comment_body = self.my_phantomjs.use_phantomjs_to_get_url_body(
            url=comment_url)
        if comment_body == '':  # 网络问题或者ip切换出错
            return self._data_error_init()

        comment_body = self._wash_url_body(body=comment_body)
        # self.my_lg.info(str(comment_body))
        comment_body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(
            comment_body)
        if comment_body_1 != []:
            comment_data = comment_body_1[0]
            comment_data = json_2_dict(json_str=comment_data)
            # pprint(comment_data)
            all_sell_count = comment_data.get('wareDetailComment',
                                              {}).get('allCnt', '0')

        else:
            self.my_lg.error('获取到的comment的销售量data为空!' + self.error_record)
            return self._data_error_init()

        body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url)
        if body == '':
            return self._data_error_init()

        body = self._wash_url_body(body=body)
        # self.my_lg.info(str(body))

        body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(body)

        ## ** 起初是拿phantomjs来进行url请求的,本来想着用requests来优化,但是改动有点大,就先暂时不改动 **
        # body_1 = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
        # if body_1 == '':
        #     body_1 = []
        # else:
        # # self.my_lg.info(str(body_1[0]))

        if body_1 != []:
            data = body_1[0]
            data = json_2_dict(json_str=data)
            if data == {}:
                self.my_lg.error(r'此处直接返回data为{}' + self.error_record)
                return self._data_error_init()

            # pprint(data)
            wdis = data.get('wdis', '')  # 图文描述
            data = data.get('ware', {})
            try:
                data.pop('wdisHtml')
                data.get('wi', {})['afterServiceList'] = []
            except Exception:
                pass

            # 处理'wi' 'code'
            if data.get('wi') is not None:
                # 用于获取p_info
                code = data.get('wi', {}).get('code', '')
                # self.my_lg.info('wi,code的为: {}'.format(code))
                if code != '':
                    code = json_2_dict(json_str=code)
                    try:
                        data.get('wi', {})['code'] = code
                    except Exception as e:  # 对应p_info解析错误的, 换方法解析
                        self.my_lg.info('wi中的code对应json解析错误, 为:', e)
                        code = data.get('wi', {}).get('wareQD', '')
                        data.get('wi', {})['code'] = code
            else:
                data['wi'] = {'code': []}

            # 处理wdis
            data['wdis'] = wdis

            # 商品总销售量
            data['all_sell_count'] = all_sell_count

            if data != {}:
                self.result_data = data
                # pprint(data)
                return data
            else:
                self.my_lg.error('获取到的data的key值ware为空!' + self.error_record)
                return self._data_error_init()

        else:
            self.my_lg.error('获取到的data为空!' + self.error_record)
            return self._data_error_init()

    def deal_with_data(self, goods_id):
        '''
        处理result_data, 返回需要的信息
        :return: 字典类型
        '''
        data = self.result_data
        if data != {}:
            shop_name = self._get_shop_name(data=data)
            account = ''
            title = data.get('wname', '')
            sub_title = ''
            detail_name_list = self._get_detail_name_list(data=data)
            '''
            要存储的每个标签对应规格的价格及其库存(京东无库存抓取, 只有对应规格商品是否可买)
            '''
            price_info_list = self.get_price_info_list(goods_id,
                                                       detail_name_list, data)
            # pprint(price_info_list)

            # 获取is_delete, price, taobao_price
            _ = self._get_price_and_taobao_price_and_is_delete(
                detail_name_list=detail_name_list,
                price_info_list=price_info_list,
                goods_id=goods_id)
            if _ == [0, '', '']:  # 异常退出
                return self._data_error_init()
            else:
                is_delete, price, taobao_price = _
            # self.my_lg.info('最高价: {0}, 最低价: {1}'.format(price, taobao_price))

            # 所有示例图片地址
            '''
            新增: 由于手机版获取到的jd示例图片数据有京东的水印,所以单独先通过pc端来获取图片,pc获取失败就用phone端的
            '''
            all_img_url = self.get_pc_no_watermark_picture(goods_id=goods_id)
            if all_img_url == {}:  # 意外退出
                return self._data_error_init()

            if all_img_url == []:  # 获取pc端失败, 即获取phone示例图
                if data.get('images') is not None:
                    all_img_url = [{
                        'img_url': item.get('bigpath')
                    } for item in data.get('images')]
                else:
                    all_img_url = []
            else:
                pass
            # pprint(all_img_url)

            p_info = self.get_p_info(data=data)
            # pprint(p_info)      # 爬取是手机端的所以没有第一行的,就是手机端的规格
            div_desc = self.get_right_div_desc(data=data)
            # self.my_lg.info(str(div_desc))
            jd_type = self._get_jd_type(is_jd_market=data.get('isJdMarket'),
                                        type=goods_id[0])
            # self.my_lg.info('jd_type为: {0}'.format(jd_type))

            # 商品总销售量
            all_sell_count = str(data.get('all_sell_count', '0'))
            if is_delete == 1:
                self.my_lg.info('**** 该商品已下架...')

            result = {
                'shop_name': shop_name,  # 店铺名称
                'account': account,  # 掌柜
                'title': title,  # 商品名称
                'sub_title': sub_title,  # 子标题
                'price': price,  # 商品价格
                'taobao_price': taobao_price,  # 淘宝价
                # 'goods_stock': goods_stock,             # 商品库存
                'detail_name_list': detail_name_list,  # 商品标签属性名称
                # 'detail_value_list': detail_value_list, # 商品标签属性对应的值
                'price_info_list':
                price_info_list,  # 要存储的每个标签对应规格的价格及其库存(京东隐藏库存无法爬取,只能能买或不能买)
                'all_img_url': all_img_url,  # 所有示例图片地址
                'p_info': p_info,  # 详细信息标签名对应属性
                # 'pc_div_url': pc_div_url,               # pc端描述地址
                'div_desc': div_desc,  # div_desc
                'is_delete': is_delete,  # 是否下架判断
                'jd_type': jd_type,  # 京东类型,(京东常规商品为7,京东超市为8)
                'all_sell_count': all_sell_count,  # 商品总销售量
            }
            # pprint(result)
            # self.my_lg.info(str(result))
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # self.my_lg.info(str(json_data))
            gc.collect()
            return result

        else:
            self.my_lg.info('待处理的data为空的dict' + self.error_record)
            return {}

    def _data_error_init(self):
        '''
        错误初始化
        :return:
        '''
        self.result_data = {}

        return {}

    def _get_jd_type(self, is_jd_market, type):
        '''
        判断是否是京东商品类型
        '''
        # self.my_lg.info(str(data.get('isJdMarket')))
        if is_jd_market:  # False不是京东超市
            self.my_lg.info('该链接为京东超市')
            jd_type = 8  # 7为京东常规商品, 8表示京东超市, 9表示京东全球购, 10表示京东大药房
        elif type == 1:
            self.my_lg.info('该链接为京东全球购')
            jd_type = 9
        elif type == 2:
            self.my_lg.info('该链接为京东大药房')
            jd_type = 10
        else:
            jd_type = 7

        return jd_type

    def _get_price_and_taobao_price_and_is_delete(self, **kwargs):
        '''
        获取is_delete, price, taobao_price
        :return: [0, '', ''] 表示异常退出 | [x, xx, xx] 表示成功
        '''
        detail_name_list = kwargs.get('detail_name_list', [])
        price_info_list = kwargs.get('price_info_list', [])
        goods_id = kwargs.get('goods_id', [])
        # 是否下架判断
        is_delete = 0

        # 商品价格
        '''
        最高价和最低价处理  从已经获取到的规格对应价格中筛选最高价和最低价即可
        '''
        if detail_name_list == []:  # 说明没有规格,所有价格只能根据当前的goods_id来获取
            if self.from_ware_id_get_price_info(ware_id=goods_id)[0] == '暂无报价':
                is_delete = 1  # 说明已经下架
                price, taobao_price = (
                    0,
                    0,
                )
            else:
                try:
                    # self.my_lg.info(str(self.from_ware_id_get_price_info(ware_id=goods_id)[0]))
                    price = round(
                        float(
                            self.from_ware_id_get_price_info(
                                ware_id=goods_id)[0]), 2)
                    taobao_price = price
                except TypeError:
                    is_delete = 1  # 说明该商品暂无报价
                    price, taobao_price = (
                        0,
                        0,
                    )
        else:
            try:
                tmp_price_list = sorted([
                    round(float(item.get('detail_price', '')), 2)
                    for item in price_info_list
                ])
            except ValueError:
                self.my_lg.error('tmp_price_list的ValueError,此处设置为跳过' +
                                 self.error_record)
                return [0, '', '']

            # self.my_lg.info(str(tmp_price_list))
            if tmp_price_list != []:
                price = tmp_price_list[-1]
                taobao_price = tmp_price_list[0]
            else:
                self.my_lg.error('获取最高价最低价时错误' + self.error_record)
                return [0, '', '']

        return [is_delete, price, taobao_price]

    def _get_need_url(self, goods_id):
        '''
        获取需求的url
        :param goods_id:
        :return:
        '''
        phone_url = ''
        tmp_url = ''
        comment_url = ''
        if goods_id[0] == 0:  # 表示为京东常规商品
            phone_url = 'https://item.m.jd.com/ware/view.action?wareId=' + str(
                goods_id[1])
            # 用于得到常规信息
            tmp_url = 'https://item.m.jd.com/ware/detail.json?wareId=' + str(
                goods_id[1])
            comment_url = 'https://item.m.jd.com/ware/getDetailCommentList.json?wareId=' + str(
                goods_id[1])

        elif goods_id[0] == 1:  # 表示为京东全球购商品 (此处由于进口关税无法计算先不处理京东全球购)
            phone_url = 'https://mitem.jd.hk/ware/view.action?wareId=' + str(
                goods_id[1])
            tmp_url = 'https://mitem.jd.hk/ware/detail.json?wareId=' + str(
                goods_id[1])
            comment_url = 'https://mitem.jd.hk/ware/getDetailCommentList.json?wareId=' + str(
                goods_id[1])

            self.my_lg.info('此商品为京东全球购商品,由于进口关税无法计算,先不处理京东全球购')
            return {}

        elif goods_id[0] == 2:  # 表示京东大药房商品
            phone_url = 'https://m.yiyaojd.com/ware/view.action?wareId=' + str(
                goods_id[1])
            tmp_url = 'https://m.yiyaojd.com/ware/detail.json?wareId=' + str(
                goods_id[1])
            comment_url = 'https://m.yiyaojd.com/ware/getDetailCommentList.json?wareId=' + str(
                goods_id[1])

        return phone_url, tmp_url, comment_url

    def from_ware_id_get_price_info(self, ware_id):
        '''
        得到价格信息,由于过滤了requests所以用phantomjs
        '''
        price_url = ''
        if ware_id[0] == 0:  # 表示为京东常规商品
            price_url = 'https://item.m.jd.com/ware/getSpecInfo.json?wareId=' + str(
                ware_id[1])

        elif ware_id[0] == 1:  # 表示为京东全球购商品
            price_url = 'https://mitem.jd.hk/ware/getSpecInfo.json?wareId=' + str(
                ware_id[1])

        elif ware_id[0] == 2:  # 表示京东大药房商品
            price_url = 'https://m.yiyaojd.com/ware/getSpecInfo.json?wareId=' + str(
                ware_id[1])

        # self.my_lg.info(str(price_url))
        price_body = self.my_phantomjs.use_phantomjs_to_get_url_body(
            url=price_url)

        price_body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(price_body)
        if price_body_1 != []:
            price_data = json_2_dict(json_str=price_body_1[0])
            try:
                price_data.pop('defaultAddress')
                price_data.pop('commonConfigJson')
            except Exception:
                pass
            try:
                price_data.pop('newYanBaoInfo')
            except Exception:
                pass

            # 处理newYanBaoInfo
            new_yan_bao_info = price_data.get('newYanBaoInfo')
            if new_yan_bao_info is not None:
                new_yan_bao_info = json_2_dict(json_str=new_yan_bao_info)
                price_data['newYanBaoInfo'] = new_yan_bao_info

            # 处理allColorSet
            all_color_set = price_data.get('allColorSet')
            if all_color_set is not None:
                all_color_set = json_2_dict(json_str=all_color_set)
                price_data['allColorSet'] = all_color_set

            # 处理allSpecSet
            all_spec_set = price_data.get('allSpecSet')
            if all_spec_set is not None:
                all_spec_set = json_2_dict(json_str=all_spec_set)
                price_data['allSpecSet'] = all_spec_set

            # 处理allSizeSet
            all_size_set = price_data.get('allSizeSet')
            if all_size_set is not None:
                all_size_set = json_2_dict(json_str=all_size_set)
                price_data['allSizeSet'] = all_size_set

            # pprint(price_data)
            if price_data.get('wareMainImageUrl') is not None:
                main_image_url = price_data.get('wareMainImageUrl')
            else:
                main_image_url = ''

            return [
                price_data.get('warePrice', ''),  # 价格
                main_image_url,  # 主图地址
            ]

        else:
            # self.my_lg.error('获取到的price_data为空!')
            return []

    def _get_shop_name(self, data):
        '''
        获取shop_name
        :param data:
        :return:
        '''
        return data.get('shopInfo', {}).get('shop', {}).get('name', '') \
            if data.get('shopInfo', {}).get('shop') is not None \
            else ''

    def _get_detail_name_list(self, data):
        '''
        获取detail_name_list
        :param data:
        :return:
        '''
        detail_name_list = []
        color_size_title = data.get('skuColorSize',
                                    {}).get('colorSizeTitle', {})
        # pprint(data.get('skuColorSize', {}))
        # pprint(color_size_title)
        if color_size_title != {}:
            for key, value in color_size_title.items():
                img_here = 0
                if key == 'colorName':
                    if value is not None:
                        if value != '':  # 不为空则说明有图
                            img_here = 1

                detail_name_list.append({
                    'spec_name': value,
                    'img_here': img_here,
                })

        return detail_name_list

    def get_price_info_list(self, *params):
        '''
        得到规范的price_info_list
        :param *params:
        :return:
        '''
        goods_id = params[0]
        detail_name_list = params[1]
        data = params[2]
        # tmp_price_info_list = data.get('skuColorSize', {}).get('colorSize')
        # pprint(tmp_price_info_list)

        price_info_list = []
        if detail_name_list != []:  # 有规格
            tmp_price_info_list = data.get('skuColorSize', {}).get('colorSize')
            # pprint(tmp_price_info_list)
            if tmp_price_info_list is not None:
                for item in tmp_price_info_list:
                    tmp = {}
                    tmp_spec_value = []
                    if item.get('color') != '*':
                        tmp_spec_value.append(item.get('color'))

                    if item.get('size') != '*':
                        tmp_spec_value.append(item.get('size'))

                    if item.get('spec') != '*':
                        tmp_spec_value.append(item.get('spec'))

                    tmp_spec_value = '|'.join(tmp_spec_value)  # 具体规格
                    # self.my_lg.info(str(tmp_spec_value))

                    sku_id = item.get('skuId')
                    # 对每个sku_id就行一次请求,来获得对应sku_id的价格数据
                    if goods_id[0] == 0:
                        sku_id = [0, sku_id]
                    elif goods_id[0] == 1:
                        sku_id = [1, sku_id]
                    elif goods_id[0] == 2:
                        sku_id = [2, sku_id]
                    ware_price_and_main_img_url_list = self.from_ware_id_get_price_info(
                        ware_id=sku_id)

                    tmp['spec_value'] = tmp_spec_value
                    if ware_price_and_main_img_url_list != []:
                        tmp['detail_price'] = ware_price_and_main_img_url_list[
                            0]
                        tmp['img'] = ware_price_and_main_img_url_list[1]
                    else:
                        tmp['detail_price'] = ''
                        tmp['img'] = ''

                    tmp['rest_number'] = ''
                    if tmp.get(
                            'detail_price') is None:  # detail_price为None的跳过!
                        continue

                    price_info_list.append(tmp)
                    # pprint(price_info_list)

        return price_info_list

    def get_right_div_desc(self, data):
        '''
        得到处理后的div_desc
        :param data:
        :return:
        '''
        wdis = ''
        # 特殊处理script动态生成的
        if data.get('popWareDetailWebViewMap') is not None:
            if data.get('popWareDetailWebViewMap').get(
                    'cssContent') is not None:
                wdis = data.get('popWareDetailWebViewMap',
                                {}).get('cssContent', '')
                wdis = self._wash_div_desc(wdis=wdis)

        wdis = wdis + data.get('wdis', '')  # 如果获取到script就与wdis重组
        div_desc = self._wash_div_desc(wdis=wdis)

        return div_desc

    def _wash_div_desc(self, wdis):
        '''
        清洗div_desc
        :param wdis:
        :return:
        '''
        wdis = re.compile(r'&lt;').sub(
            '<', wdis
        )  # self.driver.page_source转码成字符串时'<','>'都被替代成&gt;&lt;此外还有其他也类似被替换
        wdis = re.compile(r'&gt;').sub('>', wdis)
        wdis = re.compile(r'&amp;').sub('&', wdis)
        wdis = re.compile(r'&nbsp;').sub(' ', wdis)
        wdis = re.compile(r'\n').sub('', wdis)
        wdis = re.compile(r'src=\"https:').sub('src=\"', wdis)  # 先替换部分带有https的
        wdis = re.compile(r'src="').sub('src=\"https:', wdis)  # 再把所欲的换成https的

        wdis = re.compile(r'<html>|</html>').sub('', wdis)
        wdis = re.compile(r'<head.*?>.*?</head>').sub('', wdis)
        wdis = re.compile(r'<body>|</body>').sub('', wdis)

        return wdis

    def get_p_info(self, data):
        '''
        得到p_info
        :param data:
        :return: list
        '''
        tmp_p_info = data.get('wi', {}).get('code')
        # pprint(tmp_p_info)
        p_info = []
        if tmp_p_info is not None:
            if isinstance(tmp_p_info, str):
                p_info = [{'p_name': '规格和包装', 'p_value': tmp_p_info}]
            elif isinstance(tmp_p_info, list):
                for item in tmp_p_info:
                    tmp = {}
                    tmp['p_name'] = list(item.keys())[0]
                    tmp_p_value = list(item.values())[0]
                    tmp_p_value_2 = []
                    if isinstance(tmp_p_value, list):
                        for i in tmp_p_value:
                            tmp_2 = {}
                            tmp_2['name'] = list(i.keys())[0]
                            tmp_2['value'] = list(i.values())[0]
                            tmp_p_value_2.append(tmp_2)
                        tmp['p_value'] = tmp_p_value_2
                    else:
                        tmp['p_value'] = tmp_p_value

                    p_info.append(tmp)
            else:
                pass

        return p_info

    def to_right_and_update_data(self, data, pipeline):
        '''
        实时更新数据
        :param data:
        :param pipeline:
        :return:
        '''
        site_id = self._from_jd_type_get_site_id_value(
            jd_type=data.get('jd_type'))
        tmp = _get_right_model_data(data=data, site_id=site_id)

        params = self.get_db_update_params(item=tmp)
        base_sql_str = jd_update_str_1
        if tmp['delete_time'] == '':
            sql_str = base_sql_str.format('shelf_time=%s', '')
        elif tmp['shelf_time'] == '':
            sql_str = base_sql_str.format('delete_time=%s', '')
        else:
            sql_str = base_sql_str.format('shelf_time=%s,', 'delete_time=%s')

        res = pipeline._update_table_2(sql_str=sql_str,
                                       params=params,
                                       logger=self.my_lg)

        return res

    def insert_into_jd_table(self, data, pipeline):
        site_id = self._from_jd_type_get_site_id_value(
            jd_type=data.get('jd_type'))
        if site_id == 0:
            self.my_lg.error('site_id获取异常, 请检查!')
            return False

        tmp = _get_right_model_data(data=data, site_id=site_id)

        self.my_lg.info('------>>>| 待存储的数据信息为:{0}'.format(tmp.get('goods_id')))

        pipeline.insert_into_jd_table(item=tmp)

        return True

    def old_jd_goods_insert_into_new_table(self, data, pipeline):
        '''
        老数据转到新表
        :param data:
        :param pipeline:
        :return:
        '''
        site_id = self._from_jd_type_get_site_id_value(
            jd_type=data.get('jd_type'))
        if site_id == 0:
            self.my_lg.error('site_id获取异常, 请检查!')
            return False

        tmp = _get_right_model_data(data=data, site_id=site_id)
        self.my_lg.info('------>>>| 待存储的数据信息为: {0}'.format(
            tmp.get('goods_id')))

        params = self._get_db_insert_params(item=tmp)
        if tmp.get('main_goods_id') is not None:
            sql_str = jd_insert_str_1

        else:
            sql_str = jd_insert_str_2

        result = pipeline._insert_into_table_2(sql_str=sql_str,
                                               params=params,
                                               logger=self.my_lg)

        return result

    def _get_db_insert_params(self, item):
        '''
        初始化存储参数
        :param item:
        :return:
        '''
        params = [
            item['goods_id'],
            item['goods_url'],
            item['username'],
            item['create_time'],
            item['modify_time'],
            item['shop_name'],
            item['account'],
            item['title'],
            item['sub_title'],
            item['link_name'],
            item['price'],
            item['taobao_price'],
            dumps(item['price_info'], ensure_ascii=False),
            dumps(item['detail_name_list'], ensure_ascii=False
                  ),  # 把list转换为json才能正常插入数据(并设置ensure_ascii=False)
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),  # 存入到PropertyInfo
            item['div_desc'],  # 存入到DetailInfo
            item['all_sell_count'],
            item['site_id'],
            item['is_delete'],
        ]

        if item.get('main_goods_id') is not None:
            params.append(item.get('main_goods_id'))

        return tuple(params)

    def get_db_update_params(self, item):
        '''
        得到db待更新参数
        :param item:
        :return:
        '''
        params = [
            item['modify_time'],
            item['shop_name'],
            item['account'],
            item['title'],
            item['sub_title'],
            item['link_name'],
            # item['price'],
            # item['taobao_price'],
            dumps(item['price_info'], ensure_ascii=False),
            dumps(item['detail_name_list'], ensure_ascii=False),
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),
            item['div_desc'],
            item['all_sell_count'],
            # item['delete_time'],
            item['is_delete'],
            item['is_price_change'],
            dumps(item['price_change_info'], ensure_ascii=False),
            item['sku_info_trans_time'],
            item['goods_id'],
        ]
        if item.get('delete_time', '') == '':
            params.insert(-1, item['shelf_time'])
        elif item.get('shelf_time', '') == '':
            params.insert(-1, item['delete_time'])
        else:
            params.insert(-1, item['shelf_time'])
            params.insert(-1, item['delete_time'])

        return tuple(params)

    def _wash_url_body(self, body):
        '''
        清洗body
        :param body:
        :return:
        '''
        body = re.compile('\n|\t|  ').sub('', body)

        return body

    def _from_jd_type_get_site_id_value(self, jd_type):
        '''
        根据jd_type来获取对应的site_id的值
        :param jd_type:
        :return: a int object
        '''
        # 采集的来源地
        if jd_type == 7:
            site_id = 7  # 采集来源地(京东)
        elif jd_type == 8:
            site_id = 8  # 采集来源地(京东超市)
        elif jd_type == 9:
            site_id = 9  # 采集来源地(京东全球购)
        elif jd_type == 10:
            site_id = 10  # 采集来源地(京东大药房)
        else:
            site_id = 0  # 表示错误

        return site_id

    def get_goods_id_from_url(self, jd_url):
        '''
        注意: 初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址
        :param jd_url:
        :return:
        '''
        is_jd_url = re.compile(r'https://item.jd.com/.*?').findall(jd_url)
        if is_jd_url != []:
            goods_id = re.compile(
                r'https://item.jd.com/(.*?).html.*?').findall(jd_url)[0]
            self.my_lg.info('------>>>| 得到的京东商品id为:{0}'.format(goods_id))
            return [0, goods_id]  # 0表示京东常规商品, 包括京东超市, 京东精选
        else:
            is_jd_hk_url = re.compile(r'https://item.jd.hk/.*?').findall(
                jd_url)
            if is_jd_hk_url != []:
                goods_id = re.compile(
                    r'https://item.jd.hk/(.*?).html.*?').findall(jd_url)[0]
                self.my_lg.info(
                    '------>>>| 得到的京东全球购商品id为:{0}'.format(goods_id))
                return [1, goods_id]  # 1表示京东全球购商品
            else:
                is_yiyao_jd_url = re.compile(
                    r'https://item.yiyaojd.com/.*?').findall(jd_url)
                if is_yiyao_jd_url != []:
                    goods_id = re.compile(
                        r'https://item.yiyaojd.com/(.*?).html.*?').findall(
                            jd_url)[0]
                    self.my_lg.info(
                        '------>>>| 得到的京东大药房商品id为:{}'.format(goods_id))
                    return [2, goods_id]  # 2表示京东大药房
                else:
                    self.my_lg.info(
                        '京东商品url错误, 非正规的url, 请参照格式(https://item.jd.com/)或者(https://item.jd.hk/)开头的...'
                    )
                    return []

    def get_pc_no_watermark_picture(self, goods_id):
        '''
        获取pc端无水印示例图片
        :param goods_id: eg: [0, '111111']
        :return: {} 表示意外退出 | [] 表示获取pc无水印图片失败 | [{'img_url': 'xxxxx'}, ...] 表示success
        '''
        if goods_id == []:
            return {}
        elif goods_id[0] == 0:  # 京东常规商品,京东超市
            tmp_pc_url = 'https://item.jd.com/' + str(goods_id[1]) + '.html'
        elif goods_id[0] == 1:  # 京东全球购(税率无法计算忽略抓取)
            tmp_pc_url = 'https://item.jd.hk/' + str(goods_id[1]) + '.html'
        elif goods_id[0] == 2:  # 京东大药房
            tmp_pc_url = 'https://item.yiyaojd.com/' + str(
                goods_id[1]) + '.html'
        else:
            return {}

        # 常规requests被过滤重定向到jd主页, 直接用 自己写的phantomjs方法获取
        # tmp_pc_body = MyRequests.get_url_body(url=tmp_pc_url, headers=self.pc_headers)
        tmp_pc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(
            url=tmp_pc_url,
            css_selector='div#spec-list ul.lh li img')  # 该css为示例图片
        # self.my_lg.info(str(tmp_pc_body))
        if tmp_pc_body == '':
            self.my_lg.info('#### 获取该商品的无水印示例图片失败! 导致原因: tmp_pc_body为空str!')
            all_img_url = []
        else:
            try:
                all_img_url = list(
                    Selector(text=tmp_pc_body).css(
                        'div#spec-list ul.lh li img::attr("src")').extract())
                if all_img_url != []:
                    all_img_url = [
                        'https:' + item_img_url for item_img_url in all_img_url
                        if re.compile(r'^http').findall(item_img_url) == []
                    ]
                    all_img_url = [
                        re.compile(r'/n5.*?jfs/').sub('/n1/jfs/', item_img_url)
                        for item_img_url in all_img_url
                    ]
                    all_img_url = [{
                        'img_url': item_img_url,
                    } for item_img_url in all_img_url]
                else:
                    all_img_url = []
            except Exception as e:
                self.my_lg.error('获取商品pc版无水印示例图片时出错: ', e)
                all_img_url = []

        return all_img_url

    def __del__(self):
        try:
            del self.my_phantomjs
            del self.my_lg
        except:
            pass
        gc.collect()
Exemplo n.º 10
0
 def __init__(self):
     self._set_headers()
     self.result_data = {}
     # self.set_cookies_key_api_uid()  # 设置cookie中的api_uid的值
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
Exemplo n.º 11
0
class ALi1688LoginAndParse(object):
    def __init__(self, logger=None):
        super(ALi1688LoginAndParse, self).__init__()
        self._set_headers()
        self.result_data = {}
        self.is_activity_goods = False
        self._set_logger(logger)
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH,
                                        logger=self.my_lg)

    def _set_logger(self, logger):
        if logger is None:
            self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                    '/1688/_/' +
                                    str(get_shanghai_time())[0:10] + '.txt',
                                    console_log_level=INFO,
                                    file_log_level=ERROR)
        else:
            self.my_lg = logger

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': '1688.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def get_ali_1688_data(self, goods_id):
        if goods_id == '':
            return self._data_error_init()

        wait_to_deal_with_url = 'https://m.1688.com/offer/' + str(
            goods_id) + '.html'
        self.my_lg.info(
            '------>>>| 待处理的阿里1688地址为: {0}'.format(wait_to_deal_with_url))

        self.error_base_record = '出错goods_id:{0}'.format(goods_id)
        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
            url=wait_to_deal_with_url, css_selector='div.d-content')
        # self.my_lg.info(str(body))
        if body == '':
            self.my_lg.error('获取到的body为空str!请检查!' + self.error_base_record)
            return self._data_error_init()

        tmp_body = body
        try:
            pull_off_shelves = Selector(
                text=body).css('div.d-content p.info::text').extract_first()
        except:
            pull_off_shelves = ''
        if pull_off_shelves == '该商品无法查看或已下架':  # 表示商品已下架, 同样执行插入数据操作
            try:
                tmp_my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                is_in_db = tmp_my_pipeline._select_table(
                    sql_str=al_select_str_1, params=(str(goods_id), ))
                # self.my_lg.info(str(is_in_db))
            except Exception:
                self.my_lg.error('数据库连接失败!' + self.error_base_record,
                                 exc_info=True)
                return self._data_error_init()

            if is_in_db != []:  # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可
                tmp_my_pipeline._update_table_2(sql_str=al_update_str_1,
                                                params=(goods_id),
                                                logger=self.my_lg)
                self.my_lg.info('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1')
                tmp_data_s = self.init_pull_off_shelves_goods()  # 初始化下架商品的属性
                tmp_data_s['before'] = True  # 用来判断原先该goods是否在db中
                self.result_data = {}

                return tmp_data_s

            else:  # 表示该goods_id没存在于db中
                self.my_lg.info('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...')
                tmp_data_s = self.init_pull_off_shelves_goods()  # 初始化下架商品的属性
                tmp_data_s['before'] = False
                self.result_data = {}

                return tmp_data_s

        body = re.compile(r'{"beginAmount"(.*?)</script></div></div>').findall(
            body)
        if body != []:
            body = body[0]
            body = r'{"beginAmount"' + body
            # self.my_lg.info(str(body))
            body = json_2_dict(json_str=body)
            # pprint(body)

            if body.get('discountPriceRanges') is not None:
                self.result_data = self._wash_discountPriceRanges(body=body)
                return self.result_data
            else:
                self.my_lg.error('data为空!' + self.error_base_record)
                return self._data_error_init()

        else:
            self.my_lg.info('解析ing..., 该商品正在参与火拼, 此处为火拼价, 为短期活动价格!')
            body = re.compile(
                r'{"activityId"(.*?)</script></div></div>').findall(tmp_body)
            if body != []:
                body = body[0]
                body = r'{"activityId"' + body
                # self.my_lg.info(str(body))
                body = json_2_dict(json_str=body)
                # pprint(body)

                if body.get('discountPriceRanges') is not None:
                    self.result_data = self._wash_discountPriceRanges(
                        body=body)
                    self.is_activity_goods = True
                    return self.result_data
                else:
                    self.my_lg.error('data为空!' + self.error_base_record)
                    return self._data_error_init()
            else:
                self.my_lg.error('这个商品对应活动属性未知, 此处不解析, 设置为跳过!' +
                                 self.error_base_record)
                return self._data_error_init()

    def deal_with_data(self):
        '''
        处理返回的result_data, 并返回需要的信息
        :return: 字典类型
        '''
        data = self.result_data
        # pprint(data)

        if data != {}:
            company_name = data.get('companyName', '')
            title = self._wash_sensitive_words(data.get('subject', ''))
            link_name = ''

            # 商品价格信息, 及其对应起批量   [{'price': '119.00', 'begin': '3'}, ...]
            price_info = self._get_price_info(data=data)
            # self.my_lg.info(str(price_info))

            # 标签属性名称及其对应的值
            # (可能有图片(url), 无图(imageUrl=None))    [{'value': [{'imageUrl': 'https://cbu01.alicdn.com/img/ibank/2017/520/684/4707486025_608602289.jpg', 'name': '白色'}, {'imageUrl': 'https://cbu01.alicdn.com/img/ibank/2017/554/084/4707480455_608602289.jpg', 'name': '卡其色'}, {'imageUrl': 'https://cbu01.alicdn.com/img/ibank/2017/539/381/4705183935_608602289.jpg', 'name': '黑色'}], 'prop': '颜色'}, {'value': [{'imageUrl': None, 'name': 'L'}, {'imageUrl': None, 'name': 'XL'}, {'imageUrl': None, 'name': '2XL'}], 'prop': '尺码'}]
            sku_props = self._get_sku_props(data=data)
            # self.my_lg.info(str(sku_props))

            # 每个规格对应价格, 及其库存量
            try:
                sku_map = self._get_sku_map(data=data,
                                            price_info=price_info,
                                            detail_name_list=sku_props)
                # pprint(sku_map)
            except Exception:
                self.my_lg.error('获取sku_map时, 遇到错误!' + self.error_base_record,
                                 exc_info=True)
                self.is_activity_goods = False
                return self._data_error_init()

            price, taobao_price = self._get_price(price_info=price_info)

            all_img_url = self._get_all_img_url(data=data)

            # 即: p_info
            property_info = self._get_p_info(data=data)

            # 即: div_desc
            detail_info_url = data.get('detailUrl')
            if detail_info_url is not None:
                # self.my_lg.info(str(detail_info_url))
                detail_info = self.get_detail_info_url_div(detail_info_url)
            else:
                detail_info = ''
            # self.my_lg.info(str(detail_info))

            is_delete = self._get_is_delete(title=title)

            result = {
                'company_name': company_name,  # 公司名称
                'title': title,  # 商品名称
                'link_name': link_name,  # 卖家姓名
                'price_info': price_info,  # 商品价格信息, 及其对应起批量
                'price': price,  # 起批的最高价
                'taobao_price': taobao_price,  # 起批的最低价
                'sku_props':
                sku_props,  # 标签属性名称及其对应的值  (可能有图片(url), 无图(imageUrl=None))
                'sku_map': sku_map,  # 每个规格对应价格, 及其库存量
                'all_img_url': all_img_url,  # 所有示例图片地址
                'property_info': property_info,  # 详细信息的标签名, 及其对应的值
                'detail_info': detail_info,  # 下方详细div块
                'is_delete': is_delete,  # 判断是否下架
            }
            # pprint(result)
            # self.my_lg.info(str(result))

            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # self.my_lg.info(str(json_data))

            # 重置self.is_activity_goods = False
            self.is_activity_goods = False

            return result
        else:
            self.my_lg.error('待处理的data为空值!' + self.error_base_record)
            self.is_activity_goods = False

            return {}

    def _data_error_init(self):
        self.result_data = {}

        return {}

    def to_right_and_update_data(self, data, pipeline):
        tmp = _get_right_model_data(data=data, site_id=2)

        params = self._get_db_update_params(item=tmp)
        # 不改价格的sql语句
        base_sql_str = al_update_str_2
        if tmp['delete_time'] == '':
            sql_str = base_sql_str.format('shelf_time=%s', '')
        elif tmp['shelf_time'] == '':
            sql_str = base_sql_str.format('delete_time=%s', '')
        else:
            sql_str = base_sql_str.format('shelf_time=%s,', 'delete_time=%s')

        pipeline._update_table_2(sql_str=sql_str,
                                 params=params,
                                 logger=self.my_lg)

    def _get_sku_props(self, **kwargs):
        '''
        得到sku_props
        :param kwargs:
        :return:
        '''
        data = kwargs.get('data', {})

        sku_props = data.get('skuProps')
        # self.my_lg.info(str(sku_props))
        if sku_props is not None:  # 这里还是保留unit为单位值
            for i in sku_props:
                value = i.get('value', [])
                i.update({'img_here': 0})  # 用于判断有示例图放在哪个属性
                if value != []:
                    for j in value:
                        if j.get('imageUrl') is not None:
                            i.update({'img_here': 1})
                        else:
                            pass
        else:
            sku_props = []  # 存在没有规格属性的

        return sku_props

    def _get_price_info(self, **kwargs):
        '''
        得到price_info
        :return:
        '''
        data = kwargs.get('data', {})

        # 商品价格信息, 及其对应起批量   [{'price': '119.00', 'begin': '3'}, ...]
        price_info = []
        if self.is_activity_goods:  # 火拼商品处理
            tmp = {}
            tmp_price = data.get('ltPromotionPriceDisplay')
            tmp_trade_number = data.get('beginAmount')
            tmp['begin'] = tmp_trade_number
            tmp['price'] = tmp_price
            price_info.append(tmp)
        else:  # 常规商品处理
            if data.get(
                    'isLimitedTimePromotion', 'true'
            ) == 'false':  # isLimitedTimePromotion 限时优惠, 'true'表示限时优惠价, 'flase'表示非限时优惠
                price_info = data.get('discountPriceRanges')
                for item in price_info:
                    try:
                        item.pop('convertPrice')
                    except KeyError:
                        pass
                        # self.my_lg.info(str(price_info))
            else:  # 限时优惠
                tmp = {
                    'begin': data.get('beginAmount', ''),
                    'price': data.get('skuDiscountPrice', '')
                }
                price_info.append(tmp)

        return price_info

    def _get_sku_map(self, **kwargs):
        '''
        得到sku_map
        :param kwargs:
        :return:
        '''
        # 每个规格对应价格, 及其库存量
        '''skuMap == SKUInfo'''
        data = kwargs.get('data', {})
        price_info = kwargs.get('price_info', [])
        detail_name_list = kwargs.get('detail_name_list', [])

        tmp_sku_map = data.get('skuMap')
        # pprint(tmp_sku_map)
        if tmp_sku_map is not None:
            sku_map = []
            for key, value in tmp_sku_map.items():
                tmp = {}
                # 处理key得到需要的值
                key = re.compile(r'&gt;').sub('|', key)
                tmp['spec_type'] = key

                # 处理value得到需要的值
                # pprint(price_info)
                if value.get('discountPrice') is None:  # 如果没有折扣价, 价格就为起批价
                    try:
                        value['discountPrice'] = price_info[0].get('price')
                    except IndexError:
                        self.my_lg.error('获取价格失败, 此处跳过!')
                        raise IndexError

                else:
                    if self.is_activity_goods:
                        pass
                    else:
                        if data.get('isLimitedTimePromotion') == 'false':
                            if float(value.get('discountPrice')) < float(
                                    price_info[0].get('price')):
                                value['discountPrice'] = price_info[0].get(
                                    'price')
                            else:
                                pass
                        else:
                            pass

                tmp['spec_value'] = self._wash_sku_value(value=value)
                sku_map.append(tmp)

        else:
            sku_map = []  # 存在没有规格时的情况

        # 添加示例图
        if sku_map != []:
            img_url_list = []
            for i in detail_name_list:
                if i.get('img_here', 0) == 1:
                    img_url_list = i.get('value', [])

            # self.my_lg.info(str(img_url_list))
            for i in img_url_list:
                img_url = i.get('imageUrl', '')
                name = i.get('name', '')
                for j in sku_map:
                    if name in j.get('spec_type', ''):
                        j.update({
                            'img_url': img_url,
                        })
                    else:
                        pass

        return sku_map

    def _get_all_img_url(self, **kwargs):
        '''
        得到all_img_url
        :param kwargs:
        :return:
        '''
        data = kwargs.get('data', {})

        tmp_all_img_url = data.get('imageList')
        if tmp_all_img_url is not None:
            all_img_url = []
            for item in tmp_all_img_url:
                tmp = {}
                try:
                    item.pop('size310x310URL')
                except KeyError:
                    # self.my_lg.info('KeyError, [size310x310URL], 此处设置为跳过')
                    pass
                tmp['img_url'] = item['originalImageURI']
                all_img_url.append(tmp)
        else:
            all_img_url = []

        return all_img_url

    def _get_p_info(self, **kwargs):
        '''
        得到p_info
        :param kwargs:
        :return:
        '''
        data = kwargs.get('data', {})

        property_info = []
        tmp_property_info = data.get('productFeatureList')
        if tmp_property_info is not None:
            for item in tmp_property_info:
                try:
                    item.pop('unit')
                except KeyError:
                    # self.my_lg.info('KeyError, [unit], 此处设置为跳过')
                    pass
                item['id'] = '0'

            property_info = tmp_property_info
        else:
            pass

        return property_info

    def _get_is_delete(self, **kwargs):
        '''
        得到is_delete
        :param kwargs:
        :return:
        '''
        title = kwargs.get('title')

        is_delete = 0
        if re.compile(r'下架').findall(title) != []:
            if re.compile(r'待下架').findall(title) != []:
                pass
            else:
                is_delete = 1
        else:
            pass

        return is_delete

    def _wash_sku_value(self, value):
        '''
        清洗value
        :param value:
        :return:
        '''
        try:
            value.pop('skuId')
        except KeyError:
            pass
        try:
            value.pop('specId')
        except KeyError:
            pass
        try:
            value.pop('saleCount')
        except KeyError:
            pass
        try:
            value.pop('discountStandardPrice')
        except KeyError:
            pass
        try:
            value.pop('price')
        except KeyError:
            pass
        try:
            value.pop('retailPrice')
        except KeyError:
            pass
        try:
            value.pop('standardPrice')
        except KeyError:
            # self.my_lg.info('KeyError, [skuId, specId, saleCount]错误, 此处跳过')
            pass

        return value

    def _wash_sensitive_words(self, word):
        '''
        清洗敏感字眼
        :param word:
        :return:
        '''
        word = re.compile(r'淘宝网').sub('', word)

        return word

    def _wash_discountPriceRanges(self, body):
        '''
        清洗discountPriceRanges
        :param body:
        :return:
        '''
        # 过滤无用属性
        try:
            body.pop('action')
            body.pop('offerSign')
            body.pop('rateDsrItems')
            body.pop('rateStarLevelMapOfMerge')
            body.pop('wirelessVideoInfo')
            body.pop('freightCost')
        except KeyError:
            # self.my_lg.info('KeyError错误, 此处跳过!')
            pass

        return body

    def _get_db_update_params(self, item):
        '''
        得到待存储的params
        :param item:
        :return: tuple
        '''
        params = [
            item['modify_time'],
            item['shop_name'],
            item['title'],
            item['link_name'],
            # item['price'],
            # item['taobao_price'],
            dumps(item['price_info'], ensure_ascii=False),
            dumps(item['detail_name_list'], ensure_ascii=False),
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            item['div_desc'],
            dumps(item['p_info'], ensure_ascii=False),
            # item['delete_time'],
            item['is_delete'],
            item['is_price_change'],
            dumps(item['price_change_info'], ensure_ascii=False),
            item['sku_info_trans_time'],
            item['goods_id'],
        ]
        if item.get('delete_time', '') == '':
            params.insert(-1, item['shelf_time'])
        elif item.get('shelf_time', '') == '':
            params.insert(-1, item['delete_time'])
        else:
            params.insert(-1, item['shelf_time'])
            params.insert(-1, item['delete_time'])

        return tuple(params)

    def _get_price(self, price_info):
        '''
        获取商品的最高价跟最低价
        :param price_info:
        :return: price, taobao_price type float
        '''
        # 设置最高价price, 最低价taobao_price
        if len(price_info) > 1:
            tmp_ali_price = []
            for item in price_info:
                tmp_ali_price.append(float(item.get('price')))

            if tmp_ali_price == []:
                price = Decimal(0).__round__(2)
                taobao_price = Decimal(0).__round__(2)

            else:
                price = Decimal(sorted(tmp_ali_price)[-1]).__round__(
                    2)  # 得到最大值并转换为精度为2的decimal类型
                taobao_price = Decimal(sorted(tmp_ali_price)[0]).__round__(2)

        elif len(
                price_info
        ) == 1:  # 由于可能是促销价, 只有一组然后价格 类似[{'begin': '1', 'price': '485.46-555.06'}]
            if re.compile(r'-').findall(price_info[0].get('price')) != []:
                tmp_price_range = price_info[0].get('price')
                tmp_price_range = tmp_price_range.split('-')
                price = tmp_price_range[1]
                taobao_price = tmp_price_range[0]

            else:
                price = Decimal(price_info[0].get('price')).__round__(
                    2)  # 得到最大值并转换为精度为2的decimal类型
                taobao_price = price

        else:  # 少于1
            price = Decimal(0).__round__(2)
            taobao_price = Decimal(0).__round__(2)

        return float(price), float(taobao_price)

    def init_pull_off_shelves_goods(self):
        '''
        初始化原先就下架的商品信息
        :return:
        '''
        is_delete = 1
        result = {
            'company_name': '',  # 公司名称
            'title': '',  # 商品名称
            'link_name': '',  # 卖家姓名
            'price_info': [],  # 商品价格信息, 及其对应起批量
            'price': 0,
            'taobao_price': 0,
            'sku_props': [],  # 标签属性名称及其对应的值  (可能有图片(url), 无图(imageUrl=None))
            'sku_map': [],  # 每个规格对应价格, 及其库存量
            'all_img_url': [],  # 所有示例图片地址
            'property_info': [],  # 详细信息的标签名, 及其对应的值
            'detail_info': '',  # 下方详细div块
            'is_delete': is_delete,  # 判断是否下架
        }

        return result

    def old_ali_1688_goods_insert_into_new_table(self, data, pipeline):
        tmp = _get_right_model_data(data=data, site_id=2)

        params = self._get_db_insert_params(item=tmp)
        if tmp.get('main_goods_id') is not None:
            sql_str = al_insert_str_1
        else:
            sql_str = al_insert_str_2

        result = pipeline._insert_into_table_2(sql_str=sql_str,
                                               params=params,
                                               logger=self.my_lg)

        return result

    def _get_db_insert_params(self, item):
        params = [
            item['goods_id'],
            item['goods_url'],
            item['username'],
            item['create_time'],
            item['modify_time'],
            item['shop_name'],
            item['title'],
            item['link_name'],
            item['price'],
            item['taobao_price'],
            dumps(item['price_info'], ensure_ascii=False
                  ),  # 把list转换为json才能正常插入数据(并设置ensure_ascii=False)
            dumps(item['detail_name_list'], ensure_ascii=False),
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            item['div_desc'],  # 存入到DetailInfo
            dumps(item['p_info'], ensure_ascii=False),  # 存入到PropertyInfo
            item['site_id'],
            item['is_delete'],
        ]

        if item.get('main_goods_id') is not None:
            params.append(item.get('main_goods_id'))

        return tuple(params)

    def get_detail_info_url_div(self, detail_info_url):
        '''
        此处过滤得到data_tfs_url的div块
        :return:
        '''
        # self.my_lg.info(str(detail_info_url))
        if re.compile(r'https').findall(detail_info_url) == []:
            detail_info_url = 'https:' + detail_info_url
            # self.my_lg.info(str(detail_info_url))
        else:
            pass
        # data_tfs_url_response = requests.get(detail_info_url, headers=self.headers)
        # data_tfs_url_body = data_tfs_url_response.content.decode('gbk')

        data_tfs_url_body = self.my_phantomjs.use_phantomjs_to_get_url_body(
            url=detail_info_url)

        # '''
        # 改用requests
        # '''
        # body = MyRequests.get_url_body(url=detail_info_url, headers=self.headers)
        # self.my_lg.info(str(body))
        # if  body == '':
        #     detail_info = ''
        #
        # data_tfs_url_body = body

        is_offer_details = re.compile(r'offer_details').findall(
            data_tfs_url_body)
        detail_info = ''

        if is_offer_details != []:
            data_tfs_url_body = re.compile(r'.*?{"content":"(.*?)"};').findall(
                data_tfs_url_body)
            # self.my_lg.info(str(body))
            if data_tfs_url_body != []:
                detail_info = data_tfs_url_body[0]
                detail_info = re.compile(r'\\').sub('', detail_info)
                detail_info = self._wash_div_desc(detail_info=detail_info)

        else:
            is_desc = re.compile(r'var desc=').findall(data_tfs_url_body)
            if is_desc != []:
                desc = re.compile(r'var desc=\'(.*)\';').findall(
                    data_tfs_url_body)
                if desc != []:
                    detail_info = desc[0]
                    detail_info = self._wash_div_desc(detail_info=detail_info)
                    detail_info = re.compile(r'src=\"https:').sub(
                        'src=\"', detail_info)  # 先替换部分带有https的
                    detail_info = re.compile(r'src="').sub(
                        'src=\"https:', detail_info)  # 再把所欲的换成https的

        # self.my_lg.info(str(detail_info))

        return detail_info

    def _wash_div_desc(self, detail_info):
        '''
        清洗detail_info
        :param detail_info:
        :return:
        '''
        detail_info = re.compile(r'&lt;').sub(
            '<', detail_info
        )  # self.driver.page_source转码成字符串时'<','>'都被替代成&gt;&lt;此外还有其他也类似被替换
        detail_info = re.compile(r'&gt;').sub('>', detail_info)
        detail_info = re.compile(r'&amp;').sub('&', detail_info)
        detail_info = re.compile(r'&nbsp;').sub(' ', detail_info)

        return detail_info

    def get_goods_id_from_url(self, ali_1688_url):
        # https://detail.1688.com/offer/559526148757.html?spm=b26110380.sw1688.mof001.28.sBWF6s
        is_ali_1688_url = re.compile(
            r'https://detail.1688.com/offer/.*?').findall(ali_1688_url)
        if is_ali_1688_url != []:
            ali_1688_url = re.compile(
                r'https://detail.1688.com/offer/(.*?).html.*?').findall(
                    ali_1688_url)[0]
            self.my_lg.info(
                '------>>>| 得到的阿里1688商品id为:{0}'.format(ali_1688_url))

            return ali_1688_url
        else:
            self.my_lg.info(
                '阿里1688商品url错误, 非正规的url, 请参照格式(https://detail.1688.com/offer/)开头的...'
            )

            return ''

    def __del__(self):
        try:
            del self.my_phantomjs
            del self.my_lg
        except Exception:
            self.my_lg.error("self.my_phantomjs释放失败!")
            pass
        gc.collect()
Exemplo n.º 12
0
class Zhe800Spike(object):
    def __init__(self):
        self._set_headers()
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'zhe800.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        base_session_id = BASE_SESSION_ID
        while base_session_id < MAX_SESSION_ID:
            print('待抓取的session_id为: ', base_session_id)
            data = self._get_one_session_id_data(
                base_session_id=base_session_id)
            sleep(.3)

            if data.get('data', {}).get('blocks', []) == []:  # session_id不存在
                pass

            else:  # 否则session_id存在
                try:
                    _ = str(
                        data.get('data',
                                 {}).get('blocks',
                                         [])[0].get('deal',
                                                    {}).get('begin_time',
                                                            ''))[:10]
                    if _ != '':
                        pass
                    elif data.get('data', {}).get('blocks', [])[0].get(
                            'showcase', {}) != {}:  # 未来时间
                        print('*** 未来时间 ***')
                        # pprint(data.get('data', {}))
                        _ = str(
                            data.get('data', {}).get('blocks', [])[1].get(
                                'deal', {}).get('begin_time', ''))[:10]
                    else:
                        raise Exception
                    begin_times_timestamp = int(
                        _)  # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整

                except Exception as e:
                    print('遇到严重错误: ', e)
                    base_session_id += 2
                    continue

                print('秒杀时间为: ',
                      timestamp_to_regulartime(begin_times_timestamp))

                if self.is_recent_time(
                        timestamp=begin_times_timestamp):  # 说明秒杀日期合法
                    try:
                        data = [
                            item_s.get('deal', {}) for item_s in data.get(
                                'data', {}).get('blocks', [])
                        ]
                    except Exception as e:
                        print('遇到严重错误: ', e)
                        base_session_id += 2
                        continue
                    # pprint(data)

                    if data != []:  # 否则说明里面有数据
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(
                            data=data)
                        # pprint(miaosha_goods_list)

                        zhe_800 = Zhe800Parse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            sql_str = 'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14'
                            db_goods_id_list = [
                                item[0] for item in list(
                                    my_pipeline._select_table(sql_str=sql_str))
                            ]
                            for item in miaosha_goods_list:
                                if item.get('zid', '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'https://shop.zhe800.com/products/' + str(
                                        item.get('zid', ''))
                                    goods_id = zhe_800.get_goods_id_from_url(
                                        tmp_url)

                                    zhe_800.get_goods_data(goods_id=goods_id)
                                    goods_data = zhe_800.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:  # 否则就解析并且插入
                                        goods_data['stock_info'] = item.get(
                                            'stock_info')
                                        goods_data['goods_id'] = str(
                                            item.get('zid'))
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get('price')
                                        goods_data['taobao_price'] = item.get(
                                            'taobao_price')
                                        goods_data['sub_title'] = item.get(
                                            'sub_title')
                                        # goods_data['is_baoyou'] = item.get('is_baoyou')
                                        goods_data['miaosha_time'] = item.get(
                                            'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item.get(
                                                        'miaosha_time'))
                                        goods_data['session_id'] = str(
                                            base_session_id)
                                        # print(goods_data['miaosha_time'])

                                        # print(goods_data)
                                        zhe_800.insert_into_zhe_800_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=my_pipeline)
                                        sleep(ZHE_800_SPIKE_SLEEP_TIME)  # 放慢速度

                            # sleep(2)
                        else:
                            pass
                        try:
                            del zhe_800
                        except:
                            pass
                        gc.collect()

                    else:  # 说明这个sessionid没有数据
                        print('该sessionid没有相关key为jsons的数据')
                        # return {}
                        pass
                else:
                    pass

            base_session_id += 2

    def _get_one_session_id_data(self, base_session_id):
        '''
        得到一个session_id的data
        :param base_session_id:
        :return:
        '''
        _data = []
        for _page in range(1, 20):
            '''per_page为20固定,其他不返回数据'''
            tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page={1}&per_page=20'.format(
                str(base_session_id), _page)

            body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url)
            # print(body)

            body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(body)
            if body_1 != []:
                data = body_1[0]
                data = json.loads(data)
                # pprint(data)

                # print(type(data.get('data', {}).get('has_next')))
                if data.get('msg', '') == '无效场次':
                    print('该session_id不存在,此处跳过')
                    break

                if not data.get('data', {}).get('has_next', True):
                    print('该session_id没有下页了!!')
                    break
                else:
                    print('正在抓取该session_id的第 {0} 页...'.format(_page))

                for _i in data.get('data', {}).get('blocks', []):
                    _data.append(_i)

            sleep(.3)

        return {
            'data': {
                'blocks': _data,
            }
        }

    def get_miaoshao_goods_info_list(self, data):
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            # pprint(item)
            tmp = {}
            # 秒杀开始时间和结束时间
            try:
                tmp['miaosha_time'] = {
                    'miaosha_begin_time':
                    timestamp_to_regulartime(
                        int(str(item.get('begin_time'))[:10])),
                    'miaosha_end_time':
                    timestamp_to_regulartime(
                        int(str(item.get('end_time'))[:10])),
                }
            except ValueError:
                continue

            # 折800商品地址
            tmp['zid'] = item.get('zid')
            # 是否包邮
            # tmp['is_baoyou'] = item.get('is_baoyou', 0)
            # 限时秒杀的库存信息
            tmp['stock_info'] = {
                'activity_stock': item.get('activity_stock',
                                           0),  # activity_stock为限时抢的剩余数量
                'stock': item.get('stock', 0),  # stock为限时秒杀的总库存
            }
            # 原始价格
            tmp['price'] = float(item.get('list_price'))
            # 秒杀的价格, float类型
            tmp['taobao_price'] = float(item.get('price'))
            # 子标题
            tmp['sub_title'] = item.get('description', '')
            miaosha_goods_list.append(tmp)
            # pprint(miaosha_goods_list)

        return miaosha_goods_list

    def is_recent_time(self, timestamp):
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: True or False
        '''
        time_1 = int(timestamp)
        time_2 = time.time()  # 当前的时间戳
        time_1 = time.localtime(time_1)
        time_2 = time.localtime(time_2)
        if time_1.tm_year > time_2.tm_year:
            print('** 该年份为未来时间年份 **')
            if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR:  # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息
                print('合法时间')
                # diff_days = abs(time_1.tm_mday - time_2.tm_mday)
                return True
            else:
                print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR,
                                                       SPIDER_END_HOUR))
                return False

        if time_1.tm_year == time_2.tm_year:
            if time_1.tm_mon > time_2.tm_mon:  # 先处理得到的time_1的月份大于当前月份的信息(即未来月份的)
                print('** 该月份为未来时间月份 **')
                if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR:  # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息
                    print('合法时间')
                    # diff_days = abs(time_1.tm_mday - time_2.tm_mday)
                    return True
                else:
                    print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(
                        SPIDER_START_HOUR, SPIDER_END_HOUR))
                    return False

            if time_1.tm_mon >= time_2.tm_mon:  # 如果目标时间的月份时间 >= 当前月份(月份合法, 表示是当前月份或者是今年其他月份)
                if time_1.tm_mday >= time_2.tm_mday - 2:  # 这样能抓到今天的前两天的信息
                    if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR:  # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息
                        print('合法时间')
                        # diff_days = abs(time_1.tm_mday - time_2.tm_mday)
                        return True
                    else:
                        print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(
                            SPIDER_START_HOUR, SPIDER_END_HOUR))
                        return False
                else:
                    print('该日时间已过期, 此处跳过')
                    return False
            else:  # 月份过期
                print('该月份时间已过期,此处跳过')
                return False

        else:
            print('非本年度的限时秒杀时间,此处跳过')
            return False

    def __del__(self):
        try:
            del self.my_phantomjs
        except:
            pass
        gc.collect()
Exemplo n.º 13
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        '''
        方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据
        '''
        # mw_appkey = '100028'
        # mw_t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e'
        # mw_ttid = 'NMMain%40mgj_h5_1.0'
        #
        # _ = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        #
        # data = {
        #     "pid": "93745",
        #     "platform": "m",
        #     "cKey": "mwp_mait",
        #     "fcid": "",
        # }
        #
        # params = {
        #     'data': data
        # }
        #
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid
        #
        # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format(
        #     mw_appkey, mw_t, mw_uuid, mw_ttid, _
        # )
        #
        # # 设置代理ip
        # ip_object = MyIpPools()
        # self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
        # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]
        #
        # tmp_proxies = {
        #     'http': self.proxy,
        # }
        #
        # try:
        #     response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        #     body = response.content.decode('utf-8')
        #     print(body)
        # except Exception:
        #     print('requests.get()请求超时....')
        #     print('data为空!')
        #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
        #     return {}
        '''
        方法二: 通过pc端来获取拼团商品列表
        '''
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
        for key in self.fcid_dict:
            print('正在抓取的分类为: ', key)
            for index in range(1, 100):
                if index % 5 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs(
                        executable_path=PHANTOMJS_DRIVER_PATH)

                fcid = self.fcid_dict[key]
                tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                    str(index), fcid)
                # requests请求数据被过滤(起初能用),改用phantomjs
                # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                    url=tmp_url)
                # print(body)

                try:
                    body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0]
                    tmp_data = json.loads(body)
                except:
                    print('json.loads转换body时出错, 请检查')
                    continue

                if tmp_data.get('result', {}).get('wall', {}).get('docs',
                                                                  []) == []:
                    # 表示拼团数据为空则跳出循环
                    break

                # pprint(tmp_data)
                # print(tmp_data)

                tmp_item_list = tmp_data.get('result',
                                             {}).get('wall',
                                                     {}).get('docs', [])
                # print(tmp_item_list)
                # pprint(tmp_item_list)

                begin_time_timestamp = int(time.time())  # 开始拼团的时间戳
                item_list = [{
                    'goods_id': item.get('tradeItemId', ''),
                    'pintuan_time': {
                        'begin_time':
                        timestamp_to_regulartime(
                            timestamp=begin_time_timestamp),
                        'end_time':
                        timestamp_to_regulartime(
                            self.get_pintuan_end_time(
                                begin_time_timestamp,
                                item.get('leftTimeOrg', ''))),
                    },
                    'all_sell_count': str(item.get('salesVolume', 0)),
                    'fcid': fcid,
                    'page': index,
                    'sort': key,
                } for item in tmp_item_list]
                print(item_list)

                for item_1 in item_list:
                    goods_list.append(item_1)

                sleep(MOGUJIE_SLEEP_TIME)

        # 处理goods_list数据
        print(goods_list)
        self.deal_with_data(goods_list)
        sleep(5)
Exemplo n.º 14
0
class MoGuJiePinTuan(object):
    def __init__(self):
        self._set_headers()
        self._set_fcid_dict()

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'api.mogujie.com',
            'Referer':
            'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def _set_fcid_dict(self):
        self.fcid_dict = {
            '女装': 10053171,
            # '精选': 10053172,
            '男友': 10053173,
            '内衣': 10053174,
            '女鞋': 10053175,
            '包包': 10053176,
            '美妆': 10053177,
            '生活': 10053178,
            '配饰': 10053179,
            '母婴': 10053180,
            '食品': 10053181,
        }

    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        '''
        方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据
        '''
        # mw_appkey = '100028'
        # mw_t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e'
        # mw_ttid = 'NMMain%40mgj_h5_1.0'
        #
        # _ = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位
        #
        # data = {
        #     "pid": "93745",
        #     "platform": "m",
        #     "cKey": "mwp_mait",
        #     "fcid": "",
        # }
        #
        # params = {
        #     'data': data
        # }
        #
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648
        # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid
        #
        # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format(
        #     mw_appkey, mw_t, mw_uuid, mw_ttid, _
        # )
        #
        # # 设置代理ip
        # ip_object = MyIpPools()
        # self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
        # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]
        #
        # tmp_proxies = {
        #     'http': self.proxy,
        # }
        #
        # try:
        #     response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        #     body = response.content.decode('utf-8')
        #     print(body)
        # except Exception:
        #     print('requests.get()请求超时....')
        #     print('data为空!')
        #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
        #     return {}
        '''
        方法二: 通过pc端来获取拼团商品列表
        '''
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
        for key in self.fcid_dict:
            print('正在抓取的分类为: ', key)
            for index in range(1, 100):
                if index % 5 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs(
                        executable_path=PHANTOMJS_DRIVER_PATH)

                fcid = self.fcid_dict[key]
                tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                    str(index), fcid)
                # requests请求数据被过滤(起初能用),改用phantomjs
                # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                    url=tmp_url)
                # print(body)

                try:
                    body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0]
                    tmp_data = json.loads(body)
                except:
                    print('json.loads转换body时出错, 请检查')
                    continue

                if tmp_data.get('result', {}).get('wall', {}).get('docs',
                                                                  []) == []:
                    # 表示拼团数据为空则跳出循环
                    break

                # pprint(tmp_data)
                # print(tmp_data)

                tmp_item_list = tmp_data.get('result',
                                             {}).get('wall',
                                                     {}).get('docs', [])
                # print(tmp_item_list)
                # pprint(tmp_item_list)

                begin_time_timestamp = int(time.time())  # 开始拼团的时间戳
                item_list = [{
                    'goods_id': item.get('tradeItemId', ''),
                    'pintuan_time': {
                        'begin_time':
                        timestamp_to_regulartime(
                            timestamp=begin_time_timestamp),
                        'end_time':
                        timestamp_to_regulartime(
                            self.get_pintuan_end_time(
                                begin_time_timestamp,
                                item.get('leftTimeOrg', ''))),
                    },
                    'all_sell_count': str(item.get('salesVolume', 0)),
                    'fcid': fcid,
                    'page': index,
                    'sort': key,
                } for item in tmp_item_list]
                print(item_list)

                for item_1 in item_list:
                    goods_list.append(item_1)

                sleep(MOGUJIE_SLEEP_TIME)

        # 处理goods_list数据
        print(goods_list)
        self.deal_with_data(goods_list)
        sleep(5)

    def deal_with_data(self, *params):
        '''
        处理并存储相关拼团商品的数据
        :param params: 待传参数
        :return:
        '''
        goods_list = params[0]

        mogujie = MoGuJieParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, miaosha_time, fcid, page from dbo.mogujie_pintuan where site_id=23'
            db_goods_id_list = [
                item[0]
                for item in list(my_pipeline._select_table(sql_str=sql_str))
            ]
            print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://shop.mogujie.com/detail/' + str(
                        goods_id)

                    mogujie.get_goods_data(goods_id=str(goods_id))
                    goods_data = mogujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        # 规范化
                        tmp_price_info_list = goods_data['price_info_list']
                        price_info_list = [{
                            'spec_value':
                            item_4.get('spec_value'),
                            'pintuan_price':
                            item_4.get('detail_price'),
                            'normal_price':
                            item_4.get('normal_price'),
                            'img_url':
                            item_4.get('img_url'),
                            'rest_number':
                            item_4.get('rest_number'),
                        } for item_4 in tmp_price_info_list]

                        goods_data['price_info_list'] = price_info_list
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['pintuan_time'] = item.get(
                            'pintuan_time', {})
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                pintuan_time=item.get('pintuan_time', {}))
                        goods_data['all_sell_count'] = item.get(
                            'all_sell_count', '')
                        goods_data['fcid'] = str(item.get('fcid'))
                        goods_data['page'] = str(item.get('page'))
                        goods_data['sort'] = str(item.get('sort', ''))

                        # pprint(goods_data)
                        # print(goods_data)
                        _r = mogujie.insert_into_mogujie_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            db_goods_id_list.append(goods_id)
                            db_goods_id_list = list(set(db_goods_id_list))

                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mogujie
        except:
            pass
        gc.collect()

    def get_pintuan_end_time(self, begin_time, left_time):
        '''
        处理并得到拼团结束时间
        :param begin_time: 秒杀开始时间戳
        :param left_time: 剩余时间字符串
        :return: end_time 时间戳(int)
        '''
        # 'leftTimeOrg': '6天13小时'
        # 'leftTimeOrg': '13小时57分'

        had_day = re.compile(r'天').findall(left_time)
        had_hour = re.compile(r'小时').findall(left_time)
        had_min = re.compile(r'分').findall(left_time)

        tmp = re.compile(r'\d+').findall(left_time)
        if had_day != [] and had_hour != []:  # left_time 格式为 '6天13小时'
            day, hour, min = int(tmp[0]), int(tmp[1]), 0

        elif had_day == [] and had_hour != []:  # left_time 格式为 '13小时57分'
            day, hour, min = 0, int(tmp[0]), int(tmp[1])

        elif had_day == [] and had_hour == []:  # left_time 格式为 '36分'
            print('left_time = ', left_time)
            day, hour, min = 0, 0, int(tmp[0])

        else:  # 无天, 小时, 分
            print('day, hour, min = 0, 0, 0', 'left_time = ', left_time)
            day, hour, min = 0, 0, 0

        left_end_time_timestamp = \
            day * 24 * 60 * 60 + \
            hour * 60 * 60 + \
            min * 60

        return begin_time + left_end_time_timestamp

    def get_pintuan_begin_time_and_pintuan_end_time(self, pintuan_time):
        '''
        返回拼团开始和结束时间
        :param pintuan_time:
        :return: tuple  pintuan_begin_time, pintuan_end_time
        '''
        pintuan_begin_time = pintuan_time.get('begin_time')
        pintuan_end_time = pintuan_time.get('end_time')
        # 将字符串转换为datetime类型
        pintuan_begin_time = datetime.datetime.strptime(
            pintuan_begin_time, '%Y-%m-%d %H:%M:%S')
        pintuan_end_time = datetime.datetime.strptime(pintuan_end_time,
                                                      '%Y-%m-%d %H:%M:%S')

        return pintuan_begin_time, pintuan_end_time

    def __del__(self):
        try:
            del self.my_phantomjs
        except:
            pass
        gc.collect()
    async def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = await tmp_sql_server.select_jumeiyoupin_pintuan_all_goods_id(
                logger=self.my_lg)
        except TypeError:
            self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            self.my_lg.info(result)

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    time_number = await self.is_recent_time(pintuan_end_time)
                    if time_number == 0:
                        await tmp_sql_server.delete_jumeiyoupin_pintuan_expired_goods_id(
                            goods_id=item[0], logger=self.my_lg)
                        self.msg = '过期的goods_id为(%s)' % item[
                            0] + ', 拼团结束时间为(%s), 删除成功!' % str(
                                json.loads(item[1]).get('begin_time'))
                        self.my_lg.info(self.msg)

                    elif time_number == 2:
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (
                            item[0], str(index))
                        self.my_lg.info(self.msg)
                        data['goods_id'] = item[0]
                        jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.my_lg)

                        _ = item[2] + '-' + str(
                            item[3])  # 格式: 'coutuan_baby-1'
                        item_list = self.api_all_goods_id.get(
                            _, [])  # 用于判断tab, index已在self.api_all_goods_id中

                        if item_list == []:
                            my_phantomjs = MyPhantomjs(
                                executable_path=PHANTOMJS_DRIVER_PATH)
                            item_list = await jumeiyoupin_2.get_one_page_goods_list(
                                my_phantomjs=my_phantomjs,
                                tab=item[2],
                                index=item[3])
                            try:
                                del my_phantomjs
                            except:
                                pass

                        if item_list == []:
                            self.my_lg.info('获取到的body为空str, 网络原因, 此处先跳过!')
                            pass
                        else:
                            if self.api_all_goods_id.get(_) is None:
                                self.api_all_goods_id[_] = item_list

                            pintuan_goods_all_goods_id = [
                                item_1.get('goods_id', '')
                                for item_1 in item_list
                            ]

                            jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse(
                                logger=self.my_lg)
                            # 内部已经下架的(测试发现官方不会提前下架活动商品)
                            if item[0] not in pintuan_goods_all_goods_id:
                                await self.update_data_2(
                                    jumeiyoupin_pintuan=jumeiyoupin_pintuan,
                                    jumei_pintuan_url=item[4],
                                    goods_id=item[0],
                                    pipeline=tmp_sql_server)

                            else:  # 未内部下架
                                await self.update_data_1(
                                    jumeiyoupin_pintuan=jumeiyoupin_pintuan,
                                    jumeiyoupin_2=jumeiyoupin_2,
                                    jumei_pintuan_url=item[4],
                                    goods_id=item[0],
                                    item_list=item_list,
                                    pipeline=tmp_sql_server)

                else:
                    self.my_lg.error('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            self.my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()

        return None
Exemplo n.º 16
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=jm_delete_str_2)
            result = list(
                tmp_sql_server._select_table(sql_str=jm_select_str_1))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            # 获取cookies
            my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
            cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(
                url='https://h5.jumei.com/')
            try:
                del my_phantomjs
            except:
                pass
            if cookies == '':
                print('!!! 获取cookies失败 !!!')
                return False

            print('获取cookies成功!')
            self.headers.update(Cookie=cookies)
            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                jumeiyoupin_miaosha = JuMeiYouPinParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        this_page_all_goods_list = self.get_one_page_all_goods_list(
                            item[2])

                        if this_page_all_goods_list == '网络错误!':
                            print('网络错误!先跳过')
                            continue

                        elif this_page_all_goods_list == []:
                            print(
                                '#### 该page对应得到的this_page_all_goods_list为空[]!')
                            print('** 该商品已被下架限时秒杀活动, 此处将其删除')
                            tmp_sql_server._delete_table(
                                sql_str=self.delete_sql_str, params=(item[0]))
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            """
                            由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                            """
                            # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                            #
                            # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            #     print('该商品已被下架限时秒杀活动,此处将其删除')
                            #     tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                            #     print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            #     pass
                            #
                            # else:  # 未下架的
                            tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(
                                item[3])
                            jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                            goods_data = jumeiyoupin_miaosha.deal_with_data()

                            if goods_data == {}:  # 返回的data为空则跳过
                                pass
                            else:
                                goods_data['goods_id'] = str(item[0])
                                goods_data['miaosha_time'] = {
                                    'miaosha_begin_time':
                                    goods_data['schedule'].get(
                                        'begin_time', ''),
                                    'miaosha_end_time':
                                    goods_data['schedule'].get('end_time', ''),
                                }
                                goods_data['miaosha_begin_time'], goods_data[
                                    'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                        miaosha_time=goods_data['miaosha_time']
                                    )

                                # print(goods_data)
                                jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(
                                    data=goods_data, pipeline=tmp_sql_server)
                                sleep(JUMEIYOUPIN_SLEEP_TIME)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Exemplo n.º 17
0
class JdCommentParse(object):
    def __init__(self, logger=None):
        self.result_data = {}
        self.msg = ''
        self._set_logger(logger)
        self._set_headers()
        self.comment_page_switch_sleep_time = 1.2  # 评论下一页sleep time
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
        self._add_headers_cookies()

    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        self.goods_id = goods_id
        self.headers.update({
            'referer':
            'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id),
        })

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 3):
            _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json'

            params = self._set_params(goods_id=goods_id,
                                      current_page=current_page)
            body = MyRequests.get_url_body(url=_url,
                                           headers=self.headers,
                                           params=params)
            # self.my_lg.info(str(body))

            _data = self._json_2_dict(body).get('wareDetailComment',
                                                {}).get('commentInfoList', [])
            _tmp_comment_list += _data

            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id:{0}'.format(goods_id))
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data

    def _get_comment_list(self, _tmp_comment_list):
        '''
        转换成需求的结果集
        :param _tmp_comment_list:
        :return:
        '''
        _comment_list = []
        for item in _tmp_comment_list:
            _comment_date = item.get('commentDate', '')
            assert _comment_date != '', '得到的_comment_date为空str!请检查!'

            # sku_info(有些商品评论是没有规格的所以默认为空即可,不加assert检查!)
            ware_attributes = item.get('wareAttributes', [])
            # self.my_lg.info(str(ware_attributes))
            sku_info = ' '.join([
                i.get('key', '') + ':' + i.get('value', '')
                for i in ware_attributes
            ])
            # assert sku_info != '', '得到的sku_info为空str!请检查!'

            _comment_content = item.get('commentData', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = self._wash_comment(comment=_comment_content)

            buyer_name = item.get('userNickName', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            # jd设置默认 购买量为1
            quantify = 1

            head_img = item.get('userImgURL', '')
            assert head_img != '', '得到的用户头像为空值!请检查!'
            head_img = 'https://' + head_img

            # 第一次评论图片
            _comment_img_list = item.get('pictureInfoList', [])
            if _comment_img_list != []:
                _comment_img_list = [{
                    'img_url': img.get('largePicURL', '')
                } for img in _comment_img_list]
            '''追评'''
            append_comment = {}

            # star_level
            star_level = int(item.get('commentScore', '5'))

            if not filter_invalid_comment_content(_comment_content):
                continue

            comment = [{
                'comment': _comment_content,
                'comment_date': _comment_date,
                'sku_info': sku_info,
                'img_url_list': _comment_img_list,
                'star_level': star_level,
                'video': '',
            }]

            _comment_list.append({
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 评论数量
                'head_img': head_img,  # 头像
                'append_comment': append_comment,  # 追评
            })

        return _comment_list

    def _add_headers_cookies(self):
        # 测试发现得带cookies, 详细到cookies中的sid字符必须有
        # 先获取cookies
        _cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(
            url='https://item.m.jd.com/')
        # self.my_lg.info(str(_cookies))
        self.headers.update({
            'cookie': _cookies,
        })

        return None

    def _set_logger(self, logger):
        if logger is None:
            self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                    '/京东/comment/' +
                                    str(get_shanghai_time())[0:10] + '.txt',
                                    console_log_level=INFO,
                                    file_log_level=ERROR)
        else:
            self.my_lg = logger

    def _set_headers(self):
        self.headers = {
            'origin': 'https://item.m.jd.com',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'content-type': 'application/x-www-form-urlencoded',
            'accept': 'application/json',
            'referer': 'https://item.m.jd.com/ware/view.action?wareId=5025518',
            'x-requested-with': 'XMLHttpRequest',
        }

    def _wash_comment(self, comment):
        '''
        清洗评论
        :param comment:
        :return:
        '''
        comment = re.compile(r'jd|\n|Jd|JD').sub('', comment)
        comment = re.compile('京东').sub('优秀网', comment)

        return comment

    def _json_2_dict(self, json_str):
        '''
        json2dict
        :param json_str:
        :return:
        '''
        try:
            _ = json.loads(json_str)
        except:
            self.my_lg.error('json.loads转换json_str时出错! 出错goods_id: ' +
                             self.goods_id)
            return {}

        return _

    def _set_params(self, goods_id, current_page):
        '''
        设置params
        :param goods_id:
        :param current_page:
        :return:
        '''
        _params = [
            ('wareId', goods_id),
            ('offset', str(current_page)),
            ('num', '10'),
            ('checkParam', 'LUIPPTP'),
            ('category', '670_671_1105'),
            ('isUseMobile', 'true'),
            ('evokeType', ''),
            ('type', '3'),  # '0' 全部评论 | '3' 好评
            ('isCurrentSku', 'false'),
        ]

        return _params

    def __del__(self):
        try:
            del self.my_lg
            del self.my_phantomjs
            del self.headers
        except:
            pass
        gc.collect()
Exemplo n.º 18
0
class MoGuJiePinTuanRealTimesUpdate(object):
    def __init__(self):
        self._set_headers()
        self.delete_sql_str = mg_delete_str_1

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'list.mogujie.com',
            # 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_2)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            self.my_phantomjs = MyPhantomjs(
                executable_path=PHANTOMJS_DRIVER_PATH)
            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs(
                        executable_path=PHANTOMJS_DRIVER_PATH)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = item[0]
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data['goods_id'] = item[
                                                    0]
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()

    def get_pintuan_end_time(self, begin_time, left_time):
        '''
        处理并得到拼团结束时间
        :param begin_time: 秒杀开始时间戳
        :param left_time: 剩余时间字符串
        :return: end_time 时间戳(int)
        '''
        # 'leftTimeOrg': '6天13小时'
        # 'leftTimeOrg': '13小时57分'

        had_day = re.compile(r'天').findall(left_time)
        had_hour = re.compile(r'小时').findall(left_time)
        had_min = re.compile(r'分').findall(left_time)

        tmp = re.compile(r'\d+').findall(left_time)
        if had_day != [] and had_hour != []:  # left_time 格式为 '6天13小时'
            day, hour, min = int(tmp[0]), int(tmp[1]), 0

        elif had_day == [] and had_hour != []:  # left_time 格式为 '13小时57分'
            day, hour, min = 0, int(tmp[0]), int(tmp[1])

        elif had_day == [] and had_hour == []:  # left_time 格式为 '36分'
            print('left_time = ', left_time)
            day, hour, min = 0, 0, int(tmp[0])

        else:  # 无天, 小时, 分
            print('day, hour, min = 0, 0, 0', 'left_time = ', left_time)
            day, hour, min = 0, 0, 0

        left_end_time_timestamp = \
            day * 24 * 60 * 60 + \
            hour * 60 * 60 + \
            min * 60

        return begin_time + left_end_time_timestamp

    def is_recent_time(self, timestamp):
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(datetime_to_timestamp(get_shanghai_time()))  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -86400:  # (为了后台能同步下架)所以设置为 24个小时
            # if diff_time < 0:     # (原先的时间)结束时间 与当前时间差 <= 0
            return 0  # 已过期恢复原价的

        elif diff_time > 0:
            return 1  # 表示是昨天跟今天的也就是待更新的

        else:  # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除)
            return 2

    def __del__(self):
        try:
            del self.my_phantomjs
        except:
            pass
        gc.collect()
Exemplo n.º 19
0
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            tmp_sql_server._delete_table(sql_str=mg_delete_str_2)
            result = list(
                tmp_sql_server._select_table(sql_str=mg_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            self.my_phantomjs = MyPhantomjs(
                executable_path=PHANTOMJS_DRIVER_PATH)
            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mogujie_pintuan = MoGuJieParse()
                if index % 8 == 0:
                    try:
                        del self.my_phantomjs
                    except:
                        pass
                    gc.collect()
                    self.my_phantomjs = MyPhantomjs(
                        executable_path=PHANTOMJS_DRIVER_PATH)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format(
                            item[3], item[2])
                        # print(tmp_url)

                        # requests请求不到数据,涉及证书认证,直接用phantomjs
                        # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
                        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url)
                        # print(body)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                body = re.compile(
                                    r'<pre.*?>(.*?)</pre>').findall(body)[0]
                                tmp_data = json.loads(body)
                                # pprint(tmp_data)
                            except:
                                print('json.loads转换body时出错, 请检查')
                                tmp_data = {}

                            if tmp_data.get('result',
                                            {}).get('wall',
                                                    {}).get('docs', []) == []:
                                print('得到的docs为[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                tmp_item_list = tmp_data.get('result', {}).get(
                                    'wall', {}).get('docs', [])
                                # pprint(tmp_item_list)

                                begin_time_timestamp = int(
                                    time.time())  # 开始拼团的时间戳
                                item_list = [{
                                    'goods_id':
                                    item.get('tradeItemId', ''),
                                    'pintuan_time': {
                                        'begin_time':
                                        timestamp_to_regulartime(
                                            timestamp=begin_time_timestamp),
                                        'end_time':
                                        timestamp_to_regulartime(
                                            self.get_pintuan_end_time(
                                                begin_time_timestamp,
                                                item.get('leftTimeOrg', ''))),
                                    },
                                    'all_sell_count':
                                    str(item.get('salesVolume', 0)),
                                } for item in tmp_item_list]
                                # pprint(item_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in item_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:
                                    # print('该商品已被下架限时秒杀活动,此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass
                                    mogujie_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mogujie_pintuan.deal_with_data(
                                    )

                                    if goods_data == {}:
                                        pass
                                    else:
                                        # 规范化
                                        print('+++ 内部下架,其实还在售卖的商品更新')
                                        goods_data['goods_id'] = item[0]
                                        goods_data[
                                            'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                goods_data['price_info_list'])

                                        # pprint(goods_data)
                                        mogujie_pintuan.update_mogujie_pintuan_table_2(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in item_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mogujie_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mogujie_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}: pass
                                            else:
                                                # 规范化
                                                goods_data['goods_id'] = item[
                                                    0]
                                                goods_data[
                                                    'price_info_list'] = _get_mogujie_pintuan_price_info_list(
                                                        goods_data[
                                                            'price_info_list'])
                                                goods_data[
                                                    'pintuan_time'] = item_2.get(
                                                        'pintuan_time', {})
                                                goods_data[
                                                    'pintuan_begin_time'], goods_data[
                                                        'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=
                                                            goods_data[
                                                                'pintuan_time']
                                                        )
                                                goods_data[
                                                    'all_sell_count'] = item_2.get(
                                                        'all_sell_count', '')

                                                # pprint(goods_data)
                                                mogujie_pintuan.update_mogujie_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(
                                                    MOGUJIE_SLEEP_TIME)  # 放慢速度

                                        else:
                                            pass

                else:
                    print('数据库连接失败,此处跳过!')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                sleep(60 * 60 * 5.5)
            else:
                sleep(5)
            gc.collect()
Exemplo n.º 20
0
class JuMeiYouPinSpike(object):
    def __init__(self):
        self._set_headers()

    def _set_headers(self):
        self.headers = {
            'Accept': 'application/json,text/javascript,text/plain,*/*;q=0.01',
            # 'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'h5.jumei.com',
            'Referer': 'https://h5.jumei.com/',
            'Cache-Control': 'max-age=0',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        all_goods_list = []
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
        cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(
            url='https://h5.jumei.com/')
        try:
            del self.my_phantomjs
        except:
            pass
        if cookies == '':
            print('!!! 获取cookies失败 !!!')
            return False

        print('获取cookies成功!')
        self.headers.update(Cookie=cookies)

        print('开始抓取在售商品...')
        for page in range(1, 50):  # 1, 开始
            tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
                str(page))
            print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)

            try:
                json_body = json.loads(body)
                # print(json_body)
            except:
                print('json.loads转换body时出错!请检查')
                json_body = {}
                pass

            this_page_item_list = json_body.get('item_list', [])
            if this_page_item_list == []:
                print('@@@@@@ 所有接口数据抓取完毕 !')
                break

            for item in this_page_item_list:
                if item.get('item_id', '') not in [
                        item_1.get('item_id', '') for item_1 in all_goods_list
                ]:
                    item['page'] = page
                    all_goods_list.append(item)

            sleep(.5)

        print('开始抓取预售商品...')
        for page in range(1, 50):  # 1, 开始
            tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format(
                str(page))
            print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)

            try:
                json_body = json.loads(body)
                # print(json_body)
            except:
                print('json.loads转换body时出错!请检查')
                json_body = {}
                pass

            this_page_item_list = json_body.get('item_list', [])
            if this_page_item_list == []:
                print('@@@@@@ 所有接口数据抓取完毕 !')
                break

            for item in this_page_item_list:
                if item.get('item_id', '') not in [
                        item_1.get('item_id', '') for item_1 in all_goods_list
                ]:
                    item['page'] = page
                    all_goods_list.append(item)

            sleep(.5)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]
        print(all_goods_list)
        print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__())

        self.deal_with_data(all_goods_list)

        return True

    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline._select_table(sql_str=jm_select_str_2))
            ]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    jumei = JuMeiYouPinParse()
                    goods_id = item.get('goods_id', '')
                    type = item.get('type', '')
                    tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format(
                        goods_id, type)
                    jumei.get_goods_data(goods_id=[goods_id, type])
                    goods_data = jumei.deal_with_data()

                    if goods_data == {}:
                        pass

                    elif goods_data.get('is_delete', 0) == 1:
                        print('------>>>| 该商品库存为0,已被抢光!')
                        pass

                    else:  # 否则就解析并且插入
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time':
                            goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time':
                            goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data[
                            'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                miaosha_time=goods_data['miaosha_time'])
                        goods_data['page'] = item.get('page')

                        # pprint(goods_data)
                        # print(goods_data)
                        jumei.insert_into_jumeiyoupin_xianshimiaosha_table(
                            data=goods_data, pipeline=my_pipeline)
                        sleep(JUMEIYOUPIN_SLEEP_TIME
                              )  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠

                    try:
                        del jumei
                    except:
                        pass

        else:
            print('数据库连接失败,此处跳过!')
            pass

        gc.collect()

    def __del__(self):
        gc.collect()
Exemplo n.º 21
0
 def __init__(self):
     self._set_headers()
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
Exemplo n.º 22
0
class WMYHQSpider(object):
    def __init__(self):
        self._set_headers()
        self.page_sleep_time = 1.2
        self.phantomjs_sleep_time = 2
        self.my_phantomjs = MyPhantomjs(load_images=True)   # load_images为True才加载图片!
        self.qrcode_base_path = '/Users/afa/myFiles/tmp/外卖券qrcode/'

    def _set_headers(self):
        self.headers = {
            'Accept': '*/*',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Accept-Encoding': 'br, gzip, deflate',
            'Host': 'app.quanmama.com',
            'User-Agent': get_random_phone_ua(),
            'Content-Length': '885',
            'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
        }

    def _get_wm_page_info(self):
        '''
        获取外卖页面的json推荐
        :return:
        '''
        # cookies = {
        #     'ASP.NET_SessionId': 'rxnstx4qhayrkqdne3coeevj',
        # }
        all_rows = []
        print('开始采集券妈妈外卖券!')
        for page_index in range(1, 5):
            print('正在抓取第{0}页...'.format(page_index))
            data = self._set_data(page_index=page_index)

            url = 'https://app.quanmama.com/apios/v5/appZdmList.ashx'
            body = MyRequests.get_url_body(method='post', url=url, headers=self.headers, cookies=None, data=data)
            # print(body)
            if body == '':
                print('获取到的body为空值!此处跳过!')
                continue
            # print(body)

            rows = json_2_dict(json_str=body).get('data', {}).get('rows', [])
            if rows == []:
                print('得到的rows为空值!此处跳过!')
                continue
            # pprint(rows)

            all_rows += rows
            sleep(self.page_sleep_time)

        print('\n@@@@@@ 抓取完毕!')
        wm_list = self._parse_wm_page(all_rows)
        # pprint(wm_list)

        self._deal_with_wm_info(wm_list)

    def _deal_with_wm_info(self, wm_list):
        '''
        处理wm_list
        :param wm_list:
        :return:
        '''
        # 先清空昨日的
        os.system('cd {0} && rm -rf *'.format(self.qrcode_base_path))
        for item in wm_list:
            print('正在处理文章id: {0}'.format(item.get('article_id')))

            exec_code = '''
            self.driver.find_element_by_css_selector('div.go-action a').send_keys(Keys.ENTER)
            sleep({0})
            '''.format(self.phantomjs_sleep_time)
            body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                url=item.get('article_link', ''),
                exec_code=exec_code)
            # div.appcoupon-qrcode img

            qrcode_str = Selector(text=body).css('div.appcoupon-qrcode img::attr("src")').extract_first()
            # print(qrcode_str)

            img_file_name = '[代码{0}]'.format(item.get('article_id', '')) + \
                            item.get('article_title', '') + '@' + \
                            item.get('article_vicetitle', '') + '.png'
            save_path = self.qrcode_base_path + img_file_name
            result = save_base64_img_2_local(save_path=save_path, base64_img_str=qrcode_str)
            if result:
                print('[+] {0}'.format(img_file_name))
            else:
                print('[-] {0}'.format(img_file_name))

            sleep(self.page_sleep_time)

        print('@@@ 抓取二维码操作完成!')

        return None

    def _parse_wm_page(self, rows):
        '''
        :param rows:
        :return:
        '''
        _ = []
        for item in rows:
            try:
                article_is_timeout = item.get('article_is_timeout')
                assert article_is_timeout is not None, 'article_is_timeout为None!'
                if article_is_timeout == 1:     # 0 未过期; 1过期
                    continue

                article_id = item.get('article_id')
                assert article_id is not None, 'article_id为空!'

                article_mall = item.get('article_mall', '')
                assert article_mall != '', 'article_mall为空值!'

                article_pic = item.get('article_pic', '')
                assert  article_pic != '', 'article_pic为空值!'

                article_vicetitle = self.replace_chinese_str(item.get('article_vicetitle', ''))
                assert article_vicetitle != '', 'article_vicetitle为空值!'

                article_title = self.replace_chinese_str(item.get('article_title', ''))
                assert article_title != '', 'article_title为空值!'

                article_link = item.get('article_link', '')
                assert article_link != '', 'article_link为空值!'

                article_begin_time = item.get('article_begintime', '')
                assert article_begin_time != '', 'article_begin_time为空值!'

                article_end_time = item.get('article_endtime', '')
                assert article_end_time != '', 'article_end_time为空值!'


            except Exception as e:
                print('遇到错误:', e)
                continue

            _.append({
                'article_id': article_id,                   # 文章id
                'article_mall': article_mall,               # 文章发布至今多久
                'article_pic': article_pic,                 # 文章缩略图
                'article_vicetitle': article_vicetitle,     # 文章子标题
                'article_title': article_title,             # 文章标题
                'article_link': article_link,               # 文章link
                'article_begin_time': article_begin_time,   # 活动开始时间
                'article_end_time': article_end_time,       # 活动结束时间
            })

        return _

    def replace_chinese_str(self, data):
        '''
        replace 中文符号
        :param data:
        :return:
        '''

        return data.replace(':', ':').replace('、', ',').replace(',', ',').replace('/', '|')

    def _set_data(self, page_index):
        '''
        post的data参数
        :return:
        '''
        data = [
            ('AgeType', '2'),
            ('ProfessionType', '2'),
            ('SexType', '1'),
            ('appname', '券妈妈'),
            ('category', '5391'),
            ('code', '532'),
            ('devicename', 'iOS'),
            ('f', 'ios'),
            ('identifiernumber', 'F037B84D-A211-44B3-BA56-D5033A1328D4'),
            # ('imei', 'DA8C3A83-C08C-4881-86A8-1E67849F5BB2'),
            ('isiosmajia', '0'),
            ('localScheme', 'qmm'),
            ('logintype', '4'),
            ('mac', '02:00:00:00:00:00'),
            ('net', '2'),
            ('pageindex', str(page_index)),
            ('phonemodel', 'iPhone'),
            ('phoneversion', '11.0'),
            ('platform', 'App Store'),
            ('rtime', '0_'),
            ('sort', '1'),
            ('test', '0'),
            # ('userphonename', '\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79'),
            # ('usertoken', '09EAD2E9E9DD9BC28F7C26D004062CA57AD0B2FAD785BBAF2E47EA62C988583E6EB9759E8BD401086322FD88EE3C741CB015E4AE3ADE06EC8FF1F188CF647C4BDA41DD1A3A8D8E20DBFA4E6DB4DCDC9588ACBE676B0EF6F66A137BEDD1B51FC8157FDD1FBC34CCACA97DF5ACE152C83494903ED1CBEEAA283856534EEAB79D678CDC3E6A2FEA9DE2463DCB5D8D61F3D365E2971E17720EDBDC4E0A218616B79ADBD4D86C5BD89C67B8A008DA67139EFD4954DD44301BE380DE25093C216928F7'),
            ('v', '5.3.2'),
        ]

        return data

    def __del__(self):
        try:
            del self.my_phantomjs
        except:
            pass
        gc.collect()
Exemplo n.º 23
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        all_goods_list = []
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
        cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(
            url='https://h5.jumei.com/')
        try:
            del self.my_phantomjs
        except:
            pass
        if cookies == '':
            print('!!! 获取cookies失败 !!!')
            return False

        print('获取cookies成功!')
        self.headers.update(Cookie=cookies)

        print('开始抓取在售商品...')
        for page in range(1, 50):  # 1, 开始
            tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
                str(page))
            print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)

            try:
                json_body = json.loads(body)
                # print(json_body)
            except:
                print('json.loads转换body时出错!请检查')
                json_body = {}
                pass

            this_page_item_list = json_body.get('item_list', [])
            if this_page_item_list == []:
                print('@@@@@@ 所有接口数据抓取完毕 !')
                break

            for item in this_page_item_list:
                if item.get('item_id', '') not in [
                        item_1.get('item_id', '') for item_1 in all_goods_list
                ]:
                    item['page'] = page
                    all_goods_list.append(item)

            sleep(.5)

        print('开始抓取预售商品...')
        for page in range(1, 50):  # 1, 开始
            tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format(
                str(page))
            print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)

            try:
                json_body = json.loads(body)
                # print(json_body)
            except:
                print('json.loads转换body时出错!请检查')
                json_body = {}
                pass

            this_page_item_list = json_body.get('item_list', [])
            if this_page_item_list == []:
                print('@@@@@@ 所有接口数据抓取完毕 !')
                break

            for item in this_page_item_list:
                if item.get('item_id', '') not in [
                        item_1.get('item_id', '') for item_1 in all_goods_list
                ]:
                    item['page'] = page
                    all_goods_list.append(item)

            sleep(.5)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]
        print(all_goods_list)
        print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__())

        self.deal_with_data(all_goods_list)

        return True
Exemplo n.º 24
0
 def __init__(self):
     super().__init__()
     self._set_headers()
     self.result_data = {}
     self.is_activity_goods = False
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
class BaseFund(object):
    def __init__(self, base_path='/Users/afa/myFiles/tmp/基金/伪好基/'):
        '''
        :param base_path: 基金图片存储path
        '''
        self.page_num_start = 1  # 开放基金排行开始page
        self.page_num_end = 3
        self.CRAWL_FUND_TIME = 1.5  # 抓取每只基金的sleep time
        self.plot_pic = None
        self.base_path = base_path
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_PATH)

    def _get_rank_fund_info(self):
        '''
        得到天天基金全部基金的rank_fund
        :return: a list
        '''
        rank_fund_list = []
        for page_num in range(self.page_num_start, self.page_num_end):
            print('正在抓取第{0}页的基金信息...'.format(page_num))
            cookies = {
                'st_pvi':
                '11586003301354',
                'EMFUND1':
                'null',
                'EMFUND0':
                'null',
                'EMFUND2':
                '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884',
                'EMFUND3':
                '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106',
                'EMFUND4':
                '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301',
                'EMFUND5':
                '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723',
                'EMFUND6':
                '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595',
                'EMFUND7':
                '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594',
                'st_si':
                '38764934559714',
                'ASP.NET_SessionId':
                'hqeo1xk5oqgwb0cqzxicytda',
                'EMFUND8':
                '07-11 11:28:55@#$%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148',
                'EMFUND9':
                '07-11 11:28:55@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092',
            }

            headers = {
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
                'Accept': '*/*',
                # 'Referer': 'http://fund.eastmoney.com/data/fundranking.html',
                'Proxy-Connection': 'keep-alive',
            }

            end_date = str(get_shanghai_time())[:10]
            start_date = str(
                datetime.datetime(year=get_shanghai_time().year - 1,
                                  month=get_shanghai_time().month,
                                  day=get_shanghai_time().day))[:10]
            print('开始时间: {0}, 结束时间: {1}'.format(start_date, end_date))

            params = (
                ('op', 'ph'),
                ('dt', 'kf'),
                ('ft', 'all'),
                ('rs', ''),
                ('gs', '0'),
                ('sc', 'zzf'),
                ('st', 'desc'),
                ('sd', start_date),  # '2017-07-10'
                ('ed', end_date),  # '2018-07-10'
                ('qdii', ''),
                ('tabSubtype', ',,,,,'),
                ('pi', str(page_num)),  # rank_data的页码
                ('pn', '50'),
                ('dx', '1'),
                # ('v', '0.5290053467389759'),
            )

            url = 'http://fund.eastmoney.com/data/rankhandler.aspx'

            # TODO 常规requests被502
            # body = MyRequests.get_url_body(url=url, headers=headers, params=params, cookies=None)
            # print(body)

            # 用phantomjs
            body = self.my_phantomjs.get_url_body(
                url=_get_url_contain_params(url, params))

            try:
                body = re.compile('<body>(.*)</body>').findall(body)[0]
                this_page_rank_data = re.compile(r'rankData = (.*);').findall(
                    body)[0]
                # print(this_page_rank_data)
            except IndexError:
                print('在获取this_page_rank_data时索引异常!请检查!')
                continue

            # 报错: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
            # 解决方案: 用demjson处理下
            this_page_rank_data = demjson.decode(this_page_rank_data).get(
                'datas', {})
            # pprint(this_page_rank_data)
            if this_page_rank_data == {}:
                return []

            for item in this_page_rank_data:
                _i = item.split(',')
                rank_fund_list.append({
                    '基金代码': _i[0],
                    '基金简称': _i[1],
                    '当天日期': _i[3],
                    '单位净值': _i[4],
                    '累计净值': _i[5],
                    '日增长率': _i[6],
                    '近1周': _i[7],
                    '近1月': _i[8],
                    '近3月': _i[9],
                    '近6月': _i[10],
                    '近1年': _i[11],
                    '近2年': _i[12],
                    '近3年': _i[13],
                    '今年来': _i[14],
                    '成立来': _i[15],
                    '手续费': _i[20],
                })

            sleep(2.5)

        print('\n抓取完毕!\n')

        # pprint(rank_fund_list)

        return rank_fund_list

    def _deal_with_rank_fund_info(self):
        '''
        处理rank_fund_info
        :return:
        '''
        rank_fund_list = self._get_rank_fund_info()

        for item in rank_fund_list:
            fund_code = item.get('基金代码', '')
            print('正在处理基金代码: {0}...'.format(fund_code))
            self._get_one_fund_info(fund_code=fund_code)
            sleep(self.CRAWL_FUND_TIME)

        print('\n@@@ 所有操作完成!\n')

        return True

    def _get_one_fund_info(self, fund_code):
        '''
        得到一只基金的info,并处理
        :return:
        '''
        cookies = {
            'st_pvi': '11586003301354',
            'st_si': '46806950936799',
            'ASP.NET_SessionId': 'fhllwae2zicg00o0x4ub1fxs',
            'EMFUND1': 'null',
            'EMFUND0': 'null',
            # 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884',
            'EMFUND2': '07-10 18:01:38@#$华润元大现金通货币B@#$002884',
            # 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106',
            'EMFUND3': '07-10 18:01:48@#$天弘现金管家货币B@#$420106',
            # 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301',
            'EMFUND4': '07-10 18:11:53@#$方正富邦保险主题指数分级@#$167301',
            # 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723',
            'EMFUND5': '07-10 18:04:32@#$招商中证银行指数分级@#$161723',
            # 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595',
            'EMFUND6': '07-10 18:05:13@#$天弘中证银行指数C@#$001595',
            # 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594',
            'EMFUND7': '07-10 18:06:13@#$天弘中证银行指数A@#$001594',
            # 'EMFUND8': '07-10%2018%3A11%3A22@%23%24%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148',
            'EMFUND8': '07-10 18:11:22@#$申万菱信多策略灵活配置混合A@#$001148',
            # 'EMFUND9': '07-10 18:12:26@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092',
            'EMFUND9': '07-10 18:12:26@#$广发生物科技指数(QDII)@#$001092',
        }

        cookies = unquote_cookies(cookies)
        # pprint(cookies)

        headers = {
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Accept': '*/*',
            # 'Referer': 'http://fund.eastmoney.com/001092.html',
            'Proxy-Connection': 'keep-alive',
        }

        v = re.compile(r'-| |:').sub('', str(
            get_shanghai_time()))  # 2018-07-10 18:30:46 -> 20180710183046
        # print(v)
        params = (
            # ('v', '20180710175951'),    # 时间
            ('v', v),  # 时间
        )

        fund_url = 'http://fund.eastmoney.com/pingzhongdata/{0}.js'.format(
            fund_code)
        # response = requests.get(fund_url, headers=headers, params=params, cookies=None)
        # body = response.text
        # print(body)

        # body = MyRequests.get_url_body(url=fund_url, headers=headers, params=params, cookies=None)
        # print(body)

        body = self.my_phantomjs.get_url_body(
            url=_get_url_contain_params(fund_url, params))
        # print(body)
        self._get_this_fund_info(body=body)

        return True

    def _get_this_fund_info(self, body):
        try:
            # 基金名
            fund_name = re.compile(r'fS_name = "(.*?)";').findall(body)[0]
            # 基金代码
            fund_code = re.compile(r'fS_code = "(.*?)";').findall(body)[0]
            print('基金名: {0}, 基金代码: {1}'.format(fund_name, fund_code))

            # 购买手续费
            fund_source_rate = re.compile(r'fund_sourceRate="(.*?)";').findall(
                body)[0]
            # 现费率
            fund_rate = re.compile('fund_Rate="(.*?)";').findall(body)[0]
            # 最小起购金额
            fund_minsg = re.compile(r'fund_minsg="(.*?)";').findall(body)[0]
            print('购买手续费: {0}%, 现费率: {1}%, 最小起购金额: {2}RMB'.format(
                fund_source_rate, fund_rate, fund_minsg))
            '''收益率'''
            # 近一年收益率
            syl_1n = re.compile(r'syl_1n="(.*?)";').findall(body)[0]
            # 近6月收益率
            syl_6y = re.compile(r'syl_6y="(.*?)";').findall(body)[0]
            # 近三月收益率
            syl_3y = re.compile(r'syl_3y="(.*?)";').findall(body)[0]
            # 近一月收益率
            syl_1y = re.compile(r'syl_1y="(.*?)";').findall(body)[0]
            msg = '@@收益率:\n\t近1年: {0}%, 近6月: {1}%, 近3月: {2}%, 近1月: {3}%'.format(
                syl_1n, syl_6y, syl_3y, syl_1y)
            print(msg)

            # 单位净值走势 equityReturn-净值回报 unitMoney-每份派送金
            data_net_worth_trend = json_2_dict(
                re.compile(r'Data_netWorthTrend = (.*?);').findall(body)[0])
            # pprint(data_net_worth_trend)
            # print('单位净值走势: {0}'.format(data_net_worth_trend))
            self._deal_with_data_net_worth_trend(
                fund_name=fund_name,
                fund_code=fund_code,
                data_net_worth_trend=data_net_worth_trend)

            # 累计净值走势
            data_ac_worth_trend = json_2_dict(
                re.compile(r'Data_ACWorthTrend = (.*?);').findall(body)[0])
            # pprint(data_ac_worth_trend)
            # print('累计净值走势: {0}'.format(data_ac_worth_trend))

            # 累计收益率走势
            data_grand_total = json_2_dict(
                re.compile(r'Data_grandTotal = (.*?);').findall(body)[0])
            # print('累计收益率走势: {0}'.format(data_grand_total))

            # 同类排名走势
            data_rate_in_similar_type = json_2_dict(
                re.compile(r'Data_rateInSimilarType = (.*?);').findall(body)
                [0])
            # print('同类排名走势: {0}'.format(data_rate_in_similar_type))

            # 同类排名百分比
            data_rate_in_similar_persent = json_2_dict(
                re.compile(r'Data_rateInSimilarPersent=(.*?);').findall(body)
                [0])
            # print('同类排名百分比: {0}'.format(data_rate_in_similar_persent))

            # 同类型基金涨幅榜(页面底部通栏)
            swith_same_type = json_2_dict(
                re.compile(r'swithSameType = (.*?);').findall(body)[0])
            # print('同类型基金涨幅榜: {0}'.format(swith_same_type))

        except IndexError as e:
            print(e)

        return None

    def _deal_with_data_net_worth_trend(self, **kwargs):
        '''
        处理data_net_worth_trend(单位净值走势), 并成像
        :param fund_name:
        :param fund_code:
        :param data_net_worth_trend:
        :return:
        '''
        fund_name = kwargs.get('fund_name')
        fund_code = kwargs.get('fund_code')
        data_net_worth_trend = kwargs.get('data_net_worth_trend', [])

        [
            item.update(
                {'x': str(timestamp_to_regulartime(str(item.get('x'))[:10]))})
            for item in data_net_worth_trend
        ]
        print('时间格式转换成功!')
        # pprint(data_net_worth_trend)

        x = [item.get('x') for item in data_net_worth_trend]
        y = [item.get('y') for item in data_net_worth_trend]
        '''绘图'''
        self.plot_pic = self._drawing(fund_name=fund_name,
                                      fund_code=fund_code,
                                      x=x,
                                      y=y)

        try:
            del self.plot_pic
        except:
            pass
        gc.collect()

        return True

    def _drawing(self, **kwargs):
        '''
        初始化画笔
        :param kwargs:
        :return:
        '''
        import matplotlib.pyplot as plt
        from random import randint

        figure_num = randint(1, 10000)
        plt.figure(figure_num)  # 创建图表1, 一个Figure对象可以包含多个子图(Axes), 从而避免图都画在一张上

        fund_name = kwargs.get('fund_name')
        fund_code = kwargs.get('fund_code')
        x = kwargs.get('x')
        y = kwargs.get('y')

        # 加载字体
        font = FontProperties(fname='/Library/Fonts/Songti.ttc', size=10)

        # 显示标题
        plt.title('{0}(代码{1})的单位净值走势图'.format(fund_name, fund_code),
                  fontproperties=font,
                  fontsize=15)
        plt.xlabel('日期', fontproperties=font)
        plt.ylabel('单位净值', fontproperties=font)

        # 显示网格
        # plt.grid()    # 太密集了不显示

        # 设置坐标轴步长step
        x_axis_label = self._get_x_axis_label(x)
        # pprint(x_axis_label)
        y_axis_label = self._get_y_axis_label(y)
        # pprint(y_axis_label)
        plt.xticks(arange(len(x_axis_label)),
                   x_axis_label,
                   rotation=30,
                   fontsize=5)  # 放str得先处理成这个格式
        # plt.yticks(y_axis_label)

        # 设置x轴值区间
        # plt.xlim(x[0], x[-2])

        # 显示图例
        plt.legend(['单位:元'], loc=1, prop=font)

        plt.figure(figure_num)
        # 调用绘制线性图函数plot()
        plot_pic = plt.plot(
            x,
            y,
            marker='.',
            markerfacecolor='r',
            markersize=1,  # 标记的点的size
            linewidth=.4,  # 线宽
            color='#7EB6EA'  # 线的颜色
        )

        # 标识数字标签
        # for a, b in zip(x, y):
        #     plt.text(a, b, '%.3f' % (b,), fontsize=5)

        # 调用show方法显式
        # plt.show()

        # 保存pic
        pic_file_name = '{0}(代码{1}).png'.format(fund_name, fund_code)
        pic_path = self.base_path + pic_file_name
        if os.path.exists(pic_path):  # 原先存在,就删除!
            # print('文件已存在!')
            os.remove(pic_path)

        savefig(fname=pic_path, dpi=400)  # dpi控制图片像素
        print('[+] {0} 保存完毕!'.format(pic_file_name))

        plt.cla()  # 清空当前图像

        return plot_pic

    def _get_x_axis_label(self, x):
        '''
        得到x轴的刻度list
        :param x:
        :return: list
        '''
        now_time = datetime.datetime.now()
        x_axis_label = []
        for _x in x:
            if _x is not None and month_differ(
                    now_time, string_to_datetime(_x)) % 6 == 0:
                if str(_x)[:7] in x_axis_label:  # 如果已存在append('')
                    x_axis_label.append('')
                else:
                    x_axis_label.append(str(_x)[:7])
            else:
                x_axis_label.append('')

        return x_axis_label

    def _get_y_axis_label(self, y):
        '''
        得到y轴的刻度list
        :param y:
        :return:
        '''
        y_step = .1
        y_axis_label = [
            _y for _y in arange(min(y) - y_step,
                                max(y) + y_step, y_step)
        ]

        return y_axis_label

    def __del__(self):
        try:
            del self.my_phantomjs
        except:
            pass
        gc.collect()
Exemplo n.º 26
0
class YanXuanParse(object):
    def __init__(self, logger=None):
        super(YanXuanParse, self).__init__()
        self.result_data = {}
        self._set_logger(logger)
        self._set_headers()
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg)

    def _set_logger(self, logger):
        if logger is None:
            self.my_lg = set_logger(
                log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/_/' + str(get_shanghai_time())[0:10] + '.txt',
                console_log_level=INFO,
                file_log_level=ERROR
            )
        else:
            self.my_lg = logger

    def _set_headers(self):
        self.headers = {
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': get_random_phone_ua(),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }

    def _get_goods_data(self, goods_id):
        '''
        得到需求数据
        :param goods_id:
        :return:
        '''
        if goods_id == '':
            self.my_lg.error('获取到的goods_id为空值!此处跳过!')
            return self._get_data_error_init()

        # 网易严选m站抓取
        url = 'http://m.you.163.com/item/detail'
        params = self._get_params(goods_id=goods_id)

        m_url = url + '?id={0}'.format(goods_id)
        self.my_lg.info('------>>>| 正在抓取严选地址为: {0}'.format(m_url))

        write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url)

        '''requests被无限转发'''
        # body = MyRequests.get_url_body(url=url, headers=self.headers, params=params)
        # self.my_lg.info(str(body))

        '''改用phantomjs'''
        body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=_get_url_contain_params(url=url, params=params))
        if body == '':
            self.my_lg.error('获取到的body为空值!'+write_info)
            return self._get_data_error_init()

        try:
            body = re.compile('var jsonData=(.*?),policyList=').findall(body)[0]
        except IndexError:
            self.my_lg.error('获取body时索引异常!'+write_info, exc_info=True)
            return self._get_data_error_init()

        body = nonstandard_json_str_handle(json_str=body)
        # self.my_lg.info(str(body))
        _ = json_2_dict(
            json_str=body, logger=self.my_lg)
        # pprint(_)
        if _ == {}:
            self.my_lg.error('获取到的data为空dict!'+write_info)
            return self._get_data_error_init()

        _ = self._wash_data(_)
        data = {}
        try:
            data['title'] = self._wash_sensitive_info(self._get_title(data=_))
            data['sub_title'] = self._wash_sensitive_info(self._get_sub_title(data=_))
            data['shop_name'] = ''
            data['all_img_url'] = self._get_all_img_url(data=_)
            data['p_info'] = self._get_p_info(data=_)
            data['div_desc'] = self._get_div_desc(data=_)
            data['sell_time'] = self._get_sell_time(data=_)
            data['detail_name_list'] = self._get_detail_name_list(data=_.get('skuSpecList', []))
            data['price_info_list'] = self._get_price_info_list(data=_.get('skuList', []))
            data['price'], data['taobao_price'] = self._get_price_and_taobao_price(
                price_info_list=data['price_info_list']
            )
            if data['price'] == 0 or data['taobao_price'] == 0:     # 售罄商品处理
                data['is_delete'] = 1
            else:
                data['is_delete'] = self._get_is_delete(price_info_list=data['price_info_list'], data=data, other=_)

        except Exception:
            self.my_lg.error('遇到错误:', exc_info=True)
            self.my_lg.error(write_info)
            return self._get_data_error_init()

        if data != {}:
            self.result_data = data
            return data
        else:
            self.my_lg.info('data为空值')
            return self._get_data_error_init()

    def _deal_with_data(self):
        '''
        结构化数据
        :return:
        '''
        data = self.result_data
        if data != {}:
            # 店铺名称
            shop_name = data['shop_name']
            # 掌柜
            account = ''
            # 商品名称
            title = data['title']
            # 子标题
            sub_title = data['sub_title']

            # 商品标签属性名称
            detail_name_list = data['detail_name_list']

            # 要存储的每个标签对应规格的价格及其库存
            price_info_list = data['price_info_list']

            # 所有示例图片地址
            all_img_url = data['all_img_url']

            # 详细信息标签名对应属性
            p_info = data['p_info']
            # pprint(p_info)

            # div_desc
            div_desc = data['div_desc']

            is_delete = data['is_delete']

            # 上下架时间
            if data.get('sell_time', {}) != {}:
                schedule = [{
                    'begin_time': data.get('sell_time', {}).get('begin_time', ''),
                    'end_time': data.get('sell_time', {}).get('end_time', ''),
                }]
            else:
                schedule = []

            # 销售总量
            all_sell_count = ''

            # 商品价格和淘宝价
            price, taobao_price = data['price'], data['taobao_price']

            result = {
                'shop_name': shop_name,                 # 店铺名称
                'account': account,                     # 掌柜
                'title': title,                         # 商品名称
                'sub_title': sub_title,                 # 子标题
                'price': price,                         # 商品价格
                'taobao_price': taobao_price,           # 淘宝价
                # 'goods_stock': goods_stock,               # 商品库存
                'detail_name_list': detail_name_list,   # 商品标签属性名称
                # 'detail_value_list': detail_value_list,   # 商品标签属性对应的值
                'price_info_list': price_info_list,     # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,             # 所有示例图片地址
                'p_info': p_info,                       # 详细信息标签名对应属性
                'div_desc': div_desc,                   # div_desc
                'schedule': schedule,                   # 商品特价销售时间段
                'all_sell_count': all_sell_count,       # 销售总量
                'is_delete': is_delete                  # 是否下架
            }
            # pprint(result)
            # print(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            self.result_data = {}
            return result

        else:
            self.my_lg.error('待处理的data为空的dict, 该商品可能已经转移或者下架')

            return self._get_data_error_init()

    def to_right_and_update_data(self, data, pipeline):
        '''
        实时更新数据
        :param data:
        :param pipeline:
        :return:
        '''
        tmp = _get_right_model_data(data, site_id=30, logger=self.my_lg)

        params = self._get_db_update_params(item=tmp)
        base_sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, IsPriceChange=%s, PriceChangeInfo=%s, {0} {1} where GoodsID = %s'
        if tmp['delete_time'] == '':
            sql_str = base_sql_str.format('shelf_time=%s', '')
        elif tmp['shelf_time'] == '':
            sql_str = base_sql_str.format('delete_time=%s', '')
        else:
            sql_str = base_sql_str.format('shelf_time=%s,', 'delete_time=%s')

        pipeline._update_table_2(sql_str=sql_str, params=params, logger=self.my_lg)

    def _get_db_update_params(self, item):
        params = [
            item['modify_time'],
            item['shop_name'],
            item['account'],
            item['title'],
            item['sub_title'],
            item['link_name'],
            # item['price'],
            # item['taobao_price'],
            dumps(item['price_info'], ensure_ascii=False),
            dumps(item['detail_name_list'], ensure_ascii=False),
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),
            item['div_desc'],
            item['all_sell_count'],
            # item['delete_time'],
            item['is_delete'],
            item['is_price_change'],
            dumps(item['price_change_info'], ensure_ascii=False),

            item['goods_id'],
        ]
        if item.get('delete_time', '') == '':
            params.insert(-1, item['shelf_time'])
        elif item.get('shelf_time', '') == '':
            params.insert(-1, item['delete_time'])
        else:
            params.insert(-1, item['shelf_time'])
            params.insert(-1, item['delete_time'])

        return tuple(params)

    def _wash_sensitive_info(self, target_str):
        '''
        清洗敏感信息
        :param target_str:
        :return:
        '''
        add_sensitive_str_list = [
            '网易',
            '严选',
            '云音乐',
        ]
        target_str = wash_sensitive_info(data=target_str, replace_str_list=[], add_sensitive_str_list=add_sensitive_str_list)

        return target_str

    def _get_title(self, data):
        title = data.get('name', '')
        assert title != '', '获取到的name为空值!请检查!'

        return title

    def _get_sub_title(self, data):
        sub_title = data.get('simpleDesc', '')  # 可以为空

        return sub_title

    def _get_all_img_url(self, data):
        tmp = data.get('itemDetail', {})
        first_img_url = data.get('listPicUrl', '')
        assert tmp != {}, '获取到的all_img_url为空dict!'

        all_img_url = [{
            'img_url': first_img_url
        }] if first_img_url != '' else []
        for key, value in tmp.items():
            if re.compile('picUrl').findall(key) != []:
                all_img_url.append({
                    'img_url': value,
                })

        return all_img_url

    def _get_p_info(self, data):
        p_info = [{
            'p_name': item.get('attrName', ''),
            'p_value': self._wash_sensitive_info(item.get('attrValue', '')),
        } for item in data.get('attrList', [])]

        return p_info

    def _get_div_desc(self, data):
        div_desc = data.get('itemDetail', {}).get('detailHtml', '')
        assert div_desc != '', '获取到的div_desc为空值!请检查!'
        # self.my_lg.info(str(div_desc))

        div_desc = self._wash_div_desc(div_desc)
        # print(div_desc)

        return div_desc

    def _wash_div_desc(self, div_desc):
        '''
        清洗div_desc
        :param div_desc:
        :return:
        '''
        # 方案1: 过滤不充分
        # filter = '''
        # _src=\".*?\"|
        # http://yanxuan.nosdn.127.net/e5f0f6b40368d7e532ff6b3a6481e6ab.jpg|
        # http://yanxuan.nosdn.127.net/c56658fa7b0b8a38bdb9c292a68fb176.jpg
        # '''.replace('\n', '').replace(' ', '')
        #
        # div_desc = re.compile(filter).sub('', div_desc)
        #
        # # 因为前面的严选声明照片地址是hash值, 每次都变
        # # 所以所有div_desc统一洗去前4张
        # div_desc = re.compile('<img.*?/>').sub('', div_desc, count=4)

        # 方案2:
        img_list = unique_list_and_keep_original_order(re.compile('src=\"(.*?)\"').findall(div_desc))
        # pprint(img_list)
        _ = ''
        for item in img_list[3:-2:]:
            _ += '<p><img src="{0}" style=""/></p>'.format(item)
        div_desc = _

        return div_desc

    def _get_sell_time(self, data):
        '''
        得到上下架时间
        :param data:
        :return:
        '''
        try:
            left_time = data.get('gradientPrice', {}).get('leftTime', 0)
        except AttributeError:  # gradientPrice的值可能为''
            return {}

        if left_time == 0:
            return {}

        now_time_timestamp = datetime_to_timestamp(get_shanghai_time())
        sell_time = {
            'begin_time': timestamp_to_regulartime(now_time_timestamp),
            'end_time': timestamp_to_regulartime(now_time_timestamp + left_time),
        }

        return sell_time

    def _get_detail_name_list(self, data):
        detail_name_list = []
        for item in data:
            if item.get('name') is None:
                return []
            else:
                detail_name_list.append({
                    'spec_name': item.get('name')
                })

        return detail_name_list

    def _get_price_info_list(self, data):
        '''
        得到price_info_list
        :param data:
        :return:
        '''
        price_info_list = []
        # pprint(data)
        for item in data:
            itemSkuSpecValueList = item.get('itemSkuSpecValueList', [])
            # pprint(itemSkuSpecValueList)
            spec_value_list = [i.get('skuSpecValue', {}).get('value', '') for i in itemSkuSpecValueList]
            spec_value = '|'.join(spec_value_list)
            img_url = item.get('pic', '')    # 默认为空
            if item.get('promotionDesc', '') == '新人专享价':    # 新人专享价处理为原价
                detail_price = str(item.get('calcPrice', ''))
            else:
                detail_price = str(item.get('retailPrice', ''))          # 零售价
            normal_price = str(item.get('counterPrice', ''))         # 市场价
            account_limit_buy_count = 5
            rest_number = item.get('sellVolume', 0)        # 官方接口没有规格库存信息, 此处默认为20
            if rest_number == 0:
                continue

            price_info_list.append({
                'spec_value': spec_value,
                'img_url': img_url,
                'detail_price': detail_price,
                'normal_price': normal_price,
                'account_limit_buy_count': account_limit_buy_count,
                'rest_number': rest_number,
            })

        return price_info_list

    def _get_price_and_taobao_price(self, price_info_list):
        # pprint(price_info_list)
        if price_info_list == []:   # 售罄商品处理
            return 0, 0

        try:
            tmp_price_list = sorted([round(float(item.get('detail_price', '')), 2) for item in price_info_list])
            price = tmp_price_list[-1]  # 商品价格
            taobao_price = tmp_price_list[0]  # 淘宝价
        except IndexError:
            raise IndexError('获取price, taobao_price时索引异常!请检查!')

        return price, taobao_price

    def _get_is_delete(self, price_info_list, data, other):
        is_delete = 0
        all_rest_number = 0
        if price_info_list != []:
            for item in price_info_list:
                all_rest_number += item.get('rest_number', 0)
            if all_rest_number == 0:
                is_delete = 1
        else:
            is_delete = 1

        # 当官方下架时间< 当前时间戳 则商品已下架 is_delete = 1
        if data['sell_time'] != {}:
            end_time = datetime_to_timestamp(string_to_datetime(data.get('sell_time', {}).get('end_time', '')))
            if end_time < datetime_to_timestamp(get_shanghai_time()):
                self.my_lg.info('该商品已经过期下架...! 进行逻辑删除 is_delete=1')
                is_delete = 1
            # print(is_delete)

        if other.get('soldOut'):    # True or False
            is_delete = 1

        return is_delete

    def _get_data_error_init(self):
        '''
        获取或者失败处理
        :return:
        '''
        self.result_data = {}

        return {}

    def _get_params(self, goods_id):
        params = (
            ('id', goods_id),
        )

        return params

    def _wash_data(self, data):
        '''
        清洗无用数据
        :param data:
        :return:
        '''
        try:
            data['comments'] = []
            data['issueList'] = []
        except:
            pass

        return data

    def get_goods_id_from_url(self, yanxuan_url):
        '''
        得到goods_id
        :param yanxuan_url:
        :return: goods_id
        '''
        # http://you.163.com/item/detail?id=1130056&_stat_area=mod_1_item_1&_stat_id=1005000&_stat_referer=itemList
        is_yanxuan_url = re.compile(r'you.163.com/item/detail.*?').findall(yanxuan_url)
        if is_yanxuan_url != []:
            if re.compile(r'id=(\d+)').findall(yanxuan_url) != []:
                goods_id = re.compile(r'id=(\d+)').findall(yanxuan_url)[0]
                self.my_lg.info('------>>>| 得到的严选商品的goods_id为: {0}'.format(goods_id))
                return goods_id
        else:
            self.my_lg.info('网易严选商品url错误, 非正规的url, 请参照格式(https://you.163.com/item/detail)开头的...')
            return ''

    def __del__(self):
        try:
            del self.my_phantomjs
            del self.my_lg
        except:
            pass
        gc.collect()
Exemplo n.º 27
0
class PinduoduoParse(object):
    def __init__(self):
        self._set_headers()
        self.result_data = {}
        # self.set_cookies_key_api_uid()  # 设置cookie中的api_uid的值
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'mobile.yangkeduo.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
            # 'Cookie': 'api_uid=rBQh+FoXerAjQWaAEOcpAg==;',      # 分析发现需要这个cookie值
        }

    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + str(
                goods_id)
            print('------>>>| 得到的商品手机版地址为: ', tmp_url)
            '''
            1.采用requests,由于经常返回错误的body(即requests.get返回的为空的html), So pass
            '''
            # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
            '''
            2.采用phantomjs来获取
            '''
            body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url)

            if body == '':
                print('body中re匹配到的data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            data = re.compile(r'window.rawData= (.*?);</script>').findall(
                body)  # 贪婪匹配匹配所有

            if data != []:
                data = json_2_dict(json_str=data[0])
                if data == {}:
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
                # pprint(data)

                try:
                    data['goods'].pop('localGroups')
                    data['goods'].pop('mallService')
                    data.pop('reviews')  # 评价信息跟相关统计
                except:
                    pass
                # pprint(data)
                '''
                处理detailGallery转换成能被html显示页面信息
                '''
                detail_data = data.get('goods', {}).get('detailGallery', [])
                tmp_div_desc = ''
                if detail_data != []:
                    for index in range(0, len(detail_data)):
                        if index == 0:  # 跳过拼多多的提示
                            pass
                        else:
                            tmp = ''
                            tmp_img_url = detail_data[index].get('url')
                            tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format(
                                tmp_img_url)
                            tmp_div_desc += tmp

                    detail_data = '<div>' + tmp_div_desc + '</div>'

                else:
                    detail_data = ''
                # print(detail_data)
                try:
                    data['goods'].pop('detailGallery')  # 删除图文介绍的无第二次用途的信息
                except:
                    pass
                data['div_desc'] = detail_data

                # pprint(data)
                self.result_data = data
                return data

            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

    def deal_with_data(self):
        '''
        处理result_data, 返回需要的信息
        :return: 字典类型
        '''
        data = self.result_data
        if data != {}:
            # 店铺名称
            if data.get('mall') is not None:
                shop_name = data.get('mall', {}).get('mallName', '')
            else:
                shop_name = ''

            # 掌柜
            account = ''

            # 商品名称
            title = data.get('goods', {}).get('goodsName', '')

            # 子标题
            sub_title = ''

            # 商品库存
            # 商品标签属性对应的值

            # 商品标签属性名称
            if data.get('goods', {}).get('skus', []) == []:
                detail_name_list = []
            else:
                if data.get('goods', {}).get('skus', [])[0].get('specs') == []:
                    detail_name_list = []
                else:
                    detail_name_list = [{
                        'spec_name': item.get('spec_key')
                    }
                                        for item in data.get('goods', {}).get(
                                            'skus', [])[0].get('specs')]
            # print(detail_name_list)

            # 要存储的每个标签对应规格的价格及其库存
            skus = data.get('goods', {}).get('skus', [])
            # pprint(skus)
            price_info_list = []
            if skus != []:  # ** 注意: 拼多多商品只有一个规格时skus也不会为空的 **
                for index in range(0, len(skus)):
                    tmp = {}
                    price = skus[index].get('groupPrice', '')  # 拼团价
                    normal_price = skus[index].get('normalPrice', '')  # 单独购买价格
                    spec_value = [
                        item.get('spec_value') for item in data.get(
                            'goods', {}).get('skus', [])[index].get('specs')
                    ]
                    spec_value = '|'.join(spec_value)
                    img_url = skus[index].get('thumbUrl', '')
                    rest_number = skus[index].get('quantity', 0)  # 剩余库存
                    is_on_sale = skus[index].get(
                        'isOnSale', 0)  # 用于判断是否在特价销售,1:特价 0:原价(normal_price)
                    tmp['spec_value'] = spec_value
                    tmp['detail_price'] = price
                    tmp['normal_price'] = normal_price
                    tmp['img_url'] = img_url
                    if rest_number <= 0:
                        tmp['rest_number'] = 0
                    else:
                        tmp['rest_number'] = rest_number
                    tmp['is_on_sale'] = is_on_sale
                    price_info_list.append(tmp)

            if price_info_list == []:
                print('price_info_list为空值')
                return {}

            # 商品价格和淘宝价
            tmp_price_list = sorted([
                round(float(item.get('detail_price', '')), 2)
                for item in price_info_list
            ])
            price = tmp_price_list[-1]  # 商品价格
            taobao_price = tmp_price_list[0]  # 淘宝价

            if detail_name_list == []:
                print('## detail_name_list为空值 ##')
                price_info_list = []

            # print('最高价为: ', price)
            # print('最低价为: ', taobao_price)
            # print(len(price_info_list))
            # pprint(price_info_list)

            # 所有示例图片地址
            all_img_url = [{
                'img_url': item
            } for item in data.get('goods', {}).get('topGallery', [])]
            # print(all_img_url)

            # 详细信息标签名对应属性
            tmp_p_value = re.compile(r'\n').sub(
                '',
                data.get('goods', {}).get('goodsDesc', ''))
            tmp_p_value = re.compile(r'\t').sub('', tmp_p_value)
            tmp_p_value = re.compile(r'  ').sub('', tmp_p_value)
            p_info = [{'p_name': '商品描述', 'p_value': tmp_p_value}]
            # print(p_info)

            # 总销量
            all_sell_count = data.get('goods', {}).get('sales', 0)

            # div_desc
            div_desc = data.get('div_desc', '')

            # 商品销售时间区间
            schedule = [{
                'begin_time':
                self.timestamp_to_regulartime(
                    data.get('goods', {}).get('groupTypes',
                                              [])[0].get('startTime')),
                'end_time':
                self.timestamp_to_regulartime(
                    data.get('goods', {}).get('groupTypes',
                                              [])[0].get('endTime')),
            }]
            # pprint(schedule)

            # 用于判断商品是否已经下架
            is_delete = 0

            result = {
                'shop_name': shop_name,  # 店铺名称
                'account': account,  # 掌柜
                'title': title,  # 商品名称
                'sub_title': sub_title,  # 子标题
                # 'shop_name_url': shop_name_url,        # 店铺主页地址
                'price': price,  # 商品价格
                'taobao_price': taobao_price,  # 淘宝价
                # 'goods_stock': goods_stock,            # 商品库存
                'detail_name_list': detail_name_list,  # 商品标签属性名称
                # 'detail_value_list': detail_value_list,# 商品标签属性对应的值
                'price_info_list': price_info_list,  # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,  # 所有示例图片地址
                'p_info': p_info,  # 详细信息标签名对应属性
                'div_desc': div_desc,  # div_desc
                'schedule': schedule,  # 商品开卖时间和结束开卖时间
                'all_sell_count': all_sell_count,  # 商品总销售量
                'is_delete': is_delete  # 用于判断商品是否已经下架
            }
            # pprint(result)
            # print(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            return result

        else:
            print('待处理的data为空的dict, 该商品可能已经转移或者下架')
            return {}

    def to_right_and_update_data(self, data, pipeline):
        tmp = _get_right_model_data(data=data, site_id=13)
        params = self._get_db_update_params(item=tmp)
        # 改价格的sql语句
        # sql_str = r'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, Price=%s, TaoBaoPrice=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, MyShelfAndDownTime=%s, delete_time=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s where GoodsID = %s'
        # 不改价格的sql语句
        if tmp['delete_time'] == '':
            sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s where GoodsID = %s'
        elif tmp['shelf_time'] == '':
            sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, delete_time=%s where GoodsID = %s'
        else:
            sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s, delete_time=%s where GoodsID = %s'

        pipeline._update_table(sql_str=sql_str, params=params)

    def insert_into_pinduoduo_xianshimiaosha_table(self, data, pipeline):
        tmp = _get_right_model_data(data=data, site_id=16)  # 采集来源地(卷皮秒杀商品)
        print('------>>>| 待存储的数据信息为: ', tmp.get('goods_id'))

        params = self._get_db_insert_miaosha_params(item=tmp)
        sql_str = r'insert into dbo.pinduoduo_xianshimiaosha(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, property_info, detail_info, schedule, stock_info, miaosha_time, miaosha_begin_time, miaosha_end_time, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
        pipeline._insert_into_table(sql_str=sql_str, params=params)

    def to_update_pinduoduo_xianshimiaosha_table(self, data, pipeline):
        tmp = _get_right_model_data(data=data, site_id=16)
        # print('------>>> | 待存储的数据信息为: |', tmp)
        print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id'))

        params = self._get_db_update_miaosha_params(item=tmp)
        sql_str = 'update dbo.pinduoduo_xianshimiaosha set modfiy_time = %s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_info=%s, all_image_url=%s, property_info=%s, detail_info=%s, is_delete=%s, schedule=%s, stock_info=%s, miaosha_time=%s, miaosha_begin_time=%s, miaosha_end_time=%s where goods_id = %s'
        pipeline._update_table(sql_str=sql_str, params=params)

    def _get_db_update_params(self, item):
        '''
        得到db待存储的数据
        :param item:
        :return:
        '''
        params = [
            item['modify_time'],
            item['shop_name'],
            item['account'],
            item['title'],
            item['sub_title'],
            item['link_name'],
            # item['price'],
            # item['taobao_price'],
            dumps(item['price_info'], ensure_ascii=False),
            dumps(item['detail_name_list'], ensure_ascii=False),
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),
            item['div_desc'],
            item['all_sell_count'],
            # item['delete_time'],
            item['is_delete'],
            dumps(item['schedule'], ensure_ascii=False),
            item['is_price_change'],
            dumps(item['price_change_info'], ensure_ascii=False),
            item['goods_id'],
        ]
        if item.get('delete_time', '') == '':
            params.insert(-1, item['shelf_time'])
        elif item.get('shelf_time', '') == '':
            params.insert(-1, item['delete_time'])
        else:
            params.insert(-1, item['shelf_time'])
            params.insert(-1, item['delete_time'])

        return tuple(params)

    def _get_db_insert_miaosha_params(self, item):
        params = (
            item['goods_id'],
            item['goods_url'],
            item['username'],
            item['create_time'],
            item['modify_time'],
            item['shop_name'],
            item['title'],
            item['sub_title'],
            item['price'],
            item['taobao_price'],
            dumps(item['detail_name_list'], ensure_ascii=False
                  ),  # 把list转换为json才能正常插入数据(并设置ensure_ascii=False)
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),  # 存入到PropertyInfo
            item['div_desc'],  # 存入到DetailInfo
            dumps(item['schedule'], ensure_ascii=False),
            dumps(item['stock_info'], ensure_ascii=False),
            dumps(item['miaosha_time'], ensure_ascii=False),
            item['miaosha_begin_time'],
            item['miaosha_end_time'],
            item['site_id'],
            item['is_delete'],
        )

        return params

    def _get_db_update_miaosha_params(self, item):
        params = (
            item['modify_time'],
            item['shop_name'],
            item['title'],
            item['sub_title'],
            item['price'],
            item['taobao_price'],
            dumps(item['detail_name_list'], ensure_ascii=False),
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),
            item['div_desc'],
            item['is_delete'],
            dumps(item['schedule'], ensure_ascii=False),
            dumps(item['stock_info'], ensure_ascii=False),
            dumps(item['miaosha_time'], ensure_ascii=False),
            item['miaosha_begin_time'],
            item['miaosha_end_time'],
            item['goods_id'],
        )

        return params

    def set_cookies_key_api_uid(self):
        '''
        给headers增加一个cookie, 里面有个key名字为api_uid
        :return:
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        self.proxies = ip_object.get_proxy_ip_from_ip_pool(
        )  # {'http': ['xx', 'yy', ...]}
        self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

        tmp_proxies = {
            'http': self.proxy,
        }
        # 得到cookie中的key名为api_uid的值
        host_url = 'http://mobile.yangkeduo.com'
        try:
            response = requests.get(
                host_url,
                headers=self.headers,
                proxies=tmp_proxies,
                timeout=10)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            api_uid = response.cookies.get('api_uid')
            # print(response.cookies.items())
            # if api_uid is None:
            #     api_uid = 'rBQh+FoXerAjQWaAEOcpAg=='
            self.headers['Cookie'] = 'api_uid=' + str(api_uid) + ';'
            # print(api_uid)
        except Exception:
            print('requests.get()请求超时....')
            pass

    def timestamp_to_regulartime(self, timestamp):
        '''
        将时间戳转换成时间
        '''
        # 利用localtime()函数将时间戳转化成localtime的格式
        # 利用strftime()函数重新格式化时间

        # 转换成localtime
        time_local = time.localtime(timestamp)
        # 转换成新的时间格式(2016-05-05 20:28:54)
        dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)

        return dt

    def get_goods_id_from_url(self, pinduoduo_url):
        '''
        得到goods_id
        :param pinduoduo_url:
        :return: goods_id (类型str)
        '''
        is_pinduoduo_url = re.compile(
            r'http://mobile.yangkeduo.com/goods.html.*?').findall(
                pinduoduo_url)
        if is_pinduoduo_url != []:
            if re.compile(
                    r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?'
            ).findall(pinduoduo_url) != []:
                tmp_pinduoduo_url = re.compile(
                    r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?'
                ).findall(pinduoduo_url)[0]
                if tmp_pinduoduo_url != '':
                    goods_id = tmp_pinduoduo_url
                else:  # 只是为了在pycharm里面测试,可以不加
                    pinduoduo_url = re.compile(r';').sub('', pinduoduo_url)
                    goods_id = re.compile(
                        r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?'
                    ).findall(pinduoduo_url)[0]
                print('------>>>| 得到的拼多多商品id为:', goods_id)
                return goods_id
            else:
                pass
        else:
            print(
                '拼多多商品url错误, 非正规的url, 请参照格式(http://mobile.yangkeduo.com/goods.html)开头的...'
            )
            return ''

    def __del__(self):
        try:
            del self.my_phantomjs
        except:
            pass
        gc.collect()
Exemplo n.º 28
0
class JuanPiParse(object):
    def __init__(self):
        super(JuanPiParse, self).__init__()
        self._set_headers()
        self.result_data = {}
        self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)

    def _set_headers(self):
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'web.juanpi.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id)
            print('------>>>| 得到的商品手机版的地址为: ', tmp_url)
            '''
            1.原先使用requests来模拟(起初安全的运行了一个月),但是后来发现光requests会not Found,记住使用前别翻墙
            '''
            # try:
            #     response = requests.get(tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=12)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            #     main_body = response.content.decode('utf-8')
            #     # print(main_body)
            #     # main_body = re.compile(r'\n').sub('', main_body)
            #     main_body = re.compile(r'\t').sub('', main_body)
            #     main_body = re.compile(r'  ').sub('', main_body)
            #     print(main_body)
            #     data = re.compile(r'__PRELOADED_STATE__=(.*),window\.__SERVER_TIME__=').findall(main_body)  # 贪婪匹配匹配所有
            #     print(data)
            # except Exception:
            #     print('requests.get()请求超时....')
            #     print('data为空!')
            #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            #     return {}
            '''
            2.采用phantomjs来处理,记住使用前别翻墙
            '''
            body = self.my_phantomjs.use_phantomjs_to_get_url_body(
                url=tmp_url,
                css_selector='div.sc-kgoBCf.bTQvTk')  # 该css为手机端标题块
            if body == '':
                print('获取到的body为空str!请检查!')
                self.result_data = {}
                return {}

            data = re.compile(
                r'__PRELOADED_STATE__ = (.*);</script> <style ').findall(
                    body)  # 贪婪匹配匹配所有

            # 得到skudata
            # 卷皮原先的skudata请求地址1(官方放弃)
            # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id)
            # 现在卷皮skudata请求地址2
            skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str(
                goods_id)

            self.skudata_headers = self.headers
            self.skudata_headers.update({'Host': 'webservice.juanpi.com'})
            skudata_body = MyRequests.get_url_body(
                url=skudata_url, headers=self.skudata_headers)
            if skudata_body == '':
                print('获取到的skudata_body为空str!请检查!')
                self.result_data = {}
                return {}
            skudata = re.compile(r'(.*)').findall(skudata_body)  # 贪婪匹配匹配所有

            if skudata != []:
                skudata = skudata[0]
                skudata = json_2_dict(json_str=skudata)
                if skudata == {}:
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
                skudata = skudata.get('skudata', {})
                # pprint(skudata)

                try:
                    if skudata.get('info') is not None:
                        pass  # 说明得到正确的skudata

                    else:  # 否则跳出
                        print('skudata中info的key为None, 返回空dict')
                        self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                        return {}

                except AttributeError as e:
                    print('遇到错误如下(先跳过!): ', e)
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}

            else:
                print('skudata为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            if data != []:
                main_data = json_2_dict(json_str=data[0])
                if main_data == {}:
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}

                if main_data.get('detail') is not None:
                    main_data = self._wash_main_data(
                        main_data.get('detail', {}))

                    main_data['skudata'] = skudata
                    # pprint(main_data)
                    # print(main_data)
                    main_data['goods_id'] = goods_id
                    self.result_data = main_data
                    return main_data

                else:
                    print('data中detail的key为None, 返回空dict')
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

    def deal_with_data(self):
        '''
        解析data数据,得到需要的东西
        :return: dict
        '''
        data = self.result_data
        if data != {}:
            # 店铺名称
            shop_name = self._get_shop_name(data=data)

            # 掌柜
            account = ''

            # 商品名称
            title = data.get('baseInfo', {}).get('title', '')

            # 子标题
            sub_title = ''

            # 商品库存

            # 商品标签属性名称
            detail_name_list = self._get_detail_name_list(data=data)
            if isinstance(detail_name_list, str):  # 单独处理下架的情况
                if detail_name_list == 'is_delete=1':
                    print('该商品已下架...')
                    sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s'
                    params = (self.result_data.get('goods_id', ''), )
                    _ = SqlServerMyPageInfoSaveItemPipeline()
                    result = _._update_table(sql_str=sql_str, params=params)
                    if result:
                        print('### 该商品已经is_delete=1 ###')
                    else:
                        print('is_delete=1标记失败!')

            if detail_name_list == {}:
                self.result_data = {}
                return {}
            # print(detail_name_list)

            # 商品标签属性对应的值(pass不采集)

            # 要存储的每个标签对应的规格的价格及其库存
            price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price(
                data=data)
            # print('最高价为: ', price)
            # print('最低价为: ', taobao_price)
            # pprint(price_info_list)

            # 所有示例图片的地址
            # pprint(data.get('goodImages'))
            all_img_url = [{
                'img_url': item
            } for item in data.get('goodImages')]
            # print(all_img_url)

            # 详细信息标签名对应的属性
            p_info = self._get_p_info(data=data)

            # pprint(p_info)

            # div_desc
            div_desc = self._get_div_desc(data=data)
            # print(div_desc)

            # 商品销售时间段
            schedule = self._get_goods_schedule(data=data)
            # pprint(schedule)

            is_delete = self._get_is_delete(data=data, schedule=schedule)
            if price == 0 or taobao_price == 0:  # 没有获取到价格说明商品已经下架了
                is_delete = 1
            # print('is_delete = ', is_delete)

            result = {
                'shop_name': shop_name,  # 店铺名称
                'account': account,  # 掌柜
                'title': title,  # 商品名称
                'sub_title': sub_title,  # 子标题
                'price': price,  # 商品价格
                'taobao_price': taobao_price,  # 淘宝价
                # 'goods_stock': goods_stock,           # 商品库存
                'detail_name_list': detail_name_list,  # 商品标签属性名称
                # 'detail_value_list': detail_value_list, # 商品标签属性对应的值
                'price_info_list': price_info_list,  # 要存储的每个标签对应规格的价格及其库存
                'all_img_url': all_img_url,  # 所有示例图片地址
                'p_info': p_info,  # 详细信息标签名对应属性
                'div_desc': div_desc,  # div_desc
                'is_delete': is_delete,  # 是否下架判断
                'schedule': schedule,  # 商品销售时间段
            }
            # pprint(result)
            # wait_to_send_data = {
            #     'reason': 'success',
            #     'data': result,
            #     'code': 1
            # }
            # json_data = json.dumps(wait_to_send_data, ensure_ascii=False)
            # print(json_data)
            gc.collect()
            return result

        else:
            print('待处理的data为空的dict')
            return {}

    def to_right_and_update_data(self, data, pipeline):
        tmp = _get_right_model_data(data=data, site_id=12)
        params = self._get_db_update_params(item=tmp)
        # 改价格的sql语句
        # sql_str = r'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, Price=%s, TaoBaoPrice=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, MyShelfAndDownTime=%s, delete_time=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s where GoodsID = %s'
        # 不改价格的sql语句
        if tmp['delete_time'] == '':
            sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s where GoodsID = %s'
        elif tmp['shelf_time'] == '':
            sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, delete_time=%s where GoodsID = %s'
        else:
            sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s, delete_time=%s where GoodsID = %s'

        pipeline._update_table(sql_str=sql_str, params=params)

    def insert_into_juanpi_xianshimiaosha_table(self, data, pipeline):
        tmp = _get_right_model_data(data=data, site_id=15)
        # print('------>>> | 待存储的数据信息为: |', tmp)
        print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id'))

        params = self._get_db_insert_miaosha_params(item=tmp)
        sql_str = 'insert into dbo.juanpi_xianshimiaosha(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, property_info, detail_info, schedule, stock_info, miaosha_time, miaosha_begin_time, miaosha_end_time, tab_id, page, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
        pipeline._insert_into_table(sql_str=sql_str, params=params)

    def to_update_juanpi_xianshimiaosha_table(self, data, pipeline):
        tmp = _get_right_model_data(data=data, site_id=15)
        # print('------>>> | 待存储的数据信息为: |', tmp)
        print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id'))

        params = self._get_db_update_miaosha_params(item=tmp)
        sql_str = 'update dbo.juanpi_xianshimiaosha set modfiy_time = %s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_info=%s, all_image_url=%s, property_info=%s, detail_info=%s, is_delete=%s, schedule=%s, stock_info=%s, miaosha_time=%s, miaosha_begin_time=%s, miaosha_end_time=%s where goods_id = %s'
        pipeline._update_table(sql_str=sql_str, params=params)

    def insert_into_juuanpi_pintuan_table(self, data, pipeline):
        try:
            tmp = _get_right_model_data(data=data, site_id=18)
        except:
            print('此处抓到的可能是卷皮拼团券所以跳过')
            return None

        # print('------>>> | 待存储的数据信息为: |', tmp)
        print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id'))

        params = self._get_db_insert_pintuan_params(item=tmp)
        sql_str = 'insert into dbo.juanpi_pintuan(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, all_sell_count, property_info, detail_info, schedule, miaosha_begin_time, miaosha_end_time, page, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
        _r = pipeline._insert_into_table(sql_str=sql_str, params=params)

        return _r

    def to_right_and_update_pintuan_data(self, data, pipeline):
        try:
            tmp = _get_right_model_data(data=data, site_id=18)
        except:
            print('此处抓到的可能是卷皮拼团券所以跳过')
            return None
        # print('------>>>| 待存储的数据信息为: |', tmp)
        print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id'))

        params = self._get_db_update_pintuan_params(item=tmp)
        sql_str = r'update dbo.juanpi_pintuan set modfiy_time=%s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_Info=%s, all_image_url=%s, property_info=%s, detail_info=%s, schedule=%s, is_delete=%s where goods_id = %s'
        pipeline._update_table(sql_str=sql_str, params=params)

    def _get_shop_name(self, data):
        '''
        获取shop_name
        :param data:
        :return:
        '''
        if data.get('brand_info') is not None:
            shop_name = data.get('brand_info', {}).get('title', '')
        else:
            shop_name = data.get('schedule_info', {}).get('brand_title', '')

        return shop_name

    def _get_detail_name_list(self, data):
        '''
        获取detail_name_list
        :param data:
        :return: {} 表示出错 | [] 非空正常
        '''
        sku = data.get('skudata', {}).get('sku', [])
        # pprint(sku)
        detail_name_list = []
        if sku != []:
            try:
                if sku[0].get('av_fvalue', '') == '':
                    fav_name = ''
                    pass
                else:
                    tmp = {}
                    fav_name = data.get('skudata',
                                        {}).get('info',
                                                {}).get('fav_name', '')
                    tmp['spec_name'] = fav_name
                    detail_name_list.append(tmp)
            except IndexError:
                print('IndexError错误,此处跳过!')
                # print(sku)
                if isinstance(sku, str):  # 单独处理下架的
                    if sku == '':
                        return 'is_delete=1'

                return {}

            if sku[0].get('av_zvalue', '') == '':
                zav_name = ''
            else:
                tmp = {}
                zav_name = data.get('skudata', {}).get('info',
                                                       {}).get('zav_name', '')
                tmp['spec_name'] = zav_name
                detail_name_list.append(tmp)

        return detail_name_list

    def _get_price_info_list_and_price_and_taobao_price(self, data):
        '''
        获取price_info_list, price, taobao_price
        :param data:
        :return: a tuple
        '''
        sku = data.get('skudata', {}).get('sku', [])  # 分析得到sku肯定不为[]
        # pprint(sku)
        price_info_list = []
        if len(sku) == 1 and sku[0].get(
                'av_fvalue',
                '') == '' and sku[0].get('av_zvalue') == '':  # 没有规格的默认只有一个{}
            # price最高价, taobao_price最低价
            price = round(float(sku[0].get('cprice')), 2)
            taobao_price = price

        else:  # 有规格的
            # 通过'stock'='1'来判断是否有库存, ='0'表示无库存
            # '由于卷皮不给返回库存值, 所以 'stock_tips'='库存紧张', 我就设置剩余库存为10, 如果'stock_tips'='', 就默认设置库存量为50
            # print('777')
            for item in sku:
                tmp = {}
                tmp_1 = []
                if item.get('av_fvalue', '') == '':
                    pass
                else:
                    tmp_1.append(item.get('av_fvalue'))

                if item.get('av_zvalue', '') == '':
                    pass
                else:
                    tmp_1.append(item.get('av_zvalue'))
                tmp_1 = '|'.join(tmp_1)

                if item.get('av_origin_zpic', '') != '':
                    tmp['img_url'] = item.get('av_origin_zpic', '')
                else:
                    tmp['img_url'] = ''

                if item.get('cprice', '') != '':
                    tmp['pintuan_price'] = item.get('cprice')
                    tmp['detail_price'] = item.get('sprice', '')
                    tmp['normal_price'] = item.get('price')
                else:
                    tmp['pintuan_price'] = item.get('price')
                    if item.get('sprice', '') != '':
                        tmp['detail_price'] = item.get('sprice', '')
                    else:
                        tmp['detail_price'] = item.get('price')
                    tmp['normal_price'] = item.get('price')

                if item.get('stock') == '0':  # 跳过
                    rest_number = '0'
                else:  # 即'stock'='1'
                    rest_number = '50'

                    if item.get('stock_tips', '') != '' and item.get(
                            'stock_tips', '') == '库存紧张':
                        # 库存紧张的时候设置下
                        rest_number = '10'

                    tmp['spec_value'] = tmp_1
                    tmp['rest_number'] = rest_number
                    price_info_list.append(tmp)

            # 得到有规格时的最高价和最低价
            tmp_price_list = sorted([
                round(float(item.get('pintuan_price', '')), 2)
                for item in price_info_list
            ])
            # print(tmp_price_list)
            if tmp_price_list == []:
                price = 0
                taobao_price = 0
            else:
                price = tmp_price_list[-1]  # 商品价格
                taobao_price = tmp_price_list[0]  # 淘宝价

        return price_info_list, price, taobao_price

    def _get_p_info(self, data):
        '''
        获取p_info
        :param data:
        :return:
        '''
        p_info = []
        attr = data.get('goodsDetail', {}).get('attr', [])
        # print(attr)
        if attr != []:
            # item是str时跳过
            p_info = [{
                'p_name': item.get('st_key'),
                'p_value': item.get('st_value')
            } for item in attr if isinstance(item, dict)]
            for item in p_info:
                if item.get('p_name') == '运费':
                    # 过滤掉颜色的html代码
                    item['p_value'] = '全国包邮(偏远地区除外)'

                # 过滤清洗
                tmp_p_value = item.get('p_value', '')
                tmp_p_value = re.compile(r'\xa0').sub(' ',
                                                      tmp_p_value)  # 替换为一个空格
                item['p_value'] = tmp_p_value

        return p_info

    def _get_div_desc(self, data):
        '''
        获取div_desc
        :param data:
        :return:
        '''
        div_images_list = data.get('goodsDetail', {}).get('images', [])
        tmp_div_desc = ''
        for item in div_images_list:
            tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format(
                item)
            tmp_div_desc += tmp

        return '<div>' + tmp_div_desc + '</div>'

    def _get_goods_schedule(self, data):
        '''
        获取商品销售时间段
        :param data:
        :return:
        '''
        # print(data.get('skudata', {}).get('info', {}))
        # print(data.get('skudata', {}))
        begin_time = data.get('skudata', {}).get('info', {}).get(
            'start_time')  # 取这个时间段才是正确的销售时间, 之前baseInfo是虚假的
        end_time = data.get('skudata', {}).get('info', {}).get('end_time')
        if begin_time is None or end_time is None:
            schedule = []
        else:
            schedule = [{
                'begin_time': timestamp_to_regulartime(begin_time),
                'end_time': timestamp_to_regulartime(end_time),
            }]

        return schedule

    def _get_is_delete(self, data, schedule):
        '''
        得到商品的上下架状态
        :param data:
        :param schedule:
        :return:
        '''
        end_time = data.get('skudata', {}).get('info', {}).get('end_time')
        is_delete = 0
        # 是否下架判断
        # 结束时间戳小于当前时间戳则表示已经删除无法购买, 另外每个规格卖光也不显示is_delete=1(在上面已经判断, 这个就跟销售时间段没关系了)
        if schedule != []:
            if data.get('baseInfo', {}).get('end_time') is not None:
                '''
                先判断如果baseInfo中的end_time=='0'表示已经下架
                '''
                # base_info_end_time = data.get('baseInfo', {}).get('end_time')
                # self.my_lg.info(base_info_end_time)
                # if base_info_end_time == '0':
                #     is_delete = 1
                pass

            if float(end_time) < time.time():
                '''
                再判断日期过期的
                '''
                is_delete = 1
        '''
        卷皮-新增下架判断:
        time: 2018-5-12 
        '''
        if data.get('skudata', {}).get('info', {}).get('gstatus', '1') == '2':
            # 'gstatus'在售状态为'1'
            is_delete = 1

        return is_delete

    def _wash_main_data(self, main_data):
        '''
        清洗main_data
        :param main_data:
        :return:
        '''
        # 处理commitments
        try:
            main_data['commitments'] = ''
            main_data.get('discount', {})['coupon'] = ''
            main_data.get('discount', {})['coupon_index'] = ''
            main_data.get('discount', {})['vip_info'] = ''
            main_data['topbanner'] = ''
        except:
            pass
        try:
            main_data.get('brand_info')['sub_goods'] = ''
        except:
            pass

        return main_data

    def _get_db_update_params(self, item):
        '''
        得到待更新的db数据
        :param item:
        :return:
        '''
        params = [
            item['modify_time'],
            item['shop_name'],
            item['account'],
            item['title'],
            item['sub_title'],
            item['link_name'],
            # item['price'],
            # item['taobao_price'],
            dumps(item['price_info'], ensure_ascii=False),
            dumps(item['detail_name_list'], ensure_ascii=False),
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),
            item['div_desc'],
            # item['delete_time'],
            item['is_delete'],
            dumps(item['schedule'], ensure_ascii=False),
            item['is_price_change'],
            dumps(item['price_change_info'], ensure_ascii=False),
            item['goods_id'],
        ]

        if item.get('delete_time', '') == '':
            params.insert(-1, item['shelf_time'])
        elif item.get('shelf_time', '') == '':
            params.insert(-1, item['delete_time'])
        else:
            params.insert(-1, item['shelf_time'])
            params.insert(-1, item['delete_time'])

        return tuple(params)

    def _get_db_insert_miaosha_params(self, item):
        params = (
            item['goods_id'],
            item['goods_url'],
            item['username'],
            item['create_time'],
            item['modify_time'],
            item['shop_name'],
            item['title'],
            item['sub_title'],
            item['price'],
            item['taobao_price'],
            dumps(item['detail_name_list'], ensure_ascii=False
                  ),  # 把list转换为json才能正常插入数据(并设置ensure_ascii=False)
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),  # 存入到PropertyInfo
            item['div_desc'],  # 存入到DetailInfo
            dumps(item['schedule'], ensure_ascii=False),
            dumps(item['stock_info'], ensure_ascii=False),
            dumps(item['miaosha_time'], ensure_ascii=False),
            item['miaosha_begin_time'],
            item['miaosha_end_time'],
            item['tab_id'],
            item['page'],
            item['site_id'],
            item['is_delete'],
        )

        return params

    def _get_db_update_miaosha_params(self, item):
        params = (
            item['modify_time'],
            item['shop_name'],
            item['title'],
            item['sub_title'],
            item['price'],
            item['taobao_price'],
            dumps(item['detail_name_list'], ensure_ascii=False),
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            dumps(item['p_info'], ensure_ascii=False),
            item['div_desc'],
            item['is_delete'],
            dumps(item['schedule'], ensure_ascii=False),
            dumps(item['stock_info'], ensure_ascii=False),
            dumps(item['miaosha_time'], ensure_ascii=False),
            item['miaosha_begin_time'],
            item['miaosha_end_time'],
            item['goods_id'],
        )

        return params

    def _get_db_insert_pintuan_params(self, item):
        params = (
            item['goods_id'],
            item['goods_url'],
            item['username'],
            item['create_time'],
            item['modify_time'],
            item['shop_name'],
            item['title'],
            item['sub_title'],
            item['price'],
            item['taobao_price'],
            dumps(item['detail_name_list'], ensure_ascii=False
                  ),  # 把list转换为json才能正常插入数据(并设置ensure_ascii=False)
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            item['all_sell_count'],
            dumps(item['p_info'], ensure_ascii=False),  # 存入到PropertyInfo
            item['div_desc'],  # 存入到DetailInfo
            dumps(item['schedule'], ensure_ascii=False),
            item['pintuan_begin_time'],
            item['pintuan_end_time'],
            item['page'],
            item['site_id'],
            item['is_delete'],
        )

        return params

    def _get_db_update_pintuan_params(self, item):
        params = (
            item['modify_time'],
            item['shop_name'],
            item['title'],
            item['sub_title'],
            item['price'],
            item['taobao_price'],
            dumps(item['detail_name_list'], ensure_ascii=False
                  ),  # 把list转换为json才能正常插入数据(并设置ensure_ascii=False)
            dumps(item['price_info_list'], ensure_ascii=False),
            dumps(item['all_img_url'], ensure_ascii=False),
            # item['all_sell_count'],
            dumps(item['p_info'], ensure_ascii=False),  # 存入到PropertyInfo
            item['div_desc'],  # 存入到DetailInfo
            dumps(item['schedule'], ensure_ascii=False),
            item['is_delete'],
            item['goods_id'])

        return params

    def get_goods_id_from_url(self, juanpi_url):
        '''
        得到goods_id
        :param juanpi_url:
        :return: goods_id (类型str)
        '''
        is_juanpi_url = re.compile(r'http://shop.juanpi.com/deal/.*?').findall(
            juanpi_url)
        if is_juanpi_url != []:
            if re.compile(r'http://shop.juanpi.com/deal/(\d+).*?').findall(
                    juanpi_url) != []:
                tmp_juanpi_url = re.compile(
                    r'http://shop.juanpi.com/deal/(\d+).*?').findall(
                        juanpi_url)[0]
                if tmp_juanpi_url != '':
                    goods_id = tmp_juanpi_url
                else:  # 只是为了在pycharm运行时不跳到chrome,其实else完全可以不要的
                    juanpi_url = re.compile(r';').sub('', juanpi_url)
                    goods_id = re.compile(
                        r'http://shop.juanpi.com/deal/(\d+).*?').findall(
                            juanpi_url)[0]
                print('------>>>| 得到的卷皮商品的地址为:', goods_id)
                return goods_id

        else:
            print(
                '卷皮商品url错误, 非正规的url, 请参照格式(http://shop.juanpi.com/deal/)开头的...'
            )
            return ''

    def __del__(self):
        try:
            del self.my_phantomjs
            del self.result_data
        except:
            pass
        gc.collect()
Exemplo n.º 29
0
    async def get_goods_data(self, jumei_pintuan_url):
        '''
        异步模拟得到原始data
        :param goods_id:
        :return:
        '''
        goods_id = await self.get_goods_id_from_url(jumei_pintuan_url)
        if goods_id == []:
            self.result_data = {}
            return {}
        '''
        原先采用requests被过滤无返回结果, 于是用aiohttp无奈速度过慢, 换用phantomjs
        '''
        # 拼团商品手机地址
        goods_url = 'https://s.h5.jumei.com/yiqituan/detail?item_id={0}&type={1}'.format(
            goods_id[0], goods_id[1])
        self.msg = '------>>>| 对应手机端地址为: ' + goods_url
        self.my_lg.info(self.msg)

        #** 获取ajaxDetail请求中的数据
        tmp_url = 'https://s.h5.jumei.com/yiqituan/ajaxDetail?item_id={0}&type={1}'.format(
            str(goods_id[0]), [goods_id[1]][0])
        # self.headers['Referer'] = goods_url
        # params = {
        #     'item_id': str(goods_id[0]),
        #     'type': [goods_id[1]][0],
        # }
        # body = await MyAiohttp.aio_get_url_body(url=tmp_url, headers=self.headers, params=params, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT)
        # # 获取原始url的tmp_body
        # tmp_body = await MyAiohttp.aio_get_url_body(url=goods_url, headers=self.headers, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT)
        # # print(tmp_body)
        '''
        换用phantomjs
        '''
        my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
        body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url)
        # print(body)
        try:
            body = re.compile('<pre .*?>(.*)</pre>').findall(body)[0]
            # print(body)
        except IndexError:
            body = ''
        tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(url=goods_url)
        # print(tmp_body)
        try:
            del my_phantomjs
        except:
            pass

        if body == '' or tmp_body == '':
            self.msg = '获取到的body为空str!' + ' 出错地址: ' + goods_url
            self.my_lg.error(self.msg)
            self.result_data = {}
            return {}

        data = await self.json_2_dict(json_str=body)
        if data == {}:
            self.msg = '出错地址: ' + goods_url
            self.my_lg.error(self.msg)
            self.result_data = {}
            return {}
        data = await self.wash_data(data=data)
        data = data.get('data', {})
        # pprint(data)

        try:
            data['title'] = data.get('share_info', [])[1].get('text', '')
            data['title'] = re.compile(r'聚美').sub('', data['title'])
            if len(data.get('buy_alone', {})) == 1:
                data['sub_title'] = ''
            else:
                data['sub_title'] = data.get('buy_alone', {}).get('name', '')
                data['sub_title'] = re.compile(r'聚美').sub(
                    '', data['sub_title'])
            # print(data['title'])
            if data['title'] == '':
                self.my_lg.error('获取到的title为空值, 请检查!')
                raise Exception

            # shop_name
            if data.get('shop_info') == []:
                data['shop_name'] = ''
            else:
                data['shop_name'] = data.get('shop_info',
                                             {}).get('store_title', '')
            # print(data['shop_name'])

            # 获取所有示例图片
            all_img_url = await self.get_all_img_url(data=data)
            data['all_img_url'] = all_img_url

            # 获取p_info
            p_info = await self.get_p_info(body=tmp_body)
            data['p_info'] = p_info

            # 获取div_desc
            div_desc = await self.get_div_desc(body=tmp_body)
            div_desc = await MyAiohttp.wash_html(div_desc)
            # print(div_desc)
            data['div_desc'] = div_desc

            # 上下架时间(拼团列表数据接口里面有这里先不获取)

            # 设置detail_name_list
            detail_name_list = await self.get_detail_name_list(
                size_attr=data.get('buy_alone', {}).get('size_attr', []))
            data['detail_name_list'] = detail_name_list

            # 获取每个规格对应价格以及库存
            true_sku_info = await self.get_true_sku_info(
                buy_alone_size=data.get('buy_alone', {}).get('size', []),
                size=data.get('size', []),
                group_single_price=data.get('group_single_price', ''))
            data['price_info_list'] = true_sku_info

            # is_delete
            product_status = data.get('product_status', '')
            is_delete = await self.get_is_delete(product_status=product_status,
                                                 true_sku_info=true_sku_info)
            data['is_delete'] = is_delete

            # all_sell_count
            all_sell_count = data.get('buyer_number_text', '')
            if all_sell_count != '':
                all_sell_count = re.compile(r'(\d+\.?\d*)').findall(
                    all_sell_count)[0]
                is_W = re.compile(r'万').findall(all_sell_count)
                if is_W != []:
                    all_sell_count = str(int(float(all_sell_count) * 10000))
            else:
                all_sell_count = '0'
            data['all_sell_count'] = all_sell_count

            data['goods_url'] = goods_url

        except Exception as e:
            self.msg = '遇到错误如下: ' + str(e) + ' 出错地址: ' + goods_url
            self.my_lg.error(self.msg)
            self.my_lg.exception(e)
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}

        if data != {}:
            # pprint(data)
            self.result_data = data
            return data

        else:
            self.msg = 'data为空!' + ' 出错地址: ' + goods_url
            self.my_lg.error(self.msg)
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
Exemplo n.º 30
0
 def __init__(self):
     super(JuanPiParse, self).__init__()
     self._set_headers()
     self.result_data = {}
     self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)