示例#1
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            return self._data_error()

        tmp_url = 'https://th5.m.zhe800.com/gateway/app/detail/product?productId=' + str(goods_id)
        # print('------>>>| 得到的detail信息的地址为: ', tmp_url)
        body = Requests.get_url_body(
            url=tmp_url,
            headers=self.headers,
            ip_pool_type=self.ip_pool_type)
        data = json_2_dict(json_str=body, default_res={})
        if body == '' \
                or data == {}:
            return self._data_error()

        # 处理base
        base = json_2_dict(
            json_str=data.get('/app/detail/product/base', ''),
            default_res={})

        # 处理profiles
        profiles = data.get('/app/detail/product/profiles', '')
        profiles = json_2_dict(json_str=profiles)
        if profiles == {}:
            print("json.loads转换出错,得到profiles值可能为空,此处跳过")
            profiles = ''

        # 处理score
        score = json_2_dict(
            json_str=data.get('/app/detail/product/score', ''),
            default_res={})
        try:
            score.pop('contents')
        except:
            pass

        # 处理sku
        sku = json_2_dict(
            json_str=data.get('/app/detail/product/sku', ''),
            default_res={})
        # pprint(sku)

        data['/app/detail/product/base'] = base
        data['/app/detail/product/profiles'] = profiles
        data['/app/detail/product/score'] = score
        data['/app/detail/product/sku'] = sku
        # pprint(base)

        try:
            # 得到手机版地址
            phone_url = 'http://th5.m.zhe800.com/h5/shopdeal?id=' + str(base.get('dealId', ''))
        except AttributeError:
            print('获取手机版地址失败,此处跳过')
            return self._data_error()

        print('------>>>| 得到商品手机版地址为: ', phone_url)
        # 得到并处理detail(即图文详情显示信息)
        tmp_detail_url = 'https://th5.m.zhe800.com/gateway/app/detail/graph?productId=' + str(goods_id)
        detail_data_body = Requests.get_url_body(
            url=tmp_detail_url,
            headers=self.headers,
            ip_pool_type=self.ip_pool_type)
        # print(detail_data_body)
        if detail_data_body == '':
            print('detail_data为[]!')
            return self._data_error()

        detail_data = json_2_dict(json_str=detail_data_body, default_res={})
        if detail_data == {}:
            print('json.loads(detail_data)时报错, 此处跳过')
            return self._data_error()

        detail = json_2_dict(
            json_str=detail_data.get('/app/detail/graph/detail', ''),
            default_res={})
        try:
            detail.pop('small')
        except:
            pass
        # print(detail)

        # div_desc
        tmp_div_desc = self._get_div_desc(detail=detail, goods_id=goods_id)
        if tmp_div_desc == '':
            return self._data_error()
        # print(tmp_div_desc)
        data['/app/detail/graph/detail'] = tmp_div_desc

        # shop_name
        shop_name = self._get_shop_name(data=data)
        if isinstance(shop_name, dict):
            if shop_name == {}:
                return self._data_error()
        data['shop_name'] = shop_name

        '''
        得到秒杀开始时间和结束时间
        '''
        schedule_and_stock_url = 'https://th5.m.zhe800.com/gateway/app/detail/status?productId=' + str(goods_id)
        schedule_and_stock_info_body = Requests.get_url_body(
            url=schedule_and_stock_url,
            headers=self.headers,
            high_conceal=True,
            ip_pool_type=self.ip_pool_type)
        if schedule_and_stock_info_body == '':
            print('schedule_and_stock_info为空!')
            return self._data_error()

        schedule_and_stock_info = json_2_dict(json_str=schedule_and_stock_info_body)
        if schedule_and_stock_info == {}:
            print('得到秒杀开始时间和结束时间时错误, 此处跳过')
            return self._data_error()

        schedule = json_2_dict(
            json_str=schedule_and_stock_info.get('/app/detail/status/schedule', None),
            default_res={})
        stock = json_2_dict(
            json_str=schedule_and_stock_info.get('/app/detail/status/stock', None),
            default_res={})

        data['schedule'] = schedule
        data['stock'] = stock
        data['parent_dir'] = _z8_get_parent_dir(goods_id)
        data['goods_id'] = goods_id

        self.result_data = data
        # pprint(data)

        return data
示例#2
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            return self._data_error_init()
        else:
            tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(goods_id)
            print('------>>>| 得到的商品手机版地址为: ', tmp_url)

            '''
            原先采用requests来模拟的,之前能用,但是数据多了请求多了sleep也不管用后面会获取不到信息
            '''
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True)
            # print(body)
            if body == '':
                print('获取到的tmp_url的body为空值, 此处跳过!')
                return self._data_error_init()

            # 不用这个了因为会影响到正常情况的商品
            try:
                if re.compile(r'很抱歉,您查看的页面木有了~').findall(body) != [] and (len(body)< 660 and len(body)>640):   # 单独处理商品页面不存在的情况
                    print('很抱歉,您查看的页面木有了~')
                    self.result_data = {}
                    return str(goods_id)
                else:
                    pass
            except:
                pass

            try:
                data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(body)
            except:
                data = []

            '''
            采用phantomjs
            '''
            # main_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, css='div.title')
            # # print(main_body)
            # if main_body == '':
            #     print('获取到的main_body为空值, 此处跳过!')
            #     return self._data_error_init()
            #
            # try:
            #     data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(main_body)  # 贪婪匹配匹配所有
            #     # print(data)
            # except:
            #     data = []

            if data != []:
                data = json_2_dict(json_str=data[0])
                # pprint(data)
                if data == {}:
                    return self._data_error_init()

                # div_desc
                div_desc_body = self.get_div_desc_body(goods_id=goods_id)
                # print(div_desc_body)
                if div_desc_body == '':
                    print('获取到的div_desc_body为空!')
                    return {}

                # p_info
                p_info = self.get_p_info_list(goods_id=goods_id)
                # pprint(p_info)
                if p_info == []:
                    return {}

                # 获取商品实时库存信息
                stock_info = self.get_stock_info_dict(goods_id=goods_id)
                if stock_info == {}:
                    print('获取到的库存信息为{}!')
                    return {}
                # pprint(stock_info)

                data['div_desc'] = div_desc_body
                data['p_info'] = p_info
                data['stock_info'] = stock_info

                if stock_info.get('pin_status', 2) == 3:
                    print('##### 该拼团商品已经被抢光 ...')
                    is_delete = 1
                else:
                    is_delete = 0
                data['is_delete'] = is_delete
                data['parent_dir'] = _z8_get_parent_dir(goods_id)

                self.result_data = data
                # pprint(data)
                return data

            else:
                print('data为空!')
                return self._data_error_init()
示例#3
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            return self._data_error_init()

        tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(goods_id)
        print('------>>>| 得到的商品手机版地址为: ', tmp_url)

        try:
            '''
            原先采用requests来模拟的,之前能用,但是数据多了请求多了sleep也不管用后面会获取不到信息
            '''
            body = Requests.get_url_body(
                url=tmp_url,
                headers=self.headers,
                proxy_type=self.proxy_type,
                ip_pool_type=self.ip_pool_type,
                num_retries=self.req_num_retries,)
            assert body != ''
            # print(body)

            '''
            采用phantomjs
            '''
            # main_body = self.driver.use_phantomjs_to_get_url_body(url=tmp_url, css='div.title')
            # # print(main_body)
            # if main_body == '':
            #     print('获取到的main_body为空值, 此处跳过!')
            #     return self._data_error_init()

            # 不用这个了因为会影响到正常情况的商品
            try:
                if re.compile(r'很抱歉,您查看的页面木有了~').findall(body) != [] and (
                        len(body) < 660 and len(body) > 640):  # 单独处理商品页面不存在的情况
                    print('很抱歉,您查看的页面木有了~')
                    self.result_data = {}
                    return str(goods_id)
                else:
                    pass
            except:
                pass

            data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(body)
            assert data != [], 'data为空!'
            data = json_2_dict(
                json_str=data[0],
                default_res={},)
            assert data != {}
            # pprint(data)
            # div_desc
            div_desc_body = self.get_div_desc_body(goods_id=goods_id)
            # print(div_desc_body)
            assert div_desc_body != '', '获取到的div_desc_body为空!'
            p_info = self.get_p_info_list(goods_id=goods_id)
            # pprint(p_info)
            assert p_info != []
            # 获取商品实时库存信息
            stock_info = self.get_stock_info_dict(goods_id=goods_id)
            assert stock_info != {}, '获取到的库存信息为{}!'
            # pprint(stock_info)
        except (IndexError, AssertionError, Exception) as e:
            print(e)
            return self._data_error_init()

        data['div_desc'] = div_desc_body
        data['p_info'] = p_info
        data['stock_info'] = stock_info

        if stock_info.get('pin_status', 2) == 3:
            print('##### 该拼团商品已经被抢光 ...')
            is_delete = 1
        else:
            is_delete = 0
        data['is_delete'] = is_delete
        data['parent_dir'] = _z8_get_parent_dir(goods_id)

        self.result_data = data
        # pprint(data)

        return data
示例#4
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            tmp_url = 'https://th5.m.zhe800.com/gateway/app/detail/product?productId=' + str(
                goods_id)
            # print('------>>>| 得到的detail信息的地址为: ', tmp_url)

            body = Requests.get_url_body(url=tmp_url,
                                         headers=self.headers,
                                         high_conceal=True,
                                         ip_pool_type=self.ip_pool_type)
            if body == '':
                self.result_data = {}
                return {}
            else:
                data = [body]

            if data != []:
                data = json_2_dict(json_str=data[0])
                if data == {}:
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
                # pprint(data)

                # 处理base
                base = data.get('/app/detail/product/base', '')
                base = json_2_dict(json_str=base)
                if base == {}:
                    print("json.loads转换出错,得到base值可能为空,此处跳过")
                    base = ''

                # 处理profiles
                profiles = data.get('/app/detail/product/profiles', '')
                profiles = json_2_dict(json_str=profiles)
                if profiles == {}:
                    print("json.loads转换出错,得到profiles值可能为空,此处跳过")
                    profiles = ''

                # 处理score
                score = data.get('/app/detail/product/score', '')
                score = json_2_dict(json_str=score)
                try:
                    score.pop('contents')
                except:
                    pass
                if score == {}:
                    print("json.loads转换出错,得到score值可能为空,此处跳过")
                    score = ''

                # 处理sku
                sku = data.get('/app/detail/product/sku', '')
                sku = json_2_dict(json_str=sku)
                # pprint(sku)
                if sku == {}:
                    print("json.loads转换出错,得到sku值可能为空,此处跳过")
                    sku = ''

                data['/app/detail/product/base'] = base
                data['/app/detail/product/profiles'] = profiles
                data['/app/detail/product/score'] = score
                data['/app/detail/product/sku'] = sku

                # 得到手机版地址
                try:
                    phone_url = 'http://th5.m.zhe800.com/h5/shopdeal?id=' + str(
                        base.get('dealId', ''))
                except AttributeError:
                    print('获取手机版地址失败,此处跳过')
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}

                print('------>>>| 得到商品手机版地址为: ', phone_url)
                # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

                # 得到并处理detail(即图文详情显示信息)
                tmp_detail_url = 'https://th5.m.zhe800.com/gateway/app/detail/graph?productId=' + str(
                    goods_id)
                detail_data_body = Requests.get_url_body(
                    url=tmp_detail_url,
                    headers=self.headers,
                    high_conceal=True,
                    ip_pool_type=self.ip_pool_type)
                # print(detail_data_body)
                if detail_data_body == '':
                    print('detail_data为[]!')
                    self.result_data = {}
                    return {}
                else:
                    detail_data = [detail_data_body]

                if detail_data != []:
                    detail_data = json_2_dict(json_str=detail_data[0])
                    if detail_data == {}:
                        print('json.loads(detail_data)时报错, 此处跳过')
                        self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                        return {}
                    # pprint(detail_data)

                    detail = detail_data.get('/app/detail/graph/detail', '')
                    detail = json_2_dict(json_str=detail)
                    try:
                        detail.pop('small')
                    except:
                        pass
                    if detail == {}:
                        print("json.loads转换出错,得到detail值可能为空,此处跳过")
                        detail = ''
                    # print(detail)

                    # div_desc
                    tmp_div_desc = self._get_div_desc(detail=detail,
                                                      goods_id=goods_id)
                    if tmp_div_desc == '':
                        self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                        return {}
                    # print(tmp_div_desc)
                    data['/app/detail/graph/detail'] = tmp_div_desc

                    # shop_name
                    shop_name = self._get_shop_name(data=data)
                    if isinstance(shop_name, dict):
                        if shop_name == {}:
                            self.result_data = {}
                            return {}
                    data['shop_name'] = shop_name
                    '''
                    得到秒杀开始时间和结束时间
                    '''
                    schedule_and_stock_url = 'https://th5.m.zhe800.com/gateway/app/detail/status?productId=' + str(
                        goods_id)
                    schedule_and_stock_info_body = Requests.get_url_body(
                        url=schedule_and_stock_url,
                        headers=self.headers,
                        high_conceal=True,
                        ip_pool_type=self.ip_pool_type)
                    if schedule_and_stock_info_body == '':
                        print('schedule_and_stock_info为空!')
                        self.result_data = {}
                        return {}
                    else:
                        schedule_and_stock_info = [
                            schedule_and_stock_info_body
                        ]

                    if schedule_and_stock_info != []:
                        schedule_and_stock_info = json_2_dict(
                            json_str=schedule_and_stock_info[0])
                        if schedule_and_stock_info == {}:
                            print('得到秒杀开始时间和结束时间时错误, 此处跳过')
                            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                            return {}

                        schedule = schedule_and_stock_info.get(
                            '/app/detail/status/schedule')
                        if schedule is None:
                            schedule = {}
                        else:
                            schedule = json_2_dict(json_str=schedule)

                        stock = schedule_and_stock_info.get(
                            '/app/detail/status/stock')
                        if stock is None:
                            stock = {}
                        else:
                            stock = json_2_dict(json_str=stock)
                    else:
                        schedule = {}
                        stock = {}
                    data['schedule'] = schedule
                    data['stock'] = stock
                    data['parent_dir'] = _z8_get_parent_dir(goods_id)

                    self.result_data = data
                    # pprint(data)
                    return data

                else:
                    print('detail_data为空!')
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}

            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}