예제 #1
0
    def from_ip_pool_set_proxy_ip_to_phantomjs(self):
        '''
        给phantomjs切换代理ip
        :return:
        '''
        ip_object = MyIpPools()
        ip_list = ip_object.get_proxy_ip_from_ip_pool().get('http')
        try:
            proxy_ip = ip_list[randint(0, len(ip_list) - 1)]  # 随机一个代理ip
        except Exception:
            # print('从ip池获取随机ip失败...正在使用本机ip进行爬取!')
            return False

        # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip))
        proxy_ip = re.compile(r'https://|http://').sub('', proxy_ip)
        proxy_ip = proxy_ip.split(':')  # 切割成['xxxx', '端口']

        try:
            tmp_js = {
                'script':
                'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]),
                'args': []
            }
            self.driver.command_executor._commands['executePhantomScript'] = (
                'POST', '/session/$sessionId/phantom/execute')
            self.driver.execute('executePhantomScript', tmp_js)

        except Exception:
            print('动态切换ip失败')
            return False

        return True
예제 #2
0
    def get_url_body(cls,
                     url,
                     headers: dict,
                     params: dict = None,
                     cookies=None,
                     had_referer=False):
        '''
        根据url得到body
        :param tmp_url:
        :return: '' 表示出错退出 | body 类型str
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        proxies = ip_object.get_proxy_ip_from_ip_pool(
        )  # {'http': ['xx', 'yy', ...]}
        proxy = proxies['http'][randint(0, len(proxies) - 1)]

        tmp_proxies = {
            'http': proxy,
        }
        # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

        tmp_headers = headers
        # tmp_headers['Host'] = re.compile(r'://(.*?)/').findall(url)[0]
        if had_referer:
            if re.compile(r'https').findall(url) != []:
                tmp_headers['Referer'] = 'https://' + tmp_headers['Host'] + '/'
            else:
                tmp_headers['Referer'] = 'http://' + tmp_headers['Host'] + '/'

        s = requests.session()
        try:
            if params is not None:
                response = s.get(
                    url,
                    headers=tmp_headers,
                    params=params,
                    cookies=cookies,
                    proxies=tmp_proxies,
                    timeout=12)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            else:
                response = s.get(
                    url,
                    headers=tmp_headers,
                    proxies=tmp_proxies,
                    cookies=cookies,
                    timeout=12)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            body = response.content.decode('utf-8')

            body = re.compile('\t').sub('', body)
            body = re.compile('  ').sub('', body)
            body = re.compile('\r\n').sub('', body)
            body = re.compile('\n').sub('', body)
            # print(body)
        except Exception:
            print('requests.get()请求超时....')
            print('data为空!')
            body = ''

        return body
예제 #3
0
    def set_cookies_key_api_uid(self):
        '''
        给headers增加一个cookie, 里面有个key名字为api_uid
        :return:
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
        self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

        tmp_proxies = {
            'http': self.proxy,
        }
        # 得到cookie中的key名为api_uid的值
        host_url = 'http://mobile.yangkeduo.com'
        try:
            response = requests.get(host_url, headers=self.headers, proxies=tmp_proxies, timeout=10)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            api_uid = response.cookies.get('api_uid')
            # print(response.cookies.items())
            # if api_uid is None:
            #     api_uid = 'rBQh+FoXerAjQWaAEOcpAg=='
            self.headers['Cookie'] = 'api_uid=' + str(api_uid) + ';'
            # print(api_uid)
        except Exception:
            print('requests.get()请求超时....')
            pass
예제 #4
0
async def get_taobao_sign_and_body(base_url,
                                   headers: dict,
                                   params: dict,
                                   data: json,
                                   timeout=13,
                                   _m_h5_tk='undefine',
                                   session=None,
                                   logger=None):
    '''
    得到淘宝带签名sign接口数据
    :param base_url:
    :param headers:
    :param params:
    :param data:
    :param timeout:
    :param _m_h5_tk:
    :param session:
    :return: (_m_h5_tk, session, body)
    '''
    sign, t = await calculate_right_sign(data=data, _m_h5_tk=_m_h5_tk)
    headers['Host'] = re.compile(r'://(.*?)/').findall(base_url)[0]
    params.update(
        {  # 添加下面几个query string
            't': t,
            'sign': sign,
            'data': data,
        })

    # 设置代理ip
    ip_object = MyIpPools()
    proxy = ip_object._get_random_proxy_ip()  # 失败返回False

    tmp_proxies = {
        'http': proxy,
    }

    if session is None:
        session = requests.session()
    else:
        session = session
    try:
        response = session.get(url=base_url,
                               headers=headers,
                               params=params,
                               proxies=tmp_proxies,
                               timeout=timeout)
        _m_h5_tk = response.cookies.get('_m_h5_tk', '')
        _m_h5_tk = _m_h5_tk.split('_')[0]
        # print(s.cookies.items())
        # print(_m_h5_tk)

        body = response.content.decode('utf-8')
        # print(body)

    except Exception as e:
        logger.exception(e)
        _m_h5_tk = ''
        body = ''

    return (_m_h5_tk, session, body)
예제 #5
0
    def from_ip_pool_set_proxy_ip_to_phantomjs(self):
        ip_object = MyIpPools()
        proxy_ip = ip_object._get_random_proxy_ip()
        if not proxy_ip:  # 失败返回False
            return False

        # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip))
        proxy_ip = re.compile(r'http://').sub('', proxy_ip)  # 过滤'http://'
        proxy_ip = proxy_ip.split(':')  # 切割成['xxxx', '端口']

        try:
            tmp_js = {
                'script':
                'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]),
                'args': []
            }
            self.driver.command_executor._commands['executePhantomScript'] = (
                'POST', '/session/$sessionId/phantom/execute')
            self.driver.execute('executePhantomScript', tmp_js)

        except Exception:
            print('动态切换ip失败')
            return False

        return True
예제 #6
0
async def get_proxy():
    # 设置代理ip
    ip_object = MyIpPools()
    ip_list = ip_object.get_proxy_ip_from_ip_pool()['http']
    proxy = ip_list[randint(0, len(ip_list) - 1)]

    return proxy
예제 #7
0
    def get_url_body(self, tmp_url):
        '''
        得到url的body
        :param tmp_url: 待爬取的url
        :return: str
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        self.proxies = ip_object.get_proxy_ip_from_ip_pool(
        )  # {'http': ['xx', 'yy', ...]}
        self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

        tmp_proxies = {
            'http': self.proxy,
        }
        # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

        try:
            response = requests.get(
                tmp_url, headers=self.headers, proxies=tmp_proxies,
                timeout=10)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            data = response.content.decode('utf-8')
            # print(data)
        except Exception:
            print('requests.get()请求超时....')
            print('today的data为空!')
            data = '{}'
        return data
예제 #8
0
def _init_chrome(is_headless=True, is_pic=True, is_proxy=True):
    '''
    如果使用chrome请设置page_timeout=30
    :return:
    '''
    from selenium.webdriver.support import ui
    from selenium import webdriver

    CHROME_DRIVER_PATH = '/Users/afa/myFiles/tools/chromedriver'
    print('--->>>初始化chrome驱动中<<<---')
    chrome_options = webdriver.ChromeOptions()
    if is_headless:
        chrome_options.add_argument('--headless')  # 注意: 设置headless无法访问网页
    # 谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument(
        '--no-sandbox'
    )  # required when running as root user. otherwise you would get no sandbox errors.

    # chrome_options.add_argument('window-size=1200x600')   # 设置窗口大小

    # 设置无图模式
    if is_pic:
        prefs = {
            'profile.managed_default_content_settings.images': 2,
        }
        chrome_options.add_experimental_option("prefs", prefs)

    # 设置代理
    if is_proxy:
        ip_object = MyIpPools()
        proxy_ip = ip_object._get_random_proxy_ip().replace(
            'http://', '') if isinstance(ip_object._get_random_proxy_ip(),
                                         str) else ''
        if proxy_ip != '':
            chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip))
    '''无法打开https解决方案'''
    # 配置忽略ssl错误
    capabilities = webdriver.DesiredCapabilities.CHROME.copy()
    capabilities['acceptSslCerts'] = True
    capabilities['acceptInsecureCerts'] = True

    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    # 修改user-agent
    chrome_options.add_argument('--user-agent={0}'.format(user_agent))

    # 忽视证书错误
    chrome_options.add_experimental_option('excludeSwitches',
                                           ['ignore-certificate-errors'])

    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,
                              chrome_options=chrome_options,
                              desired_capabilities=capabilities)
    wait = ui.WebDriverWait(driver, 30)  # 显示等待n秒, 每过0.5检查一次页面是否加载完毕
    print('------->>>初始化完毕<<<-------')

    return driver
예제 #9
0
    async def get_proxy(self):
        '''
        异步获取proxy
        :return: 格式: 'http://ip:port'
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        proxy = ip_object._get_random_proxy_ip()    # 失败返回False

        return proxy
예제 #10
0
    async def get_proxy(self):
        '''
        异步获取proxy
        :return: 格式: 'http://ip:port'
        '''
        # 设置代理ip
        ip_object = MyIpPools()
        ip_list = ip_object.get_proxy_ip_from_ip_pool()['http']
        proxy = ip_list[randint(0, len(ip_list) - 1)]

        return proxy
예제 #11
0
    def deal_with_div(self, goods_id):
        # 研究分析发现要获取描述div只需要通过下面地址即可
        # https://hws.m.taobao.com/cache/desc/5.0?callback=backToDesc&type=1&id=
        url = 'https://hws.m.taobao.com/cache/desc/5.0?callback=backToDesc&type=1&id=' + str(
            goods_id)
        # print(url)

        # 设置代理ip
        ip_object = MyIpPools()
        self.proxies = ip_object.get_proxy_ip_from_ip_pool(
        )  # {'http': ['xx', 'yy', ...]}
        self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

        tmp_proxies = {
            'http': self.proxy,
        }
        try:
            self.from_ip_pool_set_proxy_ip_to_phantomjs()
            self.driver.get(url)
        except Exception:
            try:
                self.from_ip_pool_set_proxy_ip_to_phantomjs()
                self.driver.get(url)
            except Exception:
                self.from_ip_pool_set_proxy_ip_to_phantomjs()
                self.driver.get(url)

        body = self.driver.page_source
        # print(body)
        try:
            body = re.compile(r'backToDesc\((.*)\)').findall(body)[0]
        except IndexError:
            print('获取详情图片介绍时出错,此处跳过!')
            return ''

        try:
            body = json.loads(body)
        except Exception:
            print('999')
            return ''

        body = body.get('pcDescContent', '')
        body = re.compile(r'&lt;').sub(
            '<', body
        )  # self.driver.page_source转码成字符串时'<','>'都被替代成&gt;&lt;此外还有其他也类似被替换
        body = re.compile(r'&gt;').sub('>', body)
        body = re.compile(r'&amp;').sub('&', body)
        body = re.compile(r'&nbsp;').sub(' ', body)
        body = re.compile(r'src=\"https:').sub('src=\"', body)  # 先替换部分带有https的
        body = re.compile(r'src="').sub('src=\"https:', body)  # 再把所欲的换成https的
        # print(body)

        return body
예제 #12
0
    def _init_chrome(self):
        '''
        如果使用chrome请设置page_timeout=30
        :return:
        '''
        print('--->>>初始化chrome驱动中<<<---')
        chrome_options = webdriver.ChromeOptions()
        # chrome_options.add_argument('--headless')     # 注意: 设置headless无法访问网页
        # 谷歌文档提到需要加上这个属性来规避bug
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument(
            '--no-sandbox'
        )  # required when running as root user. otherwise you would get no sandbox errors.

        # chrome_options.add_argument('window-size=1200x600')   # 设置窗口大小

        # 设置无图模式
        prefs = {
            'profile.managed_default_content_settings.images': 2,
        }
        chrome_options.add_experimental_option("prefs", prefs)

        # 设置代理
        ip_object = MyIpPools()
        proxy_ip = ip_object._get_random_proxy_ip().replace(
            'http://', '') if isinstance(ip_object._get_random_proxy_ip(),
                                         str) else ''
        if proxy_ip != '':
            chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip))
        '''无法打开https解决方案'''
        # 配置忽略ssl错误
        capabilities = webdriver.DesiredCapabilities.CHROME.copy()
        capabilities['acceptSslCerts'] = True
        capabilities['acceptInsecureCerts'] = True

        # 修改user-agent
        chrome_options.add_argument('--user-agent={0}'.format(HEADERS[randint(
            0,
            len(HEADERS) - 1)]))

        # 忽视证书错误
        chrome_options.add_experimental_option('excludeSwitches',
                                               ['ignore-certificate-errors'])

        self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,
                                       chrome_options=chrome_options,
                                       desired_capabilities=capabilities)
        wait = ui.WebDriverWait(self.driver, 30)  # 显示等待n秒, 每过0.5检查一次页面是否加载完毕
        print('------->>>初始化完毕<<<-------')
예제 #13
0
    def _get_proxies(cls):
        '''
        得到单个代理ip
        :return: 格式: {'http': ip+port}
        '''
        ip_object = MyIpPools()
        proxies = ip_object.get_proxy_ip_from_ip_pool(
        )  # {'http': ['xx', 'yy', ...]}
        proxy = proxies['http'][randint(0, len(proxies) - 1)]

        tmp_proxies = {
            'http': proxy,
        }

        return tmp_proxies
예제 #14
0
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--proxy-server=http://183.136.218.253:80')
chrome_options.add_argument('--headless')  # 注意: 设置headless无法访问网页
chrome_options.add_argument('--disable-gpu')

# 设置无图模式
prefs = {'profile.managed_default_content_settings.images': 2}
chrome_options.add_experimental_option("prefs", prefs)
'''无法打开https解决方案'''
# 配置忽略ssl错误
capabilities = webdriver.DesiredCapabilities.CHROME.copy()
capabilities['acceptSslCerts'] = True
capabilities['acceptInsecureCerts'] = True

# 方法1: 设置代理
ip_object = MyIpPools()
proxy_ip = ip_object._get_random_proxy_ip().replace(
    'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else ''
if proxy_ip != '':
    chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip))

# 方法2:
# ip_object = MyIpPools()
# proxy_ip = ip_object._get_random_proxy_ip().replace('http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else ''
# # Change the proxy properties of that copy.
# capabilities['proxy'] = {
#     "httpProxy": proxy_ip,
#     "ftpProxy": proxy_ip,
#     "sslProxy": proxy_ip,
#     "noProxy": None,
#     "proxyType": "MANUAL",
예제 #15
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        self.msg = '------>>>| 对应的手机端地址为: ' + 'https://h5.m.taobao.com/awp/core/detail.htm?id=' + str(goods_id)
        self.my_lg.info(self.msg)

        appKey = '12574478'
        t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位

        '''
        下面是构造params
        '''
        goods_id = goods_id
        # self.my_lg.info(goods_id)
        params_data_1 = {
            'id': goods_id
        }
        params_data_2 = {
            'exParams': json.dumps(params_data_1),  # 每层里面的字典都要先转换成json
            'itemNumId': goods_id
        }
        # self.my_lg.info(str(params_data_2))

        ### * 注意这是正确的url地址: right_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?appKey=12574478&t=1508886442888&api=mtop.taobao.detail.getdetail&v=6.0&ttid=2016%40taobao_h5_2.0.0&isSec=0&ecode=0&AntiFlood=true&AntiCreep=true&H5Request=true&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22exParams%22%3A%22%7B%5C%22id%5C%22%3A%5C%22546756179626%5C%22%7D%22%2C%22itemNumId%22%3A%22546756179626%22%7D'
        # right_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?appKey=12574478&t=1508886442888&api=mtop.taobao.detail.getdetail&v=6.0&ttid=2016%40taobao_h5_2.0.0&isSec=0&ecode=0&AntiFlood=true&AntiCreep=true&H5Request=true&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22exParams%22%3A%22%7B%5C%22id%5C%22%3A%5C%22546756179626%5C%22%7D%22%2C%22itemNumId%22%3A%22546756179626%22%7D'
        # right_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?appKey=12574478&t=1508857184835&api=mtop.taobao.detail.getdetail&v=6.0&ttid=2016%40taobao_h5_2.0.0&isSec=0&ecode=0&AntiFlood=true&AntiCreep=true&H5Request=true&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22exParams%22%3A%22%7B%5C%22id%5C%22%3A%5C%2241439519931%5C%22%7D%22%2C%22itemNumId%22%3A%2241439519931%22%7D'
        # self.my_lg.info(right_url)

        params = {
            'appKey': appKey,
            't': t,
            # sign = '24b2e987fce9c84d2fc0cebd44be49ef'     # sign可以为空
            'api': 'mtop.taobao.detail.getdetail',
            'v': '6.0',
            'ttid': '2016@taobao_h5_2.0.0',
            'isSec': '0',
            'ecode': '0',
            'AntiFlood': 'true',
            'AntiCreep': 'true',
            'H5Request': 'true',
            'type': 'jsonp',
            'callback': 'mtopjsonp1',
            'data': json.dumps(params_data_2),  # 每层里面的字典都要先转换成json
        }
        tmp_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/'

        # 设置代理ip
        ip_object = MyIpPools()
        self.proxies = ip_object.get_proxy_ip_from_ip_pool()     # {'http': ['xx', 'yy', ...]}
        self.proxy = self.proxies['http'][randint(0, len(self.proxies)-1)]

        tmp_proxies = {
            'http': self.proxy,
        }
        # self.my_lg.info('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

        s = requests.session()
        try:
            response = s.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            last_url = re.compile(r'\+').sub('', response.url)  # 转换后得到正确的url请求地址
            # self.my_lg.info(last_url)
            response = s.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            data = response.content.decode('utf-8')
            # self.my_lg.info(data)
            data = re.compile(r'mtopjsonp1\((.*)\)').findall(data)  # 贪婪匹配匹配所有
            # self.my_lg.info(str(data))
        except Exception:
            self.my_lg.error('requests.get()请求超时...' + ' 出错goods_id: ' + str(goods_id))
            self.my_lg.error('data为空!')
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}

        if data != []:
            data = data[0]
            try:
                data = json.loads(data)
            except Exception:
                self.my_lg.error('json.loads转换data时出错, 请检查! 出错goods_id: ' + str(goods_id))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            # pprint(data)

            if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' and data.get('data', {}).get('seller', {}).get('evaluates') is None:
                '''
                ## 表示该商品已经下架, 原地址被重定向到新页面
                '''
                self.my_lg.info('@@@@@@ 该商品已经下架...')
                tmp_data_s = self.init_pull_off_shelves_goods()
                self.result_data = {}
                return tmp_data_s

            # 处理商品被转移或者下架导致页面不存在的商品
            if data.get('data').get('seller', {}).get('evaluates') is None:
                self.my_lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            data['data']['rate'] = ''           # 这是宝贝评价
            data['data']['resource'] = ''       # 买家询问别人
            data['data']['vertical'] = ''       # 也是问和回答
            data['data']['seller']['evaluates'] = ''  # 宝贝描述, 卖家服务, 物流服务的评价值...
            result_data = data['data']

            # 处理result_data['apiStack'][0]['value']
            # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', ''))
            result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {})
            try:
                result_data_apiStack_value = json.loads(result_data_apiStack_value)

                result_data_apiStack_value['vertical'] = ''
                result_data_apiStack_value['consumerProtection'] = ''  # 7天无理由退货
                result_data_apiStack_value['feature'] = ''
                result_data_apiStack_value['layout'] = ''
                result_data_apiStack_value['delivery'] = ''     # 发货地到收到地
                result_data_apiStack_value['resource'] = ''     # 优惠券
                # result_data_apiStack_value['item'] = ''       # 不能注释否则得不到月销量
                # pprint(result_data_apiStack_value)
            except Exception:
                self.my_lg.error("json.loads转换出错,得到result_data['apiStack'][0]['value']值可能为空,此处跳过" + ' 出错goods_id: ' + str(goods_id))
                result_data_apiStack_value = ''
                pass

            # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value']
            result_data['apiStack'][0]['value'] = result_data_apiStack_value

            # 处理mockData
            mock_data = result_data['mockData']
            try:
                mock_data = json.loads(mock_data)
            except Exception:
                self.my_lg.error('json.loads转化mock_data时出错, 跳出' + ' 出错goods_id: ' + str(goods_id))
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            mock_data['feature'] = ''
            # pprint(mock_data)
            result_data['mockData'] = mock_data

            # self.my_lg.info(str(result_data.get('apiStack', [])[0]))   # 可能会有{'name': 'esi', 'value': ''}的情况
            if result_data.get('apiStack', [])[0].get('value', '') == '':
                self.my_lg.info("result_data.get('apiStack', [])[0].get('value', '')的值为空....")
                result_data['trade'] = {}
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
            else:
                result_data['trade'] = result_data.get('apiStack', [])[0].get('value', {}).get('trade', {})     # 用于判断该商品是否已经下架的参数
                # pprint(result_data['trade'])

            self.result_data = result_data
            # pprint(self.result_data)
            return result_data
        else:
            self.my_lg.info('data为空!')
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
예제 #16
0
    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url
        :param goods_id:
        :return: data   类型dict
        '''
        if goods_id == '':
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            return {}
        else:
            tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id)
            print('------>>>| 得到的商品手机版的地址为: ', tmp_url)
            '''
            1.原先使用requests来模拟(起初安全的运行了一个月),但是后来发现光requests会not Found,记住使用前别翻墙
            '''
            # 设置代理ip
            ip_object = MyIpPools()
            self.proxies = ip_object.get_proxy_ip_from_ip_pool(
            )  # {'http': ['xx', 'yy', ...]}
            self.proxy = self.proxies['http'][randint(0,
                                                      len(self.proxies) - 1)]

            tmp_proxies = {
                'http': self.proxy,
            }
            # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

            # try:
            #     response = requests.get(tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=12)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            #     main_body = response.content.decode('utf-8')
            #     # print(main_body)
            #     # main_body = re.compile(r'\n').sub('', main_body)
            #     main_body = re.compile(r'\t').sub('', main_body)
            #     main_body = re.compile(r'  ').sub('', main_body)
            #     print(main_body)
            #     data = re.compile(r'__PRELOADED_STATE__=(.*),window\.__SERVER_TIME__=').findall(main_body)  # 贪婪匹配匹配所有
            #     print(data)
            # except Exception:
            #     print('requests.get()请求超时....')
            #     print('data为空!')
            #     self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            #     return {}
            '''
            2.采用phantomjs来处理,记住使用前别翻墙
            '''
            self.from_ip_pool_set_proxy_ip_to_phantomjs()
            try:
                self.driver.set_page_load_timeout(15)  # 设置成10秒避免数据出错
            except:
                return {}

            try:
                self.driver.get(tmp_url)
                self.driver.implicitly_wait(20)  # 隐式等待和显式等待可以同时使用

                locator = (By.CSS_SELECTOR, 'div.sc-kgoBCf.bTQvTk'
                           )  # 该css为手机端标题块
                try:
                    WebDriverWait(self.driver, 20, 0.5).until(
                        EC.presence_of_element_located(locator))
                except Exception as e:
                    print('遇到错误: ', e)
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
                else:
                    print('div.d-content已经加载完毕')
                main_body = self.driver.page_source
                main_body = re.compile(r'\n').sub('', main_body)
                main_body = re.compile(r'\t').sub('', main_body)
                main_body = re.compile(r'  ').sub('', main_body)
                # print(main_body)
                data = re.compile(
                    r'__PRELOADED_STATE__ = (.*);</script> <style ').findall(
                        main_body)  # 贪婪匹配匹配所有
                # print(data)
            except Exception as e:  # 如果超时, 终止加载并继续后续操作
                print('-->>time out after 15 seconds when loading page')
                print('报错如下: ', e)
                # self.driver.execute_script('window.stop()')  # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            # 得到skudata
            # 卷皮原先的skudata请求地址1(官方放弃)
            # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id)
            # 现在卷皮skudata请求地址2
            skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str(
                goods_id)

            self.skudata_headers = self.headers
            self.skudata_headers['Host'] = 'webservice.juanpi.com'
            try:
                response = requests.get(
                    skudata_url,
                    headers=self.skudata_headers,
                    proxies=tmp_proxies,
                    timeout=10)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
                skudata = response.content.decode('utf-8')
                # print(skudata)
                skudata = re.compile(r'(.*)').findall(skudata)  # 贪婪匹配匹配所有
                # print(skudata)
            except Exception:
                print('requests.get()请求超时....')
                print('skudata为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            if skudata != []:
                skudata = skudata[0]
                try:
                    skudata = json.loads(skudata)
                except:
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
                skudata = skudata.get('skudata', {})
                # pprint(skudata)

                try:
                    if skudata.get('info') is not None:
                        pass  # 说明得到正确的skudata

                    else:  # 否则跳出
                        print('skudata中info的key为None, 返回空dict')
                        self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                        return {}

                except AttributeError as e:
                    print('遇到错误如下(先跳过!): ', e)
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}

            else:
                print('skudata为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}

            if data != []:
                main_data = data[0]
                # print(main_data)
                try:
                    main_data = json.loads(main_data)
                    # pprint(main_data)
                except:
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}

                if main_data.get('detail') is not None:
                    main_data = main_data.get('detail', {})
                    # 处理commitments
                    try:
                        main_data['commitments'] = ''
                        main_data.get('discount', {})['coupon'] = ''
                        main_data.get('discount', {})['coupon_index'] = ''
                        main_data.get('discount', {})['vip_info'] = ''
                        main_data['topbanner'] = ''
                    except:
                        pass
                    try:
                        main_data.get('brand_info')['sub_goods'] = ''
                    except:
                        pass

                    main_data['skudata'] = skudata
                    # pprint(main_data)
                    # print(main_data)
                    self.result_data = main_data
                    return main_data

                else:
                    print('data中detail的key为None, 返回空dict')
                    self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                    return {}
            else:
                print('data为空!')
                self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
                return {}
예제 #17
0
    def get_div_from_pc_div_url(self, url, goods_id):
        '''
        根据pc描述的url模拟请求获取描述的div
        :return: str
        '''
        '''
        appKey:12574478
        t:1509513791232
        api:mtop.taobao.detail.getdesc
        v:6.0
        type:jsonp
        dataType:jsonp
        timeout:20000
        callback:mtopjsonp1
        data:{"id":"546818961702","type":"1"}
        '''
        appKey = '12574478'
        t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位

        '''
        下面是构造params
        '''
        goods_id = goods_id
        # self.my_lg.info(goods_id)
        params_data_1 = {
            'id': goods_id,
            'type': '1',
        }

        # self.my_lg.info(str(params_data_2))
        params = {
            'data': json.dumps(params_data_1)  # 每层里面的字典都要先转换成json
        }

        tmp_url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdesc/6.0/?appKey={}&t={}&api=mtop.taobao.detail.getdesc&v=6.0&type=jsonp&dataType=jsonp&timeout=20000&callback=mtopjsonp1'.format(
            appKey, t
        )

        tmp_proxies = {
            'http': self.proxy,
        }
        # self.my_lg.info('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

        # 设置3层避免报错退出
        try:
            response = requests.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        except Exception:
            try:
                # 设置代理ip
                ip_object = MyIpPools()
                self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
                self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

                tmp_proxies = {
                    'http': self.proxy,
                }
                response = requests.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
            except Exception:
                # 设置代理ip
                ip_object = MyIpPools()
                self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
                self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

                tmp_proxies = {
                    'http': self.proxy,
                }
                response = requests.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造

        last_url = re.compile(r'\+').sub('', response.url)      # 转换后得到正确的url请求地址
        # self.my_lg.info(last_url)
        try:
            response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
        except Exception:
            ip_object = MyIpPools()
            self.proxies = ip_object.get_proxy_ip_from_ip_pool()  # {'http': ['xx', 'yy', ...]}
            self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)]

            tmp_proxies = {
                'http': self.proxy,
            }
            response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造

        data = response.content.decode('utf-8')
        # self.my_lg.info(str(data))
        data = re.compile(r'mtopjsonp1\((.*)\)').findall(data)  # 贪婪匹配匹配所有
        if data != []:
            data = data[0]
            data = json.loads(data)

            if data != []:
                div = data.get('data', '').get('pcDescContent', '')
                div = self.deal_with_div(div)
                # self.my_lg.info(div)
            else:
                div = ''
        else:
            div = ''

        return div