def from_ip_pool_set_proxy_ip_to_phantomjs(self): ''' 给phantomjs切换代理ip :return: ''' ip_object = MyIpPools() ip_list = ip_object.get_proxy_ip_from_ip_pool().get('http') try: proxy_ip = ip_list[randint(0, len(ip_list) - 1)] # 随机一个代理ip except Exception: # print('从ip池获取随机ip失败...正在使用本机ip进行爬取!') return False # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip)) proxy_ip = re.compile(r'https://|http://').sub('', proxy_ip) proxy_ip = proxy_ip.split(':') # 切割成['xxxx', '端口'] try: tmp_js = { 'script': 'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]), 'args': [] } self.driver.command_executor._commands['executePhantomScript'] = ( 'POST', '/session/$sessionId/phantom/execute') self.driver.execute('executePhantomScript', tmp_js) except Exception: print('动态切换ip失败') return False return True
def get_url_body(cls, url, headers: dict, params: dict = None, cookies=None, had_referer=False): ''' 根据url得到body :param tmp_url: :return: '' 表示出错退出 | body 类型str ''' # 设置代理ip ip_object = MyIpPools() proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} proxy = proxies['http'][randint(0, len(proxies) - 1)] tmp_proxies = { 'http': proxy, } # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) tmp_headers = headers # tmp_headers['Host'] = re.compile(r'://(.*?)/').findall(url)[0] if had_referer: if re.compile(r'https').findall(url) != []: tmp_headers['Referer'] = 'https://' + tmp_headers['Host'] + '/' else: tmp_headers['Referer'] = 'http://' + tmp_headers['Host'] + '/' s = requests.session() try: if params is not None: response = s.get( url, headers=tmp_headers, params=params, cookies=cookies, proxies=tmp_proxies, timeout=12) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 else: response = s.get( url, headers=tmp_headers, proxies=tmp_proxies, cookies=cookies, timeout=12) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 body = response.content.decode('utf-8') body = re.compile('\t').sub('', body) body = re.compile(' ').sub('', body) body = re.compile('\r\n').sub('', body) body = re.compile('\n').sub('', body) # print(body) except Exception: print('requests.get()请求超时....') print('data为空!') body = '' return body
def set_cookies_key_api_uid(self): ''' 给headers增加一个cookie, 里面有个key名字为api_uid :return: ''' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # 得到cookie中的key名为api_uid的值 host_url = 'http://mobile.yangkeduo.com' try: response = requests.get(host_url, headers=self.headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 api_uid = response.cookies.get('api_uid') # print(response.cookies.items()) # if api_uid is None: # api_uid = 'rBQh+FoXerAjQWaAEOcpAg==' self.headers['Cookie'] = 'api_uid=' + str(api_uid) + ';' # print(api_uid) except Exception: print('requests.get()请求超时....') pass
async def get_taobao_sign_and_body(base_url, headers: dict, params: dict, data: json, timeout=13, _m_h5_tk='undefine', session=None, logger=None): ''' 得到淘宝带签名sign接口数据 :param base_url: :param headers: :param params: :param data: :param timeout: :param _m_h5_tk: :param session: :return: (_m_h5_tk, session, body) ''' sign, t = await calculate_right_sign(data=data, _m_h5_tk=_m_h5_tk) headers['Host'] = re.compile(r'://(.*?)/').findall(base_url)[0] params.update( { # 添加下面几个query string 't': t, 'sign': sign, 'data': data, }) # 设置代理ip ip_object = MyIpPools() proxy = ip_object._get_random_proxy_ip() # 失败返回False tmp_proxies = { 'http': proxy, } if session is None: session = requests.session() else: session = session try: response = session.get(url=base_url, headers=headers, params=params, proxies=tmp_proxies, timeout=timeout) _m_h5_tk = response.cookies.get('_m_h5_tk', '') _m_h5_tk = _m_h5_tk.split('_')[0] # print(s.cookies.items()) # print(_m_h5_tk) body = response.content.decode('utf-8') # print(body) except Exception as e: logger.exception(e) _m_h5_tk = '' body = '' return (_m_h5_tk, session, body)
def from_ip_pool_set_proxy_ip_to_phantomjs(self): ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip() if not proxy_ip: # 失败返回False return False # print('------>>>| 正在使用的代理ip: {} 进行爬取... |<<<------'.format(proxy_ip)) proxy_ip = re.compile(r'http://').sub('', proxy_ip) # 过滤'http://' proxy_ip = proxy_ip.split(':') # 切割成['xxxx', '端口'] try: tmp_js = { 'script': 'phantom.setProxy({}, {});'.format(proxy_ip[0], proxy_ip[1]), 'args': [] } self.driver.command_executor._commands['executePhantomScript'] = ( 'POST', '/session/$sessionId/phantom/execute') self.driver.execute('executePhantomScript', tmp_js) except Exception: print('动态切换ip失败') return False return True
async def get_proxy(): # 设置代理ip ip_object = MyIpPools() ip_list = ip_object.get_proxy_ip_from_ip_pool()['http'] proxy = ip_list[randint(0, len(ip_list) - 1)] return proxy
def get_url_body(self, tmp_url): ''' 得到url的body :param tmp_url: 待爬取的url :return: str ''' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) try: response = requests.get( tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 data = response.content.decode('utf-8') # print(data) except Exception: print('requests.get()请求超时....') print('today的data为空!') data = '{}' return data
def _init_chrome(is_headless=True, is_pic=True, is_proxy=True): ''' 如果使用chrome请设置page_timeout=30 :return: ''' from selenium.webdriver.support import ui from selenium import webdriver CHROME_DRIVER_PATH = '/Users/afa/myFiles/tools/chromedriver' print('--->>>初始化chrome驱动中<<<---') chrome_options = webdriver.ChromeOptions() if is_headless: chrome_options.add_argument('--headless') # 注意: 设置headless无法访问网页 # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( '--no-sandbox' ) # required when running as root user. otherwise you would get no sandbox errors. # chrome_options.add_argument('window-size=1200x600') # 设置窗口大小 # 设置无图模式 if is_pic: prefs = { 'profile.managed_default_content_settings.images': 2, } chrome_options.add_experimental_option("prefs", prefs) # 设置代理 if is_proxy: ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip().replace( 'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' if proxy_ip != '': chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip)) '''无法打开https解决方案''' # 配置忽略ssl错误 capabilities = webdriver.DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' # 修改user-agent chrome_options.add_argument('--user-agent={0}'.format(user_agent)) # 忽视证书错误 chrome_options.add_experimental_option('excludeSwitches', ['ignore-certificate-errors']) driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options, desired_capabilities=capabilities) wait = ui.WebDriverWait(driver, 30) # 显示等待n秒, 每过0.5检查一次页面是否加载完毕 print('------->>>初始化完毕<<<-------') return driver
async def get_proxy(self): ''' 异步获取proxy :return: 格式: 'http://ip:port' ''' # 设置代理ip ip_object = MyIpPools() proxy = ip_object._get_random_proxy_ip() # 失败返回False return proxy
async def get_proxy(self): ''' 异步获取proxy :return: 格式: 'http://ip:port' ''' # 设置代理ip ip_object = MyIpPools() ip_list = ip_object.get_proxy_ip_from_ip_pool()['http'] proxy = ip_list[randint(0, len(ip_list) - 1)] return proxy
def deal_with_div(self, goods_id): # 研究分析发现要获取描述div只需要通过下面地址即可 # https://hws.m.taobao.com/cache/desc/5.0?callback=backToDesc&type=1&id= url = 'https://hws.m.taobao.com/cache/desc/5.0?callback=backToDesc&type=1&id=' + str( goods_id) # print(url) # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } try: self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.get(url) except Exception: try: self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.get(url) except Exception: self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.get(url) body = self.driver.page_source # print(body) try: body = re.compile(r'backToDesc\((.*)\)').findall(body)[0] except IndexError: print('获取详情图片介绍时出错,此处跳过!') return '' try: body = json.loads(body) except Exception: print('999') return '' body = body.get('pcDescContent', '') body = re.compile(r'<').sub( '<', body ) # self.driver.page_source转码成字符串时'<','>'都被替代成><此外还有其他也类似被替换 body = re.compile(r'>').sub('>', body) body = re.compile(r'&').sub('&', body) body = re.compile(r' ').sub(' ', body) body = re.compile(r'src=\"https:').sub('src=\"', body) # 先替换部分带有https的 body = re.compile(r'src="').sub('src=\"https:', body) # 再把所欲的换成https的 # print(body) return body
def _init_chrome(self): ''' 如果使用chrome请设置page_timeout=30 :return: ''' print('--->>>初始化chrome驱动中<<<---') chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--headless') # 注意: 设置headless无法访问网页 # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--disable-gpu') chrome_options.add_argument( '--no-sandbox' ) # required when running as root user. otherwise you would get no sandbox errors. # chrome_options.add_argument('window-size=1200x600') # 设置窗口大小 # 设置无图模式 prefs = { 'profile.managed_default_content_settings.images': 2, } chrome_options.add_experimental_option("prefs", prefs) # 设置代理 ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip().replace( 'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' if proxy_ip != '': chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip)) '''无法打开https解决方案''' # 配置忽略ssl错误 capabilities = webdriver.DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True # 修改user-agent chrome_options.add_argument('--user-agent={0}'.format(HEADERS[randint( 0, len(HEADERS) - 1)])) # 忽视证书错误 chrome_options.add_experimental_option('excludeSwitches', ['ignore-certificate-errors']) self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options, desired_capabilities=capabilities) wait = ui.WebDriverWait(self.driver, 30) # 显示等待n秒, 每过0.5检查一次页面是否加载完毕 print('------->>>初始化完毕<<<-------')
def _get_proxies(cls): ''' 得到单个代理ip :return: 格式: {'http': ip+port} ''' ip_object = MyIpPools() proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} proxy = proxies['http'][randint(0, len(proxies) - 1)] tmp_proxies = { 'http': proxy, } return tmp_proxies
chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--proxy-server=http://183.136.218.253:80') chrome_options.add_argument('--headless') # 注意: 设置headless无法访问网页 chrome_options.add_argument('--disable-gpu') # 设置无图模式 prefs = {'profile.managed_default_content_settings.images': 2} chrome_options.add_experimental_option("prefs", prefs) '''无法打开https解决方案''' # 配置忽略ssl错误 capabilities = webdriver.DesiredCapabilities.CHROME.copy() capabilities['acceptSslCerts'] = True capabilities['acceptInsecureCerts'] = True # 方法1: 设置代理 ip_object = MyIpPools() proxy_ip = ip_object._get_random_proxy_ip().replace( 'http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' if proxy_ip != '': chrome_options.add_argument('--proxy-server={0}'.format(proxy_ip)) # 方法2: # ip_object = MyIpPools() # proxy_ip = ip_object._get_random_proxy_ip().replace('http://', '') if isinstance(ip_object._get_random_proxy_ip(), str) else '' # # Change the proxy properties of that copy. # capabilities['proxy'] = { # "httpProxy": proxy_ip, # "ftpProxy": proxy_ip, # "sslProxy": proxy_ip, # "noProxy": None, # "proxyType": "MANUAL",
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' self.msg = '------>>>| 对应的手机端地址为: ' + 'https://h5.m.taobao.com/awp/core/detail.htm?id=' + str(goods_id) self.my_lg.info(self.msg) appKey = '12574478' t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 ''' 下面是构造params ''' goods_id = goods_id # self.my_lg.info(goods_id) params_data_1 = { 'id': goods_id } params_data_2 = { 'exParams': json.dumps(params_data_1), # 每层里面的字典都要先转换成json 'itemNumId': goods_id } # self.my_lg.info(str(params_data_2)) ### * 注意这是正确的url地址: right_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?appKey=12574478&t=1508886442888&api=mtop.taobao.detail.getdetail&v=6.0&ttid=2016%40taobao_h5_2.0.0&isSec=0&ecode=0&AntiFlood=true&AntiCreep=true&H5Request=true&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22exParams%22%3A%22%7B%5C%22id%5C%22%3A%5C%22546756179626%5C%22%7D%22%2C%22itemNumId%22%3A%22546756179626%22%7D' # right_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?appKey=12574478&t=1508886442888&api=mtop.taobao.detail.getdetail&v=6.0&ttid=2016%40taobao_h5_2.0.0&isSec=0&ecode=0&AntiFlood=true&AntiCreep=true&H5Request=true&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22exParams%22%3A%22%7B%5C%22id%5C%22%3A%5C%22546756179626%5C%22%7D%22%2C%22itemNumId%22%3A%22546756179626%22%7D' # right_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?appKey=12574478&t=1508857184835&api=mtop.taobao.detail.getdetail&v=6.0&ttid=2016%40taobao_h5_2.0.0&isSec=0&ecode=0&AntiFlood=true&AntiCreep=true&H5Request=true&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22exParams%22%3A%22%7B%5C%22id%5C%22%3A%5C%2241439519931%5C%22%7D%22%2C%22itemNumId%22%3A%2241439519931%22%7D' # self.my_lg.info(right_url) params = { 'appKey': appKey, 't': t, # sign = '24b2e987fce9c84d2fc0cebd44be49ef' # sign可以为空 'api': 'mtop.taobao.detail.getdetail', 'v': '6.0', 'ttid': '2016@taobao_h5_2.0.0', 'isSec': '0', 'ecode': '0', 'AntiFlood': 'true', 'AntiCreep': 'true', 'H5Request': 'true', 'type': 'jsonp', 'callback': 'mtopjsonp1', 'data': json.dumps(params_data_2), # 每层里面的字典都要先转换成json } tmp_url = 'https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies)-1)] tmp_proxies = { 'http': self.proxy, } # self.my_lg.info('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) s = requests.session() try: response = s.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 last_url = re.compile(r'\+').sub('', response.url) # 转换后得到正确的url请求地址 # self.my_lg.info(last_url) response = s.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 data = response.content.decode('utf-8') # self.my_lg.info(data) data = re.compile(r'mtopjsonp1\((.*)\)').findall(data) # 贪婪匹配匹配所有 # self.my_lg.info(str(data)) except Exception: self.my_lg.error('requests.get()请求超时...' + ' 出错goods_id: ' + str(goods_id)) self.my_lg.error('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != []: data = data[0] try: data = json.loads(data) except Exception: self.my_lg.error('json.loads转换data时出错, 请检查! 出错goods_id: ' + str(goods_id)) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) if data.get('data', {}).get('trade', {}).get('redirectUrl', '') != '' and data.get('data', {}).get('seller', {}).get('evaluates') is None: ''' ## 表示该商品已经下架, 原地址被重定向到新页面 ''' self.my_lg.info('@@@@@@ 该商品已经下架...') tmp_data_s = self.init_pull_off_shelves_goods() self.result_data = {} return tmp_data_s # 处理商品被转移或者下架导致页面不存在的商品 if data.get('data').get('seller', {}).get('evaluates') is None: self.my_lg.info('data为空, 地址被重定向, 该商品可能已经被转移或下架') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} data['data']['rate'] = '' # 这是宝贝评价 data['data']['resource'] = '' # 买家询问别人 data['data']['vertical'] = '' # 也是问和回答 data['data']['seller']['evaluates'] = '' # 宝贝描述, 卖家服务, 物流服务的评价值... result_data = data['data'] # 处理result_data['apiStack'][0]['value'] # self.my_lg.info(result_data.get('apiStack', [])[0].get('value', '')) result_data_apiStack_value = result_data.get('apiStack', [])[0].get('value', {}) try: result_data_apiStack_value = json.loads(result_data_apiStack_value) result_data_apiStack_value['vertical'] = '' result_data_apiStack_value['consumerProtection'] = '' # 7天无理由退货 result_data_apiStack_value['feature'] = '' result_data_apiStack_value['layout'] = '' result_data_apiStack_value['delivery'] = '' # 发货地到收到地 result_data_apiStack_value['resource'] = '' # 优惠券 # result_data_apiStack_value['item'] = '' # 不能注释否则得不到月销量 # pprint(result_data_apiStack_value) except Exception: self.my_lg.error("json.loads转换出错,得到result_data['apiStack'][0]['value']值可能为空,此处跳过" + ' 出错goods_id: ' + str(goods_id)) result_data_apiStack_value = '' pass # 将处理后的result_data['apiStack'][0]['value']重新赋值给result_data['apiStack'][0]['value'] result_data['apiStack'][0]['value'] = result_data_apiStack_value # 处理mockData mock_data = result_data['mockData'] try: mock_data = json.loads(mock_data) except Exception: self.my_lg.error('json.loads转化mock_data时出错, 跳出' + ' 出错goods_id: ' + str(goods_id)) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} mock_data['feature'] = '' # pprint(mock_data) result_data['mockData'] = mock_data # self.my_lg.info(str(result_data.get('apiStack', [])[0])) # 可能会有{'name': 'esi', 'value': ''}的情况 if result_data.get('apiStack', [])[0].get('value', '') == '': self.my_lg.info("result_data.get('apiStack', [])[0].get('value', '')的值为空....") result_data['trade'] = {} self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: result_data['trade'] = result_data.get('apiStack', [])[0].get('value', {}).get('trade', {}) # 用于判断该商品是否已经下架的参数 # pprint(result_data['trade']) self.result_data = result_data # pprint(self.result_data) return result_data else: self.my_lg.info('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id) print('------>>>| 得到的商品手机版的地址为: ', tmp_url) ''' 1.原先使用requests来模拟(起初安全的运行了一个月),但是后来发现光requests会not Found,记住使用前别翻墙 ''' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) # try: # response = requests.get(tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=12) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # main_body = response.content.decode('utf-8') # # print(main_body) # # main_body = re.compile(r'\n').sub('', main_body) # main_body = re.compile(r'\t').sub('', main_body) # main_body = re.compile(r' ').sub('', main_body) # print(main_body) # data = re.compile(r'__PRELOADED_STATE__=(.*),window\.__SERVER_TIME__=').findall(main_body) # 贪婪匹配匹配所有 # print(data) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 2.采用phantomjs来处理,记住使用前别翻墙 ''' self.from_ip_pool_set_proxy_ip_to_phantomjs() try: self.driver.set_page_load_timeout(15) # 设置成10秒避免数据出错 except: return {} try: self.driver.get(tmp_url) self.driver.implicitly_wait(20) # 隐式等待和显式等待可以同时使用 locator = (By.CSS_SELECTOR, 'div.sc-kgoBCf.bTQvTk' ) # 该css为手机端标题块 try: WebDriverWait(self.driver, 20, 0.5).until( EC.presence_of_element_located(locator)) except Exception as e: print('遇到错误: ', e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('div.d-content已经加载完毕') main_body = self.driver.page_source main_body = re.compile(r'\n').sub('', main_body) main_body = re.compile(r'\t').sub('', main_body) main_body = re.compile(r' ').sub('', main_body) # print(main_body) data = re.compile( r'__PRELOADED_STATE__ = (.*);</script> <style ').findall( main_body) # 贪婪匹配匹配所有 # print(data) except Exception as e: # 如果超时, 终止加载并继续后续操作 print('-->>time out after 15 seconds when loading page') print('报错如下: ', e) # self.driver.execute_script('window.stop()') # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作 print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # 得到skudata # 卷皮原先的skudata请求地址1(官方放弃) # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id) # 现在卷皮skudata请求地址2 skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str( goods_id) self.skudata_headers = self.headers self.skudata_headers['Host'] = 'webservice.juanpi.com' try: response = requests.get( skudata_url, headers=self.skudata_headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 skudata = response.content.decode('utf-8') # print(skudata) skudata = re.compile(r'(.*)').findall(skudata) # 贪婪匹配匹配所有 # print(skudata) except Exception: print('requests.get()请求超时....') print('skudata为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if skudata != []: skudata = skudata[0] try: skudata = json.loads(skudata) except: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} skudata = skudata.get('skudata', {}) # pprint(skudata) try: if skudata.get('info') is not None: pass # 说明得到正确的skudata else: # 否则跳出 print('skudata中info的key为None, 返回空dict') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} except AttributeError as e: print('遇到错误如下(先跳过!): ', e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('skudata为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != []: main_data = data[0] # print(main_data) try: main_data = json.loads(main_data) # pprint(main_data) except: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if main_data.get('detail') is not None: main_data = main_data.get('detail', {}) # 处理commitments try: main_data['commitments'] = '' main_data.get('discount', {})['coupon'] = '' main_data.get('discount', {})['coupon_index'] = '' main_data.get('discount', {})['vip_info'] = '' main_data['topbanner'] = '' except: pass try: main_data.get('brand_info')['sub_goods'] = '' except: pass main_data['skudata'] = skudata # pprint(main_data) # print(main_data) self.result_data = main_data return main_data else: print('data中detail的key为None, 返回空dict') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def get_div_from_pc_div_url(self, url, goods_id): ''' 根据pc描述的url模拟请求获取描述的div :return: str ''' ''' appKey:12574478 t:1509513791232 api:mtop.taobao.detail.getdesc v:6.0 type:jsonp dataType:jsonp timeout:20000 callback:mtopjsonp1 data:{"id":"546818961702","type":"1"} ''' appKey = '12574478' t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 ''' 下面是构造params ''' goods_id = goods_id # self.my_lg.info(goods_id) params_data_1 = { 'id': goods_id, 'type': '1', } # self.my_lg.info(str(params_data_2)) params = { 'data': json.dumps(params_data_1) # 每层里面的字典都要先转换成json } tmp_url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdesc/6.0/?appKey={}&t={}&api=mtop.taobao.detail.getdesc&v=6.0&type=jsonp&dataType=jsonp&timeout=20000&callback=mtopjsonp1'.format( appKey, t ) tmp_proxies = { 'http': self.proxy, } # self.my_lg.info('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) # 设置3层避免报错退出 try: response = requests.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 except Exception: try: # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } response = requests.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 except Exception: # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } response = requests.get(tmp_url, headers=self.headers, params=params, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 last_url = re.compile(r'\+').sub('', response.url) # 转换后得到正确的url请求地址 # self.my_lg.info(last_url) try: response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 except Exception: ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 data = response.content.decode('utf-8') # self.my_lg.info(str(data)) data = re.compile(r'mtopjsonp1\((.*)\)').findall(data) # 贪婪匹配匹配所有 if data != []: data = data[0] data = json.loads(data) if data != []: div = data.get('data', '').get('pcDescContent', '') div = self.deal_with_div(div) # self.my_lg.info(div) else: div = '' else: div = '' return div