def login(self): ''' 判断是否登陆以及登陆 ''' try: # 有usercookies 且没有登陆信息 if self.usercookies not in (None, False): print_green('登陆...') self.session.cookies.update(self.usercookies) self.usercookies = False except Exception as err: print_green('登陆出错啦', err)
def is_get_right_page(self): ''' 判断是否获得正确页面, 若需要可重写 这里默认你的链接指向页面就是你的目标页面, 故跳转即失败 ''' try: self.response.raise_for_status() if len(self.response.history) > 0: if self.response.history[0].status_code == 302: return False return True except Exception as err: print_green('判断是否获得正确页面出错啦', err) return False
def save_proxy2mongo(self, data): ''' 储存代理及相关信息到mongo data = { 'proxy':{.:.}, 'cookies':{.:.}, 'headers':{.:.} } ''' print_green('储存代理') try: if data.get('proxy') is None: raise ValueError('数据格式错误, 没有 proxy 键') if self.collection_name is not None: result = self.col.insert_one(data) return result.inserted_id except Exception as err: print_green('储存代理出错啦', err)
def parse_details_html_by_selenium(): ''' 用selenium解析 ''' try: items = wait.until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, '.J_MouserOnverReq'))) # items = driver.find_elements_by_class_name('J_MouserOnverReq') for item in items: result = {} image_ele = item.find_element_by_class_name('J_ItemPic') image = image_ele.get_attribute('data-src') result['image'] = image if image else image_ele.get_attribute( 'src') result['title'] = item.find_element_by_class_name( 'title').text.strip() result['url'] = item.find_element_by_class_name( 'J_ClickStat').get_attribute('href') result['price'] = item.find_element_by_css_selector( '.price strong').text result['deal_cnt'] = item.find_element_by_class_name( 'deal-cnt').text[:-3] # 这种写法是分析特性,不具有普遍性 # result['is_service_free'] = True if len(item.find_elements_by_css_selector('.row-1 > div'))==3 else False # 封装判断element是否存在的方法,简单粗暴,目的直接 result['is_service_free'] = '包邮' if has_element( item, '.icon-service-free') else '不包邮' result['shop_name'] = item.find_element_by_class_name( 'shopname').text.strip() result['location'] = item.find_element_by_class_name( 'location').text print_green( ' ' * 20, items.index(item), "*" * 20, '正在用selenium解析', '*' * 20, '保存到数据库: %s' % ('成功' if save_to_db(result) else '失败', )) return len(items) except TimeoutError as e: print_rd('等待超时', e) parse_details_html_by_selenium() except Exception as e: print_rd('parse__by_selenium 出错啦', e)
def proxy(self): ''' 获取一个有效代理, 不需要再对返回的值进行判断 传入一个集合名, 如果没有, 那么从代理池获取 ''' try: test_x = '' if self.collection_name is not None: test_x = '数据库没有代理,' print_green('从数据库获取代理') result = self.col.find_one_and_delete({}) if result is not None: return result print_green('%s 从代理池获取代理' % test_x) resp = requests.get(PROXY_GETED_API) resp.raise_for_status if len(resp.text.split(':')) == 2: return {'proxy': { 'http': 'http://' + resp.text, 'https': 'http://' + resp.text }} return self.proxy except Exception as err: print_green('获取代理出错啦', err)
def get_resp_wrapper(self, url, is_login='******', retry=5, is_sleep=True, validate=None, **kwargs): ''' 对用session各种请求得到的结果进行预处理 如: 请求页面是否需要登陆, 什么时候登陆, 错误处理, 是否处理验证码 is_login : '******'-> 请求后登陆/'before'-> 请求前登陆/None-> 不登录, 优先级高于对象 validate : None-> 调用对象的self.validate/False -> 优先级高于对象, 禁止验证/function -> 执行这个传入的validate retry : int 出错后重试次数 is_sleep : 返回前是否sleep kwargs : 全部传入后续session请求 ''' if retry < 0: print_green('错误次数太多, 重新实例化并执行') return get_resp_wrapper(MsessionReq(*self.params_tuple, **self.params_dict), url, is_login=is_login, validate=validate, **kwargs) try: if is_login == 'before': self.login() self.response = func(self, url, **kwargs) if self.is_get_right_page() is False: print_green('302码, 代理被封:', self.session.proxies['http']) referer_url = self.response.history[0].url self.session.headers.update({'Referer': referer_url}) if validate is not None: # 方法 if validate is False: self.__is_validate = False else: __validate = validate else: # 对象 if self.validate is None: self.__is_validate = False else: __validate = self.validate if self.__is_validate is True: # 解决验证码 results = __validate(referer_url, self) if results['is_continue'] is False: MsessionReq.__is_validate = False if results['result'] is not None: self.proxy_info.update(results['result']) # 重新请求链接 return get_resp_wrapper(self, url, is_login=is_login, validate=validate, **kwargs) print_green('验证失败, 重新实例化并执行') return get_resp_wrapper(MsessionReq(*self.params_tuple, **self.params_dict), url, is_login=is_login, validate=validate, **kwargs) # 成功获取页面 if is_login == 'behind': self.login() self.session.headers.update({'Referer': self.response.url}) if is_sleep: #在每次成功返回之前随机等待, 避免失败时的等待 time.sleep(uniform(1, 7)) print_green('succeed 正在使用代理:', self.proxy_info['proxy']['http']) return self.response except Exception as err: print_green('get_resp_wrapper出错啦', err) # 不重新实例化, 传入原来self, 重新执行 preprocessor return get_resp_wrapper(self, url, is_login=is_login, retry=retry - 1, validate=validate, **kwargs)
def __del__(self): if self.collection_name is not None: print_green('关闭代理数据库连接') self.client.close()