class Driver(object): def __init__(self): # 使用requestium的Session, 使用requests和Selenium, 设置为headless模式 self.s = Session( webdriver_path='./chromedriver', browser='chrome', default_timeout=15, #webdriver_options={'arguments': ['headless']} ) self.category_mapping = None path = os.path.join(os.getcwd(), FILENAME) if os.path.exists(path): self.category_mapping = ujson.load(open(path)) #pprint(self.category_mapping) def close(self): if self.s.driver is not None: self.s.driver.quit() if self.s is not None: self.s.close() def login(self): """ 使用driver登录到启信宝 """ login_url = 'http://www.qixin.com/auth/login?return_url=%2F' self.s.driver.get(login_url) # 使用requestium中的ensure_*方法定位元素 username_xpath = '//input[@class="form-control input-lg input-flat input-flat-user"]' user_element = self.s.driver.ensure_element_by_xpath(username_xpath) for c in USERNAME: # 间歇输入Username和Password user_element.send_keys(c) time.sleep(random.randint(0, 2)) password_xpath = '//input[@class="form-control input-lg input-flat input-flat-lock"]' password_element = self.s.driver.ensure_element_by_xpath( password_xpath) for c in PASSWORD: password_element.send_keys(c) time.sleep(random.random()) password_element.send_keys(Keys.ENTER) self.s.driver.implicitly_wait(10) def process_cookies(self): """ 使用requests抓取页面 """ # 将driver的cookies转给requests的session tmp_url = 'http://www.qixin.com/search?area.province=12&page=1&scope[]=1' self.s.driver.get(tmp_url) self.s.transfer_driver_cookies_to_session() self.s.copy_user_agent_from_driver() # 判断category mapping是否存在 if self.category_mapping is None: req = self.s.get('http://www.qixin.com') self.category_mapping = {} for element in req.xpath('//div[@class="grid-item"]'): category_l1 = element.xpath( './div/text()').extract_first().strip() category_l2 = element.xpath('./a/text()').extract() self.category_mapping[category_l1] = category_l2 ujson.dump(self.category_mapping, open(os.path.join(os.getcwd(), FILENAME), 'w')) def fetch_page(self): # 获取cookies之后,使用requests的session开始抓取数据 result = [] self.s.proxies.update({ 'http': 'http://forward.xdaili.cn:80', 'https': 'https://forward.xdaili.cn:80' }) for page in range(1, 11): url = 'http://www.qixin.com/search?area.province=12&page=%s&scope[]=1&sorter=4' % page self.s.headers.update({'Proxy-Authorization': sign()}) req = self.s.get(url) for element in req.xpath( "//div[contains(@class, 'company-item')]"): result.append({ 'title': element.xpath(".//div[@class='company-title']/a/text()" ).extract_first().strip(), 'legal_owner': element.xpath(".//div[@class='legal-person'][1]/text()" ).re_first(r'法定代表人:(\w*)').strip(), 'status': element.xpath( ".//div[@class='company-tags']/span[1]/text()"). extract_first().strip(), 'capital': element.xpath(".//div[contains(@class, 'col-3-1')]/text()" ).extract_first().strip(), 'date': element.xpath(".//div[contains(@class, 'col-3-2')]/text()" ).extract_first().strip(), 'url': element.xpath(".//div[@class='company-title']/a/@href" ).extract_first().strip() }) time.sleep(10) return result def process_search_condition(self): """ 构建搜索条件 * URL: http://www.qixin.com/search? * param 地区: area.province=12, area.district=120101-120119 * param 搜索范围: scope[]=1 * param 排序: sorter=3 | 4 * param 注册资本: capital: 1-5 * param 所属行业: industry.l1 一级行业, industry.l2 二级行业 * param 注册年份: year: 1-5 * param page: 页码,最大不超过500, 只能看5000条搜索结果 http://www.qixin.com/search?area.district=120101&area.province=12&capital=2&industry.l1=%E5%86%9C%E3%80%81%E6%9E%97%E3%80%81%E7%89%A7%E3%80%81%E6%B8%94%E4%B8%9A&industry.l2=%E5%86%9C%E4%B8%9A&page=1&scope[]=1&sorter=4&year=5 """ pass
class Driver(object): def __init__(self): # 使用requestium的Session, 使用requests和Selenium, 设置为headless模式 self.s = Session( webdriver_path='./chromedriver', browser='chrome', default_timeout=15, #webdriver_options={'arguments': ['headless']} ) # self.category_mapping = None # path = os.path.join(os.getcwd(), FILENAME) # if os.path.exists(path): # self.category_mapping = ujson.load(open(path)) # pprint(self.category_mapping) def close(self): if self.s.driver is not None: self.s.driver.quit() if self.s is not None: self.s.close() def login(self): """ 使用driver登录到启信宝 """ login_url = 'http://www.qixin.com/auth/login?return_url=%2F' self.s.driver.get(login_url) # 使用requestium中的ensure_*方法定位元素 user_element = self.s.driver.ensure_element_by_xpath( LOGIN_XPATH['username']) for c in USERNAME: # 间歇输入Username和Password user_element.send_keys(c) time.sleep(random.randint(0, 2)) password_element = self.s.driver.ensure_element_by_xpath( LOGIN_XPATH['password']) for c in PASSWORD: password_element.send_keys(c) time.sleep(random.random()) password_element.send_keys(Keys.ENTER) self.s.driver.implicitly_wait(20) def process_cookies(self): """ 使用requests抓取页面 """ # 将driver的cookies转给requests的session tmp_url = 'http://www.qixin.com/search?area.province=12&page=1&scope[]=1' self.s.driver.get(tmp_url) self.s.transfer_driver_cookies_to_session() self.s.copy_user_agent_from_driver() # 判断category mapping是否存在 if self.category_mapping is None: req = self.s.get('http://www.qixin.com') self.category_mapping = {} for element in req.xpath(CATEGORY_XPATH['info']): category_l1 = element.xpath( CATEGORY_XPATH['l1']).extract_first().strip() category_l2 = element.xpath(CATEGORY_XPATH['l2']).extract() self.category_mapping[category_l1] = category_l2 ujson.dump(self.category_mapping, open(os.path.join(os.getcwd(), FILENAME), 'w')) def fetch_page_with_chrome(self, url): self.s.transfer_session_cookies_to_driver() self.s.driver.get(url) def fetch_page_with_requests(self, url): """ url = 'http://www.qixin.com/search?area.province=12&page=%s&scope[]=1&sorter=4' % page :param url:请求的URL :param return: 返回list """ # 获取cookies之后,使用requests的session开始抓取数据 self.s.proxies.update({ 'http': 'http://forward.xdaili.cn:80', 'https': 'https://forward.xdaili.cn:80' }) self.s.headers.update({'Proxy-Authorization': sign()}) req = self.s.get(url) result = parse_list(req) return result