def get_next_page(self, log): ''' 翻页功能,获取列表页下一页 :return: ''' #计算总页数( 向上取整) company_page = math.ceil(self.company_num / 20) log.info("总页数:{}".format(company_page)) data = [] #拼接url for next_link in range(1, int(company_page) + 1): # log.info("当前页码:{}".format(next_link)) if next_link != 1: next_page_url = "https://www.tianyancha.com/search/p{}?key={}".format( next_link, self.label_name) self.driver.get(next_page_url) time.sleep(2) #解析页面 item = self.parse_and_get_list_company(log) if item: data.extend(item) if data: insert_db(data, log) name_results_coll.update_one({'_id': self._id}, {'$set': { "flag": 1 }})
def update_name_db(self, log): ''' 修改name_results字段 :return: ''' try: # 更新名字数量 name_results_coll.update_one( {'_id': self._id}, {'$set': {"name_num": self.name_num, "company_numm": self.company_num}} ) # log.info("数据更新中...") except Exception as e: log.info("数据无法更新...")
def parse_company_pages(self, log): ''' 判断该名字的数量是否在范围内 :return: ''' while True: try: # 获取公司数量 try: tmp = self.driver.find_element_by_xpath( '//span[contains(text(),"天眼查为你找到")]/..').text _ = re.search(r'(\d+)', tmp).group(1) if '+' in _: self.company_num = _ else: self.company_num = int(_) except Exception as e: log.error(e) self.company_num = 0 """--------------------""" # 获取老板数量 try: tmp = self.driver.find_element_by_xpath( '//div[contains(text(),"天眼查为你找到")]/..').text self.name_num = int(re.search(r'(\d+)', tmp).group(1)) except Exception as e: log.error(e) self.name_num = 0 log.info('老板数量:{}'.format(self.name_num)) log.info('公司数量:{}'.format(self.company_num)) if isinstance(self.company_num, str): name_results_coll.update_one({'_id': self._id}, { '$set': { "name_num": self.name_num, "company_numm": 0, 'flag': 2 } }) return self.update_name_db(log) if self.company_num != 0 and self.company_num <= 100: self.get_next_page(log) break except Exception as e: log.info(e) log.info("parse_company_pages异常") break
def handle_question(self, log): ''' 判断是否登录成功 :return: ''' try: if ('proxy' or '503' or '500') in self.driver.page_source: self.driver.refresh() sleep(3) #如果跳转到首页即登录成功 home_pages = self.driver.find_element_by_xpath( "//input[@id='home-main-search']") if home_pages: #提前打标记 account_results_coll.update_one({'_id': self.account_id}, {'$set': { "flag": 1 }}) self.flag = False return except Exception as e: # 如果找到登录页面元素,即账号不可用,登录失败 try: no_use = self.driver.find_element_by_xpath( "//div[@class='pb30 position-rel']/input") if no_use: # 提前打标记 # 标记为不可用,下次不再取出 account_results_coll.update_one({'_id': self.account_id}, {'$set': { "usable": 1 }}) return except Exception as e: # 获取列表页所有详情urls try: tmp = self.driver.find_element_by_xpath( '//span[contains(text(),"天眼查为你找到")]/..').text match = int(re.search(r'天眼查为你找到(\d+)家公司', tmp).group(1)) except Exception as e: log.error(e) match = 0 if match: self.parse_company_pages(log) self.check_count = 0 #都找不到说明没有进入列表页 else: try: self.driver.find_element_by_xpath( "//div[@class='container']//div[@class='content']") # self.image_handle(log) # 提前打标记, 标记需要打码 updateTime = str(int(time.time() * 1000)) account_results_coll.update_one( {'_id': self.account_id}, {'$set': { "flag": 1, 'updateTime': updateTime }}) self.flag = True self.login(log) # 如果出现检索条件过大,或者账号暂时不可用 except Exception as e: if '普通用户可查看100家公司,VIP会员可查看5000家公司' in self.driver.page_source: # 提前打标记 account_results_coll.update_one( {'_id': self.account_id}, {'$set': { "flag": 0 }}) # self.search_name_from_mongo() # self.get_name_to_search() else: try: error_str = self.driver.find_element_by_xpath( "//div[@class='f24 mb40 mt40 sec-c1 ']" ).text if error_str == "抱歉,没有找到相关结果!": log.info("[ERROR]: 抱歉,没有找到相关结果!") # 继续搜索下一个名字 # 更新名字数量 name_results_coll.update_one( {'_id': self._id}, { '$set': { "name_num": 0, "company_numm": 0, 'flag': 4 } }) self.check_count += 1 if self.check_count >= 4: updateTime = str( int(time.time() * 1000)) account_results_coll.update_one( {'_id': self.account_id}, { '$set': { "flag": 0, 'updateTime': updateTime } }) self.flag = True self.login(log) # 更新公司数量 # self.db.name_results.update_one({'_id': self._id},{'$set': {"company_numm": 0}}) # self.get_name_to_search() except Exception as e: # 匹配错误提示信息 error_info = self.driver.find_element_by_xpath( "/html/body/div/div[1]").text if error_info == "系统检测到您非人类行为,己被禁止访问天眼查,若有疑问请联系官方qq群 515982002": log.info("[ERROR]: {}".format(error_info)) updateTime = str(int(time.time() * 1000)) account_results_coll.update_one( {'_id': self.account_id}, { '$set': { "flag": 0, 'updateTime': updateTime } }) self.flag = True self.login(log)