class CEESpiderSina: """ CEESpiderSina类(College Entrance Examination)用来抓取高考数据,数据来源于新浪高考 """ def __init__(self,proxy=None): self.region = '' self.result = [] self.browser = AutoBrowser(proxy=proxy,timeout=20) self.browser.surf('http://kaoshi.edu.sina.com.cn/college/collegeAvgScoreRank?syear=2013&provid=1',ready_check=(By.CLASS_NAME,'pageNumWrap')) def select_region(self,region): """ 选择省份 :param str region: 省份 :return: 无返回值 """ self.region = region self.browser.interact_one_time('#provSel',select_text=region) def select_subject(self,subject='文科'): """ 选择文理科 :param str subject: 科目,文科或者理科 :return: 无返回值 """ self.browser.interact_one_time('#typeSel',select_text=subject) def select_year(self,year='2014'): """ 选择年份 :param str year: 年份 :return: 无返回值 """ self.browser.interact_one_time('#sYear',select_text=year) def select_batch(self,batch='本科一批'): """ 选择批次 :param str order: 批次 :return: 无返回值 """ self.browser.interact_one_time('#sBatch',select_text=batch) def do_search(self): """ 开始搜索 :return: 无返回值 """ self.browser.interact_one_time('#searchBtn',click=True) if self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')): self.current_url = self.browser.browser.current_url else: raise TimeoutError time.sleep(5) def clear(self): """ 清空结果 :return: """ self.result = [] def get_result_and_more(self): """ 添加所有页结果到self.result :return: """ is_next = True self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False)) while(is_next): try: self.browser.browser.find_element_by_css_selector('.pageNumWrap > [node-type="next"]') self.browser.interact_one_time('.pageNumWrap > [node-type="next"]',click=True) time.sleep(2) if not self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')): raise TimeoutError self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False)) except NoSuchElementException: break @property def colleges(self): """ 返回爬虫的结果 :return: 结果列表 :rtype: list """ vars = ['university','type','university_region','average_score','subject','year','batch','student_region'] colleges = [] for cstr in self.result: for item in re.split('\n',cstr): new_item = re.split('\s+',item)[1:8] new_item.append(self.region) colleges.append(dict(zip(vars,new_item))) colleges = [item for item in colleges if len(item) > 7] ''' for item in colleges: if re.match('^--$',item['average_score']) is not None: item['average_score'] = None else: item['average_score'] = int(float(item['average_score'])) if re.match('^--$',item['province_control_score']) is not None: item['province_control_score'] = None else: item['province_control_score'] = int(float(item['province_control_score']))''' return colleges def close(self): """ 关闭浏览器 :return: 无返回值 """ self.browser.quit()
class CEESpider: """ CEESpider类(College Entrance Examination)用来抓取高考数据 """ def __init__(self,proxy=None): self.first_region_set = ['安徽','北京','重庆','福建','广东','广西','甘肃','贵州','河北','河南','湖南','湖北','海南','黑龙江'] self.second_region_set = ['吉林','江苏','江西','辽宁','内蒙古','宁夏','青海'] self.third_region_set = ['上海','四川','山西','山东','陕西','天津','新疆','西藏','云南','浙江'] self.college = '' self.region = '' self.last_url = '' self.current_url = '' self.result = [] self.no_result = False self.browser = AutoBrowser(proxy=proxy,timeout=20) self.browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html',ready_check=(By.LINK_TEXT,'末页')) def select_region(self,region): """ 选择省份 :param str region: 省份 :return: 无返回值 """ self.region = region self.browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) if region in self.first_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(1)',click=True) if region in self.second_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(2)',click=True) if region in self.third_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(3)',click=True) self.browser.interact_one_time(location=self.browser.locate(link_text=region),click=True) def select_subject(self,subject='文科'): """ 选择文理科 :param str subject: 科目,文科或者理科 :return: 无返回值 """ self.browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) self.browser.interact_one_time(location=self.browser.locate(link_text=subject),click=True) def set_college(self,college='复旦大学'): """ 设定学校 :param str college: 学校名称 :return: 无返回值 """ self.college = college self.browser.interact_one_time('#provinceScoreKEY',send_text=college) def do_search(self): """ 开始搜索 :return: 无返回值 """ self.browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True) if self.browser.browser.find_element_by_id('noResultMessage').text == '': if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): self.current_url = self.browser.browser.current_url else: raise TimeoutError else: self.no_result = True time.sleep(5) def clear(self): """ 清空结果 :return: """ self.result = [] def get_result_and_more(self): """ 添加所有页结果到self.result :return: """ if self.no_result: self.no_result = False return None self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False)) self.last_url = self.current_url self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True) if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): time.sleep(5) self.current_url = self.browser.browser.current_url while self.last_url != self.current_url: self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False)) self.last_url = self.current_url self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True) if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): self.current_url = self.browser.browser.current_url @property def colleges(self): """ 返回爬虫的结果 :return: 结果列表 :rtype: list """ vars = ['university','student_region','subject','year','batch','average_score','province_control_score'] colleges = [] for cstr in self.result: for item in re.split('\n',cstr)[1:]: colleges.append(dict(zip(vars,re.split('\s+',item)[0:8]))) for item in colleges: if re.match('^--$',item['average_score']) is not None: item['average_score'] = None else: item['average_score'] = int(float(item['average_score'])) if re.match('^--$',item['province_control_score']) is not None: item['province_control_score'] = None else: item['province_control_score'] = int(float(item['province_control_score'])) return colleges def close(self): """ 关闭浏览器 :return: 无返回值 """ self.browser.quit()