def __init__(self,proxy=None): self.soups = list() self.more = True self.browser = AutoBrowser(proxy=proxy) self.browser.surf('http://epub.cnki.net/kns/brief/result.aspx?dbprefix=CJFQ', ready_check=(By.CSS_SELECTOR,'#bottom')) time.sleep(2)
def __init__(self,proxy=None): self.first_region_set = ['安徽','北京','重庆','福建','广东','广西','甘肃','贵州','河北','河南','湖南','湖北','海南','黑龙江'] self.second_region_set = ['吉林','江苏','江西','辽宁','内蒙古','宁夏','青海'] self.third_region_set = ['上海','四川','山西','山东','陕西','天津','新疆','西藏','云南','浙江'] self.college = '' self.region = '' self.last_url = '' self.current_url = '' self.result = [] self.no_result = False self.browser = AutoBrowser(proxy=proxy,timeout=20) self.browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html',ready_check=(By.LINK_TEXT,'末页'))
def __init__(self,proxy=None): self.region = '' self.result = [] self.browser = AutoBrowser(proxy=proxy,timeout=20) self.browser.surf('http://kaoshi.edu.sina.com.cn/college/collegeAvgScoreRank?syear=2013&provid=1',ready_check=(By.CLASS_NAME,'pageNumWrap'))
class CEESpiderSina: """ CEESpiderSina类(College Entrance Examination)用来抓取高考数据,数据来源于新浪高考 """ def __init__(self,proxy=None): self.region = '' self.result = [] self.browser = AutoBrowser(proxy=proxy,timeout=20) self.browser.surf('http://kaoshi.edu.sina.com.cn/college/collegeAvgScoreRank?syear=2013&provid=1',ready_check=(By.CLASS_NAME,'pageNumWrap')) def select_region(self,region): """ 选择省份 :param str region: 省份 :return: 无返回值 """ self.region = region self.browser.interact_one_time('#provSel',select_text=region) def select_subject(self,subject='文科'): """ 选择文理科 :param str subject: 科目,文科或者理科 :return: 无返回值 """ self.browser.interact_one_time('#typeSel',select_text=subject) def select_year(self,year='2014'): """ 选择年份 :param str year: 年份 :return: 无返回值 """ self.browser.interact_one_time('#sYear',select_text=year) def select_batch(self,batch='本科一批'): """ 选择批次 :param str order: 批次 :return: 无返回值 """ self.browser.interact_one_time('#sBatch',select_text=batch) def do_search(self): """ 开始搜索 :return: 无返回值 """ self.browser.interact_one_time('#searchBtn',click=True) if self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')): self.current_url = self.browser.browser.current_url else: raise TimeoutError time.sleep(5) def clear(self): """ 清空结果 :return: """ self.result = [] def get_result_and_more(self): """ 添加所有页结果到self.result :return: """ is_next = True self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False)) while(is_next): try: self.browser.browser.find_element_by_css_selector('.pageNumWrap > [node-type="next"]') self.browser.interact_one_time('.pageNumWrap > [node-type="next"]',click=True) time.sleep(2) if not self.browser.is_ready(locator=(By.CLASS_NAME,'pageNumWrap')): raise TimeoutError self.result.append(self.browser.get_text(location='#scoreTable2',beautiful=False)) except NoSuchElementException: break @property def colleges(self): """ 返回爬虫的结果 :return: 结果列表 :rtype: list """ vars = ['university','type','university_region','average_score','subject','year','batch','student_region'] colleges = [] for cstr in self.result: for item in re.split('\n',cstr): new_item = re.split('\s+',item)[1:8] new_item.append(self.region) colleges.append(dict(zip(vars,new_item))) colleges = [item for item in colleges if len(item) > 7] ''' for item in colleges: if re.match('^--$',item['average_score']) is not None: item['average_score'] = None else: item['average_score'] = int(float(item['average_score'])) if re.match('^--$',item['province_control_score']) is not None: item['province_control_score'] = None else: item['province_control_score'] = int(float(item['province_control_score']))''' return colleges def close(self): """ 关闭浏览器 :return: 无返回值 """ self.browser.quit()
# coding=UTF-8 import random import time from libs.network.class_autobrowser import AutoBrowser proxy_list = ['58.22.86.44:8000'] proxy_checked_list = ['58.20.234.243:8000','58.20.242.85:8000', '110.52.232.56:8000','110.52.232.56:80', '58.20.232.239:8000','58.246.242.154:8080', '58.20.232.239:8000','110.52.232.75:8000', '60.13.74.184:81','110.52.232.60:8000', '58.247.30.222:8080','58.22.86.44:8000'] browser = AutoBrowser(proxy=proxy_list[random.randint(0,len(proxy_list)-1)]) browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html') browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) browser.interact_one_time('div.tabs_10:nth-child(3)',click=True) browser.interact_one_time(location=browser.locate(link_text='西藏'),click=True) browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) browser.interact_one_time(location=browser.locate(link_text='文科'),click=True) browser.interact_one_time('#provinceScoreKEY',send_text='复旦大学') browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True) time.sleep(5) print(browser.browser.find_element_by_css_selector('#queryschoolad').text) u1 = browser.browser.current_url browser.interact_one_time(location=browser.locate(link_text='下一页'),click=True)
class Cnki: """ Cnki类用来连接cnki数据库 """ def __init__(self,proxy=None): self.soups = list() self.more = True self.browser = AutoBrowser(proxy=proxy) self.browser.surf('http://epub.cnki.net/kns/brief/result.aspx?dbprefix=CJFQ', ready_check=(By.CSS_SELECTOR,'#bottom')) time.sleep(2) def submit(self): """ 提交查询,进行搜索 :return: """ self.browser.interact_one_time(self.browser.locate(id="btnSearch"),click=True) time.sleep(5) def sort(self,by='被引'): """ 根据by参数进行排序 :param str by: 变量 :return: 无返回值 """ self.browser.switch(iframe='iframeResult') self.browser.interact_one_time(location=self.browser.locate(link_text=by),click=True) time.sleep(6) def select_all_literature(self): self.browser.interact_one_time(location=self.browser.locate(link_text='清除'),click=True) time.sleep(2) self.browser.interact_one_time(location=self.browser.locate(id='selectCheckbox'),click=True) time.sleep(2) self.browser.interact_one_time(location='.SavePoint > a:nth-child(3)',click=True) def get_more(self,limit=4): """ 查询下一页 :param limit: :return: """ i = 1 while self.more: if i >= limit: self.more = False try: self.browser.switch(iframe='iframeResult') self.browser.interact_one_time(location=self.browser.locate(id='Page_next'),click=True) time.sleep(3) except NoSuchElementException: self.more = False else: time.sleep(5) self.select_all_literature() self.child_operation() i += 1 time.sleep(2) def child_operation(self): """ 操作子页面,并添加文献信息到self.soups :return: 无返回值 """ self.browser.interact_one_time(location='.GTContentTitle > td:nth-child(1) > input:nth-child(1)',click=True) self.browser.interact_one_time(location='#file_export > input:nth-child(1)',click=True) time.sleep(5) self.browser.interact_one_time(location=self.browser.locate(link_text='NoteExpress'),click=True) time.sleep(5) self.soups.append(BeautifulSoup(self.browser.browser.find_element_by_css_selector('.mainTable').text,"lxml")) time.sleep(5) self.browser.switch_to_parent(close=True) self.browser.switch_to_parent(close=True) time.sleep(5) def set_query(self,query_str=None): """ 设置专业查询字符串 :param str query_str: 查询字符串 :return: 无返回值 """ self.browser.interact_one_time(self.browser.locate(id='1_4'),click=True) time.sleep(2) self.browser.interact_one_time('#expertvalue',send_text=query_str) time.sleep(1) def set_period(self,start_period=None,end_period=None): """ 设置起始和终止时期 :param str start_period: 起始时期 :param str end_period: 终止时期 :return: 无返回值 """ if start_period is not None: self.browser.interact_one_time(location=self.browser.locate(id='year_from'),select_text=start_period) if end_period is not None: self.browser.interact_one_time(location=self.browser.locate(id='year_to'),select_text=end_period) time.sleep(1) def set_subject(self,subjects=None): """ 选择学科领域 :param list subjects: 学科字符串 :return: 无返回值 """ self.browser.interact_one_time(location='input.btn:nth-child(1)',click=True) for subject in subjects: self.browser.interact_one_time(location=self.browser.locate(xpath=''.join(["//input[@name='",subject,"']"])),click=True) time.sleep(1) def export_to_pickle(self,file=r'E:\gitrobot\files\literature\literature_list.pkl'): """ 到处有效的代理服务器列表到文件 :param str file: 文件名 :return: 无返回值 """ F = open(file, 'wb') pickle.dump(self.soups, F) F.close() def export_to_dict(self): literature = OrderedDict() for llist in self.soups: content = str(llist.find_all('p')) content = re.split('</p>\]',re.split('\[<p>',content)[1])[0] items = re.split('\n',content) one_literature = dict() for item in items: if '{Title}' in item: title = re.sub('\s+','',re.split('}: ',item)[1]) if '{Author}' in item: one_literature['author'] = [re.sub('\s+','',author) for author in re.split('\{Author\}\: ',item) if len(author) > 0] if '{Author Address}' in item: one_literature['address'] = [re.sub('\s+','',address) for address in re.split(';',re.split('\}\: ',item)[1]) if len(address) > 0] if '{Journal}' in item: one_literature['journal'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Year}' in item: one_literature['year'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Issue}' in item: one_literature['issure'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Pages}' in item: one_literature['pages'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Keywords}' in item: one_literature['keyword'] = [re.sub('\s+','',keyword) for keyword in re.split(';',re.split('\}\: ',item)[1]) if len(keyword) > 0] if '{Abstract}' in item: one_literature['abstract'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{ISBN/ISSN}' in item: one_literature['ISBN/ISSN'] = re.sub('\s+','',re.split('\}\: ',item)[1]) if '{Database Provider}' in item: literature[title] = one_literature one_literature = dict() return literature def export_to_json(self,file=r'E:\gitrobot\files\literature\literature_list.txt'): json.dump(self.export_to_dict(), fp=open(file,'w')) def close(self): """ 关闭浏览器 :return: 无返回值 """ self.browser.quit()
''' mongo = MongoDB() mongo.connect('publication','ChineseJournal') ''' literatures = json.load(open(r'E:\gitrobot\files\literature\journals_cssci.txt')) for l in literatures: print(l) #mongo.collection.insert_one(l) print(len(literatures))''' proxy_list = ['101.26.38.162:82'] proxy_list = ['111.56.13.152:80', '101.26.38.162:80', '101.26.38.162:82', '111.56.13.150:80', '60.191.157.155:3128', '60.191.175.54:3128', '60.191.167.93:3128', '61.163.32.6:3128', '49.1.244.139:3128', '112.16.76.188:8080', '60.191.163.147:3128', '60.194.100.51:80', '101.226.12.223:80', '82.200.81.233:80', '85.143.24.70:80', '59.58.162.141:888', '110.18.241.9:3128', '60.15.41.214:3128', '61.7.149.69:8080', '61.184.199.203:3128', '86.100.118.44:81', '61.150.89.67:3128', '61.162.223.41:9797', '95.168.217.24:3128', '86.100.118.44:80', '31.173.74.73:8080', '58.248.137.228:80', '79.120.72.222:3128', '46.218.85.101:3129', '106.56.225.200:3128', '60.15.55.228:3128', '60.13.74.184:81', '101.200.234.114:8080', '104.238.83.28:443', '91.183.124.41:80', '60.191.164.22:3128', '62.204.241.146:8000', '60.191.174.227:3128', '60.191.153.12:3128', '61.53.65.52:3128', '36.250.69.4:80', '61.153.198.178:3128', '60.191.153.75:3128', '60.191.178.43:3128', '60.13.74.184:82', '60.13.74.184:80', '60.191.161.244:3128', '60.191.170.122:3128', '60.191.167.11:3128', '61.175.220.4:3128', '61.164.92.254:9999', '61.75.2.124:3128', '27.122.12.45:3128', '64.62.233.67:80', '113.140.43.51:3128', '60.191.166.130:3128', '113.107.57.76:8101', '113.107.57.76:80', '60.191.160.20:3128', '61.134.34.148:3128', '93.51.247.104:80', '60.191.164.59:3128', '91.142.84.182:3128', '72.252.11.91:8080', '59.44.244.14:9797', '58.18.50.10:3128', '58.96.187.208:3128', '85.194.75.18:8080', '113.105.80.61:3128', '58.59.141.187:3128', '61.163.45.240:3128', '91.108.131.250:8080', '110.17.172.150:3128'] #browser = AutoBrowser(proxy=proxy_list[random.randint(0,len(proxy_list)-1)]) #browser = AutoBrowser(proxy='101.26.38.162:82') browser = AutoBrowser() browser.surf('http://navi.cnki.net/knavi/journal/Detailq/CJFD/JJYJ?Year=&Issue=&Entry=', ready_check=(By.CSS_SELECTOR,'#bottom')) result = [] for item in mongo.collection.find({'ISSN':None}): print(item['中文名称']) browser.interact_one_time(location=browser.locate(id='navi-search-value'),send_text=item['中文名称']) browser.interact_one_time(location=browser.locate(id='navi-search-button'),click=True) time.sleep(2) browser.interact_one_time(location=browser.locate( css_selector=''.join(['a[title="',item['中文名称'],'"]'])),click=True) time.sleep(2) data = BeautifulSoup(browser.browser.find_element_by_css_selector('.list01').text,"lxml") ISSN = re.search('\d{4}-\d{3}[0-9a-zA-Z]',str(data)).group() print(ISSN)
class CEESpider: """ CEESpider类(College Entrance Examination)用来抓取高考数据 """ def __init__(self,proxy=None): self.first_region_set = ['安徽','北京','重庆','福建','广东','广西','甘肃','贵州','河北','河南','湖南','湖北','海南','黑龙江'] self.second_region_set = ['吉林','江苏','江西','辽宁','内蒙古','宁夏','青海'] self.third_region_set = ['上海','四川','山西','山东','陕西','天津','新疆','西藏','云南','浙江'] self.college = '' self.region = '' self.last_url = '' self.current_url = '' self.result = [] self.no_result = False self.browser = AutoBrowser(proxy=proxy,timeout=20) self.browser.surf('http://gkcx.eol.cn/soudaxue/queryProvinceScore.html',ready_check=(By.LINK_TEXT,'末页')) def select_region(self,region): """ 选择省份 :param str region: 省份 :return: 无返回值 """ self.region = region self.browser.interact_one_time('.gaoxiaoshengyuandi_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) if region in self.first_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(1)',click=True) if region in self.second_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(2)',click=True) if region in self.third_region_set: self.browser.interact_one_time('div.tabs_10:nth-child(3)',click=True) self.browser.interact_one_time(location=self.browser.locate(link_text=region),click=True) def select_subject(self,subject='文科'): """ 选择文理科 :param str subject: 科目,文科或者理科 :return: 无返回值 """ self.browser.interact_one_time('.getFstypegaoxiaogesheng_s > span:nth-child(2) > a:nth-child(1) > img:nth-child(1)',click=True) self.browser.interact_one_time(location=self.browser.locate(link_text=subject),click=True) def set_college(self,college='复旦大学'): """ 设定学校 :param str college: 学校名称 :return: 无返回值 """ self.college = college self.browser.interact_one_time('#provinceScoreKEY',send_text=college) def do_search(self): """ 开始搜索 :return: 无返回值 """ self.browser.interact_one_time('#dxlqx > form:nth-child(1) > div:nth-child(2) > input:nth-child(1)',click=True) if self.browser.browser.find_element_by_id('noResultMessage').text == '': if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): self.current_url = self.browser.browser.current_url else: raise TimeoutError else: self.no_result = True time.sleep(5) def clear(self): """ 清空结果 :return: """ self.result = [] def get_result_and_more(self): """ 添加所有页结果到self.result :return: """ if self.no_result: self.no_result = False return None self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False)) self.last_url = self.current_url self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True) if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): time.sleep(5) self.current_url = self.browser.browser.current_url while self.last_url != self.current_url: self.result.append(self.browser.get_text(location='#queryschoolad',beautiful=False)) self.last_url = self.current_url self.browser.interact_one_time(location=self.browser.locate(link_text='下一页'),click=True) if self.browser.is_ready(locator=(By.LINK_TEXT,'下一页')): self.current_url = self.browser.browser.current_url @property def colleges(self): """ 返回爬虫的结果 :return: 结果列表 :rtype: list """ vars = ['university','student_region','subject','year','batch','average_score','province_control_score'] colleges = [] for cstr in self.result: for item in re.split('\n',cstr)[1:]: colleges.append(dict(zip(vars,re.split('\s+',item)[0:8]))) for item in colleges: if re.match('^--$',item['average_score']) is not None: item['average_score'] = None else: item['average_score'] = int(float(item['average_score'])) if re.match('^--$',item['province_control_score']) is not None: item['province_control_score'] = None else: item['province_control_score'] = int(float(item['province_control_score'])) return colleges def close(self): """ 关闭浏览器 :return: 无返回值 """ self.browser.quit()