def get_hit_rescruit(): base_url = "http://job.hit.edu.cn/index/getZczphData" host = "job.hit.edu.cn" header = util.get_header(host) header['referer'] = "http://job.hit.edu.cn/info?dj=MQ--" header['accept'] = "*/*" header['X-Requested-With'] = "XMLHttpRequest" req = requests.Session() header['cookie'] = "JSESSIONID=A36AAA74D82B3F39C3FD2455853EC081" req.get("http://job.hit.edu.cn/info?dj=MQ--") re = jedis.jedis() re.connect_redis() # 哈工大最新的就业网站是从2016年9月开始的,至今一共有13个月的数据 for i in range(0, 14): month = 9 year = 2016 month = month + i if month > 12: year = 2017 month = month - 12 date = datetime.date(year, month, 1) params = {'Month': util.get_month(date)} # params = {'Month': '2017-10'} params = json.dumps(params) print(params) res = req.post(headers=header, url=base_url, data=params) content = res.content.decode("utf-8") # print(content) parse_hit_info(content, re)
def get_scut_recuit(): print("开始获取华南理工大学数据=====================") url = "http://jyzx.6ihnep7.cas.scut.edu.cn/jyzx/xs/zpxx/xyxj/" req = requests.Session() headers = util.get_header(host='jyzx.6ihnep7.cas.scut.edu.cn') redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, 61): try: data = { 'pageNo': '60', 'daoxv1': '0', 'entName': '', 'time': '-1', 'pageNO': str(i) } content = req.post(url=url, headers=headers, data=data).content.decode('utf-8') parse_info(redis, content) except BaseException as e: util.format_err(e) break redis.add_to_file(table_name) redis.add_university(table_name) print("获取华南理工大学数据完成=======================")
def get_zzu_recruit(): url = "http://job.zzu.edu.cn:9009/service/business/college/jobfair/jobFairInfo/getCalendarInfo.xf" req = requests.Session() host = 'job.zzu.edu.cn:9009' headers = util.get_header(host) headers['referer'] = 'http://job.zzu.edu.cn/p/page/jobCalendar.html?channel_code=XJH&type=0' redis = jedis.jedis() redis.clear_list(table_name) year = 2018 # 从 2017年一直退回到2012年 for i in range(72, 0, -1): month = i % 12 if month == 0: year = year - 1 month = 12 params = { 'remark': '0', 'year': str(year), 'month': str(month) } print(params) res = req.post(url=url, headers=headers, data=params) content = res.content.decode('utf-8') parse_info(content, redis) redis.add_to_file(table_name) redis.add_university(table_name)
def get_ouc_recruit(): print("开始获取中国海洋大学数据=====================") url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html" host = 'career.ouc.edu.cn' headers = util.get_header(host) req = requests.Session() res = req.get(url=url, headers=headers).content.decode('gbk') redis = jedis.jedis() redis.clear_list(table_name) soup = BeautifulSoup(res, 'html5lib') total_infos = int(re.findall(pattern, str(soup))[0][14:]) page_num = total_infos // 20 + 1 for i in range(1, page_num + 1): try: if i == 1: url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html" else: url = "http://career.ouc.edu.cn/html/zp_info/campus/" + str(i) + ".html" content = req.get(url=url, headers=headers).content.decode('gbk') parse_info(content, redis) except BaseException as e: util.format_err(e) redis.add_university(table_name) redis.add_to_file(table_name) print("获取中国海洋大学数据完成=====================")
def get_gzu_recruit(): base_url = 'http://jobs.gzu.edu.cn/gzujobs/client/recruitment/meet?page=' req = requests.Session() host = 'jobs.gzu.edu.cn' content = req.get(url=base_url + str(1), headers=util.get_header(host)).content.decode('utf-8') page_num = get_page_num(content) redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, page_num + 1): url = base_url + str(i) print(url) content = req.get( url=url, headers=util.get_header(host)).content.decode('utf-8') parse_info(content, redis, i) redis.add_university(table_name) redis.add_to_file(table_name)
def get_sufe_recruit(): host = "career.sufe.edu.cn/" headers = util.get_header(host) re = jedis.jedis() re.clear_list(table_name) url = "http://careersys.sufe.edu.cn/pros_jiuye/s/zxh/owebsiteData/recruitmentAndPreaching?callback=&type=list&eachPageRows=600¤tPageno=1&_=" req = requests.Session() res = req.get(headers=headers, url=url) content = res.content.decode("utf-8") parse_info(content, re)
def get_top_500(base_url, page_num, company_type): host = "www.fortunechina.com" header = util.get_header(host) req = requests.Session() re = jedis.jedis() re.connect_redis() for i in range(1, page_num): url = base_url + str(i) print(i) res = req.get(headers=header, url=url).content.decode("utf-8") parse_top500(res, re, company_type)
def get_nju_rescruit(): base_url = "http://job.nju.edu.cn/login/nju/home.jsp?type=zph&DZPHBH=&sfss=sfss&zphzt=&jbksrq=&jbjsrq=&sfgq=&pageSearch=2&pageNow=" req = requests.Session() header = util.get_header("job.nju.edu.cn") re = jedis.jedis() re.connect_redis() for i in range(1, 118): print(i) content = req.get(headers=header, url=base_url + str(i)).content.decode("utf-8") parse_nju_info(content, re) re.add_university("nju_company_info") print("finish")
def get_fdu_rescruit(): host = "www.career.fudan.edu.cn" headers = util.get_header(host) headers['cookie'] = 'JSESSIONID=0000qZlE0QPPNarjW8SKyrjJPEW:19b14rm85' # 将count掉值设置为大于等于总信息的数字,可以一次性获得所有数据 url = "http://www.career.fudan.edu.cn/jsp/career_talk_list.jsp?count=3000&list=true" req = requests.Session() re = jedis.jedis() re.connect_redis() res = req.get(headers=headers, url=url) content = res.content.decode("utf-8") parse_info(content, re) re.add_university(table_name) re.add_to_file(table_name)
def get_lzu_rescruit(): base_url = "http://job.lzu.edu.cn/htmlfile/article/list/119/list_" url_tail = ".shtml" host = "job.lzu.edu.cn" header = util.get_header(host) max_page_num = 50 req = requests.Session() re = jedis.jedis() re.connect_redis() for i in range(1, max_page_num + 1): url = base_url + str(i) + url_tail html = req.get(headers=header, url=url).content.decode("utf-8") parse_html(html, re) print(i) re.add_university("lzu_company_info") print("finish")
def get_cufe_rescruit(): base_url = "http://scc.cufe.edu.cn/recruitment-datas/15/" url_tail = "/2.html" host = "scc.cufe.edu.cn" req = requests.Session() header = util.get_header(host) re = jedis.jedis() re.clear_list(table_name) max_page_num = 422 for i in range(1, max_page_num): print(i) url = base_url + str(i) + url_tail res = req.get(headers=header, url=url).content.decode("utf-8") parse_info(res, re) re.add_university(table_name) re.add_to_file(table_name)
def get_ncepu_recruit(): table_name = "ncepu_company_info" base_url = "http://job.ncepu.edu.cn/teachin/index?domain=ncepu&page=" req = requests.Session() redis = jedis.jedis() redis.clear_list(table_name) host = "job.ncepu.edu.cn" header = util.get_header(host) # 获取宣讲会信息 for i in range(1, 34): res = req.get(headers=header, url=base_url + str(i)) html = res.content.decode("utf-8") parse_info(html, redis, table_name) get_double_choose(req, header, re) redis.add_university(table_name) redis.add_to_file(table_name)
def get_nju_rescruit(): print("NJU Begin===================================================") base_url = "http://job.nju.edu.cn:9081/login/nju/home.jsp?type=zph&DZPHBH=&sfss=sfss&zphzt=&jbksrq=&jbjsrq=&sfgq=&pageSearch=2&pageNow=" req = requests.Session() header = util.get_header("job.nju.edu.cn") re = jedis.jedis() re.connect_redis() re.clear_list(table_name) for i in range(1, 120): print(i) content = req.get(headers=header, url=base_url + str(i)).content.decode("utf-8") parse_nju_info(content, re) get_zph_info(req, header, re) re.add_university(table_name) re.add_to_file(table_name) print("NJU finish ===================================================")
def get_scu_recruit(self): host = 'jy.scu.edu.cn' first_url = "http://jy.scu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx" base_url = "http://jy.scu.edu.cn/eweb/wfc/app/pager.so?type=goPager&requestPager=pager&pageMethod=next¤tPage=" req = requests.Session() scu_header = util.get_header(host) res = req.get(headers=scu_header, url=first_url) content = res.content.decode("utf-8") table_name = "scu_company_info" page_num = 224 index_begin = 8 index_end = 28 self.parse_info(content, table_name, index_begin, index_end, 2) self.get_rescruit(base_url, req, scu_header, table_name, page_num, index_begin, index_end, 2) self.re.add_university(table_name) self.re.add_to_file(table_name)
def get_xju_recruitment(): base_url = 'http://zsjy.xju.edu.cn/zpxx/' first_url = 'http://zsjy.xju.edu.cn/zpxx.htm' req = requests.Session() redis = jedis.jedis() redis.clear_list(table_name) headers = util.get_header('zsjy.xju.edu.cn') content = req.get(url=first_url, headers=headers).content.decode('utf-8') page_num = get_total_page(content) parse_info(content, redis) headers['Referer'] = first_url for i in range(page_num - 1, 0, -1): url = base_url + str(i) + '.htm' print(url) content = req.get(url=url, headers=headers).content.decode('utf-8') parse_info(content, redis) redis.add_to_file(table_name) redis.add_university(table_name)
def get_szu_recruit(): print("深圳大学开始==================================") url = 'http://job.szu.edu.cn/EngageListAllMeeting.aspx?index=1' headers = util.get_header('job.szu.edu.cn') req = requests.session() redis = jedis.jedis() redis.clear_list(table_name) content = req.get(url=url, headers=headers).content.decode("utf-8") base_url = url[0:-1] total_num = get_total_page(content) for i in range(1, total_num + 1): url = base_url + str(i) print(url) content = req.get(url=url, headers=headers).content.decode("utf-8") parse_info(content, redis) redis.add_university(table_name) redis.add_to_file(table_name) print("深圳大学结束==================================")
def get_sjtu_rescruit(): host = "www.job.sjtu.edu.cn" first_url = "http://www.job.sjtu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx&xjhType=yjb" base_url = "http://www.job.sjtu.edu.cn/eweb/wfc/app/pager.so?type=goPager&requestPager=pager&pageMethod=next¤tPage=" header = util.get_header(host) req = requests.Session() res = req.get(headers=header, url=first_url).content.decode("utf-8") table_name = "sjtu_company_info" page_num = 39 page_num = get_page_num(content=res) # 解析数据 get_rescruit(base_url, req, header, table_name, page_num, 14, 64, 1) # 在大学列表里新增表名 redis.add_university(table_name) # 保存到json文件 redis.add_to_file(table_name)
def get_ustc_recruit(): # 专场招聘会URL base_url = "http://www.job.ustc.edu.cn/API/Web/Recruit.ashx?rand=0.10286254897924929&pagesize=20&action=list&keyword=&pageindex=" req = requests.Session() host = "www.job.ustc.edu.cn" table_name = "ustc_company_info" header = util.get_header(host) re = jedis.jedis() re.connect_redis() for i in range(1, 25): url = base_url + str(i) res = req.get(headers=header, url=url) content = res.content.decode("utf-8") parse_info(content, re, table_name) get_communicate(req, re, header, table_name) re.add_university(table_name) re.add_to_file(table_name) print("finish")
def get_bnu_recuit(): print("开始获取北京师范大学数据=====================") url = "http://career.bnu.edu.cn/front/zp_query/zphQuery.jspa?" host = "career.bnu.edu.cn" headers = util.get_header(host=host) redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, 82): # 一共81页 try: params = {'paramMap.xxlx': '1', 'page.curPage': '%d' % i} html = requests.get(url=url, headers=headers, params=params).json() # json 数据 parse_info(html, redis) except BaseException as e: util.format_err(e) finally: print('获取北京师范大学第 %d 页(共81页)数据完成' % i) redis.add_university(table_name) # 添加学校到github中 redis.add_to_file(table_name) # 添加表到文件中
def get_pku_recruit(): print("PKU Begin ===================================================") base_url = "https://scc.pku.edu.cn/information/base-job-fair!findFairInfoByMonth.action" host = "scc.pku.edu.cn" headers = util.get_header(host) headers[ 'referer'] = "https://scc.pku.edu.cn/timeline?fairDate=2017-11-03%2000:00" headers[ 'Cookie'] = "Hm_lvt_f77188aadf0698598108fbf1f0e5df52=1509938240,1510453941; JSESSIONID=A07EA9A7A0B89A27E64ABB70E7D2C5FD; Hm_lpvt_f77188aadf0698598108fbf1f0e5df52=1510454286" req = requests.Session() re = jedis.jedis() re.connect_redis() re.clear_list(table_name) # # 获取宣讲会信息 for i in range(1, 13): month = i yearMonth = datetime.date(2017, month, i) yearMonth = util.get_month(yearMonth) data = {'yearMonth': yearMonth} # data = {'yearMonth': '2017-01'} # req.get(url="https://scc.pku.edu.cn", verify=False) res = req.post(headers=headers, url=base_url, data=data, verify=False) content = res.content.decode("utf-8") parse_info(content, re) # 获取双选会信息 url = "https://scc.pku.edu.cn/home!bigFairJobInfo.action" url2 = "https://scc.pku.edu.cn/home!bigFairJobInfo.action" data2 = {'start': 0, 'limit': 600, 'currentPage': 1} headers['referer'] = "https://scc.pku.edu.cn/home!speciaPreach.action" headers[ 'cookie'] = 'JSESSIONID=AFBCF8D631C5F757F2790373BE5AB090; Hm_lvt_f77188aadf0698598108fbf1f0e5df52=1513048945,1514907282; Hm_lpvt_f77188aadf0698598108fbf1f0e5df52=1514907290' headers['X-Requested-With'] = "X-Requested-WithXMLHttpRequest" headers['Cache-Control'] = "no-cache" req.get(url=url, headers=headers, verify=False) headers['referer'] = "https://scc.pku.edu.cn/home!bigFairJobInfo.action" info = req.post(headers=headers, url=url2, data=data2, verify=False) print("get info success") parse_info2(info.content.decode("utf-8"), re) re.add_university(table_name) re.add_to_file(table_name) print("PKU Finish ===================================================")
def get_hnu_recuit(): print("开始获取湖南大学数据=====================") url = "http://scc.hnu.edu.cn/newsjob!getMore.action?" host = "scc.hnu.edu.cn" headers = util.get_header(host=host) redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, 310): # 一共310页,102页及其以前都是2017年的 try: params = {'p.currentPage': '%d' % i, 'Lb': '1'} html = requests.get(url=url, headers=headers, params=params).text parse_info(html, redis) except BaseException as e: # 还不太会错误处理机制 util.format_err(e) break finally: print('获取湖南大学第 %d 页(共310页)数据完成' % i) redis.add_university(table_name) redis.add_to_file(table_name)
def get_nxu_recruit(): url = 'http://www.nxujob.com/news/news-list.php?id=27&page=' req = requests.Session() headers = util.get_header('www.nxujob.com') redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, 48): print(url + str(i)) content = req.get(url=url + str(i), headers=headers).content.decode('gbk') soup = BeautifulSoup(content, 'html5lib') url_list = soup.find_all( href=re.compile('http://www.nxujob.com/news/news-show.php'), attrs={'target': '_blank'}) for j in range(10): detail_url = url_list[j].attrs['href'] print(detail_url) detail = req.get(url=detail_url, headers=headers).content.decode('gbk') parse_info(detail, redis) redis.add_to_file(table_name) redis.add_university(table_name)
def get_hit_rescruit(): print("HIT Begin ===================================================") base_url = "http://job.hit.edu.cn/index/getZczphData" host = "job.hit.edu.cn" header = util.get_header(host) req = requests.Session() header[ 'Cookie'] = 'UM_distinctid=12d92-04155388a776ed-49566e-1fa400-15fef2bf12e643; CNZZDATA1261107882=1341678225-1511543504-https%253A%252F%252Fwww.baidu.com%252F%7C1513672466; JSESSIONID=E8EAAFC1F662C83D57C2D504594BD6CF' # res = req.get("http://job.hit.edu.cn/info?dj=MQ--").content.decode('utf-8') # print(res) # req.headers.update() re = jedis.jedis() re.connect_redis() re.clear_list(table_name) # 哈工大最新的就业网站是从2016年9月开始的 for i in range(0, 16): month = 9 year = 2016 month = month + i if month > 12: year = 2017 month = month - 12 date = datetime.date(year, month, 1) params = {'Month': util.get_month(date)} # params = {'Month': '2017-11'} params = json.dumps(params) print(params) print(base_url) res = req.post(url=base_url, headers=header, data=params) print(res.status_code) content = res.content.decode("utf-8") # print(content) parse_hit_info(content, re) re.add_to_file(table_name) re.add_university(table_name) print("HIT finish ===================================================")
def get_nku_recruit(): # 宣讲会 url = 'http://career.nankai.edu.cn/Home/Reccalender/doxuanjiang' # 双选会 url2 = 'http://career.nankai.edu.cn/Home/Reccalender/doshuangxuan' host = 'career.nankai.edu.cn' header = util.get_header(host) header['referer'] = 'http://career.nankai.edu.cn/reccalender/index.html' header[ 'cookie'] = 'yunsuo_session_verify=5374b1e89d110421560f5e8e3182d03c; PHPSESSID=632an0himtafj6me8379r8fkn4; Hm_lvt_6eb8a37eb57545b46494b26e6136af4a=1511532968; Hm_lpvt_6eb8a37eb57545b46494b26e6136af4a=1511533002' years = ['2016, 2017'] req = requests.Session() redis = jedis.jedis() redis.clear_list(table_name) for year in years: company_list = req.post(url=url, headers=header, data={ 'year': year }).content.decode('unicode-escape') parse_info(redis, company_list) # 获取双选会 recruit_list = req.post(url=url2, headers=header, data={ 'year': 2017 }).content.decode('unicode-escape') recruit_list = json.loads(recruit_list) for item in recruit_list: id = item['id'] date = item['starttime'] title = item['title'] print("===============================") print(title, id) recruit_url = 'http://career.nankai.edu.cn/Home/Recruitment/content/type/1/id/' + str( id) + '.html' content = req.get(url=recruit_url, headers=header).content.decode("utf-8") parse_recruit_info(redis, content, date, id) redis.add_university(table_name) redis.add_to_file(table_name)
def get_scu_recruit(self): table_name = "scu_company_info" host = 'jy.scu.edu.cn' referer = "http://jy.scu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx" base_url = "http://jy.scu.edu.cn/eweb/wfc/app/pager.so?type=goPager&requestPager=pager&pageMethod=next¤tPage=" url = "http://jy.scu.edu.cn/eweb/wfc/app/pager.so?type=goPager&requestPager=pager&pageMethod=next¤tPage=0" self.re.clear_list(table_name) req = requests.Session() scu_header = util.get_header(host) scu_header[ 'Referer'] = 'http://jy.scu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx&xjhType=all' data = {'xjhType': 'yjb', 'jbrq': '', 'zpzt': ''} req.get(url='http://jy.scu.edu.cn/eweb/jygl/index.so', headers=scu_header) res = req.post(headers=scu_header, url=referer, data=str(data)) content = res.content.decode("utf-8") index_begin = 8 index_end = 28 page_num = self.get_page_num(content) scu_header['Referer'] = referer self.get_rescruit(base_url, req, scu_header, table_name, page_num, index_begin, index_end, 2) self.re.add_university(table_name) self.re.add_to_file(table_name)
# coding = utf-8 import re import requests from bs4 import BeautifulSoup from jedis import jedis from util import util # 云南大学 table_name = 'ynu_company_info' headers = util.get_header('jobs.ynu.edu.cn') req = requests.Session() date_pattern = re.compile('[0-9]{4}-[0-9]{2}-[0-9]{2}') def get_ynu_recruitment(): base_url = "http://jobs.ynu.edu.cn/wszplist.jsp?urltype=tree.TreeTempUrl&wbtreeid=1091" url = 'http://jobs.ynu.edu.cn/wszplist.jsp?urltype=tree.TreeTempUrl&wbtreeid=1091' redis = jedis.jedis() redis.clear_list(table_name) content = req.get(url=base_url, headers=headers).content.decode('utf-8') total_page_num = get_total_num(content) params = { 'reqformCURURI': '3187540095DC12E6C9C66ED4973512AD', 'reqformKEYTYPES': '4, 12, 93', 'actiontype': 'Find', 'reqformORDER': 'desc', 'reqformORDERKEY': 'wbrelease', 'reqformCountNo': total_page_num, 'reqformGOPAGE': '',