elif response.status_code == 403: log.error('request is forbidden by the server...') return 0 else: log.error(response.status_code) return 0 except requests.exceptions.RequestException as e: log.error(response.status_code + "超时3次") return 0 # 爬取职位信息,将内容保存在当前目录的data文件夹下 if __name__ == '__main__': craw_job_list = parse_job_xml('../config/job.xml') for _ in craw_job_list: # 创建joblist对象 joblist = crawl_jobs(_) col = [ u'公司ID', u'工作经验', u'教育程度', u'工作性质', u'岗位名称', u'岗位ID', u'发布时间', u'城市', u'公司LOGO', u'工业领域', u'岗位优势',
if response.status_code == 200: max_page_no = int(int(response.json()['content']['data']['page']['totalCount']) / 15 + 1) return max_page_no elif response.status_code == 403: log.error('request is forbidden by the server...') return 0 else: log.error(response.status_code) return 0 if __name__ == '__main__': craw_job_list = parse_job_xml('../config/job.xml') for _ in craw_job_list: joblist = crawl_jobs(_) col = [ u'职位编码', u'职位名称', u'所在城市', u'发布日期', u'薪资待遇', u'公司编码', u'公司名称', u'公司全称'] df = pd.DataFrame(joblist, columns=col) dir = "./data/" mkdirs_if_not_exists(dir) df.to_excel(os.path.join(dir, _ + ".xlsx"), sheet_name=_, index=False)