예제 #1
0
        elif response.status_code == 403:
            log.error('request is forbidden by the server...')
            return 0
        else:
            log.error(response.status_code)
            return 0
    except requests.exceptions.RequestException as e:
        log.error(response.status_code + "超时3次")
    return 0

    


# 爬取职位信息,将内容保存在当前目录的data文件夹下
if __name__ == '__main__':
    craw_job_list = parse_job_xml('../config/job.xml')
    for _ in craw_job_list:
        # 创建joblist对象
        joblist = crawl_jobs(_)
        col = [
            u'公司ID',
            u'工作经验',
            u'教育程度',
            u'工作性质',
            u'岗位名称',
            u'岗位ID',
            u'发布时间',
            u'城市',
            u'公司LOGO',
            u'工业领域',
            u'岗位优势',
예제 #2
0
    if response.status_code == 200:
        max_page_no = int(int(response.json()['content']['data']['page']['totalCount']) / 15 + 1)

        return max_page_no
    elif response.status_code == 403:
        log.error('request is forbidden by the server...')

        return 0
    else:
        log.error(response.status_code)

        return 0


if __name__ == '__main__':
    craw_job_list = parse_job_xml('../config/job.xml')
    for _ in craw_job_list:
        joblist = crawl_jobs(_)
        col = [
            u'职位编码',
            u'职位名称',
            u'所在城市',
            u'发布日期',
            u'薪资待遇',
            u'公司编码',
            u'公司名称',
            u'公司全称']
        df = pd.DataFrame(joblist, columns=col)
        dir = "./data/"
        mkdirs_if_not_exists(dir)
        df.to_excel(os.path.join(dir, _ + ".xlsx"), sheet_name=_, index=False)