def crawl_company_stage(company_id): req_url = 'https://m.lagou.com/gongsi/%s.html' % str(company_id) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Host': 'm.lagou.com', 'Referer': 'https://m.lagou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' } response = requests.get(req_url, headers=headers, cookies=m_lagou_spider.get_cookies(), timeout=20) print(response.url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html5lib') company_desc = soup.find_all(class_="desc")[0].get_text().strip() industryField = company_desc.split('/')[0].strip() financeStage = company_desc.split('/')[1].strip() staffNum = company_desc.split('/')[2].strip() elif response.status_code == 403: log.error('403 forbidden...') else: log.error(response.status_code) time.sleep(config.TIME_SLEEP) return [company_id, industryField, financeStage, staffNum]
def crawl_company(havemark=0): # 定义公司信息为列表形式 COMPANY_LIST = list() # 请求的url req_url = 'https://www.lagou.com/gongsi/0-0-0.json?havemark=%d' % havemark # 请求头部 headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/gongsi/0-0-0?havemark=0', 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 ' 'Mobile/13B143 Safari/601.1' } # 进行页数循环 for pn in range(20): params = { 'first': 'false', 'pn': str(pn), 'sortField': '0', 'havemark': str(havemark) } response = requests.post(req_url, headers=headers, params=params, cookies=m_lagou_spider.get_cookies(), timeout=10) print(response.url) if response.status_code == 200: company_list_per_page = response.json()['result'] for company in company_list_per_page: COMPANY_LIST.append([ company['companyId'], company['companyShortName'], company['city'], company['companyFeatures'], company['companyFullName'], company['financeStage'], company['industryField'], company['interviewRemarkNum'], company['positionNum'], company['processRate'] ]) log.info('page %d has been crawled down~' % (pn + 1)) elif response.status_code == 403: log.error('403 forbidden...') else: log.error(response.status_code) # 睡眠 time.sleep(config.TIME_SLEEP) return COMPANY_LIST
def get_max_page_no(company_id): """ return the max page number of interviewees' comments based on particular company :param company_id: :return: """ request_url = 'https://www.lagou.com/gongsi/searchInterviewExperiences.json' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Host': 'www.lagou.com', 'Referer': 'https://www.lagou.com', 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0' ' Mobile/13B143 Safari/601.1', 'Referer': 'https://www.lagou.com/gongsi/interviewExperiences.html?companyId=%s' % company_id } params = { 'companyId': company_id, 'positionType': '', 'pageSize': '10', 'pageNo': '1' } response = requests.post(request_url, headers=headers, params=params, cookies=get_cookies()) if response.status_code == 200: maxpage = int(response.json()['content']['data']['page']['totalCount']) else: log.error('Error code is ' + str(response.status_code)) maxpage = 0 return int(maxpage / 10) + 1
def crawl_interviewee_comments(company_id): request_url = 'https://www.lagou.com/gongsi/searchInterviewExperiences.json' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Host': 'www.lagou.com', 'Referer': 'https://www.lagou.com', 'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0' ' Mobile/13B143 Safari/601.1', 'Referer': 'https://www.lagou.com/gongsi/interviewExperiences.html?companyId=%s' % company_id } maxpage_no = get_max_page_no(company_id) if maxpage_no > 0: for pn in range(maxpage_no): params = { 'companyId': company_id, 'positionType': '', 'pageSize': '10', 'pageNo': str(pn + 1) } response = requests.post(request_url, headers=headers, params=params, cookies=get_cookies()) log.info('Crawl page %s successfully~' % response.url) if response.status_code == 200: comment_list = response.json( )['content']['data']['page']['result'] for comment in comment_list: insert_item(comment) log.info('insert one item successfully~') """ intervieweeComment = IntervieweeComment() intervieweeComment.id = comment['id'] intervieweeComment.companyId = comment['companyId'] intervieweeComment.companyName = comment['companyName'] intervieweeComment.companyScore = comment['companyScore'] intervieweeComment.comprehensiveScore = comment['comprehensiveScore'] intervieweeComment.interviewerScore = comment['interviewerScore'] intervieweeComment.describeScore = comment['describeScore'] intervieweeComment.myScore = comment['myScore'] intervieweeComment.content = comment['content'] intervieweeComment.createTime = comment['createTime'] intervieweeComment.hrId = comment['hrId'] intervieweeComment.positionId = comment['positionId'] intervieweeComment.positionName = comment['positionName'] intervieweeComment.positionStatus = comment['positionStatus'] intervieweeComment.positionType = comment['positionType'] intervieweeComment.tagArray = comment['tagArray'] intervieweeComment.usefulCount = comment['usefulCount'] insert_item(intervieweeComment) """ else: log.error('Error code is ' + str(response.status_code)) time.sleep(config.TIME_SLEEP)