def init_search_page(self): url = 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'ehire.51job.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': self.user_agent, 'Cookie': self.cookie } res = self.html_downloader.download(url, headers=headers, proxies=self.proxies, allow_redirects=False) if res.status_code == 302: self.logger.warning('cookie invalid.') # self.h.hset( # self.source + '|' + self.auth_kwargs['username'].encode( # 'utf-8'), '') raise MfCookieValidException('cookie_invalid') self.logger.info('初始化search_page成功.') return res.content
def get_resume_list(self, page=1, **params): self.logger.info('开始执行搜索任务, USER: {} PAGE: {} {}'.format( self.auth_kwargs['username'].encode('utf-8'), page, json.dumps(params, ensure_ascii=False).encode('utf-8'))) end = datetime.datetime.now() + datetime.timedelta(days=1) start = end - datetime.timedelta(days=settings.SEARCH_DAY) post_date = datetime2str(start, fmt='%Y%m%d') + '000000_' \ + datetime2str(end, fmt='%Y%m%d') + '000000' url = '{}pn{}/pve_5593_{}/?key={}&age=18_30&postdate={}'.format( params.get('city_url'), page, params.get('degree', '4'), params.get('keyword').encode('utf-8'), post_date) # print(url) headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'cookie': self.cookie, 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.84 Safari/537.36' } res = self.html_downloader.download(url, headers=headers, proxies=self.proxies) if 'passport.58.com' in res.url: raise MfCookieValidException('cookie invalid. {}'.format( self.auth_kwargs['username'].encode('utf-8'))) real_html = self.get_real_html(res.content) soups = self.html_parser.parser(real_html).find( 'div', id='infolist').find_all('dl') has_next = True if self.html_parser.parser(real_html).find( 'div', class_='pagerout').find('a', class_='next') else False url_lst = [] for soup in soups: url = soup.find('dt').find('a').get('href') resume_id = self.get_resume_id(url) if self.do_filter(resume_id): self.logger.info('简历: {}, {}天内已被采集过'.format( resume_id, settings.DELAY_DAY)) continue url_lst.append(url) time.sleep(random.randint(1, 2)) return has_next, url_lst[:-1]
def get_resume_list_lp(self, page, **kwargs): """ 猎聘模式 :return: """ url = 'http://www.fenjianli.com/search/liepinSearch.htm' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.fenjianli.com', 'Origin': 'http://www.fenjianli.com', 'Pragma': 'no-cache', 'Referer': 'http://www.fenjianli.com/search/liepinHome.htm', 'User-Agent': self.user_agent, 'Cookie': self.cookie, 'X-Requested-With': 'XMLHttpRequest', } data = { 'searchNear': 'on', 'areas': kwargs['area_code'], 'hJobs': kwargs['job_code'] + ',', 'rows': '30', 'sortBy': '1', 'sortType': '1', 'degree': '0-0', 'offset': 30 * (page - 1), '_random': str(random.random()), } res = self.html_downloader.download(url, method='POST', headers=headers, data=data, proxies=self.proxies, allow_redirects=False) if res.status_code == 302: self.logger.info('cookie_invalid: %s' % res.content) # print self.auth_kwargs # self.invalid_cookie(self.auth_kwargs['username'], # self.auth_kwargs['password']) raise MfCookieValidException('cookie_invalid') if not res.json().get('list'): self.logger.warning('%s' % json.dumps(res.json(), ensure_ascii=False)) return self.logger.info('Page: %s 获取到%s份匹配的简历 [%s-%s]' % (page, len(res.json().get('list')), kwargs['area_name'].encode('utf-8'), kwargs['job_name'].encode('utf-8'))) return res.json().get('list')
def get_resume_detail(self, resume_id, mode='zl'): url = 'http://fenjianli.com/search/getDetail.htm' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'fenjianli.com', 'Origin': 'http://fenjianli.com', 'Pragma': 'no-cache', 'User-Agent': self.user_agent, 'Cookie': self.cookie, 'X-Requested-With': 'XMLHttpRequest', } data = {'id': resume_id, '_random': str(random.random())} if mode == 'zl': headers['Referer'] = 'http://www.fenjianli.com/search/' \ 'detail.htm?ids=' \ + base64.b64encode(resume_id) else: headers['Referer'] = 'http://www.fenjianli.com/search/' \ 'liepinDetail.htm?ids=' \ + base64.b64encode(resume_id) res = self.html_downloader.download(url, method='POST', headers=headers, data=data, proxies=self.proxies, allow_redirects=False) if res.status_code == 302: self.logger.warning('cookie_invalid: %s' % res.content) raise MfCookieValidException('cookie_invalid_from_detail') if '登录异常,请联系客服处理' in res.content: self.logger.info('cookie_invalid: %s' % res.content) raise MfCookieValidException('cookie_invalid_from_detail') self.logger.info('获取简历详情成功: %s' % resume_id.encode('utf-8')) return res.json()
def add_to_folder_catalog(self, resume_id, ids, folder_catalog_id): url = 'http://www.fenjianli.com/userResumeDetail/' \ 'addToFolderCatalog.htm' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Length': '26', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.fenjianli.com', 'Origin': 'http://www.fenjianli.com', 'Pragma': 'no-cache', 'Referer': 'http://www.fenjianli.com/search/detail.htm?type=1' '&ids=%s' % base64.b64encode(resume_id), 'User-Agent': self.user_agent, 'Cookie': self.cookie, 'X-Requested-With': 'XMLHttpRequest', } data = { 'ids': ids, 'folderCatalogId': folder_catalog_id, 'folderCatalogType': 'Download', 'type': 'add', 'isResumeId': 'true', '_random': random.random() } res = self.html_downloader.download(url, method='POST', headers=headers, data=data, allow_redirects=False, proxies=self.proxies) if res.status_code == 302 or 'error' in res.json(): self.logger.warning('cookie_invalid: %s' % res.content) raise MfCookieValidException('cookie_invalid') self.logger.info('%s' % res.content) return res.json()
def get_cookie(self): """ 根据账号获取cookie :return: """ username = self.auth_kwargs.get('username') if not username: raise Exception('username is empty.') self.cookie = self.cookie_pool.hget( '{username}'.format(username=username.encode('utf-8'))) if not self.cookie: raise MfCookieValidException('cookie invalid.') self.cookie = cookie2str(eval(self.cookie)) return username, self.cookie
def get_resume_detail(self, url): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'cache-control': 'no-cache', 'Cookie': self.cookie, # 'pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.84 Safari/537.36' } url = unquote(url) if not self.cookie: raise Exception('account_limit') cookies = cookie2dict(self.cookie) res = self.html_downloader.download(url, headers=headers, proxies=self.proxies, cookies=cookies) if 'passport.58.com' in res.url: raise MfCookieValidException('cookie invalid.') resume_id = self.get_resume_id(res.url) try: real_html = self.get_real_html(res.content) except Exception as e: real_html = None self.logger.info(res.content) time.sleep(random.randint(1, 2)) return resume_id, real_html
def get_resume_detail(self, resume_args): url = 'https://ihr.zhaopin.com/resumesearch/getresumedetial.do?' \ 'access_token=46e7d16be97a4f3ba9ca7beb2c42f8a8&' \ 'resumeNo=%s&searchresume=1&resumeSource=1&keyword=java' \ '&t=%s&k=%s&v=0&version=1' \ '&openFrom=1' % (resume_args['resumeNo'], resume_args['t'], resume_args['k']) headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'ihr.zhaopin.com', 'Pragma': 'no-cache', 'Referer': url, 'User-Agent': self.user_agent, 'Cookie': self.cookie, 'X-Requested-With': 'XMLHttpRequest', } res = self.html_downloader.download(url, headers=headers, proxies=self.proxies) if res.json().get('code') == 6001: self.logger.info(self.logger_prefix + 'cookie失效了') self.set_cookie_invalid() raise MfCookieValidException('cookie_invalid') if res.json().get('code') != 1: self.logger.info( self.logger_prefix + '获取简历详情失败: %s - %s' % (self.auth_kwargs['username'].encode('utf-8'), resume_args.get('resumeNo').encode('utf-8')[:-4])) return self.logger.info(self.logger_prefix + '获取简历详情成功: %s - %s' % (self.auth_kwargs['username'].encode('utf-8'), resume_args.get('resumeNo').encode('utf-8')[:-4])) return res.json().get('data')
def get_resume_detail(self, resume_args): url = 'http://ihr.zhaopin.com/resumesearch/getresumedetial.do' \ '?access_token=%s&resumeNo=%s_1&searchresume=1&resumeSource=1' \ '&%s' % (self.access_token, resume_args.get('id'), resume_args.get('valResumeTimeStr')) headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'ihr.zhaopin.com', 'Pragma': 'no-cache', 'Referer': url, 'User-Agent': self.user_agent, 'Cookie': self.cookie, 'X-Requested-With': 'XMLHttpRequest', } res = self.html_downloader.download(url, headers=headers, proxies=self.proxies) if res.json().get('code') == 6001: self.logger.info('cookie失效了') self.set_cookie_invalid() raise MfCookieValidException('cookie_invalid') if res.json().get('code') != 1: self.logger.info(self.logger_prefix + '获取简历详情失败: %s - %s' % (self.auth_kwargs['username'].encode('utf-8'), resume_args.get('number').encode('utf-8'))) return self.logger.info(self.logger_prefix + '获取简历详情成功: %s - %s' % (self.auth_kwargs['username'].encode('utf-8'), resume_args.get('number').encode('utf-8'))) return res.json().get('data')
def download_resume(self, resume_id): resume_detail = self.get_resume_detail(resume_id) if not resume_detail: self.logger.info('获取简历详情失败. %s' % resume_id.encode('utf-8')) raise MfCookieValidException('get_resume_detail_failed.') self.find_customer_channel_account(resume_id) tree_folder = self.tree_of_folder_catalog(resume_id) folder_catalog_id = re.findall('(?<=id:)\d+(?=, pId:[1-9])', tree_folder.get('data'))[0] ids = resume_id.encode('utf-8') + '/' + resume_detail.get( 'name').encode('utf-8') self.logger.info('%s: 开始执行下载操作: %s - %s' % (self.auth_kwargs['username'].encode('utf-8'), ids, folder_catalog_id.encode('utf-8'))) res = self.add_to_folder_catalog(resume_id, ids, folder_catalog_id) if '下载简历成功[1]份' in res: self.logger.info('下载成功') return self.get_resume_detail(resume_id) else: self.logger.info('下载失败: %s' % res) return
def get_resume_detail(self, resume_url): url = 'https://ehire.51job.com/' + resume_url headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'ehire.51job.com', 'Referer': 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx', 'Upgrade-Insecure-Requests': '1', 'User-Agent': self.user_agent, 'Cookie': self.cookie } res = self.html_downloader.download(url, headers=headers, proxies=self.proxies, allow_redirects=False) if res.status_code == 302: self.logger.warning('cookie invalid.') # self.h.hset( # self.source + '|' + self.auth_kwargs['username'].encode( # 'utf-8'), '') raise MfCookieValidException('cookie_invalid') if '简历ID:' in res.content: self.logger.info('获取简历正文成功 %s' % self.auth_kwargs['username'].encode('utf-8')) return res.content self.logger.warning('获取简历正文失败 %s' % self.auth_kwargs['username'].encode('utf-8')) access_key = self.html_parser.parser(res.content).find( 'input', id='hidAccessKey').get('value') # self.robot_login.send_markdown( # title="简历搜索", # content="#### 前程简历搜索详情页出现验证码.\n" # "- 帐号: %s\n" # "- 密码: %s\n" # "- 会员名: %s\n" # "- 代理: %s\n\n" # % ( # self.auth_kwargs['username'].encode( # 'utf-8'), # self.auth_kwargs['password'].encode( # 'utf-8'), # self.auth_kwargs['account_name'].encode( # 'utf-8'), # self.auth_kwargs['ip'].encode( # 'utf-8') + ':' + # self.auth_kwargs['port'].encode('utf-8')) # ) self.get_captcha(referer=res.url, access_key=access_key) time.sleep(60) raise Exception
def get_resume_list(self, previous_page_html, action='pagerTopNew$ctl03', **search_args): """ :param previous_page_html: :param action: :param search_args: city['北京|010000'], keywords['销售代表|3001'] :return: """ url = 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'ehire.51job.com', 'Origin': 'https://ehire.51job.com', 'Pragma': 'no-cache', 'Referer': 'https://ehire.51job.com/Candidate/' 'SearchResumeNew.aspx', 'Upgrade-Insecure-Requests': '1', 'User-Agent': self.user_agent, 'Cookie': self.cookie } _soups = self.html_parser.parser(previous_page_html) data = { '__EVENTTARGET': action, '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': _soups.find('input', id='__VIEWSTATE').get('value'), 'ctrlSerach$search_keyword_txt': search_args['keywords'].split('|')[0], 'ctrlSerach$search_company_txt': '', 'ctrlSerach$search_area_input': '', 'ctrlSerach$search_area_hid': '', 'ctrlSerach$search_funtype_hid': '', 'ctrlSerach$search_expectsalaryf_input': '不限', 'ctrlSerach$search_expectsalaryt_input': '不限', 'ctrlSerach$search_industry_hid': '', 'ctrlSerach$search_wyf_input': '不限', 'ctrlSerach$search_wyt_input': '不限', 'ctrlSerach$search_df_input': '不限', 'ctrlSerach$search_dt_input': '不限', 'ctrlSerach$search_cursalaryf_input': '不限', 'ctrlSerach$search_cursalaryt_input': '不限', 'ctrlSerach$search_age_input': '年龄:18-30', 'ctrlSerach$search_agef_input': '18', 'ctrlSerach$search_aget_input': '30', 'ctrlSerach$search_expjobarea_input': search_args['city'].split('|')[0], 'ctrlSerach$search_expjobarea_hid': search_args['city'], 'ctrlSerach$search_forlang_input': '语言', 'ctrlSerach$search_fl_input': '不限', 'ctrlSerach$search_fllsabilityll_input': '不限', 'ctrlSerach$search_englishlevel_input': '英语等级', 'ctrlSerach$search_sex_input': '性别', 'ctrlSerach$search_major_input': '专业', 'ctrlSerach$search_major_hid': '', 'ctrlSerach$search_hukou_input': '户口', 'ctrlSerach$search_hukou_hid': '', 'ctrlSerach$search_rsmupdate_input': '近1周', 'ctrlSerach$search_jobstatus_input': '求职状态', 'send_cycle': '1', 'send_time': '7', 'send_sum': '10', 'ctrlSerach$hidSearchValue': u'%s##0#######20#35############近1周|1##1#0##%s#0#0#0' % (search_args['keywords'].split('|')[0], search_args['city']), 'ctrlSerach$hidKeyWordMind': '', 'ctrlSerach$hidRecommend': '', 'ctrlSerach$hidWorkYearArea': '', 'ctrlSerach$hidDegreeArea': '', 'ctrlSerach$hidSalaryArea': '', 'ctrlSerach$hidCurSalaryArea': '', 'ctrlSerach$hidIsRecDisplay': '1', 'showselected': '', 'pagerTopNew$ctl06': '50', 'cbxColumns$0': 'AGE', 'cbxColumns$1': 'WORKYEAR', 'cbxColumns$2': 'SEX', 'cbxColumns$3': 'AREA', 'cbxColumns$4': 'WORKFUNC', 'cbxColumns$5': 'TOPDEGREE', 'cbxColumns$6': 'LASTUPDATE', 'hidAccessKey': _soups.find('input', id='hidAccessKey').get('value'), 'hidShowCode': '0', 'hidDisplayType': '1', 'hidEhireDemo': '', 'hidUserID': '', 'hidCheckUserIds': _soups.find('input', id='hidCheckUserIds').get('value'), 'hidCheckKey': _soups.find('input', id='hidCheckKey').get('value'), 'hidEvents': '', 'hidNoSearchIDs': '', 'hidBtnType': '', 'hideMarkid': '', 'hidStrAuthority': _soups.find('input', id='hidStrAuthority').get('value'), 'hidDownloadNum': _soups.find('input', id='hidDownloadNum').get('value'), 'hidKeywordCookie': '', 'showGuide': '', } if not search_args['use_keywords']: self.logger.info('采用职能进行搜索.') data['ctrlSerach$search_keyword_txt'] = '' data['ctrlSerach$search_funtype_hid'] = search_args['keywords'] data['hidSearchValue'] = \ u'##0#%s######20#35############近1周|1##1#0##%s#0#0#0' \ % (search_args['keywords'], search_args['city']) else: self.logger.info('采用关键词进行搜索.') res = self.html_downloader.download(url, method='POST', headers=headers, data=data, proxies=self.proxies, allow_redirects=False) if res.status_code == 302: self.logger.warning('cookie invalid.') # self.h.hset( # self.source + '|' + self.auth_kwargs['username'].encode( # 'utf-8'), '') raise MfCookieValidException('cookie_invalid') access_key = self.html_parser.parser(res.content).find( 'input', id='hidAccessKey').get('value') # auth_ = self.html_parser.parser(res.content).find( # 'div', id='divVerifyCode_ch').get('style') soups = self.html_parser.parser(res.content).find_all( 'td', class_='Common_list_table-id-text') resume_list = [] if not soups: # 通过empty_times控制,当某账号累计10次遇到返回为空的情况,则进行验证码验证 empty_times = int(self.h_search_empty_times.hget(self.auth_kwargs['username'])) \ if self.h_search_empty_times.hget(self.auth_kwargs['username']) else 0 if empty_times > 10: self.logger.warning( '搜索列表遇到验证码. %s' % self.auth_kwargs['username'].encode('utf-8')) self.get_captcha(referer=res.url, access_key=access_key, do_type='CheckSearchResume') self.h_search_empty_times.hset(self.auth_kwargs['username'], 0) raise Exception else: self.logger.warning( '未匹配到搜索结果,跳过该任务[%s, %s, %s]' % (self.auth_kwargs['username'].encode('utf-8'), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) empty_times += 1 self.h_search_empty_times.hset(self.auth_kwargs['username'], empty_times) return resume_list, '' for soup in soups: ref_time = soup.find_parent().find_all('td')[-2].text.encode( 'utf-8') if datetime.datetime.now().isoweekday() == 1: # 周一 # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4) global DAY_LIMITED DAY_LIMITED = settings.DAY_LIMITED limited_day = datetime.datetime.now() - datetime.timedelta( days=1) else: global DAY_LIMITED DAY_LIMITED = settings.DAY_LIMITED limited_day = datetime.datetime.now() - datetime.timedelta( days=1) if str2datetime(ref_time, '%Y-%m-%d').date() < \ limited_day.date(): self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED) break resume_list.append(soup.find('a').get('href')) try: page = self.html_parser.parser(res.content).find( 'div', class_='Search_page-numble').find( 'a', class_='active').get('title').encode('utf-8') except Exception as e: self.logger.warning('未找到分页组件,跳过该任务[%s, %s]' % (search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) return resume_list, '' self.logger.info( 'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' % (page, len(resume_list), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) if int(page) > settings.TASK_PAGE_LIMIT: raise FiveOneResumeException('task_page_limit') return resume_list, res.content
def get_resume_list(self, page=1, is_download=False, **search_args): """ 获取简历列表页 搜索条件: 关键词/所在地/年龄20-35/学历/最近三天upDate :param page: :param search_args: :return: """ url = 'https://ihr.zhaopin.com/resumesearch/search.do?' \ 'access_token=%s' % self.access_token headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'ihr.zhaopin.com', 'Origin': 'https://ihr.zhaopin.com', 'Pragma': 'no-cache', 'Referer': 'https://ihr.zhaopin.com/resumesearch/search/', 'User-Agent': self.user_agent, 'Cookie': self.cookie, 'X-Requested-With': 'XMLHttpRequest', } data = { 'keywords': search_args['keywords'].split('|')[0].encode('utf-8'), 'startNum': (page - 1) * 30, 'rowsCount': '30', 'resumeGrade': '', 'sortColumnName': 'sortUpDate', 'sortColumn': 'sortUpDate desc', 'onlyHasImg': 'false', 'anyKeyWord': 'false', 'hopeWorkCity': search_args['city'].split('|')[1].encode('utf-8'), 'ageStart': search_args.get('age_start', '18'), 'ageEnd': search_args.get('age_end', '30'), 'workYears': search_args.get('work_years', ''), 'liveCity': search_args.get('live_city', ''), 'sex': search_args.get('sex', ''), 'edu': search_args.get('degree', '5'), 'upDate': search_args.get('up_date', ''), # 默认搜索最近三天 'companyName': search_args.get('company_name', ''), 'jobType': '', 'desiredJobType': search_args.get('desired_job_type', ''), 'industry': search_args.get('industry', ''), 'desiredIndustry': '', 'careerStatus': '', 'desiredSalary': '', 'langSkill': '', 'hukouCity': '', 'major': '', 'onlyLastWork': 'false', } # print(json.dumps(data, ensure_ascii=False, indent=4)) if search_args['use_keywords'] is False: data['desiredJobType'] = search_args['keywords'].split('|')[1] self.logger.info('采用职能进行搜索.') else: self.logger.info('采用关键词进行搜索') res = self.html_downloader.download(url, method='POST', data=data, headers=headers, proxies=self.proxies) # self.logger.info('搜索返回 %s' % res.json()) if res.json().get('code') == 6001: self.logger.info(self.logger_prefix + 'cookie失效了') self.set_cookie_invalid() raise MfCookieValidException('cookie_invalid') if res.json().get('code') == 808: self.logger.warning(self.logger_prefix + res.json().get('message').encode('utf-8')) today = datetime2str(datetime.datetime.now(), '%Y-%m-%d') self.h_over_search_limit.hset( today + '|' + self.auth_kwargs['username'].encode('utf-8'), 1) # 当日搜索大库简历已达上限 global LIMIT_MESSAGE_BOX if not LIMIT_MESSAGE_BOX.get( self.auth_kwargs['username'].encode('utf-8'), ''): LIMIT_MESSAGE_BOX[self.auth_kwargs['username'].encode( 'utf-8')] = 1 self.robot_login.send_markdown( title="智联简历搜索", content="#### 智联简历当日关键词搜索量已达上限.\n" "- 帐号: %s\n" "- 密码: %s\n" "- 代理: %s\n" "- 达到上限账号总数: %s\n" % (self.auth_kwargs['username'].encode('utf-8'), self.auth_kwargs['password'].encode('utf-8'), self.auth_kwargs['ip'].encode('utf-8') + ':' + self.auth_kwargs['port'].encode('utf-8'), len(LIMIT_MESSAGE_BOX))) raise ZhiLianResumeException('user_record_limited') try: resume_list = res.json().get('results') if not resume_list: raise Exception except Exception as e: self.logger.exception('获取list失败: %s | %s' % (str(e), res.content)) return [] resume_accept_list = [] for resume in resume_list: # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4 # ) if is_download is False: if datetime.datetime.now().isoweekday() == 1: # 周一 # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4) global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) else: global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) if str2datetime(resume.get('modifyDate'), '%Y-%m-%d').date() < limited_day.date(): self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED) break resume_accept_list.append(resume) self.logger.info('page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' % (page, len(resume_accept_list), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) return resume_accept_list
def get_resume_list(self, page=1, **search_args): """ 获取简历列表页 搜索条件: 关键词/所在地/年龄20-35/学历/最近三天SF_1_1_7 :param page: :param search_args: :return: """ if search_args['use_keywords'] is False: self.logger.info('采用职能进行搜索.') url = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \ 'SF_1_1_2=%s&' \ 'SF_1_1_18=%s&' \ 'orderBy=DATE_MODIFIED,1&' \ 'pageSize=30&' \ 'SF_1_1_27=0&' \ 'SF_1_1_5=%s,16&' \ 'SF_1_1_8=18,30&' \ 'SF_1_1_7=1,9&' \ 'exclude=1&pageIndex=%s' \ % (search_args['keywords'].split('|')[1].encode('utf-8'), search_args['city'].split('|')[1].encode('utf-8'), search_args['degree'], page) referer = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \ 'SF_1_1_2=%s&' \ 'SF_1_1_18=%s&' \ 'orderBy=DATE_MODIFIED,1&' \ 'pageSize=30&' \ 'SF_1_1_27=0&' \ 'SF_1_1_5=%s,16&' \ 'SF_1_1_8=18,30&' \ 'SF_1_1_7=1,9&' \ 'exclude=1&pageIndex=%s' \ % (search_args['keywords'].split('|')[1].encode('utf-8'), search_args['city'].split('|')[1].encode('utf-8'), search_args['degree'], page) else: self.logger.info('采用关键词进行搜索.') url = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \ 'SF_1_1_1=%s&' \ 'SF_1_1_18=%s&' \ 'orderBy=DATE_MODIFIED,1&' \ 'pageSize=30&' \ 'SF_1_1_27=0&' \ 'SF_1_1_5=%s,16&' \ 'SF_1_1_8=18,30&' \ 'SF_1_1_7=1,9&' \ 'exclude=1&pageIndex=%s' \ % (search_args['keywords'].split('|')[0].encode('utf-8'), search_args['city'].split('|')[1].encode('utf-8'), search_args['degree'], page) referer = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \ 'SF_1_1_1=%s&' \ 'SF_1_1_18=%s&' \ 'orderBy=DATE_MODIFIED,1&' \ 'pageSize=30&' \ 'SF_1_1_27=0&' \ 'SF_1_1_5=%s,16&' \ 'SF_1_1_8=18,30&' \ 'SF_1_1_7=1,9&' \ 'exclude=1&pageIndex=%s' \ % (search_args['keywords'].split('|')[0].encode('utf-8'), search_args['city'].split('|')[1].encode('utf-8'), search_args['degree'], page) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'ihrsearch.zhaopin.com', 'Pragma': 'no-cache', 'Referer': referer, 'Upgrade-Insecure-Requests': '1', 'User-Agent': self.user_agent, 'Cookie': self.cookie } res = self.html_downloader.download(url, headers=headers, proxies=self.proxies, allow_redirects=False) if res.status_code == 302: self.logger.warning('cookie失效了') self.set_cookie_invalid() raise MfCookieValidException('cookie_invalid') try: soups = self.html_parser.parser(res.content).find( 'form', attrs={ 'name': 'frmResult' }).find('tbody').find_all('tr', class_='info') soups1 = self.html_parser.parser(res.content).find( 'form', attrs={ 'name': 'frmResult' }).find('tbody').find_all('tr', valign='middle') except Exception as e: self.logger.exception('获取resume_list失败: %s' % str(e)) return [] resume_list = [] for index, soup in enumerate(soups): # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4 # ) if datetime.datetime.now().isoweekday() == 1: # 周一 # global DAY_LIMITED # DAY_LIMITED = 5 # limited_day = datetime.datetime.now() - datetime.timedelta( # days=4) global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) else: global DAY_LIMITED DAY_LIMITED = 2 limited_day = datetime.datetime.now() - datetime.timedelta( days=1) if str2datetime( soups1[index].find_all('td')[-1].text.encode('utf-8'), '%y-%m-%d').date() < limited_day.date(): self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED) break resume_item = dict() resume_item['resumeNo'] = soup.find('a').get('resumeurlpart') resume_item['t'] = soup.find('a').get('t') resume_item['k'] = soup.find('a').get('k') resume_list.append(resume_item) self.logger.info( 'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' % (page, len(resume_list), search_args['keywords'].encode('utf-8'), search_args['city'].encode('utf-8'))) return resume_list