示例#1
0
    def init_search_page(self):
        url = 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx'
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Host': 'ehire.51job.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code == 302:
            self.logger.warning('cookie invalid.')
            # self.h.hset(
            #     self.source + '|' + self.auth_kwargs['username'].encode(
            #         'utf-8'), '')
            raise MfCookieValidException('cookie_invalid')

        self.logger.info('初始化search_page成功.')
        return res.content
示例#2
0
    def get_resume_list(self, page=1, **params):
        self.logger.info('开始执行搜索任务, USER: {} PAGE: {} {}'.format(
            self.auth_kwargs['username'].encode('utf-8'), page,
            json.dumps(params, ensure_ascii=False).encode('utf-8')))
        end = datetime.datetime.now() + datetime.timedelta(days=1)
        start = end - datetime.timedelta(days=settings.SEARCH_DAY)
        post_date = datetime2str(start, fmt='%Y%m%d') + '000000_' \
                    + datetime2str(end, fmt='%Y%m%d') + '000000'
        url = '{}pn{}/pve_5593_{}/?key={}&age=18_30&postdate={}'.format(
            params.get('city_url'), page, params.get('degree', '4'),
            params.get('keyword').encode('utf-8'), post_date)
        # print(url)
        headers = {
            'accept':
            'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'zh-CN,zh;q=0.9',
            'cache-control':
            'no-cache',
            'cookie':
            self.cookie,
            'pragma':
            'no-cache',
            'upgrade-insecure-requests':
            '1',
            'user-agent':
            'Mozilla/5.0 (X11; Linux x86_64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/68.0.3440.84 Safari/537.36'
        }
        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies)
        if 'passport.58.com' in res.url:
            raise MfCookieValidException('cookie invalid. {}'.format(
                self.auth_kwargs['username'].encode('utf-8')))
        real_html = self.get_real_html(res.content)
        soups = self.html_parser.parser(real_html).find(
            'div', id='infolist').find_all('dl')

        has_next = True if self.html_parser.parser(real_html).find(
            'div', class_='pagerout').find('a', class_='next') else False

        url_lst = []
        for soup in soups:
            url = soup.find('dt').find('a').get('href')
            resume_id = self.get_resume_id(url)
            if self.do_filter(resume_id):
                self.logger.info('简历: {}, {}天内已被采集过'.format(
                    resume_id, settings.DELAY_DAY))
                continue
            url_lst.append(url)
        time.sleep(random.randint(1, 2))
        return has_next, url_lst[:-1]
示例#3
0
    def get_resume_list_lp(self, page, **kwargs):
        """
        猎聘模式
        :return:
        """
        url = 'http://www.fenjianli.com/search/liepinSearch.htm'
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'www.fenjianli.com',
            'Origin': 'http://www.fenjianli.com',
            'Pragma': 'no-cache',
            'Referer': 'http://www.fenjianli.com/search/liepinHome.htm',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie,
            'X-Requested-With': 'XMLHttpRequest',
        }
        data = {
            'searchNear': 'on',
            'areas': kwargs['area_code'],
            'hJobs': kwargs['job_code'] + ',',
            'rows': '30',
            'sortBy': '1',
            'sortType': '1',
            'degree': '0-0',
            'offset': 30 * (page - 1),
            '_random': str(random.random()),
        }
        res = self.html_downloader.download(url,
                                            method='POST',
                                            headers=headers,
                                            data=data,
                                            proxies=self.proxies,
                                            allow_redirects=False)

        if res.status_code == 302:
            self.logger.info('cookie_invalid: %s' % res.content)
            # print self.auth_kwargs
            # self.invalid_cookie(self.auth_kwargs['username'],
            #                     self.auth_kwargs['password'])
            raise MfCookieValidException('cookie_invalid')

        if not res.json().get('list'):
            self.logger.warning('%s' %
                                json.dumps(res.json(), ensure_ascii=False))
            return

        self.logger.info('Page: %s 获取到%s份匹配的简历 [%s-%s]' %
                         (page, len(res.json().get('list')),
                          kwargs['area_name'].encode('utf-8'),
                          kwargs['job_name'].encode('utf-8')))

        return res.json().get('list')
示例#4
0
    def get_resume_detail(self, resume_id, mode='zl'):
        url = 'http://fenjianli.com/search/getDetail.htm'
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'fenjianli.com',
            'Origin': 'http://fenjianli.com',
            'Pragma': 'no-cache',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie,
            'X-Requested-With': 'XMLHttpRequest',
        }
        data = {'id': resume_id, '_random': str(random.random())}

        if mode == 'zl':
            headers['Referer'] = 'http://www.fenjianli.com/search/' \
                                 'detail.htm?ids=' \
                                 + base64.b64encode(resume_id)
        else:
            headers['Referer'] = 'http://www.fenjianli.com/search/' \
                                 'liepinDetail.htm?ids=' \
                                 + base64.b64encode(resume_id)

        res = self.html_downloader.download(url,
                                            method='POST',
                                            headers=headers,
                                            data=data,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code == 302:
            self.logger.warning('cookie_invalid: %s' % res.content)
            raise MfCookieValidException('cookie_invalid_from_detail')

        if '登录异常,请联系客服处理' in res.content:
            self.logger.info('cookie_invalid: %s' % res.content)
            raise MfCookieValidException('cookie_invalid_from_detail')

        self.logger.info('获取简历详情成功: %s' % resume_id.encode('utf-8'))
        return res.json()
示例#5
0
    def add_to_folder_catalog(self, resume_id, ids, folder_catalog_id):
        url = 'http://www.fenjianli.com/userResumeDetail/' \
              'addToFolderCatalog.htm'
        headers = {
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'no-cache',
            'Connection':
            'keep-alive',
            'Content-Length':
            '26',
            'Content-Type':
            'application/x-www-form-urlencoded; charset=UTF-8',
            'Host':
            'www.fenjianli.com',
            'Origin':
            'http://www.fenjianli.com',
            'Pragma':
            'no-cache',
            'Referer':
            'http://www.fenjianli.com/search/detail.htm?type=1'
            '&ids=%s' % base64.b64encode(resume_id),
            'User-Agent':
            self.user_agent,
            'Cookie':
            self.cookie,
            'X-Requested-With':
            'XMLHttpRequest',
        }
        data = {
            'ids': ids,
            'folderCatalogId': folder_catalog_id,
            'folderCatalogType': 'Download',
            'type': 'add',
            'isResumeId': 'true',
            '_random': random.random()
        }
        res = self.html_downloader.download(url,
                                            method='POST',
                                            headers=headers,
                                            data=data,
                                            allow_redirects=False,
                                            proxies=self.proxies)
        if res.status_code == 302 or 'error' in res.json():
            self.logger.warning('cookie_invalid: %s' % res.content)
            raise MfCookieValidException('cookie_invalid')

        self.logger.info('%s' % res.content)
        return res.json()
示例#6
0
    def get_cookie(self):
        """
        根据账号获取cookie
        :return:
        """
        username = self.auth_kwargs.get('username')
        if not username:
            raise Exception('username is empty.')
        self.cookie = self.cookie_pool.hget(
            '{username}'.format(username=username.encode('utf-8')))
        if not self.cookie:
            raise MfCookieValidException('cookie invalid.')

        self.cookie = cookie2str(eval(self.cookie))

        return username, self.cookie
示例#7
0
    def get_resume_detail(self, url):
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            # 'cache-control': 'no-cache',
            'Cookie':
            self.cookie,
            # 'pragma': 'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/68.0.3440.84 Safari/537.36'
        }
        url = unquote(url)

        if not self.cookie:
            raise Exception('account_limit')

        cookies = cookie2dict(self.cookie)
        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies,
                                            cookies=cookies)
        if 'passport.58.com' in res.url:
            raise MfCookieValidException('cookie invalid.')
        resume_id = self.get_resume_id(res.url)
        try:
            real_html = self.get_real_html(res.content)
        except Exception as e:
            real_html = None
            self.logger.info(res.content)
        time.sleep(random.randint(1, 2))
        return resume_id, real_html
示例#8
0
    def get_resume_detail(self, resume_args):
        url = 'https://ihr.zhaopin.com/resumesearch/getresumedetial.do?' \
              'access_token=46e7d16be97a4f3ba9ca7beb2c42f8a8&' \
              'resumeNo=%s&searchresume=1&resumeSource=1&keyword=java' \
              '&t=%s&k=%s&v=0&version=1' \
              '&openFrom=1' % (resume_args['resumeNo'], resume_args['t'],
                               resume_args['k'])
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Host': 'ihr.zhaopin.com',
            'Pragma': 'no-cache',
            'Referer': url,
            'User-Agent': self.user_agent,
            'Cookie': self.cookie,
            'X-Requested-With': 'XMLHttpRequest',
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies)
        if res.json().get('code') == 6001:
            self.logger.info(self.logger_prefix + 'cookie失效了')
            self.set_cookie_invalid()
            raise MfCookieValidException('cookie_invalid')

        if res.json().get('code') != 1:
            self.logger.info(
                self.logger_prefix + '获取简历详情失败: %s - %s' %
                (self.auth_kwargs['username'].encode('utf-8'),
                 resume_args.get('resumeNo').encode('utf-8')[:-4]))
            return
        self.logger.info(self.logger_prefix + '获取简历详情成功: %s - %s' %
                         (self.auth_kwargs['username'].encode('utf-8'),
                          resume_args.get('resumeNo').encode('utf-8')[:-4]))
        return res.json().get('data')
示例#9
0
    def get_resume_detail(self, resume_args):
        url = 'http://ihr.zhaopin.com/resumesearch/getresumedetial.do' \
              '?access_token=%s&resumeNo=%s_1&searchresume=1&resumeSource=1' \
              '&%s' % (self.access_token,
                       resume_args.get('id'),
                       resume_args.get('valResumeTimeStr'))

        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Host': 'ihr.zhaopin.com',
            'Pragma': 'no-cache',
            'Referer': url,
            'User-Agent': self.user_agent,
            'Cookie': self.cookie,
            'X-Requested-With': 'XMLHttpRequest',
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies)
        if res.json().get('code') == 6001:
            self.logger.info('cookie失效了')
            self.set_cookie_invalid()
            raise MfCookieValidException('cookie_invalid')

        if res.json().get('code') != 1:
            self.logger.info(self.logger_prefix + '获取简历详情失败: %s - %s' %
                             (self.auth_kwargs['username'].encode('utf-8'),
                              resume_args.get('number').encode('utf-8')))
            return
        self.logger.info(self.logger_prefix + '获取简历详情成功: %s - %s' %
                         (self.auth_kwargs['username'].encode('utf-8'),
                          resume_args.get('number').encode('utf-8')))
        return res.json().get('data')
示例#10
0
    def download_resume(self, resume_id):
        resume_detail = self.get_resume_detail(resume_id)
        if not resume_detail:
            self.logger.info('获取简历详情失败. %s' % resume_id.encode('utf-8'))
            raise MfCookieValidException('get_resume_detail_failed.')

        self.find_customer_channel_account(resume_id)
        tree_folder = self.tree_of_folder_catalog(resume_id)
        folder_catalog_id = re.findall('(?<=id:)\d+(?=, pId:[1-9])',
                                       tree_folder.get('data'))[0]
        ids = resume_id.encode('utf-8') + '/' + resume_detail.get(
            'name').encode('utf-8')
        self.logger.info('%s: 开始执行下载操作: %s - %s' %
                         (self.auth_kwargs['username'].encode('utf-8'), ids,
                          folder_catalog_id.encode('utf-8')))

        res = self.add_to_folder_catalog(resume_id, ids, folder_catalog_id)
        if '下载简历成功[1]份' in res:
            self.logger.info('下载成功')
            return self.get_resume_detail(resume_id)
        else:
            self.logger.info('下载失败: %s' % res)
            return
示例#11
0
    def get_resume_detail(self, resume_url):
        url = 'https://ehire.51job.com/' + resume_url
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'ehire.51job.com',
            'Referer':
            'https://ehire.51job.com/Candidate/SearchResumeNew.aspx',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code == 302:
            self.logger.warning('cookie invalid.')
            # self.h.hset(
            #     self.source + '|' + self.auth_kwargs['username'].encode(
            #         'utf-8'), '')
            raise MfCookieValidException('cookie_invalid')

        if '简历ID:' in res.content:
            self.logger.info('获取简历正文成功 %s' %
                             self.auth_kwargs['username'].encode('utf-8'))
            return res.content

        self.logger.warning('获取简历正文失败 %s' %
                            self.auth_kwargs['username'].encode('utf-8'))

        access_key = self.html_parser.parser(res.content).find(
            'input', id='hidAccessKey').get('value')

        # self.robot_login.send_markdown(
        #     title="简历搜索",
        #     content="#### 前程简历搜索详情页出现验证码.\n"
        #             "- 帐号: %s\n"
        #             "- 密码: %s\n"
        #             "- 会员名: %s\n"
        #             "- 代理: %s\n\n"
        #             % (
        #                 self.auth_kwargs['username'].encode(
        #                     'utf-8'),
        #                 self.auth_kwargs['password'].encode(
        #                     'utf-8'),
        #                 self.auth_kwargs['account_name'].encode(
        #                     'utf-8'),
        #                 self.auth_kwargs['ip'].encode(
        #                     'utf-8') + ':' +
        #                 self.auth_kwargs['port'].encode('utf-8'))
        # )
        self.get_captcha(referer=res.url, access_key=access_key)

        time.sleep(60)
        raise Exception
示例#12
0
    def get_resume_list(self,
                        previous_page_html,
                        action='pagerTopNew$ctl03',
                        **search_args):
        """

        :param previous_page_html:
        :param action:
        :param search_args: city['北京|010000'], keywords['销售代表|3001']
        :return:
        """
        url = 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx'
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Host': 'ehire.51job.com',
            'Origin': 'https://ehire.51job.com',
            'Pragma': 'no-cache',
            'Referer': 'https://ehire.51job.com/Candidate/'
            'SearchResumeNew.aspx',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        _soups = self.html_parser.parser(previous_page_html)

        data = {
            '__EVENTTARGET':
            action,
            '__EVENTARGUMENT':
            '',
            '__LASTFOCUS':
            '',
            '__VIEWSTATE':
            _soups.find('input', id='__VIEWSTATE').get('value'),
            'ctrlSerach$search_keyword_txt':
            search_args['keywords'].split('|')[0],
            'ctrlSerach$search_company_txt':
            '',
            'ctrlSerach$search_area_input':
            '',
            'ctrlSerach$search_area_hid':
            '',
            'ctrlSerach$search_funtype_hid':
            '',
            'ctrlSerach$search_expectsalaryf_input':
            '不限',
            'ctrlSerach$search_expectsalaryt_input':
            '不限',
            'ctrlSerach$search_industry_hid':
            '',
            'ctrlSerach$search_wyf_input':
            '不限',
            'ctrlSerach$search_wyt_input':
            '不限',
            'ctrlSerach$search_df_input':
            '不限',
            'ctrlSerach$search_dt_input':
            '不限',
            'ctrlSerach$search_cursalaryf_input':
            '不限',
            'ctrlSerach$search_cursalaryt_input':
            '不限',
            'ctrlSerach$search_age_input':
            '年龄:18-30',
            'ctrlSerach$search_agef_input':
            '18',
            'ctrlSerach$search_aget_input':
            '30',
            'ctrlSerach$search_expjobarea_input':
            search_args['city'].split('|')[0],
            'ctrlSerach$search_expjobarea_hid':
            search_args['city'],
            'ctrlSerach$search_forlang_input':
            '语言',
            'ctrlSerach$search_fl_input':
            '不限',
            'ctrlSerach$search_fllsabilityll_input':
            '不限',
            'ctrlSerach$search_englishlevel_input':
            '英语等级',
            'ctrlSerach$search_sex_input':
            '性别',
            'ctrlSerach$search_major_input':
            '专业',
            'ctrlSerach$search_major_hid':
            '',
            'ctrlSerach$search_hukou_input':
            '户口',
            'ctrlSerach$search_hukou_hid':
            '',
            'ctrlSerach$search_rsmupdate_input':
            '近1周',
            'ctrlSerach$search_jobstatus_input':
            '求职状态',
            'send_cycle':
            '1',
            'send_time':
            '7',
            'send_sum':
            '10',
            'ctrlSerach$hidSearchValue':
            u'%s##0#######20#35############近1周|1##1#0##%s#0#0#0' %
            (search_args['keywords'].split('|')[0], search_args['city']),
            'ctrlSerach$hidKeyWordMind':
            '',
            'ctrlSerach$hidRecommend':
            '',
            'ctrlSerach$hidWorkYearArea':
            '',
            'ctrlSerach$hidDegreeArea':
            '',
            'ctrlSerach$hidSalaryArea':
            '',
            'ctrlSerach$hidCurSalaryArea':
            '',
            'ctrlSerach$hidIsRecDisplay':
            '1',
            'showselected':
            '',
            'pagerTopNew$ctl06':
            '50',
            'cbxColumns$0':
            'AGE',
            'cbxColumns$1':
            'WORKYEAR',
            'cbxColumns$2':
            'SEX',
            'cbxColumns$3':
            'AREA',
            'cbxColumns$4':
            'WORKFUNC',
            'cbxColumns$5':
            'TOPDEGREE',
            'cbxColumns$6':
            'LASTUPDATE',
            'hidAccessKey':
            _soups.find('input', id='hidAccessKey').get('value'),
            'hidShowCode':
            '0',
            'hidDisplayType':
            '1',
            'hidEhireDemo':
            '',
            'hidUserID':
            '',
            'hidCheckUserIds':
            _soups.find('input', id='hidCheckUserIds').get('value'),
            'hidCheckKey':
            _soups.find('input', id='hidCheckKey').get('value'),
            'hidEvents':
            '',
            'hidNoSearchIDs':
            '',
            'hidBtnType':
            '',
            'hideMarkid':
            '',
            'hidStrAuthority':
            _soups.find('input', id='hidStrAuthority').get('value'),
            'hidDownloadNum':
            _soups.find('input', id='hidDownloadNum').get('value'),
            'hidKeywordCookie':
            '',
            'showGuide':
            '',
        }

        if not search_args['use_keywords']:
            self.logger.info('采用职能进行搜索.')
            data['ctrlSerach$search_keyword_txt'] = ''
            data['ctrlSerach$search_funtype_hid'] = search_args['keywords']
            data['hidSearchValue'] = \
                u'##0#%s######20#35############近1周|1##1#0##%s#0#0#0' \
                % (search_args['keywords'], search_args['city'])
        else:
            self.logger.info('采用关键词进行搜索.')

        res = self.html_downloader.download(url,
                                            method='POST',
                                            headers=headers,
                                            data=data,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code == 302:
            self.logger.warning('cookie invalid.')
            # self.h.hset(
            #     self.source + '|' + self.auth_kwargs['username'].encode(
            #         'utf-8'), '')
            raise MfCookieValidException('cookie_invalid')

        access_key = self.html_parser.parser(res.content).find(
            'input', id='hidAccessKey').get('value')
        # auth_ = self.html_parser.parser(res.content).find(
        #     'div', id='divVerifyCode_ch').get('style')

        soups = self.html_parser.parser(res.content).find_all(
            'td', class_='Common_list_table-id-text')

        resume_list = []

        if not soups:
            # 通过empty_times控制,当某账号累计10次遇到返回为空的情况,则进行验证码验证
            empty_times = int(self.h_search_empty_times.hget(self.auth_kwargs['username'])) \
                if self.h_search_empty_times.hget(self.auth_kwargs['username']) else 0
            if empty_times > 10:
                self.logger.warning(
                    '搜索列表遇到验证码. %s' %
                    self.auth_kwargs['username'].encode('utf-8'))
                self.get_captcha(referer=res.url,
                                 access_key=access_key,
                                 do_type='CheckSearchResume')
                self.h_search_empty_times.hset(self.auth_kwargs['username'], 0)
                raise Exception
            else:
                self.logger.warning(
                    '未匹配到搜索结果,跳过该任务[%s, %s, %s]' %
                    (self.auth_kwargs['username'].encode('utf-8'),
                     search_args['keywords'].encode('utf-8'),
                     search_args['city'].encode('utf-8')))
                empty_times += 1
                self.h_search_empty_times.hset(self.auth_kwargs['username'],
                                               empty_times)
                return resume_list, ''

        for soup in soups:
            ref_time = soup.find_parent().find_all('td')[-2].text.encode(
                'utf-8')
            if datetime.datetime.now().isoweekday() == 1:
                # 周一
                # global DAY_LIMITED
                # DAY_LIMITED = 5
                # limited_day = datetime.datetime.now() - datetime.timedelta(
                #     days=4)
                global DAY_LIMITED
                DAY_LIMITED = settings.DAY_LIMITED
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)
            else:
                global DAY_LIMITED
                DAY_LIMITED = settings.DAY_LIMITED
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)

            if str2datetime(ref_time, '%Y-%m-%d').date() < \
                    limited_day.date():
                self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED)
                break
            resume_list.append(soup.find('a').get('href'))

        try:
            page = self.html_parser.parser(res.content).find(
                'div', class_='Search_page-numble').find(
                    'a', class_='active').get('title').encode('utf-8')
        except Exception as e:
            self.logger.warning('未找到分页组件,跳过该任务[%s, %s]' %
                                (search_args['keywords'].encode('utf-8'),
                                 search_args['city'].encode('utf-8')))
            return resume_list, ''

        self.logger.info(
            'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' %
            (page, len(resume_list), search_args['keywords'].encode('utf-8'),
             search_args['city'].encode('utf-8')))

        if int(page) > settings.TASK_PAGE_LIMIT:
            raise FiveOneResumeException('task_page_limit')

        return resume_list, res.content
示例#13
0
    def get_resume_list(self, page=1, is_download=False, **search_args):
        """
        获取简历列表页
        搜索条件: 关键词/所在地/年龄20-35/学历/最近三天upDate
        :param page:
        :param search_args:
        :return:
        """
        url = 'https://ihr.zhaopin.com/resumesearch/search.do?' \
              'access_token=%s' % self.access_token
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'ihr.zhaopin.com',
            'Origin': 'https://ihr.zhaopin.com',
            'Pragma': 'no-cache',
            'Referer': 'https://ihr.zhaopin.com/resumesearch/search/',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie,
            'X-Requested-With': 'XMLHttpRequest',
        }

        data = {
            'keywords': search_args['keywords'].split('|')[0].encode('utf-8'),
            'startNum': (page - 1) * 30,
            'rowsCount': '30',
            'resumeGrade': '',
            'sortColumnName': 'sortUpDate',
            'sortColumn': 'sortUpDate desc',
            'onlyHasImg': 'false',
            'anyKeyWord': 'false',
            'hopeWorkCity': search_args['city'].split('|')[1].encode('utf-8'),
            'ageStart': search_args.get('age_start', '18'),
            'ageEnd': search_args.get('age_end', '30'),
            'workYears': search_args.get('work_years', ''),
            'liveCity': search_args.get('live_city', ''),
            'sex': search_args.get('sex', ''),
            'edu': search_args.get('degree', '5'),
            'upDate': search_args.get('up_date', ''),  # 默认搜索最近三天
            'companyName': search_args.get('company_name', ''),
            'jobType': '',
            'desiredJobType': search_args.get('desired_job_type', ''),
            'industry': search_args.get('industry', ''),
            'desiredIndustry': '',
            'careerStatus': '',
            'desiredSalary': '',
            'langSkill': '',
            'hukouCity': '',
            'major': '',
            'onlyLastWork': 'false',
        }
        # print(json.dumps(data, ensure_ascii=False, indent=4))
        if search_args['use_keywords'] is False:
            data['desiredJobType'] = search_args['keywords'].split('|')[1]
            self.logger.info('采用职能进行搜索.')
        else:
            self.logger.info('采用关键词进行搜索')

        res = self.html_downloader.download(url,
                                            method='POST',
                                            data=data,
                                            headers=headers,
                                            proxies=self.proxies)
        # self.logger.info('搜索返回 %s' % res.json())
        if res.json().get('code') == 6001:
            self.logger.info(self.logger_prefix + 'cookie失效了')
            self.set_cookie_invalid()
            raise MfCookieValidException('cookie_invalid')

        if res.json().get('code') == 808:
            self.logger.warning(self.logger_prefix +
                                res.json().get('message').encode('utf-8'))
            today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
            self.h_over_search_limit.hset(
                today + '|' + self.auth_kwargs['username'].encode('utf-8'), 1)
            # 当日搜索大库简历已达上限

            global LIMIT_MESSAGE_BOX
            if not LIMIT_MESSAGE_BOX.get(
                    self.auth_kwargs['username'].encode('utf-8'), ''):
                LIMIT_MESSAGE_BOX[self.auth_kwargs['username'].encode(
                    'utf-8')] = 1
                self.robot_login.send_markdown(
                    title="智联简历搜索",
                    content="#### 智联简历当日关键词搜索量已达上限.\n"
                    "- 帐号: %s\n"
                    "- 密码: %s\n"
                    "- 代理: %s\n"
                    "- 达到上限账号总数: %s\n" %
                    (self.auth_kwargs['username'].encode('utf-8'),
                     self.auth_kwargs['password'].encode('utf-8'),
                     self.auth_kwargs['ip'].encode('utf-8') + ':' +
                     self.auth_kwargs['port'].encode('utf-8'),
                     len(LIMIT_MESSAGE_BOX)))

            raise ZhiLianResumeException('user_record_limited')

        try:
            resume_list = res.json().get('results')
            if not resume_list:
                raise Exception
        except Exception as e:
            self.logger.exception('获取list失败: %s | %s' % (str(e), res.content))
            return []

        resume_accept_list = []
        for resume in resume_list:
            # global DAY_LIMITED
            # DAY_LIMITED = 5
            # limited_day = datetime.datetime.now() - datetime.timedelta(
            #     days=4
            # )
            if is_download is False:
                if datetime.datetime.now().isoweekday() == 1:
                    # 周一
                    # global DAY_LIMITED
                    # DAY_LIMITED = 5
                    # limited_day = datetime.datetime.now() - datetime.timedelta(
                    #     days=4)
                    global DAY_LIMITED
                    DAY_LIMITED = 2
                    limited_day = datetime.datetime.now() - datetime.timedelta(
                        days=1)
                else:
                    global DAY_LIMITED
                    DAY_LIMITED = 2
                    limited_day = datetime.datetime.now() - datetime.timedelta(
                        days=1)

                if str2datetime(resume.get('modifyDate'),
                                '%Y-%m-%d').date() < limited_day.date():
                    self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED)
                    break
            resume_accept_list.append(resume)

        self.logger.info('page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' %
                         (page, len(resume_accept_list),
                          search_args['keywords'].encode('utf-8'),
                          search_args['city'].encode('utf-8')))
        return resume_accept_list
示例#14
0
    def get_resume_list(self, page=1, **search_args):
        """
        获取简历列表页
        搜索条件: 关键词/所在地/年龄20-35/学历/最近三天SF_1_1_7
        :param page:
        :param search_args:
        :return:
        """

        if search_args['use_keywords'] is False:
            self.logger.info('采用职能进行搜索.')
            url = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \
                  'SF_1_1_2=%s&' \
                  'SF_1_1_18=%s&' \
                  'orderBy=DATE_MODIFIED,1&' \
                  'pageSize=30&' \
                  'SF_1_1_27=0&' \
                  'SF_1_1_5=%s,16&' \
                  'SF_1_1_8=18,30&' \
                  'SF_1_1_7=1,9&' \
                  'exclude=1&pageIndex=%s' \
                  % (search_args['keywords'].split('|')[1].encode('utf-8'),
                     search_args['city'].split('|')[1].encode('utf-8'),
                     search_args['degree'],
                     page)
            referer = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \
                      'SF_1_1_2=%s&' \
                      'SF_1_1_18=%s&' \
                      'orderBy=DATE_MODIFIED,1&' \
                      'pageSize=30&' \
                      'SF_1_1_27=0&' \
                      'SF_1_1_5=%s,16&' \
                      'SF_1_1_8=18,30&' \
                      'SF_1_1_7=1,9&' \
                      'exclude=1&pageIndex=%s' \
                      % (search_args['keywords'].split('|')[1].encode('utf-8'),
                         search_args['city'].split('|')[1].encode('utf-8'),
                         search_args['degree'],
                         page)

        else:
            self.logger.info('采用关键词进行搜索.')
            url = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \
                  'SF_1_1_1=%s&' \
                  'SF_1_1_18=%s&' \
                  'orderBy=DATE_MODIFIED,1&' \
                  'pageSize=30&' \
                  'SF_1_1_27=0&' \
                  'SF_1_1_5=%s,16&' \
                  'SF_1_1_8=18,30&' \
                  'SF_1_1_7=1,9&' \
                  'exclude=1&pageIndex=%s' \
                  % (search_args['keywords'].split('|')[0].encode('utf-8'),
                     search_args['city'].split('|')[1].encode('utf-8'),
                     search_args['degree'],
                     page)
            referer = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \
                      'SF_1_1_1=%s&' \
                      'SF_1_1_18=%s&' \
                      'orderBy=DATE_MODIFIED,1&' \
                      'pageSize=30&' \
                      'SF_1_1_27=0&' \
                      'SF_1_1_5=%s,16&' \
                      'SF_1_1_8=18,30&' \
                      'SF_1_1_7=1,9&' \
                      'exclude=1&pageIndex=%s' \
                      % (search_args['keywords'].split('|')[0].encode('utf-8'),
                         search_args['city'].split('|')[1].encode('utf-8'),
                         search_args['degree'],
                         page)

        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Host': 'ihrsearch.zhaopin.com',
            'Pragma': 'no-cache',
            'Referer': referer,
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies,
                                            allow_redirects=False)

        if res.status_code == 302:
            self.logger.warning('cookie失效了')
            self.set_cookie_invalid()
            raise MfCookieValidException('cookie_invalid')

        try:
            soups = self.html_parser.parser(res.content).find(
                'form', attrs={
                    'name': 'frmResult'
                }).find('tbody').find_all('tr', class_='info')
            soups1 = self.html_parser.parser(res.content).find(
                'form', attrs={
                    'name': 'frmResult'
                }).find('tbody').find_all('tr', valign='middle')
        except Exception as e:
            self.logger.exception('获取resume_list失败: %s' % str(e))
            return []
        resume_list = []
        for index, soup in enumerate(soups):
            # global DAY_LIMITED
            # DAY_LIMITED = 5
            # limited_day = datetime.datetime.now() - datetime.timedelta(
            #     days=4
            # )
            if datetime.datetime.now().isoweekday() == 1:
                # 周一
                # global DAY_LIMITED
                # DAY_LIMITED = 5
                # limited_day = datetime.datetime.now() - datetime.timedelta(
                #     days=4)
                global DAY_LIMITED
                DAY_LIMITED = 2
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)
            else:
                global DAY_LIMITED
                DAY_LIMITED = 2
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)

            if str2datetime(
                    soups1[index].find_all('td')[-1].text.encode('utf-8'),
                    '%y-%m-%d').date() < limited_day.date():
                self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED)
                break
            resume_item = dict()
            resume_item['resumeNo'] = soup.find('a').get('resumeurlpart')
            resume_item['t'] = soup.find('a').get('t')
            resume_item['k'] = soup.find('a').get('k')
            resume_list.append(resume_item)
        self.logger.info(
            'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' %
            (page, len(resume_list), search_args['keywords'].encode('utf-8'),
             search_args['city'].encode('utf-8')))
        return resume_list