示例#1
0
def main(check_day):
    es = Elasticsearch(hosts='172.16.25.9')
    index = 'morgan-v3-%s' \
            % (datetime2str(str2datetime(check_day), fmt='%Y.%m.%d'))
    start_time = int(
        time.mktime((str2datetime(check_day) -
                     datetime.timedelta(hours=1)).timetuple())) * 1000 - 1
    end_time = int(time.mktime((str2datetime(check_day)).timetuple())) * 1000

    body = {
        "version": True,
        "size": 10000,  # 用于控制输出数量
        "query": {
            "bool": {
                "must": [{
                    "query_string": {
                        "query": "log_message:\"判断是否需要下载 id=\"",
                        "analyze_wildcard": True
                    }
                }, {
                    "range": {
                        "@timestamp": {
                            "gte": start_time,
                            "lte": end_time,
                            "format": "epoch_millis"
                        }
                    }
                }],
                "must_not": []
            }
        },
        "_source": {
            "excludes": []
        },
        "aggs": {
            "2": {
                "date_histogram": {
                    "field": "@timestamp",
                    "interval": "30m",
                    "time_zone": "Asia/Shanghai",
                    "min_doc_count": 1
                }
            }
        }
    }
    res = es.search(index=index, body=body)
    hits = res.get('hits').get('hits')
    today = datetime2str(str2datetime(check_day), fmt='%Y-%m-%d')
    pools = RedisSet('ZHI_LIAN_AWAKE_FAILED-%s' % today)

    logger.info("匹配到%s个唤醒失败的简历 『%s - %s』。" % (len(hits), start_time, end_time))
    for hit in hits:
        log_message = hit.get('_source').get('log_message')
        # 正则匹配id
        normal_id = re.findall('(?<=id=)\d+(?=\s)', log_message)[0]
        pools.sadd(normal_id)
    logger.info('当前集合长度为: %s' % len(pools.scard()))
示例#2
0
    def resume_search(self, **search_args):
        self.get_cookie()
        ip = eval(self.auth_kwargs['proxy'])['ip']
        port = eval(self.auth_kwargs['proxy'])['port']

        self.proxies = {
            'http': 'http://%s:%s' % (ip, port),
            'https': 'https://%s:%s' % (ip, port),
        }
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        # print(search_args)

        flag = False
        while True:

            awake_flow_no = self.h_awake_flow_no.hget('FIVE_ONE')
            if self.h_status.hget(awake_flow_no) == '400':
                self.logger.info("程序当前处于暂停状态.sleep 60s")
                time.sleep(60)
                continue

            if flag is False:
                # 扫描第一页
                page_html = self.init_search_page()
                resume_list, page_html = self.get_resume_list(
                    page_html, action='pagerTopNew$ctl00', **search_args)
                flag = True
            else:
                resume_list, page_html = self.get_resume_list(
                    page_html, **search_args)

            if not resume_list:
                raise FiveOneResumeException('resume_list_empty')

            time.sleep(random.randint(1, 5))
            for resume_args in resume_list:

                if self.is_limited(self.auth_kwargs['username']) is True:
                    raise FiveOneResumeException('user_record_limited')

                count = self.is_limited(self.auth_kwargs['username'])

                # 用于简历去重
                resume_id = re.findall('''(?<=hidUserID=)\d+?(?=&)''',
                                       resume_args)[0].encode('utf-8')

                last_search_day = self.h_black_list.hget(resume_id)
                if last_search_day:
                    distance = (
                        str2datetime(today.replace('|', ''), '%Y-%m-%d') -
                        str2datetime(last_search_day, '%Y-%m-%d')).days
                else:
                    distance = DAY_LIMITED + 1
                if distance < DAY_LIMITED:
                    self.logger.warning('该简历%s天内已经被采集过: %s' %
                                        (DAY_LIMITED, resume_id))
                    continue
                self.h_black_list.hset(resume_id, today.replace('|', ''))
                resume_detail = self.get_resume_detail(resume_url=resume_args)
                if not resume_detail:
                    continue
                resume_uuid = str(uuid.uuid1())
                # content_origin = {'name': '', 'email': '', 'phone': '',
                #                   'html': resume_detail.decode('utf-8')}
                # content = json.dumps(content_origin, ensure_ascii=False)

                content = resume_detail.decode('utf-8')

                sql = '''INSERT INTO spider_search.resume_raw (source, content, 
                createBy, 
                trackId, createtime, email, emailJobType, emailCity, subject) VALUES 
                (%s, %s, "python", %s, now(), %s, %s, %s, %s)'''
                sql_value = (self.common_settings.SOURCE, content, resume_uuid,
                             self.auth_kwargs['username'],
                             search_args['keywords'].split('|')[0],
                             search_args['city'].split('|')[0], str(resume_id))

                resume_update_time = ''
                msg_data = {
                    "channelType": "WEB",
                    "content": {
                        "content": content,
                        "id": '',
                        "createBy": "python",
                        "createTime": int(time.time() * 1000),
                        "ip": '',
                        "resumeSubmitTime": '',
                        "resumeUpdateTime": resume_update_time,
                        "source": self.common_settings.SOURCE,
                        "trackId": str(resume_uuid),
                        "avatarUrl": '',
                        "email": self.auth_kwargs['username'],
                        'emailJobType': search_args['keywords'].split('|')[0],
                        'emailCity': search_args['city'].split('|')[0],
                        'subject': str(resume_id)
                    },
                    "interfaceType": "PARSE",
                    "resourceDataType": "RAW",
                    "resourceType": "RESUME_SEARCH_AWAKE",
                    "source": self.common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    'traceID': str(resume_uuid),
                    'callSystemID': self.common_settings.CALL_SYSTEM_ID,
                }
                # self.mysql_handler.save(sql=sql, data=sql_value)
                res = self.save_data(sql=sql,
                                     data=sql_value,
                                     msg_data=msg_data)

                if res:
                    count += 1
                    self.h_use_record.hset(
                        self.auth_kwargs['username'] + today, count)
                    mysql_ = self.init_mysql(
                        user='******',
                        passwd='bi_admin#@1mofanghr',
                        host='172.16.25.1',
                        # user='******',
                        # passwd='bi_admin#@1mofanghr',
                        # host='10.0.3.52',
                        cursorclass=DictCursor,
                        cls_singleton=False)
                    sql = '''
                    INSERT INTO spider.resume_awake_record (source, position, city,
                     raw_id, create_time, username) VALUES ('FIVE_ONE', %s, 
                     %s, %s, now(), %s)
                    '''
                    value = (search_args['keywords'].split('|')[0],
                             search_args['city'].split('|')[0], res,
                             self.auth_kwargs['username'])
                    mysql_.save(sql, value)

                time.sleep(random.randint(5, 7))
示例#3
0
    def get_resume_list(self,
                        previous_page_html,
                        action='pagerTopNew$ctl03',
                        **search_args):
        """

        :param previous_page_html:
        :param action:
        :param search_args: city['北京|010000'], keywords['销售代表|3001']
        :return:
        """
        url = 'https://ehire.51job.com/Candidate/SearchResumeNew.aspx'
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Host': 'ehire.51job.com',
            'Origin': 'https://ehire.51job.com',
            'Pragma': 'no-cache',
            'Referer': 'https://ehire.51job.com/Candidate/'
            'SearchResumeNew.aspx',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        _soups = self.html_parser.parser(previous_page_html)

        data = {
            '__EVENTTARGET':
            action,
            '__EVENTARGUMENT':
            '',
            '__LASTFOCUS':
            '',
            '__VIEWSTATE':
            _soups.find('input', id='__VIEWSTATE').get('value'),
            'ctrlSerach$search_keyword_txt':
            search_args['keywords'].split('|')[0],
            'ctrlSerach$search_company_txt':
            '',
            'ctrlSerach$search_area_input':
            '',
            'ctrlSerach$search_area_hid':
            '',
            'ctrlSerach$search_funtype_hid':
            '',
            'ctrlSerach$search_expectsalaryf_input':
            '不限',
            'ctrlSerach$search_expectsalaryt_input':
            '不限',
            'ctrlSerach$search_industry_hid':
            '',
            'ctrlSerach$search_wyf_input':
            '不限',
            'ctrlSerach$search_wyt_input':
            '不限',
            'ctrlSerach$search_df_input':
            '不限',
            'ctrlSerach$search_dt_input':
            '不限',
            'ctrlSerach$search_cursalaryf_input':
            '不限',
            'ctrlSerach$search_cursalaryt_input':
            '不限',
            'ctrlSerach$search_age_input':
            '年龄:18-30',
            'ctrlSerach$search_agef_input':
            '18',
            'ctrlSerach$search_aget_input':
            '30',
            'ctrlSerach$search_expjobarea_input':
            search_args['city'].split('|')[0],
            'ctrlSerach$search_expjobarea_hid':
            search_args['city'],
            'ctrlSerach$search_forlang_input':
            '语言',
            'ctrlSerach$search_fl_input':
            '不限',
            'ctrlSerach$search_fllsabilityll_input':
            '不限',
            'ctrlSerach$search_englishlevel_input':
            '英语等级',
            'ctrlSerach$search_sex_input':
            '性别',
            'ctrlSerach$search_major_input':
            '专业',
            'ctrlSerach$search_major_hid':
            '',
            'ctrlSerach$search_hukou_input':
            '户口',
            'ctrlSerach$search_hukou_hid':
            '',
            'ctrlSerach$search_rsmupdate_input':
            '近1周',
            'ctrlSerach$search_jobstatus_input':
            '求职状态',
            'send_cycle':
            '1',
            'send_time':
            '7',
            'send_sum':
            '10',
            'ctrlSerach$hidSearchValue':
            u'%s##0#######20#35############近1周|1##1#0##%s#0#0#0' %
            (search_args['keywords'].split('|')[0], search_args['city']),
            'ctrlSerach$hidKeyWordMind':
            '',
            'ctrlSerach$hidRecommend':
            '',
            'ctrlSerach$hidWorkYearArea':
            '',
            'ctrlSerach$hidDegreeArea':
            '',
            'ctrlSerach$hidSalaryArea':
            '',
            'ctrlSerach$hidCurSalaryArea':
            '',
            'ctrlSerach$hidIsRecDisplay':
            '1',
            'showselected':
            '',
            'pagerTopNew$ctl06':
            '50',
            'cbxColumns$0':
            'AGE',
            'cbxColumns$1':
            'WORKYEAR',
            'cbxColumns$2':
            'SEX',
            'cbxColumns$3':
            'AREA',
            'cbxColumns$4':
            'WORKFUNC',
            'cbxColumns$5':
            'TOPDEGREE',
            'cbxColumns$6':
            'LASTUPDATE',
            'hidAccessKey':
            _soups.find('input', id='hidAccessKey').get('value'),
            'hidShowCode':
            '0',
            'hidDisplayType':
            '1',
            'hidEhireDemo':
            '',
            'hidUserID':
            '',
            'hidCheckUserIds':
            _soups.find('input', id='hidCheckUserIds').get('value'),
            'hidCheckKey':
            _soups.find('input', id='hidCheckKey').get('value'),
            'hidEvents':
            '',
            'hidNoSearchIDs':
            '',
            'hidBtnType':
            '',
            'hideMarkid':
            '',
            'hidStrAuthority':
            _soups.find('input', id='hidStrAuthority').get('value'),
            'hidDownloadNum':
            _soups.find('input', id='hidDownloadNum').get('value'),
            'hidKeywordCookie':
            '',
            'showGuide':
            '',
        }

        if not search_args['use_keywords']:
            self.logger.info('采用职能进行搜索.')
            data['ctrlSerach$search_keyword_txt'] = ''
            data['ctrlSerach$search_funtype_hid'] = search_args['keywords']
            data['hidSearchValue'] = \
                u'##0#%s######20#35############近1周|1##1#0##%s#0#0#0' \
                % (search_args['keywords'], search_args['city'])
        else:
            self.logger.info('采用关键词进行搜索.')

        res = self.html_downloader.download(url,
                                            method='POST',
                                            headers=headers,
                                            data=data,
                                            proxies=self.proxies,
                                            allow_redirects=False)
        if res.status_code == 302:
            self.logger.warning('cookie invalid.')
            # self.h.hset(
            #     self.source + '|' + self.auth_kwargs['username'].encode(
            #         'utf-8'), '')
            raise MfCookieValidException('cookie_invalid')

        access_key = self.html_parser.parser(res.content).find(
            'input', id='hidAccessKey').get('value')
        # auth_ = self.html_parser.parser(res.content).find(
        #     'div', id='divVerifyCode_ch').get('style')

        soups = self.html_parser.parser(res.content).find_all(
            'td', class_='Common_list_table-id-text')

        resume_list = []

        if not soups:
            # 通过empty_times控制,当某账号累计10次遇到返回为空的情况,则进行验证码验证
            empty_times = int(self.h_search_empty_times.hget(self.auth_kwargs['username'])) \
                if self.h_search_empty_times.hget(self.auth_kwargs['username']) else 0
            if empty_times > 10:
                self.logger.warning(
                    '搜索列表遇到验证码. %s' %
                    self.auth_kwargs['username'].encode('utf-8'))
                self.get_captcha(referer=res.url,
                                 access_key=access_key,
                                 do_type='CheckSearchResume')
                self.h_search_empty_times.hset(self.auth_kwargs['username'], 0)
                raise Exception
            else:
                self.logger.warning(
                    '未匹配到搜索结果,跳过该任务[%s, %s, %s]' %
                    (self.auth_kwargs['username'].encode('utf-8'),
                     search_args['keywords'].encode('utf-8'),
                     search_args['city'].encode('utf-8')))
                empty_times += 1
                self.h_search_empty_times.hset(self.auth_kwargs['username'],
                                               empty_times)
                return resume_list, ''

        for soup in soups:
            ref_time = soup.find_parent().find_all('td')[-2].text.encode(
                'utf-8')
            if datetime.datetime.now().isoweekday() == 1:
                # 周一
                # global DAY_LIMITED
                # DAY_LIMITED = 5
                # limited_day = datetime.datetime.now() - datetime.timedelta(
                #     days=4)
                global DAY_LIMITED
                DAY_LIMITED = settings.DAY_LIMITED
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)
            else:
                global DAY_LIMITED
                DAY_LIMITED = settings.DAY_LIMITED
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)

            if str2datetime(ref_time, '%Y-%m-%d').date() < \
                    limited_day.date():
                self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED)
                break
            resume_list.append(soup.find('a').get('href'))

        try:
            page = self.html_parser.parser(res.content).find(
                'div', class_='Search_page-numble').find(
                    'a', class_='active').get('title').encode('utf-8')
        except Exception as e:
            self.logger.warning('未找到分页组件,跳过该任务[%s, %s]' %
                                (search_args['keywords'].encode('utf-8'),
                                 search_args['city'].encode('utf-8')))
            return resume_list, ''

        self.logger.info(
            'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' %
            (page, len(resume_list), search_args['keywords'].encode('utf-8'),
             search_args['city'].encode('utf-8')))

        if int(page) > settings.TASK_PAGE_LIMIT:
            raise FiveOneResumeException('task_page_limit')

        return resume_list, res.content
示例#4
0
    def resume_search(self, page, **search_args):
        self.get_cookie()
        ip = eval(self.auth_kwargs['proxy'])['ip']
        port = eval(self.auth_kwargs['proxy'])['port']

        self.proxies = {
            'http': 'http://%s:%s' % (ip, port),
            'https': 'https://%s:%s' % (ip, port),
        }
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        # print(search_args)
        resume_list = self.get_resume_list(page=page, **search_args)

        if not resume_list:
            raise ZhiLianResumeException('resume_list_empty')

        for resume_args in resume_list:
            # 用于限制帐号进入详情页次数
            if not self.h_use_record.hget(
                    self.auth_kwargs['username'] + today):
                self.h_use_record.hset(self.auth_kwargs['username'] + today, 0)
                count = 0
            else:
                count = int(
                    self.h_use_record.hget(
                        self.auth_kwargs['username'] + today))

            if self.check_limit(count=count):
                today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                self.h_over_search_limit.hset(today + '|' + self.auth_kwargs[
                    'username'].encode('utf-8'), 1)
                raise ZhiLianResumeException('user_record_limited')

            # 用于简历去重
            try:
                resume_id = str(resume_args.get('resumeNo').encode('utf-8')[
                                :10])
            except:
                resume_id = str(resume_args.get('number')[:10])
            last_search_day = self.h_black_list.hget(resume_id)
            if last_search_day:
                distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d')
                            - str2datetime(last_search_day, '%Y-%m-%d')).days
            else:
                distance = DAY_LIMITED + 1
            if distance < DAY_LIMITED:
                self.logger.warning('该简历%s天内已经被采集过: %s'
                                    % (DAY_LIMITED, resume_id))
                continue
            self.h_black_list.hset(resume_id, today.replace('|', ''))
            resume_detail = self.get_resume_detail(
                resume_args=resume_args)

            if not resume_detail:
                continue

            if resume_detail.get('resumeSource').encode('utf-8').lower() == \
                    'download':
                resource_type = 'RESUME_INBOX'
            else:
                resource_type = 'RESUME_SEARCH'

            content = json.dumps({'name': '', 'email': '', 'phone': '',
                                  'html': resume_detail},
                                 ensure_ascii=False)
            data = {
                'ChannelType': 'APP',
                'Source': self.source,
                'ResourceType': resource_type,
                'content': content,
                'accountContent': json.dumps(self.auth_kwargs,
                                             ensure_ascii=False,
                                             cls=JsonCustomEncoder),
                'resumeId': resume_detail.get('resumeNo'),
                'searchData': json.dumps(
                    search_args.get('origin_search_args'), ensure_ascii=False),
                'code': 200
            }
            self.push_resume(**data)
            time.sleep(random.randint(1, 5))
示例#5
0
def execute_awake():
    runner = ResumeFen()
    while True:
        task_id, params = runner.get_task()
        if not task_id:
            runner.push_task()
            continue

        try:
            runner.get_cookie()
            page = 1
            has_next_page = True
            while has_next_page:
                if params.get('model_name') == 'ZHI_LIAN':
                    mode = 'zl'
                    resume_list = runner.get_resume_list_zl(page, **params)
                else:
                    mode = 'lp'
                    resume_list = runner.get_resume_list_lp(page, **params)

                if not resume_list:
                    runner.logger.warning('简历列表为空,开始切换任务.')
                    runner.update_task(task_id=task_id)
                    break

                flag = 0
                for resume in resume_list:
                    resume_id = resume.get('id').encode('utf-8')
                    day = datetime.datetime.now().day
                    today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                    last_search_day = runner.h_search_back_list.hget(resume_id)

                    if flag > 25:
                        runner.logger.info('当前页存在超过25个已采集简历,跳过任务.')
                        has_next_page = False
                        break

                    # 用于去重天数限制 TIME_LIMIT
                    if last_search_day:
                        last_search_day = str2datetime(last_search_day,
                                                       fmt='%Y-%m-%d').day

                        if day - last_search_day <= TIME_LIMIT:
                            runner.logger.warning('该简历[%s] %s天内已采集过.' %
                                                  (resume_id, TIME_LIMIT))
                            flag += 1
                            continue

                    content = runner.get_resume_detail(resume_id=resume_id,
                                                       mode=mode)

                    if not content:
                        continue

                    content = json.dumps(content, ensure_ascii=False)

                    resume_uuid = str(uuid.uuid1())

                    sql = '''insert into spider_search.resume_raw (source, content, 
                                createBy, 
                                trackId, createtime, email, emailJobType, emailCity, subject) values 
                                (%s, %s, "python", %s, now(), %s, %s, %s, %s)'''
                    sql_value = (runner.common_settings.SOURCE, content,
                                 resume_uuid, runner.auth_kwargs['username'],
                                 params['job_name'], params['area_name'],
                                 str(resume_id))

                    resume_update_time = ''
                    msg_data = {
                        "channelType": "WEB",
                        "content": {
                            "content": content,
                            "id": '',
                            "createBy": "python",
                            "createTime": int(time.time() * 1000),
                            "ip": '',
                            "resumeSubmitTime": '',
                            "resumeUpdateTime": resume_update_time,
                            "source": runner.common_settings.SOURCE,
                            "trackId": str(resume_uuid),
                            "avatarUrl": '',
                            "email": runner.auth_kwargs['username'],
                            'emailJobType': params['job_name'],
                            'emailCity': params['area_name'],
                            'subject': resume_id
                        },
                        "interfaceType": "PARSE",
                        "resourceDataType": "RAW",
                        "resourceType": "RESUME_SEARCH",
                        "source": runner.common_settings.SOURCE,
                        "trackId": str(resume_uuid),
                        'traceID': str(resume_uuid),
                        'callSystemID': runner.common_settings.CALL_SYSTEM_ID,
                    }
                    # self.mysql_handler.save(sql=sql, data=sql_value)
                    res = runner.save_data(sql=sql,
                                           data=sql_value,
                                           msg_data=msg_data)
                    if res:
                        # 重置cookie重试次数
                        runner.h_account_limit.hset(
                            runner.auth_kwargs['username'], 0)
                        runner.h_search_back_list.hset(resume_id, today)
                    time.sleep(random.randint(1, 5))
                if len(resume_list) < 30:
                    runner.logger.info('当前页简历小于30,任务结束。')
                    has_next_page = False

                page += 1
                runner.update_task(task_id=task_id)

        except MfCookieValidException:
            runner.update_task(task_id=task_id)
            runner.add_task(param=json.dumps(params, ensure_ascii=False))
            runner.logger.warning('因Cookie失败导致任务退出,重新添加任务!')

        except Exception as e:
            runner.update_task(task_id=task_id,
                               execute_status='FAILURE',
                               execute_result=str(e))
            runner.logger.exception(str(e))
示例#6
0
    def get_resume_list(self, page=1, is_download=False, **search_args):
        """
        获取简历列表页
        搜索条件: 关键词/所在地/年龄20-35/学历/最近三天upDate
        :param page:
        :param search_args:
        :return:
        """
        url = 'https://ihr.zhaopin.com/resumesearch/search.do?' \
              'access_token=%s' % self.access_token
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'ihr.zhaopin.com',
            'Origin': 'https://ihr.zhaopin.com',
            'Pragma': 'no-cache',
            'Referer': 'https://ihr.zhaopin.com/resumesearch/search/',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie,
            'X-Requested-With': 'XMLHttpRequest',
        }

        data = {
            'keywords': search_args['keywords'].split('|')[0].encode('utf-8'),
            'startNum': (page - 1) * 30,
            'rowsCount': '30',
            'resumeGrade': '',
            'sortColumnName': 'sortUpDate',
            'sortColumn': 'sortUpDate desc',
            'onlyHasImg': 'false',
            'anyKeyWord': 'false',
            'hopeWorkCity': search_args['city'].split('|')[1].encode('utf-8'),
            'ageStart': search_args.get('age_start', '18'),
            'ageEnd': search_args.get('age_end', '30'),
            'workYears': search_args.get('work_years', ''),
            'liveCity': search_args.get('live_city', ''),
            'sex': search_args.get('sex', ''),
            'edu': search_args.get('degree', '5'),
            'upDate': search_args.get('up_date', ''),  # 默认搜索最近三天
            'companyName': search_args.get('company_name', ''),
            'jobType': '',
            'desiredJobType': search_args.get('desired_job_type', ''),
            'industry': search_args.get('industry', ''),
            'desiredIndustry': '',
            'careerStatus': '',
            'desiredSalary': '',
            'langSkill': '',
            'hukouCity': '',
            'major': '',
            'onlyLastWork': 'false',
        }
        # print(json.dumps(data, ensure_ascii=False, indent=4))
        if search_args['use_keywords'] is False:
            data['desiredJobType'] = search_args['keywords'].split('|')[1]
            self.logger.info('采用职能进行搜索.')
        else:
            self.logger.info('采用关键词进行搜索')

        res = self.html_downloader.download(url,
                                            method='POST',
                                            data=data,
                                            headers=headers,
                                            proxies=self.proxies)
        # self.logger.info('搜索返回 %s' % res.json())
        if res.json().get('code') == 6001:
            self.logger.info(self.logger_prefix + 'cookie失效了')
            self.set_cookie_invalid()
            raise MfCookieValidException('cookie_invalid')

        if res.json().get('code') == 808:
            self.logger.warning(self.logger_prefix +
                                res.json().get('message').encode('utf-8'))
            today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
            self.h_over_search_limit.hset(
                today + '|' + self.auth_kwargs['username'].encode('utf-8'), 1)
            # 当日搜索大库简历已达上限

            global LIMIT_MESSAGE_BOX
            if not LIMIT_MESSAGE_BOX.get(
                    self.auth_kwargs['username'].encode('utf-8'), ''):
                LIMIT_MESSAGE_BOX[self.auth_kwargs['username'].encode(
                    'utf-8')] = 1
                self.robot_login.send_markdown(
                    title="智联简历搜索",
                    content="#### 智联简历当日关键词搜索量已达上限.\n"
                    "- 帐号: %s\n"
                    "- 密码: %s\n"
                    "- 代理: %s\n"
                    "- 达到上限账号总数: %s\n" %
                    (self.auth_kwargs['username'].encode('utf-8'),
                     self.auth_kwargs['password'].encode('utf-8'),
                     self.auth_kwargs['ip'].encode('utf-8') + ':' +
                     self.auth_kwargs['port'].encode('utf-8'),
                     len(LIMIT_MESSAGE_BOX)))

            raise ZhiLianResumeException('user_record_limited')

        try:
            resume_list = res.json().get('results')
            if not resume_list:
                raise Exception
        except Exception as e:
            self.logger.exception('获取list失败: %s | %s' % (str(e), res.content))
            return []

        resume_accept_list = []
        for resume in resume_list:
            # global DAY_LIMITED
            # DAY_LIMITED = 5
            # limited_day = datetime.datetime.now() - datetime.timedelta(
            #     days=4
            # )
            if is_download is False:
                if datetime.datetime.now().isoweekday() == 1:
                    # 周一
                    # global DAY_LIMITED
                    # DAY_LIMITED = 5
                    # limited_day = datetime.datetime.now() - datetime.timedelta(
                    #     days=4)
                    global DAY_LIMITED
                    DAY_LIMITED = 2
                    limited_day = datetime.datetime.now() - datetime.timedelta(
                        days=1)
                else:
                    global DAY_LIMITED
                    DAY_LIMITED = 2
                    limited_day = datetime.datetime.now() - datetime.timedelta(
                        days=1)

                if str2datetime(resume.get('modifyDate'),
                                '%Y-%m-%d').date() < limited_day.date():
                    self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED)
                    break
            resume_accept_list.append(resume)

        self.logger.info('page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' %
                         (page, len(resume_accept_list),
                          search_args['keywords'].encode('utf-8'),
                          search_args['city'].encode('utf-8')))
        return resume_accept_list
示例#7
0
    def get_resume_list(self, page=1, **search_args):
        """
        获取简历列表页
        搜索条件: 关键词/所在地/年龄20-35/学历/最近三天SF_1_1_7
        :param page:
        :param search_args:
        :return:
        """

        if search_args['use_keywords'] is False:
            self.logger.info('采用职能进行搜索.')
            url = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \
                  'SF_1_1_2=%s&' \
                  'SF_1_1_18=%s&' \
                  'orderBy=DATE_MODIFIED,1&' \
                  'pageSize=30&' \
                  'SF_1_1_27=0&' \
                  'SF_1_1_5=%s,16&' \
                  'SF_1_1_8=18,30&' \
                  'SF_1_1_7=1,9&' \
                  'exclude=1&pageIndex=%s' \
                  % (search_args['keywords'].split('|')[1].encode('utf-8'),
                     search_args['city'].split('|')[1].encode('utf-8'),
                     search_args['degree'],
                     page)
            referer = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \
                      'SF_1_1_2=%s&' \
                      'SF_1_1_18=%s&' \
                      'orderBy=DATE_MODIFIED,1&' \
                      'pageSize=30&' \
                      'SF_1_1_27=0&' \
                      'SF_1_1_5=%s,16&' \
                      'SF_1_1_8=18,30&' \
                      'SF_1_1_7=1,9&' \
                      'exclude=1&pageIndex=%s' \
                      % (search_args['keywords'].split('|')[1].encode('utf-8'),
                         search_args['city'].split('|')[1].encode('utf-8'),
                         search_args['degree'],
                         page)

        else:
            self.logger.info('采用关键词进行搜索.')
            url = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \
                  'SF_1_1_1=%s&' \
                  'SF_1_1_18=%s&' \
                  'orderBy=DATE_MODIFIED,1&' \
                  'pageSize=30&' \
                  'SF_1_1_27=0&' \
                  'SF_1_1_5=%s,16&' \
                  'SF_1_1_8=18,30&' \
                  'SF_1_1_7=1,9&' \
                  'exclude=1&pageIndex=%s' \
                  % (search_args['keywords'].split('|')[0].encode('utf-8'),
                     search_args['city'].split('|')[1].encode('utf-8'),
                     search_args['degree'],
                     page)
            referer = 'https://ihrsearch.zhaopin.com/Home/ResultForCustom?' \
                      'SF_1_1_1=%s&' \
                      'SF_1_1_18=%s&' \
                      'orderBy=DATE_MODIFIED,1&' \
                      'pageSize=30&' \
                      'SF_1_1_27=0&' \
                      'SF_1_1_5=%s,16&' \
                      'SF_1_1_8=18,30&' \
                      'SF_1_1_7=1,9&' \
                      'exclude=1&pageIndex=%s' \
                      % (search_args['keywords'].split('|')[0].encode('utf-8'),
                         search_args['city'].split('|')[1].encode('utf-8'),
                         search_args['degree'],
                         page)

        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;'
            'q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Host': 'ihrsearch.zhaopin.com',
            'Pragma': 'no-cache',
            'Referer': referer,
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': self.user_agent,
            'Cookie': self.cookie
        }

        res = self.html_downloader.download(url,
                                            headers=headers,
                                            proxies=self.proxies,
                                            allow_redirects=False)

        if res.status_code == 302:
            self.logger.warning('cookie失效了')
            self.set_cookie_invalid()
            raise MfCookieValidException('cookie_invalid')

        try:
            soups = self.html_parser.parser(res.content).find(
                'form', attrs={
                    'name': 'frmResult'
                }).find('tbody').find_all('tr', class_='info')
            soups1 = self.html_parser.parser(res.content).find(
                'form', attrs={
                    'name': 'frmResult'
                }).find('tbody').find_all('tr', valign='middle')
        except Exception as e:
            self.logger.exception('获取resume_list失败: %s' % str(e))
            return []
        resume_list = []
        for index, soup in enumerate(soups):
            # global DAY_LIMITED
            # DAY_LIMITED = 5
            # limited_day = datetime.datetime.now() - datetime.timedelta(
            #     days=4
            # )
            if datetime.datetime.now().isoweekday() == 1:
                # 周一
                # global DAY_LIMITED
                # DAY_LIMITED = 5
                # limited_day = datetime.datetime.now() - datetime.timedelta(
                #     days=4)
                global DAY_LIMITED
                DAY_LIMITED = 2
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)
            else:
                global DAY_LIMITED
                DAY_LIMITED = 2
                limited_day = datetime.datetime.now() - datetime.timedelta(
                    days=1)

            if str2datetime(
                    soups1[index].find_all('td')[-1].text.encode('utf-8'),
                    '%y-%m-%d').date() < limited_day.date():
                self.logger.warning('匹配到%s天前的简历,执行跳过操作.' % DAY_LIMITED)
                break
            resume_item = dict()
            resume_item['resumeNo'] = soup.find('a').get('resumeurlpart')
            resume_item['t'] = soup.find('a').get('t')
            resume_item['k'] = soup.find('a').get('k')
            resume_list.append(resume_item)
        self.logger.info(
            'page: %s, 总计获取到简历%s份, 搜索条件[%s, %s]' %
            (page, len(resume_list), search_args['keywords'].encode('utf-8'),
             search_args['city'].encode('utf-8')))
        return resume_list
示例#8
0
    def resume_search(self, page, **search_args):
        self.get_cookie()
        ip = eval(self.auth_kwargs['proxy'])['ip']
        port = eval(self.auth_kwargs['proxy'])['port']

        self.proxies = {
            'http': 'http://%s:%s' % (ip, port),
            'https': 'https://%s:%s' % (ip, port),
        }
        today = '|' + datetime2str(datetime.datetime.now(), '%Y-%m-%d')

        # print(search_args)
        resume_list = self.get_resume_list(page=page, **search_args)

        if not resume_list:
            raise ZhiLianResumeException('resume_list_empty')

        for resume_args in resume_list:
            # 用于限制帐号进入详情页次数
            if not self.h_use_record.hget(self.auth_kwargs['username'] +
                                          today):
                self.h_use_record.hset(self.auth_kwargs['username'] + today, 0)
                count = 0
            else:
                count = int(
                    self.h_use_record.hget(self.auth_kwargs['username'] +
                                           today))

            if self.check_limit(count=count):
                today = datetime2str(datetime.datetime.now(), '%Y-%m-%d')
                self.h_over_search_limit.hset(
                    today + '|' + self.auth_kwargs['username'].encode('utf-8'),
                    1)
                raise ZhiLianResumeException('user_record_limited')

            # 用于简历去重
            try:
                resume_id = str(
                    resume_args.get('resumeNo').encode('utf-8')[:10])
            except:
                resume_id = str(resume_args.get('number')[:10])

            mysql_1 = self.init_mysql(
                user='******',
                passwd='bi_admin#@1mofanghr',
                host='172.16.25.1',
                # user='******',
                # passwd='bi_admin#@1mofanghr',
                # host='10.0.3.52',
                cursorclass=DictCursor,
                cls_singleton=False)
            sql = '''
                 insert into spider.resume_awake_record_no_repeat (source, 
                 position, 
                 city,
                  raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, 
                  %s, %s, now(), %s)
                 '''
            value = (search_args['keywords'].split('|')[0],
                     search_args['city'].split('|')[0], resume_id,
                     self.auth_kwargs['username'])
            mysql_1.save(sql, value)
            del mysql_1

            last_search_day = self.h_black_list.hget(resume_id)
            if last_search_day:
                distance = (str2datetime(today.replace('|', ''), '%Y-%m-%d') -
                            str2datetime(last_search_day, '%Y-%m-%d')).days
            else:
                distance = DAY_LIMITED + 1
            if distance < DAY_LIMITED:
                self.logger.warning('该简历%s天内已经被采集过: %s' %
                                    (DAY_LIMITED, resume_id))
                continue
            self.h_black_list.hset(resume_id, today.replace('|', ''))
            resume_detail = self.get_resume_detail(resume_args=resume_args)
            if not resume_detail:
                continue
            resume_uuid = str(uuid.uuid1())
            content = json.dumps(
                {
                    'name': '',
                    'email': '',
                    'phone': '',
                    'html': resume_detail
                },
                ensure_ascii=False)
            sql = '''insert into spider_search.resume_raw (source, content, 
            createBy, 
            trackId, createtime, email, emailJobType, emailCity, subject) values 
            (%s, %s, "python", %s, now(), %s, %s, %s, %s)'''
            sql_value = (self.common_settings.SOURCE, content, resume_uuid,
                         self.auth_kwargs['username'],
                         search_args['keywords'], search_args['city'],
                         str(resume_detail.get('resumeNo')))

            resume_update_time = ''
            msg_data = {
                "channelType": "APP",
                "content": {
                    "content": content,
                    "id": '',
                    "createBy": "python",
                    "createTime": int(time.time() * 1000),
                    "ip": '',
                    "resumeSubmitTime": '',
                    "resumeUpdateTime": resume_update_time,
                    "source": self.common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    "avatarUrl": '',
                    "email": self.auth_kwargs['username'],
                    'emailJobType': search_args['keywords'],
                    'emailCity': search_args['city'],
                    'subject': str(resume_detail.get('resumeNo'))
                },
                "interfaceType": "PARSE",
                "resourceDataType": "RAW",
                "resourceType": "RESUME_SEARCH_AWAKE",
                "source": self.common_settings.SOURCE,
                "trackId": str(resume_uuid),
                'traceID': str(resume_uuid),
                'callSystemID': self.common_settings.CALL_SYSTEM_ID,
            }
            # self.mysql_handler.save(sql=sql, data=sql_value)
            res = self.save_data(sql=sql, data=sql_value, msg_data=msg_data)

            if res:
                count += 1
                self.h_use_record.hset(self.auth_kwargs['username'] + today,
                                       count)
                mysql_ = self.init_mysql(
                    user='******',
                    passwd='bi_admin#@1mofanghr',
                    host='172.16.25.1',
                    # user='******',
                    # passwd='bi_admin#@1mofanghr',
                    # host='10.0.3.52',
                    cursorclass=DictCursor,
                    cls_singleton=False)
                sql = '''
                    insert into spider.resume_awake_record (source, position, city,
                     raw_id, create_time, username) VALUES ('ZHI_LIAN', %s, 
                     %s, %s, now(), %s)
                    '''
                value = (search_args['keywords'].split('|')[0],
                         search_args['city'].split('|')[0], res,
                         self.auth_kwargs['username'])
                mysql_.save(sql, value)
                del mysql_

            time.sleep(random.randint(3, 5))