示例#1
0
 def __login(self, session, check_code=None):
     try_times = 0
     url = "http://www.818cv.com/"
     self.headers["Content-Type"] = "application/x-www-form-urlencoded"
     self.headers["Origin"] = "http://www.818cv.com"
     if check_code:
         err_url = "http://www.818cv.com" + check_code
         verify = parse_check_code(session,
                                   err_url,
                                   'lie8',
                                   proxies=self.proxies)
     else:
         verify = ""
     while True:
         try_times += 1
         try:
             response = session.post(url,
                                     data={
                                         "username": self.username,
                                         "password": self.password,
                                         "verify": verify
                                     },
                                     headers=self.headers,
                                     timeout=30,
                                     proxies=self.proxies)
             assert response
             assert response.status_code == 200
         except:
             logger.warning(
                 'fetching url %s headers %s with %s fail:\n%s' %
                 (url, self.headers, self.proxies, traceback.format_exc()))
             if try_times > 5:
                 return False, "PROXY_FAIL!"
             else:
                 time.sleep(random.uniform(3, 5))
         else:
             break
     if "您输入的帐号或者密码不正确,请重新输入。" in response.text:
         logger.warning("LOGIN WITH username=%s, passwoword=%s WRONG" %
                        (self.username, self.password))
         return False, "ACCOUNT_ERROR!"
     return True, session
示例#2
0
def __fetch_contact(session,
                    resume_id,
                    user_name,
                    user_password,
                    proxies=None):
    user_agent = nautil.user_agent()
    proxies = None

    def __session(method, url, headers={}, data=None):
        logger.info('------\nRequesting %s On %s With Data:\n%s\n------' %
                    (method, url, data))
        # time.sleep(random.uniform(4, 15))
        time.sleep(random.uniform(1, 2))

        assert method in ('get', 'post')
        request_headers = {
            "User-Agent": user_agent,
        }
        for k, v in headers.iteritems():
            request_headers[k] = v

        if method == 'get':
            response = session.get(url,
                                   headers=request_headers,
                                   proxies=proxies,
                                   params=data)
        if method == 'post':
            response = session.post(url,
                                    headers=request_headers,
                                    proxies=proxies,
                                    data=data)

        assert response
        assert response.status_code == 200
        response.encoding = 'utf-8'
        return response.text

    main_page = __session('get', 'http://www.ganji.com/vip')
    if '赶集用户登录' in main_page:
        logger.info('cookie fail, try login')
        logger.info('re-login')
        hash_value = re.search('''window.PAGE_CONFIG.__hash__ = '([^']*)';''',
                               main_page)
        assert hash_value
        hash_value = hash_value.group(1)
        logger.info('login hash_value:%s' % hash_value)

        check_code_url = re.search(
            '''<img[^>]*id="login_img_checkcode"[^>]*src=['"]*([^'"]*)['"]*[^>]*>''',
            main_page)
        assert check_code_url
        check_code_url = check_code_url.group(1)
        logger.info('login check_code_url:%s' % check_code_url)
        time_stamp = str(int(time.time() * 1000))

        counter = 0
        while True:
            counter += 1
            if counter > 10:
                raise Exception('try too many times to login')
            login_result = __session(
                'get',
                'https://passport.ganji.com/login.php',
                headers={
                    "Host": "passport.ganji.com",
                    "Referer": "https://passport.ganji.com/login.php?next=/",
                    "X-Requested-With": "XMLHttpRequest",
                    "Connection": "keep-alive"
                },
                data={
                    "callback":
                    "jQuery1820229177205394230_%s" % time_stamp,
                    "username":
                    user_name,
                    "password":
                    user_password,
                    "checkCode":
                    parse_check_code(session, check_code_url, 'ganji',
                                     proxies),
                    "setcookies":
                    "14",
                    "second":
                    "",
                    "parentfunc":
                    "",
                    "redirect_in_iframe":
                    "",
                    "next":
                    '/',
                    "__hash__":
                    hash_value,
                    "_":
                    time_stamp
                })
            open(session.temp_folder + os.path.sep + 'login_result.html',
                 'w').write(login_result)
            if 'error_msg' in login_result:
                logger.warning('login fail with response:\n%s' % login_result)
            else:
                break

    logger.info('trying to buy contact')
    message = __session('get',
                        'http://www.ganji.com/findjob/download_resume.php',
                        headers={
                            "Host": "www.ganji.com",
                            "Referer":
                            "http://www.ganji.com/jianli/%sx.htm" % resume_id,
                            "Upgrade-Insecure-Requests": 1,
                        },
                        data={
                            "source": "detail",
                            "resume_type": "0",
                            "findjob_puid": resume_id,
                            "job_postion": "",
                            "callback": "show_contact",
                            "is_batch_view_resume": 0
                        })
    open(session.temp_folder + os.path.sep + 'message.html',
         'w').write(message)
    if '您已下载过该简历' not in message:
        if '简历下载数不足' in message:
            raise Exception('The Ganji Account Can Not Afford this Resumes')
        elif '此帖子已删除' in message:
            raise Exception('The Ganji Resume Deleted')
        else:
            assert '确认查看' in message
            buy_url = 'http://www.ganji.com/findjob/download_resume.php?source=detail&resume_type=0&findjob_puid=%s&job_postion=&callback=show_contact&is_batch_view_resume=0' % resume_id
            tel_message = __session('post',
                                    buy_url,
                                    headers={
                                        "Host": "www.ganji.com",
                                        "Origin": "http://www.ganji.com",
                                        "Referer": buy_url,
                                        "Upgrade-Insecure-Requests": 1,
                                    },
                                    data={"one_key_download_setting": 1})
            assert 'tel-code' in tel_message

    logger.info('buy contact done, try upload resume')
    resume = __session('get',
                       'http://www.ganji.com/jianli/%sx.htm' % resume_id)
    shutil.rmtree(session.temp_folder)
    return upload(resume, 'ganji', get_contact=True)
示例#3
0
def __download_resume(session, url, proxies=None):
    """
    :param session: the session for download resume of ganji
    :param urls: the list of resume's url of ganji
    :param proxies: the proxies for download resume
    :return: yield the resume content
    """
    __timeout = 30
    host = url.split('/')[2]
    headers = {
        "User-Agent": nautil.user_agent(),
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, sdch",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Host": host,
        "Referer": "http://www.ganji.com/findjob/resume_list.php"
    }
    time.sleep(random.uniform(30, 100))
    try_times = 0
    while True:
        try_times += 1
        try:
            resume_data = session.get(url,
                                      headers=headers,
                                      timeout=__timeout,
                                      proxies=proxies)
            assert resume_data.status_code == 200
            resume_data.encoding = 'utf-8'
            resume = resume_data.text
            resume_failues = u"您的访问速度太快了,如果您不是机器的话,输入下面的验证码来继续访问吧"
            error_message = u"亲爱的用户,您访问的速度太快"
            if error_message in resume:
                raise Exception("ERROR_MESSAGE!")
            if resume_failues in resume:
                verify_headers = {
                    "User-Agent": headers["User-Agent"],
                    "Accept-Encoding": "gzip, deflate, sdch",
                    "Accept-Language": "zh-CN,zh;q=0.8",
                    "Host": host,
                    "Referer": url
                }
                img = pq(resume).find('.error').find('span').find('img').attr(
                    'src')
                error_url = "http://" + host + img
                verify_code = parse_check_code(session, error_url, 'ganji',
                                               proxies)
                response = session.post(error_url,
                                        data=verify_code,
                                        headers=verify_headers,
                                        timeout=__timeout)
                if u"对不起!您要查看的页面没有找到或已删除" in response.text:
                    break
                if u"对不起,该简历已停止找工作了~" in response.text:
                    break
                assert response.status_code == 200
                continue
        except Exception:
            logger.warning('fetch %s with %s fail:\n%s' %
                           (url, proxies, traceback.format_exc()))
            if try_times > 5:
                raise Exception("PROXY_FAIL!")
            else:
                time.sleep(30)
        else:
            # raise Exception("SPEED_TOO_FAST!")
            return resume_data.text
示例#4
0
def __get_resume_urls(session, urls, dedup, proxies=None):
    """
    :param session: the session of find the resume's url
    :param urls: get the urls filter by the naren's searcher engin
    :param proxies: the poxies for get the resume's href
    :return:
    """
    _resume_counter = 0
    headers = {
        "User-Agent": nautil.user_agent(),
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, sdch",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Host": "www.ganji.com",
        "Referer": "http://www.ganji.com/findjob/resume_index.php"
    }
    # pages = [0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448]
    resume_300_flag = 0
    for uu in urls:
        proxy_error_counter = 0
        last_resume_ids = set("None")
        for page in xrange(0, 640, 32):
            if resume_300_flag == 1:
                break
            time.sleep(random.uniform(30, 100))
            url = uu + '&page=%s' % page
            _timeout = 30
            try_times = 0
            try_parse_times = 0
            while True:
                try_times += 1
                try:
                    logger.warning('fetching %s with %s' % (url, proxies))
                    response = session.get(url,
                                           headers=headers,
                                           timeout=_timeout,
                                           proxies=proxies)
                    assert response.status_code == 200
                    resume_failues = u"您的访问速度太快了,如果您不是机器的话,输入下面的验证码来继续访问吧"
                    error_message = u"亲爱的用户,您访问的速度太快"
                    if error_message in response.text:
                        raise Exception("ERROR_MESSAGE!")
                    if resume_failues in response.text:
                        verify_headers = {
                            "User-Agent": headers["User-Agent"],
                            "Accept-Encoding": "gzip, deflate, sdch",
                            "Accept-Language": "zh-CN,zh;q=0.8",
                            "Host": "www.ganji.com",
                            "Referer": url
                        }
                        img = pq(response.text).find('.error').find(
                            'span').find('img').attr('src')
                        error_url = 'http://www.ganji.com' + img
                        try_parse_times += 1
                        verify_code = parse_check_code(session, error_url,
                                                       'ganji', proxies)
                        data = session.post(error_url,
                                            data=verify_code,
                                            headers=verify_headers,
                                            timeout=_timeout)
                        assert data.status_code == 200
                except Exception:
                    logger.warning('fetch %s with %s fail:\n%s' %
                                   (url, proxies, traceback.format_exc()))
                    if try_times > 5:
                        raise Exception("PROXY_FAIL!")
                    else:
                        time.sleep(30)
                else:
                    if try_parse_times > random.randint(2, 5):
                        raise Exception("PROXY_FAIL!")
                    # raise Exception("SPEED_TOO_FAST!")
                    break
            response.encoding = 'utf-8'
            response_hrefs = pq(
                response.content).find('.resume-list').find('div').find('dl')
            resume_names_urls = {}
            resume_ids_urls = {}
            for response_href in response_hrefs:
                href = pq(response_href).find('a').attr('href')
                href_id = re.findall(r'\d+', href)[0]
                name = pq(response_href).find('a').text()
                resume_names_urls[name] = href
                resume_ids_urls[href_id] = href
            if not last_resume_ids.difference(set(resume_ids_urls.keys())):
                proxy_error_counter += 1
                if proxy_error_counter > 5:
                    raise Exception("PROXY_FAIL!")
            if resume_ids_urls:
                last_resume_ids = set(resume_ids_urls.keys())
            rest_ids = dedup(resume_ids_urls.keys())  # 简历去重
            for id in rest_ids:
                _resume_counter += 1
                if _resume_counter < 300:
                    rest_url = resume_ids_urls[id]
                else:
                    resume_300_flag = 1
                    break
                resume = __download_resume(session, rest_url, proxies=proxies)
                if resume:
                    yield resume
            if u'class="next">下一页</a>' in response.text:
                continue
            else:
                break
示例#5
0
    def register(self):
        int_samples = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')
        char_samples = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
                        'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
                        'w', 'x', 'y', 'z')
        register_password = random.choice(char_samples)
        for i in xrange(random.randint(3, 6)):
            register_password += random.choice(int_samples + char_samples)
        for i in xrange(random.randint(2, 5)):
            register_password += random.choice(int_samples)
        logger.info('random passwd %s' % register_password)
        try_times = 0
        while True:
            session = requests.Session()
            session.headers.update({
                "User-Agent": nautil.user_agent(),
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.8',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': 1
            })

            session.get('http://www.jianlika.com/Signup/email.html',
                        headers={
                            "Host": "www.jianlika.com",
                        },
                        proxies=self.proxies)
            verify_code = parse_check_code(
                session,
                "http://www.jianlika.com/Verify/index.html?%s" %
                random.randint(1000000000000000, 9000000000000000),
                "jianlika",
                self.proxies,
                headers={
                    "Accept": "image/webp,image/*,*/*;q=0.8",
                    "Host": "www.jianlika.com",
                    "Referer": "http://www.jianlika.com/Signup/email.html",
                },
                typeid=3050)
            response = session.post(
                'http://www.jianlika.com/Signup/email.html',
                headers={
                    "Content-Length": "85",
                    "Host": "www.jianlika.com",
                    "Origin": "http://www.jianlika.com",
                    "Referer": "http://www.jianlika.com/Signup/email.html",
                },
                data={
                    "email": self.username,
                    "pwd": register_password,
                    "repwd": register_password,
                    "verifycode": verify_code,
                    "invitecode": "",
                    "agree": "on"
                },
                proxies=self.proxies)
            response.encoding = 'utf-8'
            if '您今天注册次数已超限' in response.text:
                raise Exception('REGISTER_OVERLOAD')
            if '验证码不正确' in response.text:
                if try_times > 5:
                    raise Exception('CHECKCODE_FAIL')
                else:
                    try_times += 1
                    continue
            if '此邮箱已被使用' in response.text:
                return 'REGISTERED!'

            assert '邮件已发送至' in response.text, '--unknown registered page---\n%s\n--unknown registered page---' % response.text
            break

        email_link = self.fetch_email()

        response = session.get(email_link,
                               headers={
                                   'Host': 'www.jianlika.com',
                               },
                               proxies=self.proxies)
        response.encoding = 'utf-8'
        if '此邮箱不存在' in response.text:
            raise Exception('REGISTER_TOO_LATE')
        if '此邮箱不需要激活' in response.text:
            logger.info('此邮箱不需要激活')
        if '邮箱验证成功' in response.text:
            logger.info('邮箱验证成功')
        return register_password