def __login(self, session, check_code=None): try_times = 0 url = "http://www.818cv.com/" self.headers["Content-Type"] = "application/x-www-form-urlencoded" self.headers["Origin"] = "http://www.818cv.com" if check_code: err_url = "http://www.818cv.com" + check_code verify = parse_check_code(session, err_url, 'lie8', proxies=self.proxies) else: verify = "" while True: try_times += 1 try: response = session.post(url, data={ "username": self.username, "password": self.password, "verify": verify }, headers=self.headers, timeout=30, proxies=self.proxies) assert response assert response.status_code == 200 except: logger.warning( 'fetching url %s headers %s with %s fail:\n%s' % (url, self.headers, self.proxies, traceback.format_exc())) if try_times > 5: return False, "PROXY_FAIL!" else: time.sleep(random.uniform(3, 5)) else: break if "您输入的帐号或者密码不正确,请重新输入。" in response.text: logger.warning("LOGIN WITH username=%s, passwoword=%s WRONG" % (self.username, self.password)) return False, "ACCOUNT_ERROR!" return True, session
def __fetch_contact(session, resume_id, user_name, user_password, proxies=None): user_agent = nautil.user_agent() proxies = None def __session(method, url, headers={}, data=None): logger.info('------\nRequesting %s On %s With Data:\n%s\n------' % (method, url, data)) # time.sleep(random.uniform(4, 15)) time.sleep(random.uniform(1, 2)) assert method in ('get', 'post') request_headers = { "User-Agent": user_agent, } for k, v in headers.iteritems(): request_headers[k] = v if method == 'get': response = session.get(url, headers=request_headers, proxies=proxies, params=data) if method == 'post': response = session.post(url, headers=request_headers, proxies=proxies, data=data) assert response assert response.status_code == 200 response.encoding = 'utf-8' return response.text main_page = __session('get', 'http://www.ganji.com/vip') if '赶集用户登录' in main_page: logger.info('cookie fail, try login') logger.info('re-login') hash_value = re.search('''window.PAGE_CONFIG.__hash__ = '([^']*)';''', main_page) assert hash_value hash_value = hash_value.group(1) logger.info('login hash_value:%s' % hash_value) check_code_url = re.search( '''<img[^>]*id="login_img_checkcode"[^>]*src=['"]*([^'"]*)['"]*[^>]*>''', main_page) assert check_code_url check_code_url = check_code_url.group(1) logger.info('login check_code_url:%s' % check_code_url) time_stamp = str(int(time.time() * 1000)) counter = 0 while True: counter += 1 if counter > 10: raise Exception('try too many times to login') login_result = __session( 'get', 'https://passport.ganji.com/login.php', headers={ "Host": "passport.ganji.com", "Referer": "https://passport.ganji.com/login.php?next=/", "X-Requested-With": "XMLHttpRequest", "Connection": "keep-alive" }, data={ "callback": "jQuery1820229177205394230_%s" % time_stamp, "username": user_name, "password": user_password, "checkCode": parse_check_code(session, check_code_url, 'ganji', proxies), "setcookies": "14", "second": "", "parentfunc": "", "redirect_in_iframe": "", "next": '/', "__hash__": hash_value, "_": time_stamp }) open(session.temp_folder + os.path.sep + 'login_result.html', 'w').write(login_result) if 'error_msg' in login_result: logger.warning('login fail with response:\n%s' % login_result) else: break logger.info('trying to buy contact') message = __session('get', 'http://www.ganji.com/findjob/download_resume.php', headers={ "Host": "www.ganji.com", "Referer": "http://www.ganji.com/jianli/%sx.htm" % resume_id, "Upgrade-Insecure-Requests": 1, }, data={ "source": "detail", "resume_type": "0", "findjob_puid": resume_id, "job_postion": "", "callback": "show_contact", "is_batch_view_resume": 0 }) open(session.temp_folder + os.path.sep + 'message.html', 'w').write(message) if '您已下载过该简历' not in message: if '简历下载数不足' in message: raise Exception('The Ganji Account Can Not Afford this Resumes') elif '此帖子已删除' in message: raise Exception('The Ganji Resume Deleted') else: assert '确认查看' in message buy_url = 'http://www.ganji.com/findjob/download_resume.php?source=detail&resume_type=0&findjob_puid=%s&job_postion=&callback=show_contact&is_batch_view_resume=0' % resume_id tel_message = __session('post', buy_url, headers={ "Host": "www.ganji.com", "Origin": "http://www.ganji.com", "Referer": buy_url, "Upgrade-Insecure-Requests": 1, }, data={"one_key_download_setting": 1}) assert 'tel-code' in tel_message logger.info('buy contact done, try upload resume') resume = __session('get', 'http://www.ganji.com/jianli/%sx.htm' % resume_id) shutil.rmtree(session.temp_folder) return upload(resume, 'ganji', get_contact=True)
def __download_resume(session, url, proxies=None): """ :param session: the session for download resume of ganji :param urls: the list of resume's url of ganji :param proxies: the proxies for download resume :return: yield the resume content """ __timeout = 30 host = url.split('/')[2] headers = { "User-Agent": nautil.user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Host": host, "Referer": "http://www.ganji.com/findjob/resume_list.php" } time.sleep(random.uniform(30, 100)) try_times = 0 while True: try_times += 1 try: resume_data = session.get(url, headers=headers, timeout=__timeout, proxies=proxies) assert resume_data.status_code == 200 resume_data.encoding = 'utf-8' resume = resume_data.text resume_failues = u"您的访问速度太快了,如果您不是机器的话,输入下面的验证码来继续访问吧" error_message = u"亲爱的用户,您访问的速度太快" if error_message in resume: raise Exception("ERROR_MESSAGE!") if resume_failues in resume: verify_headers = { "User-Agent": headers["User-Agent"], "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Host": host, "Referer": url } img = pq(resume).find('.error').find('span').find('img').attr( 'src') error_url = "http://" + host + img verify_code = parse_check_code(session, error_url, 'ganji', proxies) response = session.post(error_url, data=verify_code, headers=verify_headers, timeout=__timeout) if u"对不起!您要查看的页面没有找到或已删除" in response.text: break if u"对不起,该简历已停止找工作了~" in response.text: break assert response.status_code == 200 continue except Exception: logger.warning('fetch %s with %s fail:\n%s' % (url, proxies, traceback.format_exc())) if try_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(30) else: # raise Exception("SPEED_TOO_FAST!") return resume_data.text
def __get_resume_urls(session, urls, dedup, proxies=None): """ :param session: the session of find the resume's url :param urls: get the urls filter by the naren's searcher engin :param proxies: the poxies for get the resume's href :return: """ _resume_counter = 0 headers = { "User-Agent": nautil.user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Host": "www.ganji.com", "Referer": "http://www.ganji.com/findjob/resume_index.php" } # pages = [0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448] resume_300_flag = 0 for uu in urls: proxy_error_counter = 0 last_resume_ids = set("None") for page in xrange(0, 640, 32): if resume_300_flag == 1: break time.sleep(random.uniform(30, 100)) url = uu + '&page=%s' % page _timeout = 30 try_times = 0 try_parse_times = 0 while True: try_times += 1 try: logger.warning('fetching %s with %s' % (url, proxies)) response = session.get(url, headers=headers, timeout=_timeout, proxies=proxies) assert response.status_code == 200 resume_failues = u"您的访问速度太快了,如果您不是机器的话,输入下面的验证码来继续访问吧" error_message = u"亲爱的用户,您访问的速度太快" if error_message in response.text: raise Exception("ERROR_MESSAGE!") if resume_failues in response.text: verify_headers = { "User-Agent": headers["User-Agent"], "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8", "Host": "www.ganji.com", "Referer": url } img = pq(response.text).find('.error').find( 'span').find('img').attr('src') error_url = 'http://www.ganji.com' + img try_parse_times += 1 verify_code = parse_check_code(session, error_url, 'ganji', proxies) data = session.post(error_url, data=verify_code, headers=verify_headers, timeout=_timeout) assert data.status_code == 200 except Exception: logger.warning('fetch %s with %s fail:\n%s' % (url, proxies, traceback.format_exc())) if try_times > 5: raise Exception("PROXY_FAIL!") else: time.sleep(30) else: if try_parse_times > random.randint(2, 5): raise Exception("PROXY_FAIL!") # raise Exception("SPEED_TOO_FAST!") break response.encoding = 'utf-8' response_hrefs = pq( response.content).find('.resume-list').find('div').find('dl') resume_names_urls = {} resume_ids_urls = {} for response_href in response_hrefs: href = pq(response_href).find('a').attr('href') href_id = re.findall(r'\d+', href)[0] name = pq(response_href).find('a').text() resume_names_urls[name] = href resume_ids_urls[href_id] = href if not last_resume_ids.difference(set(resume_ids_urls.keys())): proxy_error_counter += 1 if proxy_error_counter > 5: raise Exception("PROXY_FAIL!") if resume_ids_urls: last_resume_ids = set(resume_ids_urls.keys()) rest_ids = dedup(resume_ids_urls.keys()) # 简历去重 for id in rest_ids: _resume_counter += 1 if _resume_counter < 300: rest_url = resume_ids_urls[id] else: resume_300_flag = 1 break resume = __download_resume(session, rest_url, proxies=proxies) if resume: yield resume if u'class="next">下一页</a>' in response.text: continue else: break
def register(self): int_samples = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9') char_samples = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') register_password = random.choice(char_samples) for i in xrange(random.randint(3, 6)): register_password += random.choice(int_samples + char_samples) for i in xrange(random.randint(2, 5)): register_password += random.choice(int_samples) logger.info('random passwd %s' % register_password) try_times = 0 while True: session = requests.Session() session.headers.update({ "User-Agent": nautil.user_agent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': 1 }) session.get('http://www.jianlika.com/Signup/email.html', headers={ "Host": "www.jianlika.com", }, proxies=self.proxies) verify_code = parse_check_code( session, "http://www.jianlika.com/Verify/index.html?%s" % random.randint(1000000000000000, 9000000000000000), "jianlika", self.proxies, headers={ "Accept": "image/webp,image/*,*/*;q=0.8", "Host": "www.jianlika.com", "Referer": "http://www.jianlika.com/Signup/email.html", }, typeid=3050) response = session.post( 'http://www.jianlika.com/Signup/email.html', headers={ "Content-Length": "85", "Host": "www.jianlika.com", "Origin": "http://www.jianlika.com", "Referer": "http://www.jianlika.com/Signup/email.html", }, data={ "email": self.username, "pwd": register_password, "repwd": register_password, "verifycode": verify_code, "invitecode": "", "agree": "on" }, proxies=self.proxies) response.encoding = 'utf-8' if '您今天注册次数已超限' in response.text: raise Exception('REGISTER_OVERLOAD') if '验证码不正确' in response.text: if try_times > 5: raise Exception('CHECKCODE_FAIL') else: try_times += 1 continue if '此邮箱已被使用' in response.text: return 'REGISTERED!' assert '邮件已发送至' in response.text, '--unknown registered page---\n%s\n--unknown registered page---' % response.text break email_link = self.fetch_email() response = session.get(email_link, headers={ 'Host': 'www.jianlika.com', }, proxies=self.proxies) response.encoding = 'utf-8' if '此邮箱不存在' in response.text: raise Exception('REGISTER_TOO_LATE') if '此邮箱不需要激活' in response.text: logger.info('此邮箱不需要激活') if '邮箱验证成功' in response.text: logger.info('邮箱验证成功') return register_password