def test(): task = { "area_name": u"天津", "job_code": "918", "area_code": "120000", "model_name": "zhilian", "job_name": u"其他" } cookie = { 'code': 0, 'cookie': { 'SERVERID': '70356234b78238645df699ef52f30d81|1522806241|1522806241', 'JSESSIONID': 'A3CF43A5891596C98EDE1C2D5E5BE76A' } } cookie_str = cookie = 'SERVERID=' + cookie.get('cookie').get( 'SERVERID') + ';JSESSIONID=' + cookie.get('cookie').get('JSESSIONID') # awake_one_task(task) print get_list(cookie=cookie_str, page_numner=1, task=task, proxy=settings.get_proxy()) # print get_resume('1015973541', cookie, 'zhilian', {'http': 'http://47.93.115.141:3128', 'https': 'http://47.93.115.141:3128'}) account = {'username': '******', 'passwd': 'jinqian4611', 'id': 805} set_unavaliable_account(account)
def login(username, password): # 3 login_url = 'http://www.jianlika.com/Index/login.html' post_data = {'username': username, 'password': password, 'remember': 'no'} login_header = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', # 'Cookie': 'think_language=zh-CN; user_auth_sign=8rc593c82ogfc55938595vsc14; rememberUsername=18629947965; gift_hide_timeout=1', 'Host': 'www.jianlika.com', 'Origin': 'http://www.jianlika.com', 'Referer': 'http://www.jianlika.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } response = requests.post(url=login_url, data=post_data, headers=login_header, proxies=settings.get_proxy()) # 登录完成之后,返回的内容为:'{"info":"","status":1,"url":"\\/Search"}' # print response.proxies print response.content print 'cookie:===》》》 %s ' % response.headers.get('Set-Cookie') pass
def get_list(): # task = {"area_name": u"天津", "job_code": "918", "area_code": "120000", "model_name": "zhilian", "job_name": u"其他"} task = { "area_name": u"北京", "job_code": "228", "area_code": "110000", "model_name": "zhilian", "job_name": u"销售主管" } print resume_fenjianli.get_list(cookie=cookie_str, page_numner=1, params=task, proxy=settings.get_proxy())
def main(): logger = utils.get_logger() redis_client = redis.Redis(host=common_settings.REDIS_IP, port=common_settings.REDIS_PORT, db=1) delete_before_account_inredis(redis_client) cookie_result = get_chinahr_cookie_all() proxy = settings.get_proxy() start_count = 0 if cookie_result['code']: logger.info('there has no cookie get.') return for cookie in cookie_result['cookie']: if start_count: start_count -= 1 continue username = cookie.get('userName', '') password = cookie.get('password', '') if not cookie.get('cookie', ''): logger.info('not get uid in cookie:' + str(cookie)) url = 'http://172.16.25.41:8002/acc/invalidCookie.json?userName=%s&password=%s' % ( username, password) requests.get(url) continue #continue logger.info('start to deal with :' + username) try: cookie_dict = json.loads(cookie.get('cookie', '{}')) except Exception, e: logger.info('error when json cookie:' + username + ' ' + password) continue uid_list = uid_re.findall(cookie_dict.get('bps', '')) device_id = json.loads(cookie.get('extraContent', '{}')).get('device_id', '') if not uid_list: logger.info('not get uid in cookie:' + str(cookie)) url = 'http://172.16.25.41:8002/acc/invalidCookie.json?userName=%s&password=%s' % ( username, password) requests.get(url) #break continue uid = uid_list[0] vote_result = vote(uid, proxy, device_id, cookie_dict) if not vote_result: logger.info('vote failed:' + str(cookie)) redis_client.set( time.strftime("%Y-%m-%d") + '_' + username + '_chinahr_0', 1000)
def buy_thread(): logger = utils.get_logger() logger.info( '====================================================\nstart buy thread!!!' ) global emoji_pattern kafka_producer = None mns_client = None redis_client = None mysql_pool = PersistentDB(MySQLdb, host=common_settings.MYSQL_HOST, user=common_settings.MYSQL_USER, passwd=common_settings.MYSQL_PASSWD, db=common_settings.MYSQL_DOWNLOAD_DB, port=common_settings.MYSQL_PORT, charset='utf8') mysql_conn = mysql_pool.connection() mysql_cursor = mysql_conn.cursor() mysql_cursor.execute( 'update download_record set valid=0 where valid=1 and source=6') mysql_conn.commit() # proxy = {'http': 'http://*****:*****@proxy.abuyun.com:9020', 'https': 'http://*****:*****@proxy.abuyun.com:9020'} proxy = settings.get_proxy() task_list = [] while True: try: while not task_list: task_number = mysql_cursor.execute( 'select * from download_record where valid=0 and source=6 order by updateTime desc limit 30' ) if not task_number: logger.info( 'there has no avaliable task in mysql ,sleep!!!') time.sleep(600) mysql_conn = mysql_pool.connection() mysql_cursor = mysql_conn.cursor() continue task_list = list(mysql_cursor.fetchall()) break task = task_list.pop() if task[8]: try: extend_content = json.loads(task[8]) extend_content[ 'emailJobType'] = '' if 'emailJobType' not in extend_content else extend_content[ 'emailJobType'] extend_content[ 'emailCity'] = '' if 'emailCity' not in extend_content else extend_content[ 'emailCity'] except Exception, e: logger.info('not find extend_content in task:' + str(task)) extend_content = {"emailJobType": "", "emailCity": ""} else: extend_content = {"emailJobType": "", "emailCity": ""} get_null_text = 3 for i in xrange(get_null_text): get_account_tag = True while get_account_tag: account = get_account_from_redis() if account['code']: logger.info('get error account:' + str(account)) continue cookie = account['cookie'] uid = account['uid'] device_id = account['device_id'] get_download_bean_result = util.get_download_beans( cookie, uid, proxy, device_id) if not get_download_bean_result[ 'code'] and not get_download_bean_result[ 'coin_number']: logger.info('get a account whos bean number is 0:' + account['username'] + ' result:' + str(get_download_bean_result)) release_account_from_redis(account['account_key'], 0) else: get_account_tag = False resume_result = util.buy_resume(cookie, uid, str(task[1]), settings.get_proxy(), device_id) if resume_result['code'] in [0, '0']: resume_uuid = uuid.uuid1() try: content = json.dumps(resume_result, ensure_ascii=False) content = emoji_pattern.sub(r'', content) mysql_cursor.execute( u'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity) values ("CH_HR", %s, "python", %s, now(), %s, %s, %s)', (content, resume_uuid, account['username'], extend_content['emailJobType'], extend_content['emailCity'])) mysql_conn.commit() mysql_cursor.execute('select last_insert_id()') save_mysql_ids = mysql_cursor.fetchall() if not save_mysql_ids or not save_mysql_ids[0]: logger.info('insert into mysql error!!!:' + sql + ' ' + str(sql_value)) raise Exception save_mysql_id = save_mysql_ids[0][0] except Exception, e: logger.info('mysql error:' + str(traceback.format_exc())) time.sleep(60) continue kafka_data = { "channelType": "APP", "content": { "content": content, "id": save_mysql_id, "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": '', "source": "CH_HR", "trackId": str(resume_uuid), "avatarUrl": '', "email": account['username'], 'emailJobType': extend_content['emailJobType'], 'emailCity': extend_content['emailCity'], }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_INBOX", "externalInfo": "BUY", "source": "CH_HR", "trackId": str(resume_uuid), } if common_settings.SAVE_TYPE: try: if common_settings.SAVE_TYPE == 'kafka': kafka_producer.produce(json.dumps(kafka_data)) elif common_settings.SAVE_TYPE == 'mns': buf = StringIO() f = gzip.GzipFile(mode='wb', fileobj=buf) f.write(json.dumps(kafka_data)) f.close() msg_body = base64.b64encode(buf.getvalue()) msg = Message(msg_body) for send_message_count in range( common_settings.MNS_SAVE_RETRY_TIME): try: mns_client = get_mns_client() mns_client.send_message(msg) break except Exception, e: logger.info( 'error when mns send message, time:' + str(send_message_count) + ':' + str(e)) else: raise Exception else: logger.info('did not support save type:' + common_settings.SAVE_TYPE) except Exception, e: logger.info('get error when produce data to ' + common_settings.SAVE_TYPE + ', exit!!!' + str(traceback.format_exc()))
def awake_one_task(task): logger = utils.get_logger() global citys_dict, emoji_pattern has_find_count = 0 not_find_count = 0 redis_client = get_redis_client() kafka_client = None kafka_producer = None mns_client = None result = {'code': 0} user_now = {} logger.info('start to get data ' + str(task)) mysql_error_time = 10 list_page = int(task.get('page_now', 0)) # 开始处理任务 while list_page > 0: try: # 获取账号,若获取不到账号则sleep,直到可以拿到账号为止 while not user_now: user_now = get_one_account_from_api() if not user_now['code']: break else: logger.info('get account failed:' + str(user_now)) user_now = {} except Exception, e: logger.info(str(traceback.format_exc())) continue time.sleep(1) # 开始下载列表页 download_day_str = str(time.localtime().tm_year) + '-' + str( time.localtime().tm_mon) + '-' + str(time.localtime().tm_mday) download_day = datetime.datetime.today() while list_page >= 0: list_result = util.get_list(user_now, task, settings.get_proxy(), list_page) if list_result['code'] == 1: logger.info('get 800 code, to change accounts!!!') user_now = [] break elif list_result['code'] == 3: logger.info('get 100 code, to change accounts!!!') set_account_invalid(user_now['username'], user_now['passwd']) user_now = [] break elif list_result['code']: logger.info('get error list ,continue!!!' + str(list_result)) user_now = {} break logger.info('has get the list of' + str(list_page)) resume_list = list_result.get('data', {}).get('cvList', []) if len(resume_list) == 0 and list_page == 1: logger.info('not get resume in city:' + task['zone'] + ' keyword:' + task['keyword']) # 循环列表页中的详情页 for resume in resume_list: resume_key = 'chinahr_resume_' + resume.get('cvid', '') # 在redis中检查RESUME_DELAY_DAYS天内是不是已经下载过了 try: resume_download_time = redis_client.get(resume_key) if resume_download_time: datetime_last_download = datetime.datetime.strptime( resume_download_time, '%Y-%m-%d') if (download_day - datetime_last_download ).days <= common_settings.RESUME_DELAY_DAYS: logger.info('has find %s in redis' % (resume_key, ) + ' and the city is:' + task['zone']) has_find_count += 1 continue else: redis_client.set(resume_key, download_day_str) logger.info('has find %s in redis, update ' % (resume_key, ) + ' and the city is:' + task['zone']) else: redis_client.set(resume_key, download_day_str) logger.info('not find %s in reids' % (resume_key, ) + ' and the city is:' + task['zone']) except Exception as e: logger.exception('get error when use redis.' + str(e)) # redis_client.set(resume_key, download_day_str) raise e # 开始下载唤醒详情页 time.sleep(1) resume_result = util.get_resume(user_now, str(resume.get('cvid', '')), settings.get_proxy()) # 保存详情页到mns和mysql if resume_result['code'] in [0, '0']: resume_uuid = uuid.uuid1() try: content = json.dumps(resume_result, ensure_ascii=False) content = emoji_pattern.sub(r'', content) sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity) values ("CH_HR", %s, "python", %s, now(), %s, %s, %s)' sql_value = (content, resume_uuid, user_now['username'], task['keyword'], task['zone']) kafka_data = { "channelType": "APP", "content": { "content": content, "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume.get('refDate', ''), "source": "CH_HR", "trackId": str(resume_uuid), "avatarUrl": '', "email": user_now['username'], 'emailJobType': task['keyword'], 'emailCity': task['zone'], }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH", "source": "CH_HR", "trackId": str(resume_uuid), "callSystemID": common_settings.PROJECT_NAME, "traceID": str(resume_uuid), } utils.save_data(sql, sql_value, kafka_data) logger.info('the cvid is:' + resume.get('cvid', '') + ' the lenght of data is:' + str(len(kafka_data['content']['content']))) except Exception, e: logger.info('mysql error ' + str(mysql_error_time) + ' time:' + str(traceback.format_exc())) mysql_error_time -= 1 if not mysql_error_time: # return logger.info('there has no mysql_error_time') continue elif resume_result['code'] == 6: user_now = {} # list_page = -1 break elif resume_result['code'] == 8: set_account_invalid(account['username'], account['passwd']) user_now = {} # list_page = -1 break
def download_thread(): logger = utils.get_logger() logger.info('=' * 50 + '\nstart main!!!') global numbers_left global sleep_tag global get_task_queue redis_client = get_redis_client() mysql_pool = PersistentDB(MySQLdb, host=common_settings.MYSQL_HOST, user=common_settings.MYSQL_USER, passwd=common_settings.MYSQL_PASSWD, db=common_settings.MYSQL_DOWNLOAD_DB, port=common_settings.MYSQL_PORT, charset='utf8') mysql_conn = mysql_pool.connection() mysql_cursor = mysql_conn.cursor() # proxy = {'http': 'http://*****:*****@proxy.abuyun.com:9020', 'https': 'http://*****:*****@proxy.abuyun.com:9020'} proxy = None if common_settings.USE_PROXY: proxy = settings.get_proxy() task_list = [] mysql_cursor.execute( 'select count(*) from download_record where updateTime>date(now()) and valid=2 and source=24' ) number_today = mysql_cursor.fetchall()[0][0] numbers_left = 0 if numbers_left < number_today else numbers_left - number_today # 开始处理任务 while True: task = None # 若当前时间不能下载简历,则sleep(此功能目前只在招聘狗上使用,此处暂未使用,sleep_tag会在心跳线程中被更改) while not sleep_tag: logger.info('not the correct time to buy resume, wait.') time.sleep(3600) # 若今日简历下载数已达今日上限, 则sleep if not numbers_left: logger.info('the number of today not left, sleep') time.sleep(1800) continue logger.info('the number left today is:' + str(numbers_left)) # 从内存队列中获取任务,若没有获取到,sleep一分钟 try: task = get_task_queue.get(timeout=10) if not task: logger.info('get None task, sleep!') time.sleep(600) continue except Exception, e: logger.info('not get task from queue, sleep.') time.sleep(60) continue # 解析任务中的城市、职位信息 if task[8]: try: extend_content = json.loads(task[8]) extend_content[ 'emailJobType'] = '' if 'emailJobType' not in extend_content else extend_content[ 'emailJobType'] extend_content[ 'emailCity'] = '' if 'emailCity' not in extend_content else extend_content[ 'emailCity'] except Exception, e: logger.info('not find extend_content in task:' + str(task)) extend_content = {"emailJobType": "", "emailCity": ""}
def awake_one_task(task): logger = utils.get_logger() logger.info('start aweak one task') global random_ids relogin_time = 3 redis_client = get_redis_client() result = {'code': 0, 'executeParam': task} proxy = None if common_settings.USE_PROXY: proxy = settings.get_proxy() logger.info('deal with:' + str(task)) page_now = 1 download_day = str(time.localtime().tm_mon) + '-' + str( time.localtime().tm_mday) datetime_now = datetime.datetime.now() # 开始处理任务 while page_now != -1: logger.info('start download page:' + str(page_now)) # 每个列表页使用一个账号 account, cookie = get_one_account() # 开始下载列表页 list_result = get_list(account['username'], page_now, task, proxy) if list_result['code']: logger.info('get error list result:' + str(list_result)) page_now = -1 continue # 若列表页中没有简历,认为已经翻到最后一页 if not list_result['data']: list_result['has_next_page'] = False logger.info('page number of now is ' + str(page_now)) # 逐个处理详情页 for resume_one in list_result['data']: resume, resume_update_time = resume_one datetime_update = datetime.datetime.strptime( resume_update_time, '%Y.%m.%d') if (datetime_now - datetime_update).days > 7: logger.info('get 1 days before resume, continue.' + resume_update_time) list_result['has_next_page'] = False continue # 在redis中检查今天是否已经下载过 has_find_in_redis = False resume_key = 'youzi_resume_' + str(resume) try: resume_redis_value = redis_client.get(resume_key) if resume_redis_value == download_day: has_find_in_redis = True except Exception, e: logger.info(str(traceback.format_exc())) # redis_client.set(resume_key, download_day) if has_find_in_redis: logger.info('has find %s in redis' % resume_key) continue else: logger.info('not find %s in redis' % resume_key) # 下载简历详情页,尝试三次,三次均下载失败,则跳过此简历 for x in xrange(3): account, cookie = get_one_account() resume_result = get_resume(resume, account['username'], proxy=proxy) if resume_result['code']: logger.info('get error resume:' + str(resume_result)) continue redis_client.set(resume_key, download_day) break else: continue # 保存简历详情页 resume_uuid = uuid.uuid1() try: content = json.dumps( { 'name': '', 'email': '', 'phone': '', 'html': resume_result['data'] }, ensure_ascii=False) sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)' sql_value = (common_settings.SOURCE, content, resume_uuid, str(account['username']), task['jobTitle'], task['locationname'], str(resume)) resume_update_time = '' kafka_data = { "channelType": "WEB", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": str(account['username']), 'emailJobType': task['jobTitle'], 'emailCity': task['locationname'], 'subject': str(resume) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH", "source": common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': common_settings.CALLSYSTEMID, } utils.save_data(sql, sql_value, kafka_data) except Exception, e: logger.info('get error when write mns, exit!!!' + str(traceback.format_exc())) time.sleep(1)
def get_detail(): # list_result = {'total': 169, 'code': 0, 'data': [], # 'ids': [['1022347197', '2018-04-03'], ['1032771612', '2018-04-03'], ['1006918947', '2018-04-03'], # ['1045864491', '2018-04-03'], ['1047088655', '2018-04-03'], ['1025484341', '2018-04-03'], # ['1069702121', '2018-04-03'], ['1033655918', '2018-04-03'], ['1030018035', '2018-04-03'], # ['1003596638', '2018-04-03'], ['1033708572', '2018-04-02'], ['1026380481', '2018-04-02'], # ['1022050585', '2018-04-02'], ['1002563812', '2018-04-02'], ['1025279315', '2018-04-02'], # ['1035975978', '2018-04-02'], ['1031484197', '2018-04-02'], ['1070907357', '2018-04-02'], # ['1066977155', '2018-04-02'], ['1022591047', '2018-04-02'], ['1030550073', '2018-04-02'], # ['1033854943', '2018-04-02'], ['1031640132', '2018-04-02'], ['1070885063', '2018-04-02'], # ['1070729932', '2018-04-02'], ['1045212527', '2018-04-02'], ['1028664287', '2018-04-02'], # ['1070885595', '2018-04-02'], ['1025872345', '2018-04-02'], ['1067914985', '2018-04-02']]} list_result = { 'total': 321, 'code': 0, 'data': [], 'ids': [[u'1026367068', u'2018-04-03'], [u'1033060991', u'2018-04-03'], [u'1038441431', u'2018-04-03'], [u'1001363022', u'2018-04-03'], [u'1019659221', u'2018-04-03'], [u'1002660342', u'2018-04-03'], [u'1035984395', u'2018-04-03'], [u'1014826155', u'2018-04-03'], [u'1028142888', u'2018-04-03'], [u'1022642061', u'2018-04-03'], [u'1043983298', u'2018-04-03'], [u'1029731078', u'2018-04-03'], [u'1002851632', u'2018-04-03'], [u'1002354915', u'2018-04-03'], [u'1027887893', u'2018-04-03'], [u'1032848553', u'2018-04-03'], [u'1070807725', u'2018-04-03'], [u'1031349867', u'2018-04-03'], [u'1027071847', u'2018-04-03'], [u'1021747472', u'2018-04-03'], [u'1000155591', u'2018-04-03'], [u'1029013102', u'2018-04-03'], [u'1070737095', u'2018-04-03'], [u'1020231833', u'2018-04-03'], [u'1037316081', u'2018-04-03'], [u'1019132225', u'2018-04-03'], [u'1026149838', u'2018-04-03'], [u'1070834895', u'2018-04-03'], [u'1029253681', u'2018-04-03'], [u'1058461095', u'2018-04-02']] } for id in list_result.get('ids'): detail_data = resume_fenjianli.get_resume( resume_id=id[0], cookie= 'JSESSIONID=%s; huodong=fenjianli; hdflag=active; SERVERID=%s' % (cookie.get('cookie').get('JSESSIONID'), cookie.get('cookie').get('SERVERID')), model_name='zhilian', proxy=settings.get_proxy()) # logger.info(json.dumps(detail_data, ensure_ascii=False)) # resume_result = get_resume(resume[0], cookie, task['model_name'], proxy=proxy) resume_uuid = uuid.uuid1() try: sql = 'insert into spider_search.resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)' sql_value = ('RESUME_FEN', json.dumps(detail_data['json'], ensure_ascii=False), resume_uuid, '18629947965', '其他', '天津', str(id[0])) resume_update_time = detail_data['json']['updateDate'] kafka_data = { "channelType": "WEB", "content": { "content": json.dumps(detail_data['json'], ensure_ascii=False), "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": 'RESUME_FEN', "trackId": str(resume_uuid), "avatarUrl": '', "email": '18629947965', 'emailJobType': '其他', 'emailCity': '天津', 'subject': str(id[0]) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH", "source": 'RESUME_FEN', "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': 'python', } utils.save_data(sql, sql_value, kafka_data) except Exception, e: logger.info('get error when write mns, exit!!!' + str(traceback.format_exc())) # return time.sleep(1)
def download_thread(): logger = utils.get_logger() logger.info('=' * 50 + '\nstart main!!!') global numbers_left global sleep_tag redis_client = get_redis_client() mysql_pool = PersistentDB(MySQLdb, host=common_settings.MYSQL_HOST, user=common_settings.MYSQL_USER, passwd=common_settings.MYSQL_PASSWD, db=common_settings.MYSQL_DOWNLOAD_DB, port=common_settings.MYSQL_PORT, charset='utf8') mysql_conn = mysql_pool.connection() mysql_cursor = mysql_conn.cursor() mysql_cursor.execute( 'update download_record set valid=0 where valid=1 and source=24') mysql_conn.commit() # proxy = {'http': 'http://*****:*****@proxy.abuyun.com:9020', 'https': 'http://*****:*****@proxy.abuyun.com:9020'} proxy = None if common_settings.USE_PROXY: proxy = settings.get_proxy() # sqlite_conn = sqlite3.connect('zhaopingou.db') # sqlite_cursor = sqlite_conn.cursor() task_list = [] mysql_cursor.execute( 'select count(*) from download_record where updateTime>date(now()) and valid=2 and source=24' ) number_today = mysql_cursor.fetchall()[0][0] numbers_left -= number_today numbers_left = 0 if numbers_left < 0 else numbers_left while True: while not sleep_tag: logger.info('not the correct time to buy resume, wait.') time.sleep(3600) if not numbers_left: logger.info('the number of today not left, sleep') time.sleep(1800) continue logger.info('the number left today is:' + str(numbers_left)) while not task_list: while not sleep_tag: logger.info('not the correct time to buy resume, wait.') time.sleep(3600) task_number = mysql_cursor.execute( 'select * from download_record where valid=0 and source=24 order by updateTime desc limit 1' ) if not task_number: logger.info('there has no avaliable task in mysql ,sleep!!!') time.sleep(300) continue task_list = list(mysql_cursor.fetchall()) break task = task_list.pop() if task[8]: try: extend_content = json.loads(task[8]) extend_content[ 'emailJobType'] = '' if 'emailJobType' not in extend_content else \ extend_content['emailJobType'] extend_content[ 'emailCity'] = '' if 'emailCity' not in extend_content else \ extend_content['emailCity'] except Exception, e: logger.info('not find extend_content in task:' + str(task)) extend_content = {"emailJobType": "", "emailCity": ""} else: extend_content = {"emailJobType": "", "emailCity": ""} logger.info('start to deal with task:' + task[1]) # download get_null_text_count = 0 for charge_count in xrange(1): if task[6]: account, cookie = get_one_account_with_download_by(task[6]) else: account, cookie = get_one_account(download=True) charge_result = charge_resume(task[1], cookie, proxy) # if charge_result['code'] == 9: # set_forbidden_account(account) # continue # if charge_result['code'] == 10: # get_null_text_count += 1 # continue if charge_result['code']: set_unavaliable_account(account) continue break else: logger.info('get error after 1 times try to charge resume') # if get_null_text_count == 1: try: mysql_cursor.execute( 'update download_record set valid=3 where id=%s' % str(task[0])) mysql_conn.commit() logger.info('get 3 null text of resume:' + task[1]) except Exception, e: logger.info(str(traceback.format_exc())) continue
def awake_one_task(task): logger = utils.get_logger() logger.info('start aweak one task') relogin_time = 3 redis_client = get_redis_client() result = {'code': 0, 'executeParam': task} proxy = None if common_settings.USE_PROXY: proxy = settings.get_proxy() account, cookie = get_one_account() logger.info(str(cookie)) logger.info('deal with:' + str(task)) page_now = 1 download_day = str(time.localtime().tm_mon) + '-' + str( time.localtime().tm_mday) download_day_datetime = datetime.datetime.today() while page_now != -1: logger.info('start download page:' + str(page_now)) # if not account: account, cookie = get_one_account() list_result = get_list(cookie, page_now, task, proxy) # time.sleep(2) if list_result['code'] == 5: set_unavaliable_account(account) account = None continue elif list_result['code']: logger.info('get error list result:' + str(list_result)) page_now = -1 continue logger.info('page number of now is ' + str(page_now) + ' all number is:' + str(list_result['total'])) for resume in list_result['ids']: # logger.info('sleep 5') # time.sleep(5) list_resume_update_time = datetime.datetime.strptime( resume[1], '%Y-%m-%d') if (download_day_datetime - list_resume_update_time).days > 7: logger.info("find resume who's update time before 3 days.") page_now = -1 continue if not account: account, cookie = get_one_account() has_find_in_redis = False resume_key = 'fenjianli_resume_' + str(resume[0]) try: resume_download_time = redis_client.get(resume_key) if resume_download_time == download_day: has_find_in_redis = True else: redis_client.set(resume_key, download_day) except Exception, e: redis_client.set(resume_key, download_day) if has_find_in_redis: logger.info('has find %s in redis' % resume_key) continue else: logger.info('not find %s in redis' % resume_key) resume_result = get_resume(resume[0], cookie, task['model_name'], proxy=proxy) if resume_result['code'] == 1: set_unavaliable_account(account) account = None continue if resume_result['code'] == 3: logger.info('need set valid=0 of account:' + str(account)) if resume_result['code']: logger.info('get error resume:' + str(resume_result)) continue resume_uuid = uuid.uuid1() try: sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)' sql_value = (common_settings.SOURCE, json.dumps(resume_result['json'], ensure_ascii=False), resume_uuid, str(account['username']), task['job_name'], task['area_name'], str(resume[0])) resume_update_time = resume_result['json']['updateDate'] kafka_data = { "channelType": "WEB", "content": { "content": json.dumps(resume_result['json'], ensure_ascii=False), "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": str(account['username']), 'emailJobType': task['job_name'], 'emailCity': task['area_name'], 'subject': str(resume[0]) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH", "source": common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': common_settings.CALLSYSTEMID, } utils.save_data(sql, sql_value, kafka_data) except Exception, e: logger.info('get error when write mns, exit!!!' + str(traceback.format_exc())) # return time.sleep(1)
def awake_one_task(task): logger = utils.get_logger() logger.info('start aweak one task') global random_ids relogin_time = 3 redis_client = get_redis_client() result = {'code': 0, 'executeParam': task} proxy = None if common_settings.USE_PROXY: proxy = settings.get_proxy() logger.info('deal with:' + str(task)) page_now = 1 download_day = str(time.localtime().tm_mon) + '-' + str( time.localtime().tm_mday) datetime_now = datetime.datetime.now() while page_now != -1: logger.info('start download page:' + str(page_now)) # if not account: account, cookie = get_one_account(download=True) # list_result = get_list_with_keyword(account['username'], page_now, # task, proxy) # 使用cookie 代替random_id list_result = get_list_with_keyword(cookie, page_now, task, proxy) # time.sleep(2) if list_result['code'] == 5: set_unavaliable_account(account) continue elif list_result['code'] == 6: set_forbidden_account(account) continue if list_result['code']: logger.info('get error list result:' + str(list_result)) page_now = -1 continue if not list_result['data']: list_result['has_next_page'] = False logger.info('page number of now is ' + str(page_now)) for resume_one in list_result['data']: resume, resume_update_time = resume_one datetime_update = datetime.datetime.strptime( resume_update_time, '%Y.%m.%d') if (datetime_now - datetime_update).days > 7: logger.info('简历超出七天限制,跳过任务. ' + resume_update_time) list_result['has_next_page'] = False continue has_find_in_redis = False resume_key = 'youzi_all_resume_' + str(resume) try: resume_redis_value = redis_client.get(resume_key) if resume_redis_value: # if resume_redis_value == download_day: has_find_in_redis = True except Exception, e: logger.info(str(traceback.format_exc())) # redis_client.set(resume_key, download_day) if has_find_in_redis: logger.info('has find %s in redis' % resume_key) continue else: logger.info('not find %s in redis' % resume_key) for x in xrange(3): account, cookie = get_one_account(download=True) # resume_result = get_resume('20319214', account['username'], proxy=proxy) # resume_result = get_resume(resume, account['username'], # proxy=proxy) resume_result = get_resume(resume, cookie, proxy=proxy) # update_refresh_score(account) # if resume_result['code'] == 1: # set_unavaliable_account(account) # account = None # # redis_client.delete(resume_key) # continue # if resume_result['code'] == 7: # set_forbidden_account(account) # account = None # # redis_client.delete(resume_key) # continue if resume_result['code']: logger.info('get error resume:' + str(resume_result)) # redis_client.delete(resume_key) continue redis_client.set(resume_key, download_day) break else: continue resume_uuid = uuid.uuid1() try: content = json.dumps( { 'name': '', 'email': '', 'phone': '', 'html': resume_result['data'] }, ensure_ascii=False) sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)' sql_value = (common_settings.SOURCE, content, resume_uuid, str(account['username']), task['jobTitle'], task['locationname'], str(resume)) resume_update_time = '' # resume_update_time = resume_result['json']['updateDate'] kafka_data = { "channelType": "WEB", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time() * 1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": str(account['username']), 'emailJobType': task['jobTitle'], 'emailCity': task['locationname'], 'subject': str(resume) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_SEARCH", "source": common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': common_settings.CALLSYSTEMID, } utils.save_data(sql, sql_value, kafka_data) except Exception, e: logger.info('get error when write mns, exit!!!' + str(traceback.format_exc())) # return time.sleep(1)
def awake_one_task(task): logger = utils.get_logger() logger.info('start aweak one task') relogin_time = 3 redis_client = get_redis_client() result = {'code': 0, 'executeParam':task} proxy = None if common_settings.USE_PROXY: proxy = settings.get_proxy() logger.info('deal with:'+str(task)) page_now = 1 download_day = str(time.localtime().tm_mon)+'-'+str(time.localtime().tm_mday) while page_now != -1: logger.info('start download page:'+str(page_now)) # 每个列表页更换一次账号 account, cookie = get_one_account() # 获取列表页 list_result = get_list(cookie, page_now, task, proxy) #time.sleep(2) if list_result['code'] == 5: set_unavaliable_account(account) continue elif list_result['code'] == 6: set_forbidden_account(account) continue elif list_result['code']: logger.info('get error list result:'+str(list_result)) page_now = -1 continue logger.info('page number of now is '+str(page_now)) # 逐个处理简历 for resume_one in list_result['data']: resume, thirdid, srcid, list_content = resume_one has_find_in_redis = False resume_key = 'rencaia_resume_'+str(resume) real_rid = '' real_thirdid = '' # 在redis中查看今天是否已经下过,若今天已经下过,则跳过 try: resume_redis_value=redis_client.get(resume_key) if resume_redis_value: resume_redis_value_list = resume_redis_value.split('_') if resume_redis_value_list[0] == download_day: has_find_in_redis = True real_rid = resume_redis_value_list[1] real_thirdid = resume_redis_value_list[2] # else: # pass # if resume_redis_value_list and resume_redis_value_list[0] == download_day: # has_find_in_redis=True # else: # real_rid = resume_redis_value_list[1] # read_thirdid = resume_redis_value_list[2] # redis_client.set(resume_key, download_day) except Exception, e: logger.info(str(traceback.format_exc())) # redis_client.set(resume_key, download_day) if has_find_in_redis: logger.info('has find %s in redis' % resume_key) continue else: logger.info('not find %s in redis' % resume_key) # 获取简历信息,尝试三次,三次失败则跳过 for x in xrange(3): account, cookie = get_one_account() resume_result = get_resume(resume, thirdid, srcid, cookie, proxy=proxy) update_refresh_score(account) if resume_result['code'] == 1: set_unavaliable_account(account) account = None continue if resume_result['code'] == 7: set_forbidden_account(account) account = None continue if resume_result['code']: logger.info('get error resume:'+str(resume_result)) continue # if u'存在被盗用的风险' in resume_result['data']: # logger.info(u'find 存在被盗用的风险 in page:'+str(account)) # set_forbidden_account(account) # account = None # redis_client.delete(resume_key) # continue # if u'该用户暂无求职意向,已在外网设置简历不公开' in resume_result['data']: # # logger.info('un publish resume:'+str(resume)+ 'account:'+account['username']) # # redis_client.delete(resume_key) # logger.info('get not open resumed:'+str(account)) # # set_forbidden_account(account) # continue redis_client.set(resume_key, '_'.join([download_day, str(resume_result['real_rid'] or real_rid), thirdid, srcid])) break else: continue # 保存数据至mysql和mns resume_uuid = uuid.uuid1() try: content = {'html':list_content} content['email'] = resume_result['charge_json'].get('mail') content['phone'] = resume_result['charge_json'].get('phone') content['name'] = resume_result.get('name') content = json.dumps(content, ensure_ascii=False) sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject, reason) values (%s, %s, "python", %s, now(), %s, %s, %s, %s, "list")' sql_value = (common_settings.SOURCE, content, resume_uuid, str(account['username']), task['function_id_name'], task['residence_name'], str(resume)) resume_update_time = '' # resume_update_time = resume_result['json']['updateDate'] kafka_data = { "channelType": "WEB", "content": { "content": content, "id": '', "createBy": "python", "createTime": int(time.time()*1000), "ip": '', "resumeSubmitTime": '', "resumeUpdateTime": resume_update_time, "source": common_settings.SOURCE, "trackId": str(resume_uuid), "avatarUrl": '', "email": str(account['username']), 'emailJobType': task['function_id_name'], 'emailCity': task['residence_name'], 'subject': str(resume), 'reason': 'list', }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "RESUME_INBOX", "source": common_settings.SOURCE, "trackId": str(resume_uuid), 'traceID': str(resume_uuid), 'callSystemID': common_settings.CALLSYSTEMID, } utils.save_data(sql, sql_value, kafka_data) except Exception, e: logger.info('get error when write mns, exit!!!'+str(traceback.format_exc())) # return time.sleep(1)
def deal_task(): logger = utils.get_logger() logger.info('='*50 + '\nstart deal thread') task_list = [] conn, cur = get_mysql_client() change_account_number = 10 account = {} while True: task = get_task() proxy = settings.get_proxy() # task = redis_client.get(task_list.pop()) task_split = task[12].split('_') for x in xrange(5): if not change_account_number or not account: account, cookie = get_one_account() start_time = time.time() resume_result = get_resume(task_split[0], task_split[1], task_split[2], cookie, proxy=proxy) end_time = time.time() logger.info('once time cost:'+ str(int(end_time-start_time))) # update_refresh_score(account) if resume_result['code'] == 1: set_unavaliable_account(account) account = None # redis_client.delete(resume_key) continue if resume_result['code'] == 7: set_forbidden_account(account) account = None # redis_client.delete(resume_key) continue if resume_result['code']: logger.info('get error resume:'+str(resume_result)) # redis_client.delete(resume_key) account = None continue # if u'存在被盗用的风险' in resume_result['data']: # logger.info(u'find 存在被盗用的风险 in page:'+str(account)) # set_forbidden_account(account) # account = None # redis_client.delete(resume_key) # continue if 'data' not in resume_result: logger.info('not get data in resume result.') continue if u'该用户暂无求职意向,已在外网设置简历不公开' in resume_result['data']: # logger.info('un publish resume:'+str(resume)+ 'account:'+account['username']) # redis_client.delete(resume_key) logger.info('get not open resumed:'+str(account)) # set_forbidden_account(account) continue resume_uuid = uuid.uuid1() try: sql = 'update rencaia_all_resume set status=2, info_content=%s, resumeid=%s, url=%s, phone=%s, email=%s, job_now=%s, ip=%s, account=%s where id=%s' sql_value = (resume_result['data'], resume_result['real_rid'], resume_result.get('info_url'), resume_result.get('phone'), resume_result.get('email'),resume_result.get('job_now'), proxy['http'], account['username'], task[0]) cur.execute(sql, sql_value) conn.commit() logger.info('save resume success: %s' % task[0]) break except Exception, e: logger.info('get error when write mysql, exit!!!'+str(traceback.format_exc())) # return # time.sleep(1) #return else: logger.info('did not success after 5 times get resume of '+str(task[0])) sql = 'update rencaia_all_resume set status=3 where id=%s' sql_value = (task[0], ) cur.execute(sql, sql_value) conn.commit()
def awake_one_task(task): logger = utils.get_logger() logger.info('start aweak one task') relogin_time = 3 redis_client = get_redis_client() result = {'code': 0, 'executeParam': task} proxy = None if common_settings.USE_PROXY: proxy = settings.get_proxy() conn, cur = get_mysql_client() # account, cookie = get_one_account() # logger.info(str(cookie)) logger.info('deal with:' + str(task)) page_now = task['page_number'] download_day = str(time.localtime().tm_mon) + '-' + str( time.localtime().tm_mday) while page_now != -1: logger.info('start download page:' + str(page_now)) if not page_now % 10: f = open('page_record', 'r') lines = f.readlines() f.close() start_info = json.loads(lines[0]) start_info[task['residence_ids']] = page_now f = open('page_record', 'w') # f.write(str(task['residence_ids'])+'\n') # f.write(str(page_now)) f.write(json.dumps(start_info, ensure_ascii=False)) f.close() # if not account: proxy = settings.get_proxy() account, cookie = get_one_account() start_list_time = time.time() list_result = get_list(cookie, page_now, task, proxy) end_list_time = time.time() logger.info('list cost:' + str(int(end_list_time - start_list_time)) + ' ' + str(len(list_result['data']))) #time.sleep(2) if list_result['code'] == 5: set_unavaliable_account(account) logger.info('fail page : %s %s' % (task['residence_name'], page_now)) #page_now -= 1 continue elif list_result['code'] == 6: set_forbidden_account(account) logger.info('fail page : %s %s' % (task['residence_name'], page_now)) #page_now -= 1 continue elif list_result['code']: logger.info('get error list result:' + str(list_result)) page_now = -1 continue logger.info('page number of now is ' + str(page_now)) # continue has_find_count = 0 not_find_count = 0 for resume_one in list_result['data']: resume, thirdid, srcid, list_content = resume_one # # logger.info('sleep 5') # # time.sleep(5) # # if not account: has_find_in_redis = False resume_key = 'rencaia_all_resume_' + str(resume) try: resume_redis_value = redis_client.get(resume_key) if resume_redis_value: has_find_in_redis = True # else: # pass # if resume_redis_value_list and resume_redis_value_list[0] == download_day: # has_find_in_redis=True # else: # real_rid = resume_redis_value_list[1] # read_thirdid = resume_redis_value_list[2] # redis_client.set(resume_key, download_day) except Exception, e: logger.info(str(traceback.format_exc())) # redis_client.set(resume_key, download_day) if has_find_in_redis: has_find_count += 1 # logger.info('has find %s in redis' % resume_key) continue else: not_find_count += 1 # logger.info('not find %s in redis' % resume_key) sql = 'insert into rencaia_all_resume (list_content, search_city, status, list_param, pageNo) values (%s, %s, %s, %s, %s)' sql_value = (list_content, task['residence_name'], '0', '_'.join([resume, thirdid, srcid]), str(page_now)) cur.execute(sql, sql_value) conn.commit() redis_client.set(resume_key, '_'.join([download_day, resume, thirdid, srcid])) # for x in xrange(15): # #account, cookie = get_one_account() # start_time = time.time() # resume_result = get_resume(resume, thirdid, srcid, cookie, proxy=proxy) # end_time = time.time() # logger.info('once time cost:'+ str(int(end_time-start_time))) # # update_refresh_score(account) # if resume_result['code'] == 1: # set_unavaliable_account(account) # account = None # # redis_client.delete(resume_key) # continue # if resume_result['code'] == 7: # set_forbidden_account(account) # account = None # # redis_client.delete(resume_key) # continue # if resume_result['code']: # logger.info('get error resume:'+str(resume_result)) # # redis_client.delete(resume_key) # continue # # if u'存在被盗用的风险' in resume_result['data']: # # logger.info(u'find 存在被盗用的风险 in page:'+str(account)) # # set_forbidden_account(account) # # account = None # # redis_client.delete(resume_key) # # continue # if 'data' not in resume_result: # logger.info('not get data in resume result.') # continue # if u'该用户暂无求职意向,已在外网设置简历不公开' in resume_result['data']: # # logger.info('un publish resume:'+str(resume)+ 'account:'+account['username']) # # redis_client.delete(resume_key) # logger.info('get not open resumed:'+str(account)) # # set_forbidden_account(account) # continue # break # else: # continue # resume_uuid = uuid.uuid1() # try: # sql = 'insert into rencaia_all_resume (list_content, info_content, resumeid, url, search_city, search_job, phone, email, job_now, ip, account) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' # sql_value = (list_content, resume_result['data'], resume, resume_result.get('info_url'), task['residence_name'], task['function_id_name'], resume_result.get('phone'), resume_result.get('email'),resume_result.get('job_now'), proxy['http'], account['username']) # cur.execute(sql, sql_value) # conn.commit() # cur.execute('select last_insert_id()') # save_mysql_ids = cur.fetchall() # if not save_mysql_ids or not save_mysql_ids[0]: # logger.info('insert into mysql error!!!:' + sql + ' ' + str(sql_value)) # continue # save_mysql_id = save_mysql_ids[0][0] # logger.info('save resume success: %s' % save_mysql_id) # redis_client.set(resume_key, '_'.join([download_day, str(resume_result['real_rid'] or real_rid), thirdid, srcid])) # except Exception, e: # logger.info('get error when write mysql, exit!!!'+str(traceback.format_exc())) # # return # time.sleep(1) # #return page_now = page_now + 1 if list_result['has_next_page'] else -1 logger.info( 'city : %s page : %s redis check : %s / %s ' % (task['residence_name'], page_now, has_find_count, not_find_count))
def redis_seed(): keys = r.keys() for code in keys: print('current code {}'.format(code)) start_page=r.get(code) stop_flag = False # start_page = 1 start_page=int(start_page) while 1: if stop_flag: print('完成一个') break start=1 while start<RETRY: proxy = get_proxy() try: response = requests.get('http://guba.eastmoney.com/list,{},f_{}.html'.format(code,start_page), proxies=proxy,headers=headers, cookies=cookies, verify=False, timeout=10) except Exception as e: print(e) start+=1 else: break if start==RETRY: continue text=response.text resp=Selector(text=text) detail=resp.xpath('//div[@id="articlelistnew"]/div[@class="articleh normal_post" or @class="articleh normal_post odd"]') # print('page {}'.format(start_page)) c=0 for item in detail: c=c+1 read_ount=item.xpath('.//span[1]/text()').extract_first() comment_ount=item.xpath('.//span[2]/text()').extract_first() title=item.xpath('.//span[3]/a/@title').extract_first() author=item.xpath('.//span[4]/a/font/text()').extract_first() last_update=item.xpath('.//span[5]/text()').extract_first() next_url='http://guba.eastmoney.com'+item.xpath('.//span[3]/a/@href').extract_first() d={} d['code']=code d['title']=title d['page']=start_page d['count']=c d['read_count']=read_ount d['author']=author d['comment_count']=comment_ount d['last_update']=last_update d['next_url']=next_url d['crawltime']=datetime.datetime.now() try: doc.update_one({'next_url':next_url},{'$set':d},True,True) except Exception as e: print(e) start_ = 0 start_page += 1 r.set(code,start_page) while start_<RETRY: try: proxy = get_proxy() response_detail = requests.get(next_url, headers=headers, cookies=cookies, verify=False,proxies=proxy,timeout=10) except Exception as e: print(e) start_+=1 continue else: break if start_==RETRY: continue resp_detail=response_detail.text detail_resp = Selector(text=resp_detail) zwfb_time=detail_resp.xpath('//div[@class="zwfbtime"]/text()').extract_first() if isinstance(zwfb_time,str): zwfb_pattern=re.search('发表于 (.*?) ',zwfb_time) else: continue if zwfb_pattern: zwfb_time = zwfb_pattern.group(1) else: print('未找到时间') zwfb_time = None if zwfb_time is not None and zwfb_time<END_DATE: stop_flag=True