def visit_article(self): t = get_13_time() url = 'https://www.toutiao.com/c/user/article/' headers = {'Referer': 'https://www.toutiao.com/c/user/56654489607/'} params = { 'page_type': '1', 'user_id': '56654489607', 'max_behot_time': t, 'count': '20', 'as': 'A105DCF003A4247', 'cp': '5C035412A4A72E1', '_signature': 'aSGH2hAYNSY-Fs-BeDJWyWkhh8' } resp = self.session.get(url, headers=headers, params=params) json_info = resp.json() print()
def parse_list(self,response): next_meta = response.meta companyId = next_meta['companyId'].strip() # print(companyId,response.text) n=0 while n<95: # try: t = get_13_time() # 'https://www.liepin.com/company/sojob.json?pageSize=15&curPage=0&ecompIds=8091337&dq=&publishTime=&keywords=&_=1550383073951' url=f'https://www.liepin.com/company/sojob.json?pageSize=15&curPage={n}&ecompIds={companyId}&dq=&publishTime=&keywords=&_={t}' n+=1 headers={ 'referer':'https://www.liepin.com/ajaxproxy.html' } cookies={ '__uuid': '1550017147980.22', '_uuid': 'E4361B46FFA8441973EC46E6488BD983', 'is_lp_user': '******', 'need_bind_tel': 'false', 'new_user': '******', 'c_flag': 'f57e19ed294147b87179e4e6132477f5', 'imClientId': '45e417dd37f82ac674cdcbb355984626', 'imId': '45e417dd37f82ac6a36687782a0c1c67', 'imClientId_0': '45e417dd37f82ac674cdcbb355984626', 'imId_0': '45e417dd37f82ac6a36687782a0c1c67', 'gr_user_id': '374534ce-aa54-4880-88ca-7a7bb7adf340', 'bad1b2d9162fab1f80dde1897f7a2972_gr_last_sent_cs1': '463d81f04fd219c61a667e00ad0d9493', 'grwng_uid': 'f3fda8f8-0c2e-4f29-8507-f42f7a9671ec', 'fe_work_exp_add': 'true', 'ADHOC_MEMBERSHIP_CLIENT_ID1.0': 'fa804ff0-2a02-3f31-8dcb-8e13b527dfcb', 'bad1b2d9162fab1f80dde1897f7a2972_gr_cs1': '463d81f04fd219c61a667e00ad0d9493', '__tlog': '1550383052778.97%7C00000000%7C00000000%7C00000000%7C00000000', '_mscid': '00000000', 'Hm_lvt_a2647413544f5a04f00da7eee0d5e200': '1550233873,1550279247,1550281552,1550383053', 'abtest': '0', '_fecdn_': '0', '__session_seq': '2', '__uv_seq': '2', 'Hm_lpvt_a2647413544f5a04f00da7eee0d5e200': '1550383074' } next_meta['ticker'] = next_meta['ticker'] print(next_meta['ticker']) next_meta['company_name'] = next_meta['company_name'] print(next_meta['company_name']) yield scrapy.Request(url, callback=self.parse_job,meta=next_meta,headers=headers,cookies=cookies)
def _visit_queryOrderWaitTime(self): url = 'https://kyfw.12306.cn/otn/confirmPassenger/queryOrderWaitTime' params = { 'random': get_13_time(), 'tourFlag': 'dc', '_json_att': '', 'REPEAT_SUBMIT_TOKEN': self.globalRepeatSubmitToken } r = self.session.get(url, params=params) json_data = r.json() orderId = json_data['data']['orderId'] if orderId: print('queryOrderWaitTime 出票成功, id:', orderId) return True else: print('queryOrderWaitTime 还没有出票, 预计等待时间:', json_data['data']['waitTime']) return False
def _visit_captcha_check(self): url = 'https://kyfw.12306.cn/passport/captcha/captcha-check' answer = self._get_vcode() params = { # 'callback': 'jQuery1910292836177545567_1542355491387', 'answer': answer, 'rand': 'sjrand', 'login_site': 'E', '_': get_13_time(), } response = self.session.get(url, params=params) # 会抛出 json 错误 json_data = response.json() if json_data['result_code'] == '4': print('验证码成功') self.vcode_answer = answer return True else: print('验证码失败:', self.vcode) return False
def _visit_captcha_image64(self): url = 'https://kyfw.12306.cn/passport/captcha/captcha-image64' t = get_13_time() params = { 'login_site': 'E', 'module': 'login', 'rand': 'sjrand', t: '', # 'callback': 'jQuery1910292836177545567_1542355491387', '_': str(int(t) - random.randint(1800, 2500)), } # 一定要用 self.session r = self.session.get(url, params=params) json_data = r.json() image = json_data['image'] import base64 content = base64.b64decode(image) # 返回图片的 bytes return content
import json import scrapy import re from datetime import datetime import pandas as pd import time from common.util import get_13_time t = get_13_time() from liepinSpecialComJob.items import LiepinspecialcomjobItem class LiepinSpdier(scrapy.Spider): name = 'liepin' start_urls = ['https://vip.liepin.com/883905/1405577359643.shtml', 'https://vip.liepin.com/8161070/joblist.shtml', # 'http://maker.haier.net/custompage/socialchannel/index.html?platformcode=lp', 'https://vip.liepin.com/7855333/joblist.shtml', 'https://vip.liepin.com/8090130/1409730340536.shtml', 'https://vip.liepin.com/8399212/joblist.shtml', 'https://vip.liepin.com/1198424/joblist2.shtml', 'https://vip.liepin.com/8787971/joblist.shtml', 'https://vip.liepin.com/8796178/joblist2.shtml', 'https://vip.liepin.com/8091337/1426475303042.shtml', 'https://vip.liepin.com/7904788/job.shtml', ] def parse(self, response): text = response.text company_name = re.search(r'<title>(.*?) - 猎聘网招聘官网',text).group(1) companyId=re.search(r'CONFIG={"companyId":"([0-9]+)"}',text).group(1)