class Task(AbsFetchTask): task_info = { 'task_name': '测试selenium登录[fast]', 'help': '测试selenium登录[fast]' } def _get_common_headers(self): return {'User-Agent': USER_AGENT} def _prepare(self, data=None): super()._prepare(data) self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/PicCheckCode1/g') driver.get('http://www.bjgjj.gov.cn/') return driver def _setup_task_units(self): self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _query(self, params: dict): t = params.get('t') if t == 'vc': return self._new_vc() def _new_vc(self): vc_url = VC_IMAGE_URL + str(int(time.time() * 1000)) resp = self.s.get(vc_url) return dict(cls='data:image', content=resp.content, content_type=resp.headers.get('Content-Type')) def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '身份证号' not in params: params['身份证号'] = meta.get('身份证号') if '查询密码' not in params: params['查询密码'] = meta.get('查询密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '身份证号' and '身份证号' in meta: continue elif pr['key'] == '查询密码' and '查询密码' in meta: continue res.append(pr) return res def _check_login_params(self, params): assert params is not None, '缺少参数' assert '身份证号' in params, '缺少身份证号' assert '查询密码' in params, '缺少查询密码' assert 'vc' in params, '缺少验证码' # TODO: 检查身份证号 # TODO: 检查密码 # TODO: 检查验证码 def _unit_login(self, params=None): err_msg = None if params: try: self._check_login_params(params) username = params['身份证号'] password = params['查询密码'] vc = params['vc'] self._do_login(username, password, vc) # 登录成功 self.result_key = username self.result_meta.update({'身份证号': username, '查询密码': password}) return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='身份证号', name='身份证号', cls='input', value=params.get('身份证号', '')), dict(key='查询密码', name='个人编号', cls='input', value=params.get('查询密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(LOGIN_PAGE_URL) # 等待lk请求 WebDriverWait(driver, 10).until( value_is_number((By.XPATH, '//*[@id="lk"]'))) for l in driver.get_log('browser'): print(l) # 选择身份证号方式登录 driver.find_element_by_xpath( '/html/body/table[2]/tbody/tr[3]/td/table/tbody/tr/td/div/form/div[1]/ul/li[3]/a' ).click() username_input = driver.find_element_by_xpath('//*[@id="bh1"]') password_input = driver.find_element_by_xpath('//*[@id="mm1"]') vc_input = driver.find_element_by_xpath( '//*[@id="login_tab_2"]/div/div[3]/input') submit_btn = driver.find_element_by_xpath( '//*[@id="login_tab_2"]/div/div[4]/input[1]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) vc_input.clear() vc_input.send_keys(vc) # 提交 submit_btn.click() if not driver.current_url == 'http://www.bjgjj.gov.cn/wsyw/wscx/gjjcx-choice.jsp': # FIXME: debug for l in driver.get_log('browser'): print(l) print(driver.get_cookies()) # FIXME: 尝试处理alert err_msg = '登录失败,请检查输入' alert = driver.switch_to.alert try: err_msg = alert.text # alert.accept() finally: raise InvalidParamsError(err_msg) # 登录成功 # 保存登录后的页面内容供抓取单元解析使用 self.g.login_page_html = driver.find_element_by_tag_name( 'html').get_attribute('innerHTML') self.g.current_url = driver.current_url def _unit_fetch(self): try: # TODO: soup = bs4.BeautifulSoup(self.g.login_page_html, 'html.parser') a = soup.select('a')[1] link = a.attrs['onclick'].split('"')[1] link = parse.urljoin(self.g.current_url, link) resp = self.s.get(link) self.result_data.update({ 'xxx': a.text, 'link': link, 'content': html.unescape(resp.text) }) self.result_identity.update( {'task_name': self.task_info['task_name']}) except InvalidConditionError as e: raise PreconditionNotSatisfiedError(e)
class Task(AbsFetchTask): task_info = dict(city_name="广州", help=""" <li>个人用户第一次忘记密码,需要到各办事窗口办理;在办事窗口补充完整相关信息(如电子邮箱地址)以后,忘记密码功能才能使用。</li> <li>由于目前缴费历史的查询量较多,为减轻广州社保系统压力,限制每人每天只能查询5次,敬请谅解!</li> """, developers=[{ 'name': '程菲菲', 'email': '*****@*****.**' }]) def _get_common_headers(self): return { 'User-Agent': USER_AGENT, # 'Accept-Encoding':'gzip, deflate, sdch', # 'X-Requested-With': 'XMLHttpRequest', # 'Host':'gzlss.hrssgz.gov.cn' } def _prepare(self, data=None): """恢复状态,初始化结果""" super()._prepare(data) self.result_data['baseInfo'] = {} # state # state: dict = self.state # TODO: restore from state # result # result: dict = self.result # TODO: restore from result self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_chrome_driver(self): driver = new_driver(user_agent=USER_AGENT, driver_type=DriverType.CHROME) return driver def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/cas\/captcha.jpg/g') # proxy = webdriver.Proxy() # proxy.proxy_type = ProxyType.DIRECT # proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) # driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(20) # 设置10秒脚本超时时间 driver.set_script_timeout(20) # 随便访问一个相同host的地址,方便之后设置cookie driver.get('http://gzlss.hrssgz.gov.cn/xxxx') return driver def _query(self, params: dict): """任务状态查询""" t = params.get('t') if t == 'vc': return self._new_vc() # pass def _new_vc(self): resp = self.s.get(VC_URL) return dict(content=resp.content, content_type=resp.headers['Content-Type']) def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '账号' not in params: params['账号'] = meta.get('账号') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '账号' and '账号' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue res.append(pr) return res def _setup_task_units(self): """设置任务执行单元""" self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _check_login_params(self, params): assert params is not None, '缺少参数' assert '账号' in params, '缺少账号' assert '密码' in params, '缺少密码' # other check 账号 = params['账号'] 密码 = params['密码'] if len(密码) < 4: raise InvalidParamsError('账号或密码错误') if len(账号) < 15: raise InvalidParamsError('账号或密码错误') def _loadJs(self): import execjs resps = self.s.get("http://gzlss.hrssgz.gov.cn/cas/login") modlus = BeautifulSoup(resps.content).findAll('script')[2].text.split( '=')[3].split(';')[0].replace('"', '') jsstrs = self.s.get( "http://gzlss.hrssgz.gov.cn/cas/third/jquery-1.5.2.min.js") jsstr = self.s.get("http://gzlss.hrssgz.gov.cn/cas/third/security.js") ctx = execjs.compile(jsstr.text + jsstrs.text) key = ctx.call("RSAUtils.getKeyPair", '010001', '', modlus) resp = self.s.get("http://gzlss.hrssgz.gov.cn/cas/login") lt = BeautifulSoup(resp.content, 'html.parser').find('input', {'name': 'lt'})['value'] datas = { 'usertype': "2", 'lt': lt, # 'username': params.get('账号'), # 'password': params.get('密码'), '_eventId': 'submit' } resps = self.s.post( "http://gzlss.hrssgz.gov.cn/cas/login?service=http://gzlss.hrssgz.gov.cn:80/gzlss_web/business/tomain/main.xhtml", datas) raise InvalidParamsError(resps.text) def _unit_login(self, params=None): err_msg = None if params: try: self._check_login_params(params) id_num = params['账号'] pass_word = params['密码'] vc = params['vc'] self._do_login(id_num, pass_word, vc) # 登录成功 # 保存到meta self.result_key = id_num self.result_meta['账号'] = id_num self.result_meta['密码'] = pass_word return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='账号', name='账号', cls='input', value=params.get('账号', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(LOGIN_URL) username_input = driver.find_element_by_xpath( '//*[@id="loginName"]') password_input = driver.find_element_by_xpath( '//*[@id="loginPassword"]') vc_input = driver.find_element_by_xpath('//*[@id="validateCode"]') user_type = driver.find_element_by_xpath('//*[@id="usertype2"]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) # 验证码 vc_input.clear() vc_input.send_keys(vc) user_type.click() # 登录 driver.find_element_by_xpath('//*[@id="submitbt"]').click() if driver.current_url.startswith( 'http://gzlss.hrssgz.gov.cn/cas/login'): err_msg = '登录失败,请重新登录!' try: err_msg = driver.find_element_by_xpath( '//*[@id="*.errors"]').text finally: raise InvalidParamsError(err_msg) # 登录成功 def _to_replace(self, con): res = con.replace('\r', '').replace('\n', '').replace('\t', '') return res def _unit_fetch(self): try: # TODO: 执行任务,如果没有登录,则raise PermissionError s = json.loads(self.s.get(User_BaseInfo).text) # 个人信息导航 s2 = s[0]['url'] res = self.s.get("http://gzlss.hrssgz.gov.cn/gzlss_web" + s2) # 个人基础信息 if (len( BeautifulSoup(res.text, 'html.parser').findAll( 'table', {'class': 'comitTable'})) <= 0): raise TaskNotAvailableError("网络异常,请重新登录") return redata = BeautifulSoup(res.text, 'html.parser').findAll( 'table', {'class': 'comitTable'})[0] # 姓名等信息 redata2 = BeautifulSoup(res.text, 'html.parser').findAll( 'table', {'class': 'comitTable'})[1] # 民族等信息 # 社保明细 userNum = BeautifulSoup( self.s.get(Search_URL).text, 'html.parser').find('select', { 'id': 'aac001' }).text.replace('\n', '') # 员工编号 sixian = BeautifulSoup( self.s.get(Sixian_URL + userNum).text, 'html.parser').find('table').findAll( "tr", {'class': 'table_white_data'}) # 医疗保险明细 permedicalTotal = 0.0 HmoneyCount = 0 paraURL = "&startStr=199001&endStr=" + time.strftime( '%Y%m', time.localtime()) + "" # 医疗保险地址参数 yiliao = BeautifulSoup( self.s.get(Yiliao_URL + userNum + paraURL).text, 'html.parser') a = yiliao.find('table', { 'id': 'tableDataList' }).find('script').text if "请明天再查" in a: raise TaskNotAvailableError("您今天的缴费历史查询已经达到5次,请明天再查。") elif "找不到相关数据" in a: raise TaskNotAvailableError("抱歉,找不到相关数据。") elif "非法操作" in a: raise TaskNotAvailableError("非法操作,无法查询。") self.result_data['medical_care'] = {"data": {}} dataBaseH = self.result_data['medical_care']["data"] modelH = {} si_status = "" sidata = yiliao.find('table', {'id': 'tableDataList'}) if 'alert' not in sidata.text: if len(sidata.findAll("tr")) > 1: si_status = self._to_replace( sidata.findAll("tr")[1].findAll("td")[10].text)[ 0:2] # 缴存状态 si_com = self._to_replace( sidata.findAll("tr")[2].findAll("td")[3].text) # 缴费单位 yiliaoData = sidata.findAll("tr", {'temp': '职工社会医疗保险'}) for a in range(len(yiliaoData)): td = yiliaoData[a].findAll("td") permedicalTotal += float( re.findall(r"\d+\.?\d*", td[7].text)[0]) yearH = self._to_replace(td[1].text)[0:4] monthH = self._to_replace(td[1].text)[4:6] rangNum = int(self._to_replace(td[3].text)) HmoneyCount += rangNum for a1 in range(-1, rangNum - 1): nowtime = datetime.date( int(yearH) + (int(monthH) + a1) // 12, (int(monthH) + a1) % 12 + 1, 1).strftime('%Y%m') modelH = { '缴费单位': si_com, '缴费类型': si_status, '缴费时间': nowtime, '缴费基数': self._to_replace(td[9].text), '政府资助': re.findall(r"\d+\.?\d*", td[8].text)[0], '公司缴费': float(re.findall(r"\d+\.?\d*", td[6].text)[0]) / rangNum, '个人缴费': float(re.findall(r"\d+\.?\d*", td[7].text)[0]) / rangNum } dataBaseH.setdefault(nowtime[0:4], {}) dataBaseH[nowtime[0:4]].setdefault( nowtime[4:6], []) dataBaseH[nowtime[0:4]][nowtime[4:6]].append( modelH) else: raise TaskNotImplementedError("未查询到数据!") else: errormsg2 = sidata.text.split('(')[1].split(')')[0] raise TaskNotImplementedError(errormsg2) # 养老保险明细 self.result_data['old_age'] = {"data": {}} dataBaseE = self.result_data['old_age']["data"] modelE = {} peroldTotal = 0.0 for b in range(len(sixian) - 3): td2 = sixian[b].findAll("td") if (td2[5].text.strip() != ''): peroldTotal += float(td2[5].text) yearE = td2[0].text[0:4] monthE = td2[0].text[4:6] rangNumE = int(td2[2].text) for b1 in range(-1, rangNumE - 1): nowtime2 = datetime.date( int(yearE) + (int(monthE) + b1) // 12, (int(monthE) + b1) % 12 + 1, 1).strftime('%Y%m') modelE = { '缴费单位': td2[11].text, '缴费类型': td2[12].text, '缴费时间': nowtime2, '缴费基数': td2[3].text, '公司缴费': float(td2[4].text) / rangNumE, '个人缴费': float(td2[5].text) / rangNumE } dataBaseE.setdefault(nowtime2[0:4], {}) dataBaseE[nowtime2[0:4]].setdefault(nowtime2[4:6], []) dataBaseE[nowtime2[0:4]][nowtime2[4:6]].append(modelE) # 失业保险明细 self.result_data['unemployment'] = {"data": {}} dataBaseI = self.result_data['unemployment']["data"] modelI = {} for c in range(len(sixian) - 3): td3 = sixian[c].findAll("td") if (td3[0].text.strip() != ""): yearI = td3[0].text[0:4] monthI = td3[0].text[4:6] rangNumI = int(td3[2].text) for c1 in range(-1, rangNumI - 1): nowtime3 = datetime.date( int(yearI) + (int(monthI) + c1) // 12, (int(monthI) + c1) % 12 + 1, 1).strftime('%Y%m') modelI = { '缴费单位': td3[11].text, '缴费类型': td3[12].text, '缴费时间': nowtime3, '缴费基数': td3[3].text, '公司缴费': float(td3[6].text) / rangNumI, '个人缴费': float(td3[7].text) / rangNumI } dataBaseI.setdefault(nowtime3[0:4], {}) dataBaseI[nowtime3[0:4]].setdefault(nowtime3[4:6], []) dataBaseI[nowtime3[0:4]][nowtime3[4:6]].append(modelI) # 工伤保险明细 self.result_data['injuries'] = {"data": {}} dataBaseC = self.result_data['injuries']["data"] modelC = {} for d in range(len(sixian) - 3): td4 = sixian[d].findAll("td") if (td4[0].text.strip() != ""): yearC = td4[0].text[0:4] monthC = td4[0].text[4:6] rangNumC = int(td4[2].text) for d1 in range(-1, rangNumC - 1): nowtime4 = datetime.date( int(yearC) + (int(monthC) + d1) // 12, (int(monthC) + d1) % 12 + 1, 1).strftime('%Y%m') modelC = { '缴费单位': td4[11].text, '缴费类型': td4[12].text, '缴费时间': nowtime4, '缴费基数': td4[3].text, '公司缴费': float(td4[8].text) / rangNumC, '个人缴费': '' } dataBaseC.setdefault(nowtime4[0:4], {}) dataBaseC[nowtime4[0:4]].setdefault(nowtime4[4:6], []) dataBaseC[nowtime4[0:4]][nowtime4[4:6]].append(modelC) # 生育保险明细 self.result_data['maternity'] = {"data": {}} dataBaseB = self.result_data['maternity']["data"] modelB = {} for f in range(len(sixian) - 3): td5 = sixian[f].findAll("td") if (td5[0].text.strip() != ""): yearB = td5[0].text[0:4] monthB = td5[0].text[4:6] rangNumB = int(td5[2].text) for f1 in range(-1, rangNumB - 1): nowtime5 = datetime.date( int(yearB) + (int(monthB) + f1) // 12, (int(monthB) + f1) % 12 + 1, 1).strftime('%Y%m') modelB = { '缴费单位': td5[11].text, '缴费类型': td5[12].text, '缴费时间': nowtime5, '缴费基数': td5[3].text, '公司缴费': float(td5[9].text) / rangNumB, '个人缴费': '' } dataBaseB.setdefault(nowtime5[0:4], {}) dataBaseB[nowtime5[0:4]].setdefault(nowtime5[4:6], []) dataBaseB[nowtime5[0:4]][nowtime5[4:6]].append(modelB) # 大病保险明细 dabingData = sidata.findAll("tr", {'temp': '重大疾病医疗补助'}) self.result_data['serious_illness'] = {"data": {}} dataBaseQ = self.result_data['serious_illness']["data"] modelQ = {} if (len(dabingData) > 0): for q in range(len(dabingData)): td6 = dabingData[q].findAll("td") if (td6[0].text.strip() != ""): yearQ = self._to_replace(td[1].text)[0:4] monthQ = self._to_replace(td[1].text)[4:6] rangNumQ = int(self._to_replace(td[3].text)) for a1 in range(-1, rangNumQ - 1): nowtime6 = datetime.date( int(yearQ) + (int(monthQ) + a1) // 12, (int(monthQ) + a1) % 12 + 1, 1).strftime('%Y%m') modelQ = { '缴费单位': si_com, '缴费类型': si_status, '缴费时间': nowtime6, '缴费基数': self._to_replace(td6[9].text), '政府资助': re.findall(r"\d+\.?\d*", td6[8].text)[0], '公司缴费': float( re.findall(r"\d+\.?\d*", td6[6].text)[0]) / rangNum, '个人缴费': float( re.findall(r"\d+\.?\d*", td6[7].text)[0]) / rangNum } dataBaseQ.setdefault(nowtime6[0:4], {}) dataBaseQ[nowtime6[0:4]].setdefault( nowtime6[4:6], []) dataBaseQ[nowtime6[0:4]][nowtime6[4:6]].append( modelQ) sixiantype = "" if (len(sixian) >= 4): sixiantype = sixian[len(sixian) - 4].findAll("td")[12].text social_status = { '医疗': si_status, '养老': sixiantype, '失业': sixiantype, '工伤': sixiantype, '生育': sixiantype } # 缴费时长 EmoneyCount = sixian[len(sixian) - 3].findAll("td")[1].text EmoneyCount2 = sixian[len(sixian) - 3].findAll("td")[2].text EmoneyCount3 = sixian[len(sixian) - 3].findAll("td")[3].text EmoneyCount4 = sixian[len(sixian) - 3].findAll("td")[4].text rescount = [EmoneyCount, EmoneyCount2, EmoneyCount3, EmoneyCount4] moneyCount = max(rescount) # 个人基本信息 self.result_data['baseInfo'] = { '姓名': redata.find('input', {'id': 'aac003ss'})['value'], '身份证号': redata.find('input', {'id': 'aac002ss'})['value'], '更新时间': time.strftime("%Y-%m-%d", time.localtime()), '城市名称': '广州市', '城市编号': '440100', '缴费时长': moneyCount, '最近缴费时间': sixian[len(sixian) - 4].findAll("td")[1].text, '开始缴费时间': sixian[0].findAll("td")[0].text, '个人养老累计缴费': peroldTotal, '个人医疗累计缴费': permedicalTotal, '五险状态': social_status, '账户状态': social_status['养老'], '个人编号': redata.find('input', {'id': 'aac001'})['value'], # '性别': redata.find('input', {'id': 'aac004ss'})['value'], # '民族': redata2.find('select', {'id': 'aac005'}).find(selected="selected").text.replace('\r', '').replace('\n', '').replace('\t', ''), # '户口性质': redata.find('input', {'id': 'aac009ss'})['value'], # '出生日期': redata.find('input', {'id': 'aac006ss'})['value'], # '单位名称': redata.find('input', {'id': 'aab069ss'})['value'], # '地址': redata2.find('input', {'id': 'bab306'})['value'], # '电子邮箱': redata2.find('input', {'id': 'bbc019'})['value'] } # identity信息 self.result_identity.update({ "task_name": "广州", "target_name": redata.find('input', {'id': 'aac003ss'})['value'], "target_id": self.result_meta['账号'], "status": social_status['养老'] }) # 暂时不用代码 # siresp=self.s.get("http://gzlss.hrssgz.gov.cn/gzlss_web"+s[1]['url']) # 四险导航 # sdata=BeautifulSoup(siresp.text,'html.parser') # 四险find信息 # hs = json.loads(self.s.get(Medical_URL).text) # 医疗保险信息 # medDetailURL=hs[0]['url'] # 医疗 # hresp=self.s.get("http://gzlss.hrssgz.gov.cn/gzlss_web"+medDetailURL) # hdata = BeautifulSoup(hresp.text, 'html.parser') # 医疗find信息 return except PermissionError as e: raise PreconditionNotSatisfiedError(e)
class Task(AbsFetchTask): task_info = dict(city_name="上海", help="""<li>用户名:为参保人身份证号</li> <li>密码:一般为6位数字;</li> <li>首次申请密码或遗忘网上登录密码,本人需携带有效身份证件至就近接到社区事务受理中心或就近社保分中心自助机申请办理。</li> """, developers=[{ 'name': '程菲菲', 'email': '*****@*****.**' }]) def _get_common_headers(self): return { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Host': 'www.12333sh.gov.cn', } def _prepare(self, data=None): super()._prepare(data) self.proxy = get_proxy_ip() self.result_data['baseInfo'] = {} self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/sbsjb\wzb\/Bmblist12.jpg/g') # driver.service.service_args.append('--proxy='+get_proxy_ip()+'') # driver.service.service_args.append('--proxy-type=socks5') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = get_proxy_ip() proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(30) # 设置10秒脚本超时时间 driver.set_script_timeout(30) # 随便访问一个相同host的地址,方便之后设置cookie driver.get('"http://www.12333sh.gov.cn/xxxx') return driver def _query(self, params: dict): """任务状态查询""" t = params.get('t') if t == 'vc': return self._new_vc() # pass def _new_vc(self): ress = self.s.get("http://www.12333sh.gov.cn/sbsjb/wzb/229.jsp", timeout=10, proxies={"http": "http://" + self.proxy}) resp = self.s.get(VC_URL, timeout=10, proxies={"http": "http://" + self.proxy}) return dict(content=resp.content, content_type=resp.headers['Content-Type']) def _setup_task_units(self): """设置任务执行单元""" self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _check_login_params(self, params): assert params is not None, '缺少参数' assert '用户名' in params, '缺少用户名' assert '密码' in params, '缺少密码' # other check 用户名 = params['用户名'] 密码 = params['密码'] if len(用户名) == 0: raise InvalidParamsError('用户名为空,请输入用户名') elif len(用户名) < 15: raise InvalidParamsError('用户名不正确,请重新输入') if len(密码) == 0: raise InvalidParamsError('密码为空,请输入密码!') elif len(密码) < 6: raise InvalidParamsError('密码不正确,请重新输入!') def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '用户名' not in params: params['用户名'] = meta.get('用户名') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '用户名' and '用户名' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue res.append(pr) return res def _unit_login(self, params: dict): err_msg = None if params: try: self._check_login_params(params) id_num = params.get("用户名") account_pass = params.get("密码") vc = params.get("vc") self._do_login(id_num, account_pass, vc) # data = { # 'userid': id_num, # 'userpw': account_pass, # 'userjym': vc.encode('gbk'), # } # resp = self.s.post("http://www.12333sh.gov.cn/sbsjb/wzb/dologin.jsp", data=data) # # 检查是否登录成功 # if resp.status_code != 200: # raise InvalidParamsError("登录失败") # # if resp.url != LOGIN_SUCCESS_URL: # soup = BeautifulSoup(resp.content, 'html.parser') # spans = soup.select('tr > td > span') # err_msg = "登录失败" # if spans and len(spans) > 0: # err_msg = spans[0].text # raise InvalidParamsError(err_msg) # 设置key self.result_key = params.get('用户名') # 保存到meta self.result_meta['用户名'] = params.get('用户名') self.result_meta['密码'] = params.get('密码') return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='用户名', name='用户名', cls='input', placeholder='请输入身份证号', value=params.get('用户名', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(LOGIN_URL) time.sleep(10) driver.get("http://www.12333sh.gov.cn/sbsjb/wzb/229.jsp") username_input = driver.find_element_by_xpath('//*[@id="userid"]') password_input = driver.find_element_by_xpath('//*[@id="userpw"]') vc_input = driver.find_element_by_xpath('//*[@id="userjym"]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) # 验证码 vc_input.clear() vc_input.send_keys(vc) # 登录 # driver.find_element_by_xpath('//*[@id="ckRecId20"]/form/table[1]/tbody/tr[7]/td[2]/img').click() # /html/body/form/table/tbody/tr[6]/td[2] driver.execute_script('checkForm()') time.sleep(10) if driver.current_url != "http://www.12333sh.gov.cn/sbsjb/wzb/helpinfo.jsp?id=0": raise InvalidParamsError('登录失败,请重新登录!') def _unit_fetch(self): try: # TODO: 执行任务,如果没有登录,则raise PermissionError resp = self.s.get( "http://www.12333sh.gov.cn/sbsjb/wzb/sbsjbcx12.jsp", proxies={"http": "http://" + self.proxy}, timeout=30) soup = BeautifulSoup(resp.content, 'html.parser') # years = soup.find('xml', {'id': 'dataisxxb_sum3'}).findAll("jsjs") details = soup.find('xml', { 'id': 'dataisxxb_sum2' }).findAll("jsjs") if (soup.find('xml', { 'id': 'dataisxxb_sum4' }).find('jsjs2') != None): moneyTime = soup.find('xml', { 'id': 'dataisxxb_sum4' }).find('jsjs2').text else: moneyTime = len(details) # 社保缴费明细 # 养老 self.result_data['old_age'] = {"data": {}} dataBaseE = self.result_data['old_age']["data"] modelE = {} personmoney = 0.00 dt = soup.findAll("jfdwinfo") for a in range(len(details)): yearE = details[a].find('jsjs1').text[0:4] monthE = details[a].find('jsjs1').text[4:6] dataBaseE.setdefault(yearE, {}) dataBaseE[yearE].setdefault(monthE, []) modelE = { '缴费时间': details[a].find('jsjs1').text, '缴费单位': self._match_commapy(details[a].find('jsjs1').text, dt), '缴费基数': details[a].find('jsjs3').text, '缴费类型': '', '公司缴费': '', '个人缴费': details[a].find('jsjs4').text, # '实缴金额': self._match_money(details[a].find('jsjs1').text, years[a].find('jsjs1').text,years[a].find('jsjs3').text) } personmoney += float(details[a].find('jsjs4').text) dataBaseE[yearE][monthE].append(modelE) # 医疗 self.result_data['medical_care'] = {"data": {}} dataBaseH = self.result_data['medical_care']["data"] modelH = {} for b in range(len(details)): yearH = details[b].find('jsjs1').text[0:4] monthH = details[b].find('jsjs1').text[4:6] dataBaseH.setdefault(yearH, {}) dataBaseH[yearH].setdefault(monthH, []) modelH = { '缴费时间': details[b].find('jsjs1').text, '缴费单位': self._match_commapy(details[b].find('jsjs1').text, dt), '缴费基数': details[b].find('jsjs3').text, '缴费类型': '', '公司缴费': '', '个人缴费': details[b].find('jsjs6').text, } dataBaseH[yearH][monthH].append(modelH) # 失业 self.result_data['unemployment'] = {"data": {}} dataBaseI = self.result_data['unemployment']["data"] modelI = {} for c in range(len(details)): yearI = details[c].find('jsjs1').text[0:4] monthI = details[c].find('jsjs1').text[4:6] dataBaseI.setdefault(yearI, {}) dataBaseI[yearI].setdefault(monthI, []) modelI = { '缴费时间': details[c].find('jsjs1').text, '缴费单位': self._match_commapy(details[c].find('jsjs1').text, dt), '缴费基数': details[c].find('jsjs3').text, '缴费类型': '', '公司缴费': '', '个人缴费': details[c].find('jsjs8').text, } dataBaseI[yearI][monthI].append(modelI) # 工伤 self.result_data['injuries'] = {"data": {}} # 生育 self.result_data['maternity'] = {"data": {}} # 大病 self.result_data["serious_illness"] = {"data": {}} self.result_identity.update({ "task_name": "上海", "target_name": soup.find('xm').text, "target_id": self.result_meta['用户名'], "status": "" }) if (soup.find('xml', { 'id': 'dataisxxb_sum4' }).find('jsjs3') != None): personOldMoney = soup.find('xml', { 'id': 'dataisxxb_sum4' }).find('jsjs3').text else: personOldMoney = personmoney startTime = "" recentTime = "" if (len(details) != 0): startTime = details[0].find('jsjs1').text recentTime = details[len(details) - 1].find('jsjs1').text self.result['data']['baseInfo'] = { '姓名': soup.find('xm').text, '身份证号': self.result_meta['用户名'], '更新时间': time.strftime("%Y-%m-%d", time.localtime()), '城市名称': '上海市', '城市编号': '310100', '缴费时长': moneyTime, '最近缴费时间': recentTime, '开始缴费时间': startTime, '个人养老累计缴费': personOldMoney, '个人医疗累计缴费': '', '账户状态': '' } return except InvalidConditionError as e: raise PreconditionNotSatisfiedError(e) def _match_money(self, dtime1, dtime2, fmoney): if (dtime1 == dtime2): return fmoney else: return "" def _match_commapy(self, dtime, dt): rescom = "" if (dt != None): for tr in range(len(dt)): trd = dt[tr].find('jfsj').text.split('-') if (trd[0] <= dtime <= trd[1]): rescom = dt[tr].find('jfdw').text return rescom
class Task(AbsFetchTask): task_info = dict(city_name="成都", help=""" <li>联名卡有两个密码,一个是银行查询密码,一个是公积金查询服务密码</li> <li>如若查询服务密码,可拨打服务热线12329修改</li> """, developers=[{ 'name': '程菲菲', 'email': '*****@*****.**' }]) def _get_common_headers(self): return { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36', 'Accept-Encoding': 'gzip, deflate, br', 'Host': 'gr.cdhrss.gov.cn:442', 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6' } def _prepare(self, data=None): super()._prepare(data) self.result_data['baseInfo'] = {} self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/cdwsjb\/CaptchaImg.png/g') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(13) # 设置10秒脚本超时时间 driver.set_script_timeout(13) # 随便访问一个相同host的地址,方便之后设置cookie driver.get('https://gr.cdhrss.gov.cn:442/xxxx') return driver def _query(self, params: dict): """任务状态查询""" t = params.get('t') if t == 'vc': return self._new_vc() # pass def _new_vc(self): resp = self.s.get(VC_URL) return dict(cls='data:image', content=resp.content, content_type=resp.headers['Content-Type']) def _setup_task_units(self): """设置任务执行单元""" self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _check_login_params(self, params): assert params is not None, '缺少参数' assert '用户名' in params, '缺少用户名' assert '密码' in params, '缺少密码' # other check 用户名 = params['用户名'] 密码 = params['密码'] if len(用户名) == 0: raise InvalidParamsError('用户名为空,请输入用户名') elif len(用户名) < 4: raise InvalidParamsError('用户名不正确,请重新输入') if len(密码) == 0: raise InvalidParamsError('密码为空,请输入密码!') elif len(密码) < 6: raise InvalidParamsError('密码不正确,请重新输入!') def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '用户名' not in params: params['用户名'] = meta.get('用户名') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '用户名' and '用户名' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue res.append(pr) return res def _unit_login(self, params=None): err_msg = None if params: try: self._check_login_params(params) id_num = params.get("用户名") account_pass = params.get("密码") vc = params.get("vc") self._do_login(id_num, account_pass, vc) # data = { # 'username':id_num, # 'password':account_pass, # 'checkCode':vc, # 'type':'undefined', # 'tm':str(time.time()*1000)[0:13], # } # resp = self.s.post(LOGIN_URL, data=data) # res=json.loads(resp.text) # if(len(res)>1): # raise InvalidParamsError(res['msg']) # else: # 保存到meta self.result_key = id_num self.result_meta['用户名'] = id_num self.result_meta['密码'] = account_pass return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='用户名', name='用户名', cls='input', placeholder='请输入登录号|社会保障号|社保卡号', value=params.get('用户名', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get("https://gr.cdhrss.gov.cn:442/cdwsjb/login.jsp") username_input = driver.find_element_by_xpath( '//*[@id="c_username"]') js = 'document.getElementById("c_password").style.display="inline-block";' driver.execute_script(js) password_input = driver.find_element_by_xpath( '//*[@id="c_password"]') vc_input = driver.find_element_by_xpath('//*[@id="checkCode"]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) # 验证码 vc_input.clear() vc_input.send_keys(vc) # 登录 driver.find_element_by_xpath( '//*[@id="loginbox"]/div[5]/div[1]/input').click() time.sleep(5) if driver.current_url.startswith( 'https://gr.cdhrss.gov.cn:442/cdwsjb/login.jsp'): raise InvalidParamsError('登录失败,请重新登录!') def _convert_type(self, num): resinfo = "" if (num == "1"): resinfo = "正常" else: resinfo = "停缴" return resinfo def _unit_fetch(self): try: # TODO: 执行任务,如果没有登录,则raise PermissionError # 个人信息 res = self.s.get( "https://gr.cdhrss.gov.cn:442/cdwsjb/personal/personalHomeAction!query.do" ) if (res.status_code != 200): raise TaskNotImplementedError("网络错误,请稍后再试!") else: s = json.loads(res.text)["fieldData"] # 社保明细 startTime = "199001" endTime = time.strftime("%Y%m", time.localtime()) # 查询结束时间 # 社保缴费明细-----养老 self.result['data']["old_age"] = {"data": {}} basedataE = self.result['data']["old_age"]["data"] modelE = {} peroldTotal = 0.0 detailEI = self.s.get( Detail_URL + "?dto['aae041']=" + startTime + "&dto['aae042']=" + endTime + "&dto['aae140_md5list']=&dto['aae140']=110") if 'lists' in json.loads(detailEI.text): sEI = json.loads( detailEI.text)['lists']['dg_payment']['list'] for a in range(len(sEI)): years = str(sEI[a]['aae002'])[0:4] months = str(sEI[a]['aae002'])[4:6] basedataE.setdefault(years, {}) basedataE[years].setdefault(months, []) modelE = { '缴费单位': sEI[a]['aab004'], '缴费时间': sEI[a]['aae002'], '缴费类型': '', '缴费基数': sEI[a]['yac004'], '公司缴费': sEI[a]['dwjfje'], '个人缴费': sEI[a]['grjfje'] #'缴费合计': sEI[a]['jfjezh'] } peroldTotal += float(sEI[a]['grjfje']) basedataE[years][months].append(modelE) else: sEI = {} self.result['data']["medical_care"] = {"data": {}} basedataH = self.result['data']["medical_care"]["data"] modelH = {} permedicalTotal = 0.0 # 社保明细-----医疗 detailHI = self.s.get( Detail_URL + "?dto['aae041']=" + startTime + "&dto['aae042']=" + endTime + "&dto['aae140_md5list']=&dto['aae140']=310") if 'lists' in json.loads(detailHI.text): sHI = json.loads( detailHI.text)['lists']['dg_payment']['list'] for b in range(len(sHI)): yearH = str(sHI[b]['aae002'])[0:4] monthH = str(sHI[b]['aae002'])[4:6] basedataH.setdefault(yearH, {}) basedataH[yearH].setdefault(monthH, []) modelH = { '缴费单位': sHI[b]['aab004'], '缴费时间': sHI[b]['aae002'], '缴费类型': '', '缴费基数': sHI[b]['yac004'], '公司缴费': sHI[b]['dwjfje'], '个人缴费': sHI[b]['hrzhje'], #'缴费合计': sHI[b]['jfjezh'] } permedicalTotal += float(sHI[b]['hrzhje']) basedataH[yearH][monthH].append(modelH) else: sHI = {} self.result['data']["unemployment"] = {"data": {}} basedataI = self.result['data']["unemployment"]["data"] modelI = {} # 社保明细-----失业 detailII = self.s.get( Detail_URL + "?dto['aae041']=" + startTime + "&dto['aae042']=" + endTime + "&dto['aae140_md5list']=&dto['aae140']=210") if 'lists' in json.loads(detailII.text): sII = json.loads( detailII.text)['lists']['dg_payment']['list'] for d in range(len(sII)): yearI = str(sII[d]['aae002'])[0:4] monthI = str(sII[d]['aae002'])[4:6] basedataI.setdefault(yearI, {}) basedataI[yearI].setdefault(monthI, []) modelI = { '缴费单位': sII[d]['aab004'], '缴费时间': sII[d]['aae002'], '缴费类型': '', '缴费基数': sII[d]['yac004'], '公司缴费': sII[d]['dwjfje'], '个人缴费': sII[d]['grjfje'], #'缴费合计': sII[d]['jfjezh'] } basedataI[yearI][monthI].append(modelI) else: sII = {} self.result['data']["injuries"] = {"data": {}} basedataC = self.result['data']["injuries"]["data"] modelC = {} # 社保明细-----工伤 detailCI = self.s.get( Detail_URL + "?dto['aae041']=" + startTime + "&dto['aae042']=" + endTime + "&dto['aae140_md5list']=&dto['aae140']=410") if 'lists' in json.loads(detailCI.text): sCI = json.loads( detailCI.text)['lists']['dg_payment']['list'] for c in range(len(sCI)): yearC = str(sCI[c]['aae002'])[0:4] monthC = str(sCI[c]['aae002'])[4:6] basedataC.setdefault(yearC, {}) basedataC[yearC].setdefault(monthC, []) modelC = { '缴费单位': sCI[c]['aab004'], '缴费时间': sCI[c]['aae002'], '缴费类型': '', '缴费基数': sCI[c]['yac004'], '公司缴费': sCI[c]['dwjfje'], '个人缴费': '', #'缴费合计': sCI[c]['jfjezh'] } basedataC[yearC][monthC].append(modelC) else: sCI = {} self.result['data']["maternity"] = {"data": {}} basedataB = self.result['data']["maternity"]["data"] modelB = {} # 社保明细-----生育 detailBI = self.s.get( Detail_URL + "?dto['aae041']=" + startTime + "&dto['aae042']=" + endTime + "&dto['aae140_md5list']=&dto['aae140']=510") if 'lists' in json.loads(detailBI.text): sBI = json.loads( detailBI.text)['lists']['dg_payment']['list'] for f in range(len(sBI)): yearB = str(sBI[f]['aae002'])[0:4] monthB = str(sBI[f]['aae002'])[4:6] basedataB.setdefault(yearB, {}) basedataB[yearB].setdefault(monthB, []) modelB = { '缴费单位': sBI[f]['aab004'], '缴费时间': sBI[f]['aae002'], '缴费类型': '', '缴费基数': sBI[f]['yac004'], '公司缴费': sBI[f]['dwjfje'], '个人缴费': '', #'缴费合计': sBI[f]['jfjezh'] } basedataB[yearB][monthB].append(modelB) else: sBI = {} # 大病缴费明细 self.result['data']["serious_illness"] = {"data": {}} basedataS = self.result['data']["serious_illness"]["data"] modelS = {} detailSI = self.s.get( Detail_URL + "?dto['aae041']=" + startTime + "&dto['aae042']=" + endTime + "&dto['aae140_md5list']=&dto['aae140']=330") if 'lists' in json.loads(detailSI.text): sSI = json.loads( detailSI.text)['lists']['dg_payment']['list'] for q in range(len(sSI)): yearQ = str(sSI[q]['aae002'])[0:4] monthQ = str(sSI[q]['aae002'])[4:6] basedataS.setdefault(yearQ, {}) basedataS[yearQ].setdefault(monthQ, []) modelS = { '缴费单位': sSI[q]['aab004'], '缴费时间': sSI[q]['aae002'], '缴费类型': '', '缴费基数': sSI[q]['yac004'], '公司缴费': sSI[q]['dwjfje'], '个人缴费': '' } basedataS[yearQ][monthQ].append(modelS) # 六险状态 stype = self.s.get( "https://gr.cdhrss.gov.cn:442/cdwsjb/personal/query/queryCZInsuranceInfoAction.do" ) stypes = BeautifulSoup(stype.text, 'html.parser').find( 'div', {'id': 'SeInfo'}) stype2 = json.loads( stypes.text.split('data')[40].split(';')[0].replace( '=', ''))['list'] yanglao = "0" yiliao = "0" shiye = "0" gongshang = "0" shengyu = "0" dabing = "0" for lx in range(len(stype2)): if (stype2[lx]['aae140'] == "110"): yanglao = stype2[lx]['aac031'] elif (stype2[lx]['aae140'] == "310"): yiliao = stype2[lx]['aac031'] elif (stype2[lx]['aae140'] == "210"): shiye = stype2[lx]['aac031'] elif (stype2[lx]['aae140'] == "410"): gongshang = stype2[lx]['aac031'] elif (stype2[lx]['aae140'] == "510"): shengyu = stype2[lx]['aac031'] elif (stype2[lx]['aae140'] == "330"): dabing = stype2[lx]['aac031'] social_Type = { '养老': self._convert_type(yanglao), '医疗': self._convert_type(yiliao), '大病': self._convert_type(dabing), '失业': self._convert_type(shiye), '工伤': self._convert_type(gongshang), '生育': self._convert_type(shengyu) } # 个人基本信息 if (s['aac031'] == "参保缴费"): status = "正常" else: status = "异常" mcount = [ len(sEI) - 1, len(sHI) - 1, len(sII) - 1, len(sCI) - 1, len(sBI) - 1 ] # 缴费时长 moneyCount = max(mcount) recentTime = "" startTime = "" if (len(sEI) > 0): recentTime = sEI[0]['aae002'] startTime = sEI[len(sEI) - 1]['aae002'] self.result_data['baseInfo'] = { '姓名': s['aac003'], '身份证号': s['aac002'], '更新时间': time.strftime("%Y-%m-%d", time.localtime()), '城市名称': '成都', '城市编号': '510100', '缴费时长': moneyCount, '最近缴费时间': recentTime, '开始缴费时间': startTime, '个人养老累计缴费': peroldTotal, '个人医疗累计缴费': permedicalTotal, '五险状态': social_Type, '账户状态': status, '个人编号': s['aac001'], } self.result['identity'] = { "task_name": "成都", "target_name": s['aac003'], "target_id": self.result['meta']["用户名"], "status": status } #return except InvalidConditionError as e: raise PreconditionNotSatisfiedError(e)
class Task(AbsFetchTask): task_info = { 'task_name': '测试selenium登录[fast]', 'help': '测试selenium登录[fast]' } def _get_common_headers(self): return {'User-Agent': USER_AGENT} def _prepare(self, data=None): super()._prepare(data) self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) # self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_chrome_driver) def _create_chrome_driver(self): driver = new_driver(user_agent=USER_AGENT, driver_type=DriverType.CHROME) return driver def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/cas\/captcha.jpg/g') # 随便访问一个相同host的地址,方便之后设置cookie driver.get('http://gzlss.hrssgz.gov.cn/xxxx') return driver def _setup_task_units(self): self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _query(self, params: dict): t = params.get('t') if t == 'vc': return self._new_vc() def _new_vc(self): vc_url = VC_IMAGE_URL + str(random.random()) resp = self.s.get(vc_url) return dict(cls='data:image', content=resp.content, content_type=resp.headers.get('Content-Type')) def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '账号' not in params: params['账号'] = meta.get('账号') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '账号' and '账号' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue res.append(pr) return res def _check_login_params(self, params): assert params is not None, '缺少参数' assert '账号' in params, '缺少账号' assert '密码' in params, '缺少密码' # other check 账号 = params['账号'] 密码 = params['密码'] if len(密码) < 4: raise InvalidParamsError('账号或密码错误') if len(账号) < 4: raise InvalidParamsError('账号或密码错误') def _unit_login(self, params=None): err_msg = None if params: try: self._check_login_params(params) username = params['账号'] password = params['密码'] vc = params['vc'] self._do_login(username, password, vc) # 登录成功 self.result_key = username self.result_meta.update({'账号': username, '密码': password}) return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='账号', name='账号', cls='input', value=params.get('账号', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(LOGIN_PAGE_URL) # FIXME: debug for l in driver.get_log('browser'): print(l) username_input = driver.find_element_by_xpath( '//*[@id="loginName"]') password_input = driver.find_element_by_xpath( '//*[@id="loginPassword"]') vc_input = driver.find_element_by_xpath('//*[@id="validateCode"]') user_type_input = driver.find_element_by_xpath( '//*[@id="usertype2"]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) # 验证码 vc_input.clear() vc_input.send_keys(vc) # 选择类型 user_type_input.click() # 登录 driver.find_element_by_xpath('//*[@id="submitbt"]').click() # FIXME: debug for l in driver.get_log('browser'): print(l) if driver.current_url.startswith( 'http://gzlss.hrssgz.gov.cn/cas/login'): err_msg = '登录失败,请检查输入' try: err_msg = driver.find_element_by_xpath( '//*[@id="*.errors"]').text finally: raise InvalidParamsError(err_msg) def _unit_fetch(self): try: resp = self.s.get( 'http://gzlss.hrssgz.gov.cn/gzlss_web/business/tomain/main.xhtml' ) html = etree.HTML(resp.text) target = html.xpath('/html/body/div[1]/div[3]/span/font[1]') self.result_data.update({'姓名': target[0].text}) self.result_identity.update( {'task_name': self.task_info['task_name']}) except InvalidConditionError as e: raise PreconditionNotSatisfiedError(e)
class Task(AbsFetchTask): task_info = dict(city_name="深圳", help="""<li>若您尚未激活或者没有在网上查询过您的社保卡,请点击激活社保账号</li> <li>如果您曾经激活过社保卡,但忘记密码,请点击忘记密码</li> <li>如办理社保卡时,没有登记手机号码或者更换手机号码,请本人携带身份证原件和新手机到社保分中心柜台办理注册手机变更业务。</li> """, developers=[{ 'name': '卜圆圆', 'email': '*****@*****.**' }]) def _get_common_headers(self): return { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36' } def _query(self, params: dict): """任务状态查询""" t = params.get('t') if t == 'vc': return self._new_vc() pass def _prepare(self, data=None): super()._prepare(data) self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/web\/ImageCheck.jpg/g') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(20) # 设置10秒脚本超时时间 driver.set_script_timeout(20) driver.get(LOGIN_PAGE_URL) return driver def _create_chrome_driver(self): driver = new_driver(user_agent=USER_AGENT, driver_type=DriverType.CHROME) return driver def _setup_task_units(self): """设置任务执行单元""" self._add_unit(self._unit_login) self._add_unit(self._unit_fetch_userinfo, self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _check_login_params(self, params): assert params is not None, '缺少参数' assert '用户名' in params, '缺少用户名' assert '密码' in params, '缺少密码' # other check 用户名 = params['用户名'] 密码 = params['密码'] if len(用户名) == 0: raise InvalidParamsError('用户名为空,请输入用户名') elif len(用户名) < 5: raise InvalidParamsError('用户名不正确,请重新输入') if len(密码) == 0: raise InvalidParamsError('密码为空,请输入密码!') elif len(密码) < 6: raise InvalidParamsError('密码不正确,请重新输入!') def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '用户名' not in params: params['用户名'] = meta.get('用户名') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '用户名' and '用户名' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue elif pr['key'] == 'other': continue res.append(pr) return res def get_js(self): # f = open("D:/WorkSpace/MyWorkSpace/jsdemo/js/des_rsa.js",'r',encoding='UTF-8') f = open("ceshi.js", 'r', encoding='UTF-8') line = f.readline() htmlstr = '' while line: htmlstr = htmlstr + line line = f.readline() return htmlstr def _unit_login(self, params: dict): err_msg = None if params: try: self._check_login_params(params) username = params['用户名'] password = params['密码'] vc = params['vc'] #skeys=resp.cookies._cookies['seyb.szsi.gov.cn']['/web/ggfw/app']['skey'].value i = 0 while (True): resp = self.s.get( 'https://seyb.szsi.gov.cn/web/ggfw/app/index.html', timeout=30) skeys = resp.cookies._cookies.get("seyb.szsi.gov.cn") if skeys: skeys = skeys['/web/ggfw/app']['skey'].value break else: i = i + 1 time.sleep(1) if i > 3: raise InvalidParamsError('网络异常,请重新刷新!') jsstrs = self.s.get( "https://seyb.szsi.gov.cn/web/js/comm/fw/encrypt.js", timeout=20) ctx = execjs.compile(jsstrs.content.decode("utf-8")) mmmm = ctx.call('encrypt', skeys, password) mmjm = ctx.call('stringToHex', mmmm) # jsstr = self.get_js() # ctxs = execjs.compile(jsstr) # mmmms = ctx.call('encrypt',skeys,password) # mmjms = ctx.call('stringToHex', mmmms) resp = self.s.post(LOGIN_URL, data=dict(r=random.random(), LOGINID=username, PASSWORD=mmjm, IMAGCHECK=vc, OPERTYPE2=3, ISBIND='false', now=time.strftime( '%a %b %d %Y %H:%M:%S', time.localtime()), callback='')) soup = BeautifulSoup(resp.content, 'html.parser') jsonread = json.loads( soup.text.replace('(', '').replace(')', '')) flag = jsonread['flag'] errormsg = jsonread['message'] if flag == '-1': raise InvalidParamsError(errormsg) #self._do_login(username, password, vc) # 登录成功 self.s.Token = self.s.cookies._cookies['seyb.szsi.gov.cn'][ '/']['Token'].value # 查询(点击查询) strr = '?r=' + str(random.random()) resp = self.s.post( USERINFO_URL + strr, data=dict( _isModel='true', params= '{"oper":"FrontPageAction.queryNavDimension","params":{},"datas":{"@tmpGtDatas":{"业务类型":"5"}}}' ), headers={ 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application / json, text / plain, * / *', 'Token': self.s.Token, 'Connection': 'keep - alive' }, timeout=15) self.s.Token = resp.cookies._cookies['seyb.szsi.gov.cn']['/'][ 'Token'].value self.result_data["baseInfo"] = { '城市名称': '深圳', '城市编号': '440300', '更新时间': time.strftime("%Y-%m-%d", time.localtime()) } # 查询(点击业务查询请求三次) '''第一次''' strr = '?r=' + str(random.random()) resp = self.s.post( USERINFO_URL + strr, data=dict( _isModel='true', params= '{"oper":"UnitHandleCommAction.insertLogRecord","params":{},"datas":{"@tmpGtDatas":{"rightId":"500101","rightName":"参保基本信息查询","recordType":"1"}}}' ), headers={ 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application / json, text / plain, * / *', 'Token': self.s.Token, 'Connection': 'keep - alive' }, timeout=15) self.g.Token = resp.cookies._cookies['seyb.szsi.gov.cn']['/'][ 'Token'].value '''第二次''' datass = dict( _isModel='true', params= '{"oper":"CbjbxxcxAction.queryGrcbjbxx","params":{},"datas":{"ncm_gt_用户信息":{"params":{}},"ncm_gt_参保状态":{"params":{}},"ncm_gt_缴纳情况":{"params":{}}}}' ) strrs = USERINFO_URL + '?r=' + str(random.random()) resps = self.s.post( strrs, datass, headers={ 'X-Requested-With': 'XMLHttpRequest', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep - alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application/json,text/plain, */*', 'Token': self.s.Token, 'Referer': 'https://seyb.szsi.gov.cn/web/ggfw/app/index.html', 'Origin': 'https://seyb.szsi.gov.cn', 'Host': 'seyb.szsi.gov.cn' }, timeout=15) # print(resps.text) soup = BeautifulSoup(resps.content, 'html.parser') self.s.Token = resps.cookies._cookies['seyb.szsi.gov.cn']['/'][ 'Token'].value jsonread = json.loads(soup.text) if jsonread['flag'] != '-1': userinfo = jsonread['datas'] fivedic = {} for k, v in userinfo['ncm_gt_用户信息']['params'].items(): if k.find('参保状态') > 0: fivedic.setdefault(k[:2], v) else: if k == '户籍类别': self.result_data["baseInfo"].setdefault( '户口性质', v) else: self.result_data["baseInfo"].setdefault(k, v) if k == '姓名': self.result_identity['target_name'] = v if k == '身份证号': self.result_identity['target_id'] = v monthnum = 0 for k, v in userinfo['ncm_gt_缴纳情况']['params'].items(): if k == '养老保险累计月数': self.result_data["baseInfo"].setdefault( '养老实际缴费月数', v) elif k == '失业保险累计月数': self.result_data["baseInfo"].setdefault( '失业实际缴费月数', v) else: self.result_data["baseInfo"].setdefault(k, v) if k.find('保险累计月数') > -1: if (monthnum < int(v)): monthnum = int(v) self.result_data["baseInfo"].setdefault('缴费时长', monthnum) self.result_data["baseInfo"].setdefault('五险状态', fivedic) if '参加' in fivedic.values(): self.result_identity['status'] = '正常' else: self.result_identity['status'] = '停缴' else: raise InvalidParamsError( '请您登录社保官网输入社保个人电脑号完成身份认证后,再做查询操作。') self.result_key = username # 保存到meta self.result_meta['用户名'] = username self.result_meta['密码'] = password self.result_identity['task_name'] = '深圳' return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='用户名', name='用户名', cls='input', value=params.get('用户名', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}, value=params.get('vc', '')), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(LOGIN_PAGE_URL) # 等待lk请求 # WebDriverWait(driver, 10).until(value_is_number((By.XPATH, '//*[@id="lk"]'))) # 选择身份证号方式登录 driver.find_element_by_xpath( '/html/body/div[2]/div/div/div/div/div/div[2]/div[2]/div/div[1]/a' ).click() username_input = driver.find_element_by_xpath( '//*[@id="div_dialog_login"]/div/div/div/form/div[4]/div/div[1]/div/input' ) password_input = driver.find_element_by_xpath( '//*[@id="div_dialog_login"]/div/div/div/form/div[4]/div/div[2]/div/input' ) vc_input = driver.find_element_by_xpath( '//*[@id="div_dialog_login"]/div/div/div/form/div[4]/div/div[3]/div/input' ) submit_btn = driver.find_element_by_xpath( '//*[@id="div_dialog_login"]/div/div/div/form/div[5]/input[1]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) vc_input.clear() vc_input.send_keys(vc) s = driver.find_element_by_tag_name('html').get_attribute( 'innerHTML') # Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() # 提交 submit_btn.click() time.sleep(5) login_page_html = driver.find_element_by_tag_name( 'html').get_attribute('innerHTML') soup = BeautifulSoup(login_page_html, 'html.parser') # WebDriverWait(driver, 10).until( # lambda driver: # EC.invisibility_of_element_located((By.XPATH, 'html/body/div[2]/div/div/div/div[1]/div/div[2]/div[2]/div/div[1]/a[1]'))(driver) # or EC.element_to_be_clickable((By.XPATH, '//*[@id="div_dialog_login"]/div/div/div/form/div[5]/input[1]'))(driver)) # # login_btn = driver.find_element_by_xpath( # 'html/body/div[2]/div/div/div/div[1]/div/div[2]/div[2]/div/div[1]/a[1]') # # s = login_btn.get_attribute('style') # Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() # if not s: # # failed # err_msg = driver.find_element_by_xpath('//*[@id="div_dialog_login"]/div/div/div/form/div[3]/font').text # raise InvalidParamsError(err_msg) # # TODO if len(soup.select('.ng-binding') [1].text) == 16: # len(soup.findAll('a')[13].attrs) err_msg = soup.select('.ng-binding')[2].text raise InvalidParamsError(err_msg) else: # success print('success') # Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() # 保存登录后的页面内容供抓取单元解析使用 # login_page_html = driver.find_element_by_tag_name('html').get_attribute('innerHTML') # # # print(login_page_html[login_page_html.find('欢迎')-5:login_page_html.find('欢迎')+15]) # # if login_page_html.find('<a ng-show="!ncUser" ng-click="login()" style="display: none;">')==-1: # resp = self.s.post(LOGIN_URL, data=dict( # r=random.random(), # LOGINID=username, # PASSWORD=login_page_html[login_page_html.find('PASSWORD='******'&IMAGCHECK=')], # IMAGCHECK=vc, # OPERTYPE2=3, # ISBIND='false', # now=time.strftime('%a %b %d %Y %H:%M:%S', time.localtime()), # callback='' # )) # soup = BeautifulSoup(resp.content, 'html.parser') # jsonread = json.loads(soup.text.replace('(','').replace(')','')) # flag=jsonread['flag'] # errormsg = jsonread['message'] # if flag=='-1': # raise InvalidParamsError(errormsg) def _unit_fetch_userinfo(self): """用户信息""" try: '''第三次''' strr = '?r=' + str(random.random()) resp = self.s.post( USERINFO_URL + strr, data=dict( _isModel='true', params= '{"oper":"QfzscxAction.queryQfzs","params":{},"datas":{"ncm_gt_欠费总数":{"params":{}}}}' ), headers={ 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application / json, text / plain, * / *', 'Token': self.g.Token, 'Connection': 'keep - alive' }, timeout=15) self.s.Token = resp.cookies._cookies['seyb.szsi.gov.cn']['/'][ 'Token'].value # TODO: 执行任务,如果没有登录,则raise PermissionError return except PermissionError as e: raise PreconditionNotSatisfiedError(e) def _unit_fetch(self): """五险""" try: strr = USERINFO_URL + '?r=' + str(random.random()) resp = self.s.post( strr, data=dict( _isModel='true', params= '{"oper":"UnitHandleCommAction.insertLogRecord","params":{},"datas":{"@tmpGtDatas":{"rightId":"500201","rightName":"参保缴费明细查询","recordType":"1"}}}' ), headers={ 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application / json, text / plain, * / *', 'Connection': 'keep - alive', 'Token': self.s.Token }) self.g.Token = resp.cookies._cookies['seyb.szsi.gov.cn']['/'][ 'Token'].value # strr = USERINFO_URL + '?r=' + str(random.random()) # resp = self.s.post(strr, datas, headers={'X-Requested-With': 'XMLHttpRequest', # 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', # 'Accept': 'application/json,text/plain,*/*', # 'Accept-Encoding':'gzip,deflate,br', # 'Accept-Language':'zh-CN,zh;q=0.8', # 'Connection': 'keep-alive', 'Token': self.s.Token, # 'Host': 'seyb.szsi.gov.cn', 'Origin': 'https://seyb.szsi.gov.cn', # 'Referer': 'https://seyb.szsi.gov.cn/web/ggfw/app/index.html'}) self.g.Token = self.s.Token # 明细(险种比较多)arrtype={'01':'基本养老保险','02':'失业保险','03':'基本医疗保险','04':'工伤保险','05':'生育保险'} arrtype = { 'Yl': 'old_age', 'Shiye': 'unemployment', 'Yil': 'medical_care', 'Gs': 'injuries', 'Sy': 'maternity' } arrmingxi = [ 'ncm_glt_养老缴费明细', 'ncm_glt_失业缴费明细', 'ncm_glt_医疗缴费明细', 'ncm_glt_工伤缴费明细', 'ncm_glt_生育缴费明细' ] statetime = '' endtime = '' ii = 0 for k, v in arrtype.items(): self.result_data[v] = {} self.result_data[v]['data'] = {} years = '' months = '' personjfsum = 0.00 datas = dict( _isModel='true', params='{"oper": "CbjfmxcxAction.queryCbjfmx' + k + '", "params": {}, "datas": {"' + arrmingxi[ii] + '": {"params": {"pageSize": 10, "curPageNum": 1}, "dataset": [], "heads": [],"heads_change": []}}}' ) strr = USERINFO_URL + '?r=' + str(random.random()) resp = self.s.post( strr, datas, headers={ 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application / json, text / plain, * / *', 'Connection': 'keep - alive', 'Token': self.g.Token, 'Host': 'seyb.szsi.gov.cn', 'Origin': 'https://seyb.szsi.gov.cn', 'Referer': 'https://seyb.szsi.gov.cn/web/ggfw/app/index.html' }, timeout=15) self.g.Token = resp.cookies._cookies['seyb.szsi.gov.cn']['/'][ 'Token'].value pagearr = json.loads(resp.text) """获取分页""" if pagearr['flag'] != '-1': if 'datas' in pagearr.keys(): pagesize = pagearr["datas"][ arrmingxi[ii]]['params']['pageSize'] rowsCount = pagearr["datas"][ arrmingxi[ii]]['params']['rowsCount'] pagenum = rowsCount / pagesize pagenums = rowsCount // pagesize if pagenum > pagenums: pagenums = pagenums + 1 for i in range(1, pagenums + 1): if i != 1: datas = dict( _isModel='true', params= '{"oper": "CbjfmxcxAction.queryCbjfmx' + k + '", "params": {}, "datas": {"' + arrmingxi[ii] + '": {"params": {"pageSize": 10, "curPageNum": ' + str(i) + ',"maxPageSize":50,"rowsCount":' + str(rowsCount) + ',"Total_showMsg":null,"Total_showMsgCell":null,"Total_Cols":[]},"heads":[],"heads_change":[],"dataset":[]}}}' ) strr = USERINFO_URL + '?r=' + str( random.random()) resp = self.s.post( strr, datas, headers={ 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application / json, text / plain, * / *', 'Connection': 'keep - alive', 'Token': self.g.Token, 'Host': 'seyb.szsi.gov.cn', 'Origin': 'https://seyb.szsi.gov.cn', 'Referer': 'https://seyb.szsi.gov.cn/web/ggfw/app/index.html' }, timeout=15) self.g.Token = resp.cookies._cookies[ 'seyb.szsi.gov.cn']['/']['Token'].value mx = json.loads(resp.text)["datas"] for i in range(0, len(mx[arrmingxi[ii]]['dataset'])): arr = [] if v == 'old_age' or v == 'medical_care': personjfsum = personjfsum + float( mx[arrmingxi[ii]]['dataset'][i]['个人缴']) # enterjfsum=enterjfsum+float(mx['dataset'][i]['单位缴']) yearmonth = mx[arrmingxi[ii]]['dataset'][i][ '缴费年月'].replace('年', '').replace('月', '') if len(yearmonth) == 5: yearmonth = yearmonth[:4] + '0' + yearmonth[ -1:] if statetime == '': statetime = yearmonth elif int(statetime) > int(yearmonth): statetime = yearmonth if endtime == '': endtime = yearmonth elif int(endtime) < int(yearmonth): endtime = yearmonth if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data[v]['data'][years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data[v]['data'][years][ months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data[v]['data'][years][ months] = {} mxdic = { '缴费时间': yearmonth, '缴费类型': '', '缴费基数': mx[arrmingxi[ii]]['dataset'][i]['缴费工资'], '公司缴费': mx[arrmingxi[ii]]['dataset'][i]['单位缴'], '个人缴费': mx[arrmingxi[ii]]['dataset'][i]['个人缴'], '缴费单位': mx[arrmingxi[ii]]['dataset'][i]['单位名称'], '单位编号': mx[arrmingxi[ii]]['dataset'][i]['单位编号'], '缴费合计': mx[arrmingxi[ii]]['dataset'][i]['缴费合计'], '备注': mx[arrmingxi[ii]]['dataset'][i]['备注'] } arr.append(mxdic) self.result_data[v]['data'][years][ months] = arr if v == 'old_age': self.result_data["baseInfo"].setdefault( '个人养老累计缴费', personjfsum) if v == 'medical_care': self.result_data["baseInfo"].setdefault( '个人医疗累计缴费', personjfsum) ii = ii + 1 self.result_data["baseInfo"].setdefault('最近缴费时间', endtime) self.result_data["baseInfo"].setdefault('开始缴费时间', statetime) # TODO: 执行任务,如果没有登录,则raise PermissionError return except PermissionError as e: raise PreconditionNotSatisfiedError(e) # 刷新验证码 def _new_vc(self): resp = self.s.get(VC_URL, timeout=20) return dict(cls='data:image', content=resp.content, content_type=resp.headers.get('Content-Type'))
class Task(AbsFetchTask): task_info = { 'task_name': '测试selenium登录', 'help': '测试selenium登录' } def _get_common_headers(self): return { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0" } def _prepare(self, data=None): super()._prepare(data) _driver = None if 'driver_data' in self.state: _driver = dill.loads(self.state['driver_data']) del self.state['driver_data'] self.dsc = DriverRequestsCoordinator(d=_driver, create_session=self._create_session, create_driver=self._create_driver) def _create_driver(self): driver = create_driver() driver.get(LOGIN_PAGE_URL) return driver def _create_session(self): return self.s def _update_session_data(self): super()._update_session_data() if self.dsc.d_is_created: self.state['driver_data'] = dill.dumps(self.dsc.d) def _setup_task_units(self): self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _query(self, params: dict): t = params.get('t') if t == 'vc': return self._new_vc() def _new_vc(self): self.dsc.create_driver() self.dsc.create_session() vc_url = VC_IMAGE_URL + str(int(time.time() * 1000)) resp = self.s.get(vc_url) return dict(cls='data:image', content=resp.content, content_type=resp.headers.get('Content-Type')) def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '身份证号' not in params: params['身份证号'] = meta.get('身份证号') if '查询密码' not in params: params['查询密码'] = meta.get('查询密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '身份证号' and '身份证号' in meta: continue elif pr['key'] == '查询密码' and '查询密码' in meta: continue res.append(pr) return res def _check_login_params(self, params): assert params is not None, '缺少参数' assert '身份证号' in params, '缺少身份证号' assert '查询密码' in params, '缺少查询密码' assert 'vc' in params, '缺少验证码' def _unit_login(self, params=None): err_msg = None if params: try: self._check_login_params(params) username = params['身份证号'] password = params['查询密码'] vc = params['vc'] self._do_login(username, password, vc) # 登录成功 self.result_key = username self.result_meta.update({ '身份证号': username, '密码': password }) return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='身份证号', name='身份证号', cls='input', value=params.get('身份证号', '')), dict(key='查询密码', name='个人编号', cls='input', value=params.get('查询密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx(excepted_exceptions=(InvalidParamsError,)) as driver: # 选择身份证号方式登录 driver.find_element_by_xpath('/html/body/table[2]/tbody/tr[3]/td/table/tbody/tr/td/div/form/div[1]/ul/li[3]/a').click() username_input = driver.find_element_by_xpath('//*[@id="bh1"]') password_input = driver.find_element_by_xpath('//*[@id="mm1"]') vc_input = driver.find_element_by_xpath('//*[@id="login_tab_2"]/div/div[3]/input') submit_btn = driver.find_element_by_xpath('//*[@id="login_tab_2"]/div/div[4]/input[1]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) vc_input.clear() vc_input.send_keys(vc) # 提交 submit_btn.click() if not driver.current_url == 'http://www.bjgjj.gov.cn/wsyw/wscx/gjjcx-choice.jsp': raise InvalidParamsError('登录失败,请检查输入') # 登录成功 # 同步cookie到session self.dsc.create_session() self.dsc.inc_and_sync_d_cookies() # 保存登录后的页面内容供抓取单元解析使用 self.g.login_page_html = driver.find_element_by_tag_name('html').get_attribute('innerHTML') self.g.current_url = driver.current_url def _unit_fetch(self): try: # TODO: soup = bs4.BeautifulSoup(self.g.login_page_html, 'html.parser') a = soup.select('a')[1] link = a.attrs['onclick'].split('"')[1] link = os.path.join(os.path.dirname(self.g.current_url), link) resp = self.s.get(link) print(html.unescape(resp.text)) self.result_data.update({ 'xxx': a.text, 'link': link }) self.result_identity.update({ 'task_name': self.task_info['task_name'] }) except InvalidConditionError as e: raise PreconditionNotSatisfiedError(e)
class Task(AbsFetchTask): task_info = dict( city_name="北京市", expect_time=10, help= """<li>首次登陆查询功能,验证方式必须选择联名卡号;初始密码为身份证后四位阿拉伯数字+00。为了保证您的个人信息安全,请您及时修改初始密码,如有问题请拨打“住房公积金热线12329”咨询</li> """, developers=[{ 'name': '赵伟', 'email': '*****@*****.**' }]) def _get_common_headers(self): return {'User-Agent': USER_AGENT} def _prepare(self, data=None): super()._prepare(data) self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/PicCheckCode1/g') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(13) # 设置10秒脚本超时时间 driver.set_script_timeout(13) return driver def _setup_task_units(self): self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _query(self, params: dict): t = params.get('t') if t == 'vc': return self._new_vc() def _new_vc(self): vc_url = VC_IMAGE_URL + str(int(time.time()) * 1000) resp = self.s.get(vc_url, verify=False) return dict(cls='data:image', content=resp.content, content_type=resp.headers.get('Content-Type')) def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if 'bh1' not in params: params['bh1'] = meta.get('账号') if 'mm1' not in params: params['mm1'] = meta.get('密码') if 'bh5' not in params: params['bh5'] = meta.get('账号') if 'mm5' not in params: params['mm5'] = meta.get('密码') if 'other' not in params: params['other'] = meta.get('类型Code') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # 身份证 if meta['类型Code'] == '3': if pr['key'] == 'bh5': continue if pr['key'] == 'mm5': continue if pr['key'] == 'bh1' and '账号' in meta: continue elif pr['key'] == 'mm1' and '密码' in meta: continue res.append(pr) # 联名卡 elif meta['类型Code'] == '1': if pr['key'] == 'bh1': continue if pr['key'] == 'mm1': continue if pr['key'] == 'bh5' and '账号' in meta: continue elif pr['key'] == 'mm5' and '密码' in meta: continue res.append(pr) else: res.append(pr) return res def _check_login_parama(self, params): assert params is not None, '缺少参数' assert 'other' in params, '请选择登录方式' if params["other"] == "1": assert 'bh5' in params, '缺少联名卡号' assert 'mm5' in params, '缺少密码' elif params["other"] == "3": assert 'bh1' in params, '缺少身份证号码' assert 'mm1' in params, '缺少密码' r = r'(^\d{15}$)|(^\d{18}$)|(^\d{17}(\d|X|x)$)' assert re.findall(r, params['bh1']), '请输入有效的身份证编号' assert 'vc' in params, '缺少验证码' # TODO: 检验身份证 # TODO: 检验密码 # TODO: 检验验证码 def _unit_login(self, params=None): err_msg = None if params: try: self._check_login_parama(params) if params["other"] == "3": code = "1" elif params["other"] == "1": code = "5" else: code = "1" username = params['bh' + code] password = params['mm' + code] vc = params['vc'] self._do_login(username, password, vc, params["other"]) # 登录成功 self.result_key = username self.result_meta.update({ '账号': username, '密码': password, '类型Code': params["other"] }) return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict( key='other', name= '[{"tabName":"身份证号","tabCode":"3","isEnable":"1"},{"tabName":"联名卡号","tabCode":"1","isEnable":"1"}]', cls='tab', value=params.get('类型Code', '')), dict(key='bh1', name='身份证号', cls='input', tabCode="3", value=params.get('账号', '')), dict(key='mm1', name='密码', cls='input:password', tabCode="3", value=params.get('密码', '')), dict(key='bh5', name='联名卡号', cls='input', tabCode="1", value=params.get('账号', '')), dict(key='mm5', name='密码', cls='input:password', tabCode="1", value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}, tabCode="[3,1]", value=''), ], err_msg) def _do_login(self, username, password, vc, type): """使用 web driver 模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开页面 driver.get(LOGIN_PAGE_URL) # 等待lk请求 WebDriverWait(driver, 10).until( value_is_number((By.XPATH, '//*[@id="lk"]'))) # 选择身份证号方式登录 driver.find_element_by_xpath( '/html/body/table[2]/tbody/tr[3]/td/table/tbody/tr/td/div/form/div[1]/ul/li[' + type + ']/a').click() if type == "1": id = "5" input = "0" elif type == "3": id = "1" input = "2" username_input = driver.find_element_by_xpath('//*[@id="bh' + id + '"]') password_input = driver.find_element_by_xpath('//*[@id="mm' + id + '"]') vc_input = driver.find_element_by_xpath('//*[@id="login_tab_' + input + '"]/div/div[3]/input') submit_btn = driver.find_element_by_xpath('//*[@id="login_tab_' + input + '"]/div/div[4]/input[1]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) # 验证码 vc_input.clear() vc_input.send_keys(vc) # 提交 submit_btn.click() # for test # img = Image.open(io.BytesIO(driver.get_screenshot_as_png())) # img.show() # 保存登录后的页面内容供抓取单元解析使用 self.g.login_page_html = driver.find_element_by_tag_name( 'html').get_attribute('innerHTML') self.g.current_url = driver.current_url if not driver.current_url == LOGINED_URL: # 有一种情况可能不跳转到公司的列表页 if RESULT_URL in driver.current_url: self.g.login_page_html = ''' <table id="new-mytable"> <tbody><tr style="background-color: rgb(241, 241, 241);"> <th class="style21"><div align="center">开户登记号</div></th> <th class="style21"><div align="center">单位名称</div></th> <th class="style21"><div align="center">缴存状态</div></th> </tr> <script>mnbbc='3';</script> <tr style="background-color: rgb(241, 241, 241);"> <td class="style21"><div align="center">000000</div></td> <td class="style21"><div><a href="#" onclick='javascript:window.open("''' + driver.current_url + '''","","top=0,left=0,toolbar=no,location=no,status=no,menubar=no,scrollbars=yes,resizable=yes,width=550,height=500");'></a></div></td> <td class="style21"><div align="center">缴存</div></td> </tr> </tbody> </table> ''' pass else: raise InvalidParamsError('登录失败,请检查输入') else: # 部分登录页面可登录进去但是密码不安全提示 soup = bs4.BeautifulSoup(self.g.login_page_html, 'html.parser') companyList = soup.findAll("table", {"id": "new-mytable"}) error = soup.find("span", {"class": "tittle1"}) if companyList.__len__() <= 0 and error: errorInfo = error.text.replace("\n", "") raise InvalidParamsError(errorInfo) def _unit_fetch(self): try: # 初始化抓取信息 self.result_data["baseInfo"] = {} self.result_data["companyList"] = [] self.result_data["detail"] = {"data": {}} # 渲染登录后页面 soup = bs4.BeautifulSoup(self.g.login_page_html, 'html.parser') companyList = soup.findAll("table", {"id": "new-mytable"}) name = '' target_id = '' paymentStart = '' # 累计汇缴月数 payMonth = [] last_income = {"date": "", "data": ""} if len(companyList) > 0: trs = companyList[0].findAll("tr") i = 0 trs.reverse() for tr in trs: tds = tr.findAll("td") if tr != trs[len(trs) - 1]: a = tds[1].findAll("a")[0] link = a.attrs['onclick'].split('"')[1] link = parse.urljoin(self.g.current_url, link) resp = self.s.get(link, verify=False) try: result = bs4.BeautifulSoup(resp.text, 'html.parser') if result: table = result.findAll("table")[1] if table: _tds = table.findAll("td") name = _tds[27].text target_id = _tds[33].text paymentStart = _tds[45].text self.result_data["baseInfo"] = { "姓名": _tds[27].text, "证件号": _tds[33].text, "证件类型": _tds[31].text, "个人登记号": _tds[29].text, "更新时间": datetime.datetime.now().strftime( '%Y-%m-%d'), '城市名称': '北京市', '城市编号': '110100', '最近汇款日期': '', '最近汇款金额': 0.0, '累计汇款次数': 0.0 } self.result_data["companyList"].append({ "最后业务日期": re.sub('\s', '', _tds[53].text), "单位名称": _tds[37].text, "单位登记号": _tds[35].text, "所属管理部编号": _tds[39].text, "所属管理部名称": _tds[41].text, "当前余额": re.sub('\s', '', _tds[43].text).replace("元", ""), "帐户状态": _tds[45].text, "当年缴存金额": re.sub('\s', '', _tds[47].text).replace("元", ""), "当年提取金额": re.sub('\s', '', _tds[49].text).replace("元", ""), "上年结转余额": re.sub('\s', '', _tds[51].text).replace("元", ""), "转出金额": re.sub('\s', '', _tds[55].text).replace("元", "") }) detail_tag = result.findAll( "span", {"class": "style2"}) # 20177月份后数据不太准确 temp_detail = result.findAll( "table", {"id": "tab-style"}) temp_all_date = [] if len(detail_tag) > 0: detail_a = detail_tag[1].findAll("a")[0] detail_link = detail_a.attrs[ 'onclick'].split("'")[1] detail_link = parse.urljoin( self.g.current_url, detail_link) detail_resp = self.s.get(detail_link, verify=False) detail_result = bs4.BeautifulSoup( detail_resp.content, 'html.parser') detail_table = detail_result.find( "table", {"id": "new-mytable3"}) if detail_table: detail_trs = detail_table.findAll("tr") for detail_tr in detail_trs: detail_tds = detail_tr.findAll( "td") if detail_tds.__len__() == 0: continue if detail_tr != detail_trs[0]: date = re.sub( '\s', '', detail_tds[0].text) try: self.result_data["detail"][ "data"][date[0:4]] except KeyError: self.result_data["detail"][ "data"][ date[0:4]] = {} try: self.result_data["detail"][ "data"][date[0:4]][ date[4:6]] except KeyError: self.result_data["detail"][ "data"][date[0:4]][ date[4:6]] = [] hj = re.sub( '\s', '', detail_tds[2].text) sr = re.sub( '\s', '', detail_tds[3]. text).replace(",", "") try: if (hj.find("汇缴") or hj.find("补缴") ) and Decimal(sr) > 0: if last_income[ "date"] == "": last_income.update( { "date": date, "data": sr }) else: if int(last_income[ "date"] ) < int( date): last_income.update( { "date": date, "data": sr }) if date[0: 6] not in payMonth: payMonth.append( date[0:6]) except: pass temp_all_date.append(date) self.result_data["detail"][ "data"][date[0:4]][ date[4:6]].append({ "时间": date[0:4] + "-" + date[4:6] + "-" + date[6:], "类型": re.sub( '\s', '', detail_tds[2]. text), "汇缴年月": re.sub( '\s', '', detail_tds[1]. text), "收入": re.sub( '\s', '', detail_tds[3]. text), "支出": re.sub( '\s', '', detail_tds[4]. text), "余额": re.sub( '\s', '', detail_tds[5]. text), "单位名称": _tds[37].text }) if len(temp_detail) > 0: detail_trs = temp_detail[0].findAll("tr") for detail_tr in detail_trs: detail_tds = detail_tr.findAll("td") if detail_tds.__len__() == 0: continue if detail_tr != detail_trs[0]: date = re.sub( '\s', '', detail_tds[0].text) if date not in temp_all_date: try: self.result_data["detail"][ "data"][date[0:4]] except KeyError: self.result_data["detail"][ "data"][ date[0:4]] = {} try: self.result_data["detail"][ "data"][date[0:4]][ date[4:6]] except KeyError: self.result_data["detail"][ "data"][date[0:4]][ date[4:6]] = [] hj = re.sub( '\s', '', detail_tds[2].text) sr = re.sub( '\s', '', detail_tds[3]. text).replace(",", "") try: if (hj.find("汇缴") or hj.find("补缴") ) and Decimal(sr) > 0: if last_income[ "date"] == "": last_income.update( { "date": date, "data": sr }) else: if int(last_income[ "date"] ) < int( date): last_income.update( { "date": date, "data": sr }) if date[0: 6] not in payMonth: payMonth.append( date[0:6]) except: pass self.result_data["detail"][ "data"][date[0:4]][ date[4:6]].append({ "时间": date[0:4] + "-" + date[4:6] + "-" + date[6:], "类型": re.sub( '\s', '', detail_tds[2]. text), "汇缴年月": re.sub( '\s', '', detail_tds[1]. text), "收入": re.sub( '\s', '', detail_tds[3]. text), "支出": re.sub( '\s', '', detail_tds[4]. text), "余额": re.sub( '\s', '', detail_tds[5]. text), "单位名称": _tds[37].text }) except: pass i = i + 1 if paymentStart == "繳存": paymentStart = "繳存" else: paymentStart = "封存" self.result_identity.update({ 'task_name': self.task_info['city_name'], 'target_name': name, 'target_id': target_id, 'status': paymentStart }) try: self.result_data["baseInfo"].update({ "最近汇款日期": last_income["date"][0:4] + "-" + last_income["date"][4:6], "最近汇款金额": float(last_income["data"]), "累计汇款次数": payMonth.__len__(), }) except: pass except InvalidConditionError as e: raise PreconditionNotSatisfiedError(e)
class Task(AbsFetchTask): # noinspection PyAttributeOutsideInit task_info = dict( city_name="烟台", help="""<li>如您未在社保网站查询过您的社保信息,请到烟台社保网上服务平台完成“注册”然后再登录。</li> <li>如您忘记密码,可使用注册时绑定的手机号或者电子邮箱进行密码找回;当不能通过手机和电子邮箱找回密码,需去社保机构现场重置密码。</li>""", developers=[{ 'name': '卜圆圆', 'email': '*****@*****.**' }]) def _get_common_headers(self): return { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36' } def _setup_task_units(self): self._add_unit(self._unit_login) self._add_unit(self._unit_fetch_name, self._unit_login) def _query(self, params: dict): t = params.get('t') if t == 'vc': return self._new_vc() def _prepare(self, data=None): super()._prepare(data) self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/web\/ImageCheck.jpg/g') driver.get(MAIN_URL) return driver # noinspection PyMethodMayBeStatic def _check_login_params(self, params): assert params is not None, '缺少参数' assert '身份证号' in params, '缺少身份证号' assert '密码' in params, '缺少密码' assert 'vc' in params, '缺少验证码' # other check 身份证号 = params['身份证号'] 密码 = params['密码'] if len(身份证号) == 0: raise InvalidParamsError('身份证号为空,请输入身份证号') elif len(身份证号) < 15: raise InvalidParamsError('身份证号不正确,请重新输入') if len(密码) == 0: raise InvalidParamsError('密码为空,请输入密码!') elif len(密码) < 6: raise InvalidParamsError('密码不正确,请重新输入!') def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '身份证号' not in params: params['身份证号'] = meta.get('身份证号') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '身份证号' and '身份证号' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue res.append(pr) return res def _unit_login(self, params=None): err_msg = None if not self.is_start or params: # 非开始或者开始就提供了参数 try: #self._new_vc() #vc=input('验证码:') self._check_login_params(params) id_num = params['身份证号'] password = params['密码'] m = hashlib.md5() m.update(str(password).encode(encoding="utf-8")) pw = m.hexdigest() vc = params['vc'] self._do_login(id_num, password, vc) # xmlstr='<?xml version = "1.0" encoding = "UTF-8"?><p><s tempmm = "'+password+'"/></p>' # resp = self.s.post(LOGIN_URL, data=dict( # method='writeMM2Temp', # _xmlString=xmlstr, # _random=random.random() # ),headers={'Content-Type':'application/x-www-form-urlencoded;charset=UTF-8','X-Requested-With':'XMLHttpRequest'}) # soup = BeautifulSoup(resp.content, 'html.parser') # # xmlstrs = '<?xml version="1.0" encoding="UTF-8"?><p> <s userid ="'+id_num+'"/> <s usermm="'+pw+'"/><s authcode="'+vc+'"/><s yxzjlx="A"/><s appversion="81002198533703667231184339811848228729"/><s dlfs=""/></p>' # resp = self.s.post(LOGIN_URL, data=dict( # method='doLogon', # _xmlString=xmlstrs, # _random=random.random() # ), headers={'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', # 'X-Requested-With': 'XMLHttpRequest'}) # soup = BeautifulSoup(resp.content, 'html.parser') errormsg = self.s.soup.text if errormsg: if len(errormsg) > 20: dicts = eval(errormsg.replace('true', '"true"')) self.g.usersession_uuid = dicts['__usersession_uuid'] else: raise InvalidParamsError(errormsg) self.result_key = id_num # 保存到meta self.result_meta['身份证号'] = id_num self.result_meta['密码'] = password self.result_identity['task_name'] = '烟台' self.result_identity['target_id'] = id_num return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='身份证号', name='身份证号', cls='input', value=params.get('身份证号', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(MAIN_URL) username_input = driver.find_element_by_xpath( '//*[@id="yhmInput"]') password_input = driver.find_element_by_xpath('//*[@id="mmInput"]') vc_input = driver.find_element_by_xpath( '//*[@id="authcode_result"]') # submit_btn = driver.find_element_by_xpath('//*[@name="login_btn"]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) #验证码 vc_input.clear() vc_input.send_keys(vc) Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() # 提交 driver.execute_script( 'onLogin("1.0.68","105","mainFrame.jsp?","1","")') #submit_btn.click() time.sleep(8) # Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() if driver.current_url != MAIN_URL: print('登录成功') # 保存登录后的页面内容供抓取单元解析使用 login_page_html = driver.find_element_by_tag_name( 'html').get_attribute('innerHTML') self.s.soup = BeautifulSoup(login_page_html, 'html.parser') # realname=soup.select('#xm')[0].text else: # FIXME: 尝试处理alert err_msg = '登录失败,请检查输入' alert = driver.switch_to.alert try: err_msg = alert.text # alert.accept() finally: raise InvalidParamsError(err_msg) def _unit_fetch_name(self): try: data = self.result_data resp = self.s.post( INFO_URL, data=dict(method='returnMain', __usersession_uuid=self.g.usersession_uuid, _random=random.random()), headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) soup = BeautifulSoup(resp.content, 'html.parser') arrcbsj = [ soup.findAll('input')[6].attrs['value'], soup.findAll('input')[10].attrs['value'], soup.findAll('input')[14].attrs['value'], soup.findAll('input')[18].attrs['value'], soup.findAll('input')[22].attrs['value'], soup.findAll('input')[26].attrs['value'] ] baseinfoarr = { '养老': '正常参保' if soup.findAll('input')[7].attrs['value'] == '参保缴费' else '停缴', '医疗': '正常参保' if soup.findAll('input')[11].attrs['value'] == '参保缴费' else '停缴', '失业': '正常参保' if soup.findAll('input')[15].attrs['value'] == '参保缴费' else '停缴', '工伤': '正常参保' if soup.findAll('input')[19].attrs['value'] == '参保缴费' else '停缴', '生育': '正常参保' if soup.findAll('input')[23].attrs['value'] == '参保缴费' else '停缴', soup.findAll('input')[25].attrs['value']: '正常参保' if soup.findAll('input')[27].attrs['value'] == '参保缴费' else '停缴' } data['baseInfo'] = { '姓名': soup.findAll('input')[0].attrs['value'], '身份证号': soup.findAll('input')[1].attrs['value'], '手机号码': soup.findAll('input')[2].attrs['value'], '家庭住址': soup.findAll('input')[3].attrs['value'], '通讯地址': soup.findAll('input')[4].attrs['value'], '五险状态': baseinfoarr, '开始缴费时间': min(arrcbsj), "更新时间": datetime.datetime.now().strftime('%Y-%m-%d'), '城市名称': '烟台', '城市编号': '370600' } self.result_identity['target_name'] = soup.findAll( 'input')[0].attrs['value'] idtstatus = '停缴' if '正常参保' in baseinfoarr.values(): idtstatus = '正常参保' self.result_identity['status'] = idtstatus #养老 resp = self.s.post( YL_URL, data=dict(method='queryAgedPayHis', __usersession_uuid=self.g.usersession_uuid, _random=random.random()), headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) soup = BeautifulSoup(resp.content, 'html.parser') spantext = soup.findAll('span')[1].text.split(',') data['baseInfo']['缴费时长'] = int(spantext[0].replace('共缴费', '').replace( '个月', '')) data['baseInfo']['最近缴费时间'] = spantext[3].replace('缴费年月为', '').replace( '。', '') #data['baseInfo']['开始缴费时间'] = spantext[2].replace('最早缴费年月为', '') selecttext = soup.findAll('option') ylsum = 0.00 for i in range(1, len(selecttext)): print(selecttext[i].text) self.result_data['old_age'] = {} self.result_data['old_age']['data'] = {} years = '' months = '' trinfo = soup.findAll('table')[1] for tr in trinfo.findAll('tr'): arr = [] cell = [i.text for i in tr.find_all('td')] if cell[0] == '': cell = [i.attrs['value'] for i in tr.find_all('input')] yearmonth = cell[1] ylsum = ylsum + float(cell[4]) if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['old_age']['data'][years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['old_age']['data'][years][ months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['old_age']['data'][years][ months] = {} dicts = { '险种': cell[0], '缴费时间': cell[1], '缴费类型': '', '缴费基数': cell[2].replace(',', ''), '公司缴费': cell[3], '个人缴费': cell[4], '单位编号': cell[5], '缴费单位': cell[6] } arr.append(dicts) self.result_data['old_age']['data'][years][ months] = arr data['baseInfo']['个人养老累计缴费'] = ylsum #医疗 resp = self.s.post( YIL_URL, data=dict(method='queryMediPayHis', __usersession_uuid=self.g.usersession_uuid, _random=random.random()), headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) soup = BeautifulSoup(resp.content, 'html.parser') selecttext = soup.findAll('option') yilsum = 0.00 for i in range(1, len(selecttext)): print(selecttext[i].text) self.result_data['medical_care'] = {} self.result_data['medical_care']['data'] = {} years = '' months = '' trinfo = soup.findAll('table')[1] for tr in trinfo.findAll('tr'): arr = [] cell = [i.text for i in tr.find_all('td')] if cell[0] == '': cell = [i.attrs['value'] for i in tr.find_all('input')] yearmonth = cell[1] yilsum = yilsum + float(cell[4]) if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['medical_care']['data'][ years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['medical_care']['data'][ years][months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['medical_care']['data'][years][ months] = {} dicts = { '险种': cell[0], '缴费时间': cell[1], '缴费类型': '', '缴费基数': cell[2].replace(',', ''), '公司缴费': cell[3], '个人缴费': cell[4], '单位编号': cell[5], '缴费单位': cell[6] } arr.append(dicts) self.result_data['medical_care']['data'][years][ months] = arr data['baseInfo']['个人医疗累计缴费'] = yilsum # 工商 resp = self.s.post( GS_URL, data=dict(method='queryHarmPayHis', __usersession_uuid=self.g.usersession_uuid, _random=random.random()), headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) soup = BeautifulSoup(resp.content, 'html.parser') selecttext = soup.findAll('option') for i in range(1, len(selecttext)): print(selecttext[i].text) self.result_data['injuries'] = {} self.result_data['injuries']['data'] = {} years = '' months = '' trinfo = soup.findAll('table')[1] for tr in trinfo.findAll('tr'): arr = [] cell = [i.text for i in tr.find_all('td')] if cell[0] == '': cell = [i.attrs['value'] for i in tr.find_all('input')] yearmonth = cell[1] if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['injuries']['data'][years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['injuries']['data'][ years][months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['injuries']['data'][years][ months] = {} dicts = { '险种': cell[0], '缴费时间': cell[1], '缴费类型': '', '缴费基数': cell[2].replace(',', ''), '公司缴费': cell[3], '个人缴费': cell[4], '单位编号': cell[5], '缴费单位': cell[6] } arr.append(dicts) self.result_data['injuries']['data'][years][ months] = arr # 生育 resp = self.s.post( SHY_URL, data=dict(method='queryBirthPayHis', __usersession_uuid=self.g.usersession_uuid, _random=random.random()), headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) soup = BeautifulSoup(resp.content, 'html.parser') selecttext = soup.findAll('option') for i in range(1, len(selecttext)): print(selecttext[i].text) self.result_data['maternity'] = {} self.result_data['maternity']['data'] = {} years = '' months = '' trinfo = soup.findAll('table')[1] for tr in trinfo.findAll('tr'): arr = [] cell = [i.text for i in tr.find_all('td')] if cell[0] == '': cell = [i.attrs['value'] for i in tr.find_all('input')] yearmonth = cell[1] if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['maternity']['data'][years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['maternity']['data'][ years][months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['maternity']['data'][years][ months] = {} dicts = { '险种': cell[0], '缴费时间': cell[1], '缴费类型': '', '缴费基数': cell[2].replace(',', ''), '公司缴费': cell[3], '个人缴费': cell[4], '单位编号': cell[5], '缴费单位': cell[6] } arr.append(dicts) self.result_data['maternity']['data'][years][ months] = arr # 失业 resp = self.s.post( SY_URL, data=dict(method='queryLostPayHis', __usersession_uuid=self.g.usersession_uuid, _random=random.random()), headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) soup = BeautifulSoup(resp.content, 'html.parser') selecttext = soup.findAll('option') for i in range(1, len(selecttext)): print(selecttext[i].text) self.result_data['unemployment'] = {} self.result_data['unemployment']['data'] = {} years = '' months = '' trinfo = soup.findAll('table')[1] for tr in trinfo.findAll('tr'): arr = [] cell = [i.text for i in tr.find_all('td')] if cell[0] == '': cell = [i.attrs['value'] for i in tr.find_all('input')] yearmonth = cell[1] if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['unemployment']['data'][ years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['unemployment']['data'][ years][months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['unemployment']['data'][years][ months] = {} dicts = { '险种': cell[0], '缴费时间': cell[1], '缴费类型': '', '缴费基数': cell[2].replace(',', ''), '公司缴费': cell[3], '个人缴费': cell[4], '单位编号': cell[5], '缴费单位': cell[6] } arr.append(dicts) self.result_data['unemployment']['data'][years][ months] = arr # 大病 resp = self.s.post( YIL_URL, data=dict(method='queryEmpJfxxZzCxDe', __usersession_uuid=self.g.usersession_uuid, _random=random.random()), headers={ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest' }) soup = BeautifulSoup(resp.content, 'html.parser') selecttext = soup.findAll('option') for i in range(1, len(selecttext)): print(selecttext[i].text) self.result_data['serious_illness'] = {} self.result_data['serious_illness']['data'] = {} years = '' months = '' trinfo = soup.findAll('table')[1] for tr in trinfo.findAll('tr'): arr = [] cell = [i.text for i in tr.find_all('td')] if cell[0] == '': cell = [i.attrs['value'] for i in tr.find_all('input')] yearmonth = cell[1] if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['serious_illness']['data'][ years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['unemployment']['data'][ years][months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['serious_illness']['data'][years][ months] = {} dicts = { '险种': cell[0], '缴费时间': cell[1], '缴费类型': cell[6], '缴费基数': cell[2].replace(',', ''), '公司缴费': '', '个人缴费': cell[3], '单位编号': cell[4], '缴费单位': cell[5] } arr.append(dicts) self.result_data['serious_illness']['data'][years][ months] = arr return except PermissionError as e: raise PreconditionNotSatisfiedError(e) def _new_vc(self): #randoms=random.random() #vc_url = VC_URL +str(randoms) #str(int(time.time() * 1000)) resps = json.loads(self.s.get(VC_URL).text) firstNum = resps['numLeftBase64'] oprate = resps['operatorBase64'] lastNum = resps['numRightBase64'] equla = resps['equalsBase64'] arr = [firstNum, oprate, lastNum, equla] toImage = Image.new('RGB', (110, 50), (255, 255, 255)) for i in range(4): fromImge = Image.open(io.BytesIO(base64.b64decode(arr[i]))) if (fromImge.mode == "P"): fromImge.convert("RGB") loc = (i * 22 + 15, 10) toImage.paste(fromImge, loc) imgsave = io.BytesIO() toImage.save(imgsave, "PNG") imgsave.seek(0) resp = imgsave.read() return dict(cls='data:image', content=resp)
class Task(AbsFetchTask): task_info = dict(city_name="重庆", help="""<li>初始密码为公积金账号后四位+00;可登录重庆住房公积金管理中心官网后进行修改。</li> <li>未验证注册用户首次登录时需进行身份验证,具体验证方式如下:用户通过输入公积金联名卡后六位(若用户未办理公积金联名卡的须输入个人公积金账号)验证登录。</li>""", developers=[{ 'name': '卜圆圆', 'email': '*****@*****.**' }]) def _get_common_headers(self): return { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36' } def _query(self, params: dict): """任务状态查询""" t = params.get('t') if t == 'vc': return self._new_vc() def _prepare(self, data=None): super()._prepare(data) self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/web\/ImageCheck.jpg/g') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(13) # 设置10秒脚本超时时间 driver.set_script_timeout(13) driver.get('https://www.cqgjj.cn/xxx') return driver def _setup_task_units(self): """设置任务执行单元""" self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _check_login_params(self, params): assert params is not None, '缺少参数' assert '账号' in params, '缺少账号' assert '密码' in params, '缺少密码' assert 'vc' in params, '缺少验证码' # other check 账号 = params['账号'] 密码 = params['密码'] if len(密码) < 4: raise InvalidParamsError('账号或密码错误') if 账号.isdigit(): if len(账号) < 5: raise InvalidParamsError('账号错误') return raise InvalidParamsError('账号或密码错误') def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '账号' not in params: params['账号'] = meta.get('账号') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '账号' and '账号' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue res.append(pr) return res def _unit_login(self, params: dict): err_msg = None params if params: try: self._check_login_params(params) id_num = params['账号'] password = params['密码'] vc = params['vc'] resps = self.s.get(LOGIN_PAGE_URL, timeout=10) soup = BeautifulSoup(resps.content, 'html.parser') VIEWSTATE = soup.select('#__VIEWSTATE')[0].attrs['value'] VIEWSTATEGENERATOR = soup.select( '#__VIEWSTATEGENERATOR')[0].attrs['value'] EVENTVALIDATION = soup.select( '#__EVENTVALIDATION')[0].attrs['value'] resp = self.s.post(LOGIN_PAGE_URL, data=dict( __VIEWSTATE=VIEWSTATE, __VIEWSTATEGENERATOR=VIEWSTATEGENERATOR, __EVENTVALIDATION=EVENTVALIDATION, HiddenField1=id_num, txt_loginname=id_num, txt_pwd=password, txt_code=vc, loginBtn=''), timeout=25) soup = BeautifulSoup(resp.content, 'html.parser') #self._do_login(id_num, password, vc) if len(soup.select('.error')) > 1: err_msg = soup.select('.error')[0].text.replace( ' ', '').replace('\n', '') if err_msg: raise InvalidParamsError(err_msg) else: print("登录成功!") else: print("登录成功!") self.result_key = params.get('账号') # 保存到meta self.result_meta['账号'] = params.get('账号') self.result_meta['密码'] = params.get('密码') self.result_identity['task_name'] = '重庆' return #raise TaskNotImplementedError('查询服务维护中') except (AssertionError, InvalidParamsError) as e: err_msg = str(e) vc = self._new_vc() raise AskForParamsError([ dict(key='账号', name='账号', cls='input', placeholder='账号/手机', value=params.get('账号', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(LOGIN_PAGE_URL) username_input = driver.find_element_by_xpath( '//*[@id="txt_loginname"]') password_input = driver.find_element_by_xpath( '//*[@id="txt_pwd"]') #pwdRow/td[2]/input[1] vc_input = driver.find_element_by_xpath('//*[@id="txt_code"]') submit_btn = driver.find_element_by_xpath('//*[@id="loginBtn"]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() driver.execute_script('$("#txt_pwd").removeAttr("readonly")') password_input.send_keys(password) #Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() #验证码 vc_input.clear() vc_input.send_keys(vc) # 提交 submit_btn.click() time.sleep(2) #Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() #login_page_html = driver.find_element_by_tag_name('html').get_attribute('innerHTML') if driver.current_url != LOGIN_PAGE_URL: print('登录成功') # 保存登录后的页面内容供抓取单元解析使用 #login_page_html = driver.find_element_by_tag_name('html').get_attribute('innerHTML') #self.s.soup = BeautifulSoup(login_page_html, 'html.parser') # realname=soup.select('#xm')[0].text else: # FIXME: 尝试处理alert err_msg = driver.find_elements_by_class_name('error')[0].text #err_msg = '登录失败,请检查输入' #alert = driver.switch_to.alert try: err_msg = err_msg #alert.text # alert.accept() finally: raise InvalidParamsError(err_msg) def _unit_fetch(self): try: # TODO: 执行任务,如果没有登录,则raise PermissionError # 基本信息 resp = self.s.get(INFO_URL, timeout=25) soup = BeautifulSoup(resp.content, 'html.parser') table = soup.find('table') data = self.result_data data['baseInfo'] = { '城市名称': '重庆', '城市编号': '500100', '证件类型': '身份证', '个人登记号': '', '更新时间': time.strftime("%Y-%m-%d", time.localtime()) } for tr in table.findAll('tr'): cell = [ i.text.replace('\n', '').replace('\r', '').replace( ' ', '').replace(': ', '') for i in tr.find_all('td') ] if len(cell) > 1: data['baseInfo'].setdefault( cell[0].replace(' ', '').replace( '身份证号码', '证件号').replace('开户时间', '开户日期').replace( '个人月缴交额(元)', '个人月缴存额').replace( '单位月缴交额(元)', '单位月缴存额').replace( '个人公积金帐号', '公积金帐号').replace( '个人序号', '个人账号').replace( '当前余额(元)', '当前余额').replace( '当前状态', '帐户状态').replace(':', ''), cell[1].replace('-', '').replace(' ', '')) self.result_identity['target_name'] = data['baseInfo']['姓名'] self.result_identity['target_id'] = data['baseInfo']['证件号'] if '正常' in data['baseInfo']['帐户状态']: self.result_identity['status'] = '缴存' else: self.result_identity['status'] = '封存' #公积金明细 resp = self.s.get(MINGXI_URL, timeout=25) soup = BeautifulSoup(resp.content, 'html.parser') table = soup.find('table') data['detail'] = {} data['detail']['data'] = {} years = '' months = '' maxtime = '' y = 1 hjtype = 0 hjje = '' hjrq = '' hjcs = 0 for tb in table.findAll('tbody'): dic = {} arr = [] cell = [ i.text.replace(' ', '').replace('\r\n', '') for i in tb.find_all('td') ] typedate = cell[1].split('[') hj = '' lx = cell[1] if '结息' in lx: lx = '结息' if len(typedate) > 1: hj = typedate[1].replace(']', '') lx = typedate[0] if (y == 1): maxtime = cell[0] if '汇缴' in lx: hjrq = hj hjje = str(float(cell[2]) + float(cell[3])) hjtype = 1 y = y + 1 if hj: hjcs = hjcs + 1 if hjtype == 0: hjrq = hj hjje = str(float(cell[2]) + float(cell[3])) hjtype = 1 dic = { '时间': cell[0], '单位名称': '', '支出': 0, '收入': str(float(cell[2]) + float(cell[3])), '汇缴年月': hj, '余额': cell[4], '类型': lx } times = cell[0][:7].replace('-', '') if years != times[:4]: years = times[:4] data['detail']['data'][years] = {} if months != times[-2:]: months = times[-2:] data['detail']['data'][years][months] = {} else: if months != times[-2:]: months = times[-2:] data['detail']['data'][years][months] = {} else: arr = data['detail']['data'][years][months] arr.append(dic) data['detail']['data'][years][months] = arr data['baseInfo']['最近汇缴日期'] = hjrq data['baseInfo']['最近汇缴金额'] = hjje data['baseInfo']['累计汇缴次数'] = hjcs #companyList data['companyList'] = [] enterdic = { "单位名称": data['baseInfo']['单位名称'], # "单位登记号": "", # "所属管理部编号": "", # "所属管理部名称": "", "当前余额": data['baseInfo']['当前余额'], "帐户状态": data['baseInfo']['帐户状态'], #"当年缴存金额": 0, # "当年提取金额": 0, #"上年结转余额": 0, "最后业务日期": maxtime # "转出金额": 0 } data['companyList'].append(enterdic) return except PermissionError as e: raise PreconditionNotSatisfiedError(e) def _new_vc(self): #vc_url = VC_URL # + str(int(time.time() * 1000)) resp = self.s.get(VC_URL, timeout=20) return dict(content=resp.content, content_type=resp.headers['Content-Type'])
class Task(AbsFetchTask): task_info = dict( city_name="深圳", help="""<li>如您首次在网上查询您的公积金账户,初始密码为身份证后六位,身份证号码有字母的用数字“0”代替。</li> <li>如您在公积金官网查询过您的公积金账户,请输入账户信息和密码登录即可。</li>""", developers=[{ 'name': '卜圆圆', 'email': '*****@*****.**' }]) def _get_common_headers(self): return { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko' } def _query(self, params: dict): """任务状态查询""" t = params.get('t') if t == 'vc': return self._new_vc() def _prepare(self, data=None): super()._prepare(data) self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/web\/ImageCheck.jpg/g') driver.get('https://nbp.szzfgjj.com/xxx') return driver def _setup_task_units(self): """设置任务执行单元""" self._add_unit(self._unit_login) self._add_unit(self._unit_fetch, self._unit_login) def _check_login_params(self, params): assert params is not None, '缺少参数' assert '公积金账号' in params, '缺少公积金账号' assert '密码' in params, '缺少密码' assert 'vc' in params, '缺少验证码' # other check 公积金账号 = params['公积金账号'] 密码 = params['密码'] if len(公积金账号) == 0: raise InvalidParamsError('公积金账号为空,请输入公积金账号') elif len(公积金账号) != 11: raise InvalidParamsError('公积金账号不正确,请重新输入') if len(密码) == 0: raise InvalidParamsError('密码为空,请输入密码!') elif len(密码) < 6: raise InvalidParamsError('密码不正确,请重新输入!') def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '公积金账号' not in params: params['公积金账号'] = meta.get('公积金账号') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '公积金账号' and '公积金账号' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue res.append(pr) return res def _unit_login(self, params: dict): err_msg = None if params: try: self._check_login_params(params) id_num = params['公积金账号'] password = params['密码'] vc = params['vc'] #self._do_login(id_num, password, vc) m = hashlib.md5() m.update(password.encode(encoding='utf-8')) hashpsw = m.hexdigest() data = { 'task': 'pri', 'transcode': 'card', 'ssoLogin': '', 'issueName': '', 'UserCert': '', 'bjcaRanStr': '', 'ranStr': '', 'CardNo': id_num, 'QryPwd': '19ee8550b7b1af89', 'identifyCode': vc, 'sSignTxt': '', 'SUBMIT.x': '91', 'SUBMIT.y': '15' } resp = self.s.post(LOGIN_URL, data=data, headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Cache-Control': 'no-cache' }) soup = self.s.soup errormsg = soup.select('.message')[0].text if errormsg and errormsg != id_num: raise Exception(errormsg) else: print('chengong') self.result_key = params.get('公积金账号') # 保存到meta self.result_meta['公积金账号'] = params.get('公积金账号') self.result_meta['密码'] = params.get('密码') self.result_identity['task_name'] = '深圳' return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) vc = self._new_vc() raise AskForParamsError([ dict(key='公积金账号', name='公积金账号', cls='input', placeholder='公积金账号', value=params.get('公积金账号', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(LOGIN_PAGE_URL) username_input = driver.find_element_by_xpath( '//*[@id="pri"]/p[1]/label[2]/input') password_input = driver.find_element_by_xpath( '//*[@id="pri"]/p[2]/label[2]') vc_input = driver.find_element_by_xpath( '//*[@name="identifyCode"]') submit_btn = driver.find_element_by_xpath( '//*[@id="pri"]/p[6]/input[1]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) #验证码 vc_input.clear() vc_input.send_keys(vc) Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() # 提交 submit_btn.click() time.sleep(2) if driver.current_url != LOGIN_PAGE_URL: print('登录成功') # 保存登录后的页面内容供抓取单元解析使用 login_page_html = driver.find_element_by_tag_name( 'html').get_attribute('innerHTML') self.s.soup = BeautifulSoup(login_page_html, 'html.parser') # realname=soup.select('#xm')[0].text else: # FIXME: 尝试处理alert err_msg = '登录失败,请检查输入' alert = driver.switch_to.alert try: err_msg = alert.text # alert.accept() finally: raise InvalidParamsError(err_msg) def _unit_fetch(self): try: # TODO: 执行任务,如果没有登录,则raise PermissionError return except PermissionError as e: raise PreconditionNotSatisfiedError(e) def _new_vc(self): vc_url = VC_URL # + str(int(time.time() * 1000)) resp = self.s.get(vc_url) return dict(content=resp.content, content_type=resp.headers['Content-Type'])
class Task(AbsFetchTask): task_info = dict( city_name="宁波", help="""<li>如您未在社保网站查询过您的社保信息,请到宁波社保网上服务平台完成“注册”然后再登录。</li> <li>如有问题请拨打12333。</li> """, developers=[{ 'name': '卜圆圆', 'email': '*****@*****.**' }]) def _get_common_headers(self): return { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36' } def _setup_task_units(self): self._add_unit(self._unit_login) self._add_unit(self._unit_fetch_name, self._unit_login) def _query(self, params: dict): t = params.get('t') if t == 'vc': return self._new_vc() def _prepare(self, data=None): super()._prepare(data) self.dsc = DriverRequestsCoordinator(s=self.s, create_driver=self._create_driver) def _create_driver(self): driver = new_driver(user_agent=USER_AGENT, js_re_ignore='/web\/ImageCheck.jpg/g') proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.DIRECT proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) # 以前遇到过driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。 driver.set_page_load_timeout(13) # 设置10秒脚本超时时间 driver.set_script_timeout(13) driver.get(MAIN_URL) return driver def _check_login_params(self, params): assert params is not None, '缺少参数' assert '身份证号' in params, '缺少身份证号' assert '密码' in params, '缺少密码' # other check 身份证号 = params['身份证号'] 密码 = params['密码'] if len(身份证号) == 0: raise InvalidParamsError('身份证号为空,请输入身份证号') elif len(身份证号) < 15: raise InvalidParamsError('身份证号不正确,请重新输入') if len(密码) == 0: raise InvalidParamsError('密码为空,请输入密码!') elif len(密码) < 6: raise InvalidParamsError('密码不正确,请重新输入!') def _params_handler(self, params: dict): if not (self.is_start and not params): meta = self.prepared_meta if '身份证号' not in params: params['身份证号'] = meta.get('身份证号') if '密码' not in params: params['密码'] = meta.get('密码') return params def _param_requirements_handler(self, param_requirements, details): meta = self.prepared_meta res = [] for pr in param_requirements: # TODO: 进一步检查details if pr['key'] == '身份证号' and '身份证号' in meta: continue elif pr['key'] == '密码' and '密码' in meta: continue elif pr['key'] == 'other': continue res.append(pr) return res def _unit_login(self, params: dict): err_msg = None params if params: try: self._check_login_params(params) id_num = params['身份证号'] pwd = params['密码'] yzm = params['vc'] #self._do_login(id_num, pwd, yzm) resp = self.s.post(CVC_URL, data=dict(client='NBHRSS_WEB', yzm=yzm)) soup = BeautifulSoup(resp.content, 'html.parser') infors = json.loads(soup.text) if infors['result'] == '0': resp = self.s.post(LOGIN_URL, data=dict(id=id_num, password=pwd, client='NBHRSS_WEB', phone='')) soup = BeautifulSoup(resp.content, 'html.parser') infor = json.loads(soup.text) if infor['ret'] == '1': print("登录成功!") inforr = json.loads(infor['result']) self.g.access_token = inforr['access_token'] self.result_data["baseInfo"] = { '城市名称': '宁波', '城市编号': '330200', '更新时间': time.strftime("%Y-%m-%d", time.localtime()), '姓名': inforr['xm'], '身份证号': inforr['sfz'], '社会保障卡号码': inforr['sbkh'] } self.result_identity['target_name'] = inforr['xm'] elif infor['msg'] == 'E1001': raise InvalidParamsError('请去官网进行账号升级!') else: raise InvalidParamsError(infor['msg']) else: raise InvalidParamsError('验证码错误!') self.result_key = id_num self.result_meta['身份证号'] = id_num self.result_meta['密码'] = pwd self.result_identity['task_name'] = '宁波' self.result_identity['target_id'] = id_num return except (AssertionError, InvalidParamsError) as e: err_msg = str(e) raise AskForParamsError([ dict(key='身份证号', name='身份证号', cls='input', value=params.get('身份证号', '')), dict(key='密码', name='密码', cls='input:password', value=params.get('密码', '')), dict(key='vc', name='验证码', cls='data:image', query={'t': 'vc'}, value=params.get('vc', '')), ], err_msg) def _do_login(self, username, password, vc): """使用web driver模拟登录过程""" with self.dsc.get_driver_ctx() as driver: # 打开登录页 driver.get(MAIN_URL) username_input = driver.find_element_by_xpath('//*[@id="loginid"]') password_input = driver.find_element_by_xpath('//*[@id="pwd"]') vc_input = driver.find_element_by_xpath('//*[@id="yzm"]') submit_btn = driver.find_element_by_xpath('//*[@id="btnLogin"]') # 用户名 username_input.clear() username_input.send_keys(username) # 密码 password_input.clear() password_input.send_keys(password) vc_input.clear() vc_input.send_keys(vc) # 提交 submit_btn.click() time.sleep(8) # Image.open(io.BytesIO(driver.get_screenshot_as_png())).show() if driver.current_url == INFO_URL: print('登录成功') # 保存登录后的页面内容供抓取单元解析使用 login_page_html = driver.find_element_by_tag_name( 'html').get_attribute('innerHTML') self.s.soup = BeautifulSoup(login_page_html, 'html.parser') # realname=soup.select('#xm')[0].text else: # FIXME: 尝试处理alert err_msg = '登录失败,请检查输入' alert = driver.switch_to.alert try: err_msg = alert.text # alert.accept() finally: raise InvalidParamsError(err_msg) def _yanglao(self): with self.dsc.get_driver_ctx() as driver: driver.get(YL_URL) time.sleep(3) htmls = driver.find_element_by_tag_name('html').get_attribute( 'innerHTML') soupyl = BeautifulSoup(htmls, 'html.parser') mingxitable = soupyl.select('#content') tableinfo = mingxitable[0].find_all('tr') self.result_data['old_age'] = {} self.result_data['old_age']['data'] = {} arrstr = [] years = '' months = '' maxtime = '' y = 1 for row in tableinfo: arr = [] cell = [i.text for i in row.find_all('td')] if len(cell) < 3: arrstr.extend(cell) elif len(cell) == 4: yearmonth = cell[0] if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['old_age']['data'][years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['old_age']['data'][years][ months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['old_age']['data'][years][months] = {} dicts = { '缴费时间': cell[0], '缴费类型': '', '缴费基数': cell[1], '公司缴费': '', '个人缴费': cell[2], '缴费单位': '', '到账情况': cell[3] } if y == 1: maxtime = cell[0] y = y + 1 arr.append(dicts) self.result_data['old_age']['data'][years][months] = arr # print(arrstr) if len(arrstr) > 2: self.result_data["baseInfo"].setdefault( '单位名称', arrstr[2].replace('单位名称:', '')) nowyears = time.strftime("%Y", time.localtime()) if len(arrstr) > 8: jfscolder = int(arrstr[10].replace('至本年末实际缴费月数:', '')) ljjfolder = float(arrstr[9].replace('至本年末账户累计储存额:', '')) else: jfscolder = 0 ljjfolder = 0.00 if nowyears in self.result_data['old_age']['data'].keys(): for k, v in self.result_data['old_age']['data'][ nowyears].items(): jfscolder = jfscolder + 1 ljjfolder = ljjfolder + float(v[0]['个人缴费']) self.result_data["baseInfo"].setdefault('缴费时长', jfscolder) self.result_data["baseInfo"].setdefault('个人养老累计缴费', ljjfolder) self.result_data["baseInfo"].setdefault('最近缴费时间', maxtime) if len(self.result_data['old_age']['data']) > 0: ksjfsj = min(self.result_data['old_age']['data']) self.result_data["baseInfo"].setdefault( '开始缴费时间', ksjfsj + min(self.result_data['old_age']['data'][ksjfsj])) else: self.result_data["baseInfo"].setdefault('开始缴费时间', '') if len(arrstr) > 3: cbzt = arrstr[3].replace('参保状态:', '') else: cbzt = '未知' if cbzt == '参保缴费': cbzt = '正常参保' else: cbzt = '停缴' self.result_identity['status'] = cbzt self.g.Fivestatus = {'养老': cbzt} def _yiliao(self): with self.dsc.get_driver_ctx() as driver: driver.get(YIL_URL) time.sleep(3) htmls = driver.find_element_by_tag_name('html').get_attribute( 'innerHTML') soupyl = BeautifulSoup(htmls, 'html.parser') mingxitable = soupyl.select('#content') tableinfo = mingxitable[0].find_all('tr') self.result_data['medical_care'] = {} self.result_data['medical_care']['data'] = {} arrstr = [] years = '' months = '' for row in tableinfo: arr = [] cell = [i.text for i in row.find_all('td')] if len(cell) < 3: arrstr.extend(cell) elif len(cell) == 4: yearmonth = cell[0] if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['medical_care']['data'][years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['medical_care']['data'][ years][months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['medical_care']['data'][years][ months] = {} dicts = { '缴费时间': cell[0], '缴费类型': '', '缴费基数': cell[1], '公司缴费': '', '个人缴费': cell[2], '缴费单位': '', '到账情况': cell[3] } arr.append(dicts) self.result_data['medical_care']['data'][years][ months] = arr # print(arrstr) nowyears = time.strftime("%Y", time.localtime()) if len(arrstr) > 10: ljjfolder = float(arrstr[11].replace('个人账户余额:', '')) else: ljjfolder = 0.00 if nowyears in self.result_data['old_age']['data'].keys(): for k, v in self.result_data['old_age']['data'][ nowyears].items(): ljjfolder = ljjfolder + float(v[0]['个人缴费']) self.result_data["baseInfo"].setdefault('个人医疗累计缴费', ljjfolder) if len(arrstr) > 4: cbzt = arrstr[4].replace('参保状态:', '') else: cbzt = '未知' if cbzt == '参保缴费': cbzt = '正常参保' else: cbzt = '停缴' self.g.Fivestatus.setdefault('医疗', cbzt) def _gongshang(self): with self.dsc.get_driver_ctx() as driver: driver.get(GS_URL) time.sleep(3) htmls = driver.find_element_by_tag_name('html').get_attribute( 'innerHTML') soupyl = BeautifulSoup(htmls, 'html.parser') mingxitable = soupyl.select('#content') tableinfo = mingxitable[0].find_all('tr') arrstr = [] for row in tableinfo: arr = [] cell = [i.text for i in row.find_all('td')] if len(cell) < 3: arrstr.extend(cell) if len(arrstr) > 3: cbzt = arrstr[3].replace('参保状态:', '') else: cbzt = '未知' if cbzt == '参保缴费': cbzt = '正常参保' else: cbzt = '停缴' self.g.Fivestatus.setdefault('工伤', cbzt) def _shiye(self): with self.dsc.get_driver_ctx() as driver: driver.get(SY_URL) time.sleep(3) htmls = driver.find_element_by_tag_name('html').get_attribute( 'innerHTML') soupyl = BeautifulSoup(htmls, 'html.parser') mingxitable = soupyl.select('#content') tableinfo = mingxitable[0].find_all('tr') arrstr = [] for row in tableinfo: arr = [] cell = [i.text for i in row.find_all('td')] if len(cell) < 3: arrstr.extend(cell) if len(arrstr) > 3: cbzt = arrstr[3].replace('参保状态:', '') else: cbzt = '未知' if cbzt == '参保缴费': cbzt = '正常参保' else: cbzt = '停缴' self.g.Fivestatus.setdefault('失业', cbzt) def _shengyu(self): with self.dsc.get_driver_ctx() as driver: driver.get(SHY_URL) time.sleep(3) htmls = driver.find_element_by_tag_name('html').get_attribute( 'innerHTML') soupyl = BeautifulSoup(htmls, 'html.parser') mingxitable = soupyl.select('#content') tableinfo = mingxitable[0].find_all('tr') arrstr = [] for row in tableinfo: arr = [] cell = [i.text for i in row.find_all('td')] if len(cell) < 3: arrstr.extend(cell) if len(arrstr) > 3: cbzt = arrstr[3].replace('参保状态:', '') else: cbzt = '未知' if cbzt == '参保缴费': cbzt = '正常参保' else: cbzt = '停缴' self.g.Fivestatus.setdefault('生育', cbzt) def _unit_fetch_name(self): """用户信息""" try: respss = self.s.get( 'https://rzxt.nbhrss.gov.cn/nbsbk-rzxt/web/pages/query/query-grxx.jsp' ) urls = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=10S006&bustype=01&refresh=true&client=NBHRSS_WEB' resp = self.s.get(urls) urls = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=10S005&bustype=01&refresh=true&client=NBHRSS_WEB' resp = self.s.get(urls) soup = BeautifulSoup(resp.content, 'html.parser') infor = json.loads(soup.text) infors = json.loads(infor['result']) if 'AAC004' in infors.keys(): if infors['AAC004'] == '1': xb = '男' elif infors['AAC004'] == '2': xb = '女' else: xb = '未说明性别' if 'AAZ502' in infors.keys(): if infors['AAZ502'] == '1': kt = '正常有卡状态' elif infors['AAZ502'] == '2': kt = '正式挂失状态' elif infors['AAZ502'] == '4': kt = '临时挂失状态' else: kt = '' self.result_data['baseInfo']['性别'] = xb if 'AZA103' in infors.keys(): self.result_data['baseInfo']['国籍'] = infors['AZA103'] self.result_data['baseInfo']['社保卡状态'] = kt if 'AAE010' in infors.keys(): self.result_data['baseInfo']['银行账号'] = infors['AAE010'] if 'AAZ503' in infors.keys(): self.result_data['baseInfo']['发卡日期'] = infors['AAZ503'] if 'AAE004' in infors.keys(): self.result_data['baseInfo']['手机号'] = infors['AAE004'] if 'AAE005' in infors.keys(): self.result_data['baseInfo']['固定号码'] = infors['AAE005'] if 'AAE006' in infors.keys(): self.result_data['baseInfo']['常住地址'] = infors['AAE006'] if 'AAZ220' in infors.keys(): self.result_data['baseInfo']['邮编'] = infors['AAZ220'] urls = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=10S005&bustype=01&refresh=true&client=NBHRSS_WEB' resp = self.s.get(urls) Fivestatus = {} # #养老状态 resp = self.s.get( 'https://rzxt.nbhrss.gov.cn/nbsbk-rzxt/web/pages/query/query-ylbx.jsp' ) #第一次 ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S099&bustype=01&refresh=true&client=NBHRSS_WEB' resp = self.s.get(ylurl) # 第二次 ylurl = 'https://rzxt.nbhrss.gov.cn/nbsbk-rzxt/rzxt/getTimeOut.action' resp = self.s.post(ylurl) # 第三次 ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S001&bustype=01&refresh=true¶m={"AAB301":"330200"}&client=NBHRSS_WEB' resp = self.s.get(ylurl) soupyl = BeautifulSoup(resp.content, 'html.parser') ylinfo = json.loads(soupyl.text) if ylinfo['ret'] == '1': ylinfos = json.loads(ylinfo['result']) cbzt = ylinfos['AAC008'] # arrstr[3].replace('参保状态:', '') Fivestatus = {'养老': cbzt} if cbzt == '参保缴费': cbzt = '正常参保' else: cbzt = '停缴' self.result_data["baseInfo"].setdefault( '单位名称', ylinfos['AAB004']) #养老第四次 ylurls = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S002&bustype=01¶m={"AAB301":"330200","PAGENO":1,"PAGESIZE":10000}&client=NBHRSS_WEB' resps = self.s.get(ylurls) soupyls = BeautifulSoup(resps.content, 'html.parser') ylinfos = json.loads(soupyls.text) if ylinfos['ret'] == '1': ylinfof = json.loads(ylinfos['result']) self.result_data['old_age'] = {} self.result_data['old_age']['data'] = {} years = '' months = '' maxtime = '' y = 1 for i in range(0, len(ylinfof['COSTLIST']['COST'])): arr = [] cell = ylinfof['COSTLIST']['COST'][i] # if len(cell) < 3: # arrstr.extend(cell) # elif len(cell) == 4: yearmonth = cell['AAE002'] if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['old_age']['data'][years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['old_age']['data'][years][ months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['old_age']['data'][years][months] = {} dicts = { '缴费时间': cell['AAE002'], '缴费类型': '', '缴费基数': cell['AAE180'], '公司缴费': '', '个人缴费': cell['AAE022'], '缴费单位': '', '到账情况': cell['AAE078'] } if y == 1: maxtime = cell['AAE002'] y = 2 arr.append(dicts) self.result_data['old_age']['data'][years][months] = arr nowyears = time.strftime("%Y", time.localtime()) #第五次 ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S003&bustype=01&refresh=true¶m={"AAB301":"330200"}&client=NBHRSS_WEB' resp = self.s.get(ylurl) soupyl = BeautifulSoup(resp.content, 'html.parser') ylinfo = json.loads(soupyl.text) if ylinfo['ret'] == '1': ylinfos = json.loads(ylinfo['result']) if len(ylinfos['COSTLIST']['COST']) > 1: jfscolder = int( ylinfos['COSTLIST']['COST'][0]['AAE091'] ) # arrstr[10].replace('至本年末实际缴费月数:', '') ljjfolder = float( ylinfos['COSTLIST']['COST'][0]['AAE382'] ) # arrstr[9].replace('至本年末账户累计储存额:', '') else: jfscolder = 0 ljjfolder = 0.00 else: jfscolder = 0 ljjfolder = 0.00 if nowyears in self.result_data['old_age']['data'].keys(): for k, v in self.result_data['old_age']['data'][ nowyears].items(): jfscolder = jfscolder + 1 ljjfolder = ljjfolder + float(v[0]['个人缴费']) self.result_data["baseInfo"].setdefault('缴费时长', jfscolder) self.result_data["baseInfo"].setdefault('个人养老累计缴费', ljjfolder) self.result_data["baseInfo"].setdefault('最近缴费时间', maxtime) if len(self.result_data['old_age']['data']) > 0: ksjfsj = min(self.result_data['old_age']['data']) self.result_data["baseInfo"].setdefault( '开始缴费时间', ksjfsj + min(self.result_data['old_age']['data'][ksjfsj])) else: self.result_data["baseInfo"].setdefault('开始缴费时间', '') #医疗 resp = self.s.get( 'https://rzxt.nbhrss.gov.cn/nbsbk-rzxt/web/pages/query/query-yilbx.jsp' ) # 第一次 ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S099&bustype=01&refresh=true&client=NBHRSS_WEB' resp = self.s.get(ylurl) # 第二次 ylurl = 'https://rzxt.nbhrss.gov.cn/nbsbk-rzxt/rzxt/getTimeOut.action' resp = self.s.post(ylurl) # 第三次 ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S011&bustype=01&refresh=true¶m={"AAB301":"330200"}&client=NBHRSS_WEB' resp = self.s.get(ylurl) soupyl = BeautifulSoup(resp.content, 'html.parser') ylinfo = json.loads(soupyl.text) ylinfos = json.loads(ylinfo['result']) cbzt = ylinfos['AAC008'] # arrstr[3].replace('参保状态:', '') # if cbzt == '参保缴费': # cbzt = '正常参保' # else: # cbzt = '停缴' Fivestatus.setdefault('医疗', cbzt) # 第四次 ylurls = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S012&bustype=01¶m={"AAB301":"330200","PAGENO":1,"PAGESIZE":10000}&client=NBHRSS_WEB' resps = self.s.get(ylurls) soupyls = BeautifulSoup(resps.content, 'html.parser') ylinfos = json.loads(soupyls.text) if ylinfos['ret'] == '1': ylinfof = json.loads(ylinfos['result']) self.result_data['medical_care'] = {} self.result_data['medical_care']['data'] = {} years = '' months = '' for i in range(0, len(ylinfof['COSTLIST']['COST'])): arr = [] cell = ylinfof['COSTLIST']['COST'][i] yearmonth = cell['AAE002'] if years == '' or years != yearmonth[:4]: years = yearmonth[:4] self.result_data['medical_care']['data'][years] = {} if len(months) > 0: if months == yearmonth[-2:]: self.result_data['medical_care']['data'][ years][months] = {} if months == '' or months != yearmonth[-2:]: months = yearmonth[-2:] self.result_data['medical_care']['data'][years][ months] = {} dicts = { '缴费时间': cell['AAE002'], '缴费类型': '', '缴费基数': cell['AAE180'], '公司缴费': '', '个人缴费': cell['AAE022'], '缴费单位': '', '到账情况': cell['AAE078'] } arr.append(dicts) self.result_data['medical_care']['data'][years][ months] = arr # print(arrstr) nowyears = time.strftime("%Y", time.localtime()) ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S013&bustype=01&refresh=true¶m={"AAB301":"330200"}&client=NBHRSS_WEB' resp = self.s.get(ylurl) soupyl = BeautifulSoup(resp.content, 'html.parser') ylinfo = json.loads(soupyl.text) if ylinfo['ret'] == '1': ylinfos = json.loads(ylinfo['result']) if len(ylinfos) > 1: ljjfolder = float( ylinfos['AKC087'] ) # arrstr[9].replace('至本年末账户累计储存额:', '') else: ljjfolder = 0.00 else: ljjfolder = 0.00 if nowyears in self.result_data['old_age']['data'].keys(): for k, v in self.result_data['old_age']['data'][ nowyears].items(): ljjfolder = ljjfolder + float(v[0]['个人缴费']) self.result_data["baseInfo"].setdefault('个人医疗累计缴费', ljjfolder) # 工伤 ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S018&bustype=01&refresh=true¶m={"AAB301":"330200"}&client=NBHRSS_WEB' resp = self.s.get(ylurl) soupyl = BeautifulSoup(resp.content, 'html.parser') ylinfo = json.loads(soupyl.text) if ylinfo['ret'] == '1': ylinfos = json.loads(ylinfo['result']) cbzt = ylinfos['AAC008'] # arrstr[3].replace('参保状态:', '') # if cbzt == '参保缴费': # cbzt = '正常参保' # else: # cbzt = '停缴' Fivestatus.setdefault('工伤', cbzt) # 生育 ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S019&bustype=01&refresh=true¶m={"AAB301":"330200"}&client=NBHRSS_WEB' resp = self.s.get(ylurl) soupyl = BeautifulSoup(resp.content, 'html.parser') ylinfo = json.loads(soupyl.text) if ylinfo['ret'] == '1': ylinfos = json.loads(ylinfo['result']) cbzt = ylinfos['AAC008'] # arrstr[3].replace('参保状态:', '') # if cbzt == '参保缴费': # cbzt = '正常参保' # else: # cbzt = '停缴' Fivestatus.setdefault('生育', cbzt) #失业 ylurl = 'https://app.nbhrss.gov.cn/nbykt/rest/commapi?access_token=' + self.g.access_token + '&api=91S020&bustype=01&refresh=true¶m={"AAB301":"330200"}&client=NBHRSS_WEB' resp = self.s.get(ylurl) soupyl = BeautifulSoup(resp.content, 'html.parser') ylinfo = json.loads(soupyl.text) if ylinfo['ret'] == '1': ylinfos = json.loads(ylinfo['result']) cbzt = ylinfos['AAC008'] # arrstr[3].replace('参保状态:', '') # if cbzt == '参保缴费': # cbzt = '正常参保' # else: # cbzt = '停缴' Fivestatus.setdefault('失业', cbzt) # self.result_data["baseInfo"] = { # '城市名称': '宁波', # '城市编号': '330200', # '更新时间': time.strftime("%Y-%m-%d", time.localtime()), # '姓名': soup.select('#xm')[0].text, # '性别': soup.select('#xb')[0].text, # '身份证号': soup.select('#sfz')[0].text, # '国籍': soup.select('#gj')[0].text, # '社会保障卡号码': soup.select('#sbkh')[0].text, # '社保卡状态': soup.select('#kzt')[0].text, # '银行账号': soup.select('#yhkh')[0].text, # '发卡日期': soup.select('#fkrq')[0].text, # '手机号': soup.select('#sjhm')[0].text, # '固定号码': soup.select('#gddh')[0].text, # '常住地址': soup.select('#czdz')[0].text, # '邮编': soup.select('#yzbm')[0].text # } # self._yanglao() # self._yiliao() # self._gongshang() # self._shiye() # self._shengyu() self.result_data["baseInfo"].setdefault('五险状态', Fivestatus) if '参保缴费' in Fivestatus.values(): self.result_identity['status'] = '正常' else: self.result_identity['status'] = '停缴' return except PermissionError as e: raise PreconditionNotSatisfiedError(e) def _new_vc(self): vc_url = VC_URL + time.strftime('%a %b %d %Y %H:%M:%S', time.localtime()) resp = self.s.get(vc_url) return dict(cls='data:image', content=resp.content, content_type=resp.headers.get('Content-Type'))