def chinaUnicomAPI(phone_attr): """ :param phone_attr: dict(phone=XX, province=XX,city=XX, company=XX, password=XX) :param password: 全为数字的字符串(长度不少于6位) :return: """ # makeDirs() check_param = checkAttr(phone_attr) if check_param != True: return check_param # 返回参数错误 spider = ChinaUnicom(phone_attr) login = spider.loginSys() if login['code'] != 2000: return returnResult(login['code'], data={}, desc='spider.loginSys()') # 返回登陆错误信息 else: spider.getUserInfo() spider.getCallInfo() # spider.saveItems() # clawLog(phone_attr, log) data = dict(t_operator_user=spider.user_items, t_operator_call=spider.call_items, t_operator_note=list()) # 返回爬取结果 return returnResult(2000, data=data, desc='爬取内容成功') # end # import bs4 # bs4.BeautifulSoup.strings
def creditReportAPI(name, password, auth_pwd, debug=True): """ 实现接口,当debug为True时解析本地的测试html返回结果 :param name: 用户名 :param password: 登录密码 :param auth_pwd: 身份验证码 :param debug: :return: dict(person = 字典列表, card = 字典列表, query = 字典列表) """ name = name.strip() password = password.strip() auth_pwd = auth_pwd.strip() person = CreditReport(name, password, auth_pwd) #要是流程成功最终保存用户的征信报告以html的格式 #最终会返回一个字典给result result = person.visitSys() if result['result'] == 2000: html_name = result['file_name'] #如果要对页面进行解析则设置debug=True if debug == True: result = debugTest(name, password, auth_pwd, filename=html_name) return returnResult(code=2000, data=result) else: return returnResult(code=result['result'], data={})
def getUniqueTag(): # form = {'month': '201602'} # form['month'] = month form = dict(month=month) url = 'http://gd.10086.cn/commodity/servicio/nostandardserv/realtimeListSearch/query.jsps' self.__headers[ 'Referer'] = 'http://gd.10086.cn/my/REALTIME_LIST_SEARCH.shtml?dt=1469030400000' options = { 'method': 'post', 'url': url, 'form': form, 'cookies': self.cookies, 'headers': self.__headers } response = Request.basic(options) if response: try: unique_tag = json.loads( response.text)['attachment'][0]['value'] return getMonthRecords(unique_tag) except (KeyError, IndexError, Exception) as ex: print 'unique_tag not found, error:', ex # return False return returnResult(4100, [], desc=u'getUniqueTag 解析错误') else: # return False return returnResult(4000, [], desc=u'getUniqueTag 网络错误')
def trigger_status(): response = verification_code_trigger() if response: if response.status_code == 200: try: info = response.json() except Exception as _: return returnResult(4100, [], desc=u'动态验证码解析错误') if info['type'] == 'SUCCESS_COMPLETE': msg = info['content'] return returnResult(2000, [], desc=msg) else: if 'failMessage' in info['content']: msg = info['content']['failMessage'] elif 'message' in info['content'][0]: msg = info['content'][0]['message'] else: msg = info['content'][0] return returnResult(4800, [], desc=msg) else: return returnResult(4000, [], desc=u'动态验证码请求: {}'.format( response.status_code)) else: return returnResult(4000, [], desc=u'动态验证码请求网络错误')
def clawPageCall(date_tuple, page_no=1, resend=2): #完成单次请求[存在网络繁忙则重传] """完成单次请求""" params = {'_': '1468549625712', 'menuid': '000100030001'} form = { 'pageNo': '1', 'pageSize': '20', 'beginDate': '2016-07-01', 'endDate': '2016-07-18' } form['pageNo'] = page_no form['beginDate'] = date_tuple[0] form['endDate'] = date_tuple[1] params['_'] = getTimestamp() url = 'http://iservice.10010.com/e3/static/query/callDetail' self.headers['Referer'] = 'http://iservice.10010.com/' \ 'e3/query/call_dan.html?menuId=000100030001' options = { 'method': 'post', 'url': url, 'form': form, 'params': params, 'cookies': self.cookies, 'headers': self.headers } response = Request.basic(options) if response: try: page_json = json.loads(response.text) except ValueError: # return False return returnResult(4100, [], desc='clawPageCall1') else: if 'errorMessage' in page_json.keys( ) and resend > 0: # 存在系统繁忙 try: if page_json['errorMessage'][ 'respCode'] == '4114030193': return clawPageCall(date_tuple, page_no, resend - 1) # 繁忙重传 except KeyError: # return False return returnResult(4100, [], desc='clawPageCall2') else: return response.text else: # return False return returnResult(4000, [], desc='clawPageCall')
def start(**kwargs): data = kwargs.get('data') msg_no=kwargs.get("msg_no") phone_list=getPhonelist(data) for p in phone_list: p["msg_no"]=msg_no #调用登录的函数进行登录 browser=login_for_crawler() #对browser进行判断 if type(browser)==dict: data=browser return returnResult(code=data["code"],data=None) else: c = check(browser) result_data = c.checkAPI(phone_list) return returnResult(2000, result_data)
def judgeLogin(response): """ 对登录response进行分析 :param response: response obj :return: 登录状态码dict()/raise """ try: code = re.search(r'resultCode:"(.*?)"', response.text).group(1) except (AttributeError, IndexError) as ex: return dict(code=4000, func='judgeLogin') else: code_hash = { '0000': 2000, # 流程成功 '7007': 4600, # 密码错误 '7999': 5500, # 对方服务器繁忙 '7072': 4500, # 账号错误 '7009': 4500 # 账号错误 } if code in code_hash.keys(): self.cookies.update(dict_from_cookiejar(response.cookies)) return dict(code=code_hash[code], desc='judgeLogin', data=[]) else: # raise Exception(u'未知错误') return returnResult( 4000, [], desc=unicode('登陆错误代码: {}').format(code))
def clawInfo(text): try: # selector = etree.HTML(text) # table = selector.xpath('//table[@class="tb02"]')[0] # values = table.xpath('tbody/tr[2]/td/text()') # if len(values) == 0: # values = table.xpath('tr[2]/td/text()') print(u'用户状态', text) item = dict( # phone = values[0], # name = values[1], # cert_num = values[2], # open_date = values[4], # uese_valid = re.search(u'用户状态</td>\\s+<td>(.*?)</td>', # text), user_valid=1, company=self.phone_attr['company'], province=self.phone_attr['province'], city=self.phone_attr['city'], level=re.search(u"link'\)\.html\('(.*?)'\)", text).group(1), phone=re.search(u'手机号码</td>\\s+<td>(.*?)</td>', text).group(1), name=re.search(u'用户名</td>\\s+<td>(.*?)</td>', text).group(1), cert_num=re.search(u'身份证</td>\\s+<td>(.*?)</td>', text).group(1), open_date=re.search(u'入网时间</td>\\s+<td>(.*?)</td>', text).group(1), product_name=re.search(u'所属品牌</td>\\s+<td>(.*?)</td>', text).group(1), cert_type=u'身份证', # province=self.phone_attr['province'], # city=self.phone_attr['city'], ) # 填充字段 [item.setdefault(i, '') for i in config.COLUMN_USER] self.user_items.append(item) # 保存记录 return returnResult(2000, [], desc=u'获取客户信息成功') except AttributeError: # TODO: 确认已登陆 return returnResult(4100, [], desc=u'获取客户信息解析错误') except (IndexError, Exception) as ex: # return False return returnResult(4000, [], desc=u'获取客户信息网络错误')
def fetch_session(self): from requests import cookies session_response = self.session_request() if session_response: if session_response.status_code == 200: try: self.cookies = cookies.merge_cookies( self.cookies, session_response.cookies) return returnResult(2000, [], desc=u'获取 session 成功') except: return returnResult(4100, [], desc=u'获取 session 解析错误') else: returnResult(4000, [], desc=u'获取 session 网络错误: {}'.format( session_response.status_code)) else: return returnResult(4000, [], desc=u'获取 session 网络错误')
def get_encryption_key(self): key_response = self.encryption_key_request() if key_response: if key_response.status_code == 200: key_match = re.search(u'"e"\:"(.*?)".*?"n"\:"(.*?)"', key_response.content.decode('utf-8')) try: keys_str = key_match.group(1) mods_str = key_match.group(2) self.cookies = key_response.cookies return keys_str, mods_str except Exception as _: return returnResult(4100, [], desc=u'加密 key 解析错误') else: returnResult(4000, [], desc=u'获取加密 key 网络错误: {}'.format( key_response.status_code)) else: return returnResult(4000, [], desc=u'获取加密 key 网络错误')
def getMonthRecords(unique_tag): form = dict(uniqueTag=unique_tag, monthListType='0') url = 'http://gd.10086.cn/commodity/servicio/nostandardserv/realtimeListSearch/ajaxRealQuery.jsps' # pay attention to "timeout" options = { 'method': 'post', 'url': url, 'form': form, 'cookies': self.cookies, 'timeout': 20, 'headers': self.__headers } response = Request.basic(options) if response: return returnResult(2000, response.text, desc=u'getMonthRecords 成功') else: # return False return returnResult(4000, [], desc=u'getMonthRecords 网络错误')
def get_login_url(self, login_response): if login_response: if login_response.status_code == 200: try: info = login_response.json() except Exception as _: return returnResult(4100, [], desc=u'登陆网址解析错误') if info['type'] == 'ucs.server.location.url': msg = info['content'] self.cookies = login_response.cookies return returnResult(2000, msg, desc=msg) else: code = 4800 if 'failMessage' in info['content']: msg = info['content']['failMessage'] elif 'message' in info['content'][0]: msg = info['content'][0]['message'] else: msg = info['content'] if msg == u'动态密码错误!': code = 4402 elif u'密码错误,请重新输入' in msg: code = 4401 return returnResult(code, [], desc=msg) else: returnResult(4000, [], desc=u'获取登陆网址网络错误: {}'.format( login_response.status_code)) else: return returnResult(4000, [], desc=u'获取登陆网址网络错误')
def queryInfo(): form = {'servCode': 'MY_BASICINFO'} url = 'http://gd.10086.cn/commodity/servicio/track/servicioDcstrack/query.jsps' self.__headers[ 'Referer'] = 'http://gd.10086.cn/my/myService/myBasicInfo.shtml' options = { 'method': 'post', 'url': url, 'form': form, 'cookies': self.cookies, 'headers': self.__headers } response = Request.basic(options) if response: # TODO: auto found print(response.text) return getInfo() else: # return False return returnResult(4000, [], desc=u'queryInfo 网络错误')
def getInfo(): form = {'servCode': 'MY_BASICINFO', 'operaType': 'QUERY'} url = 'http://gd.10086.cn/commodity/servicio/servicioForwarding/queryData.jsps' self.__headers[ 'Referer'] = 'http://gd.10086.cn/my/myService/myBasicInfo.shtml' options = { 'method': 'post', 'url': url, 'form': form, 'cookies': self.cookies, 'timeout': 30, 'headers': self.__headers } response = Request.basic(options) if response: # return clawInfo(response.text) return clawInfo(response.content.decode('utf-8')) else: # return False return returnResult(4000, [], desc=u'getInfo 网络错误')
def clawAllInfo(self): # try: # self.browser.find_element_by_xpath('//div[@id="mathBox"]/div/a[1]').click() # 点击查询 # self.browser.implicitly_wait(_time_usual) # except NoSuchElementException as ex: # return 4000 # self.timeSleep() # self.cookies = self.getCookies(self.browser.get_cookies()) # cookies更新 # if len(self.cookies) > 0: if self.cookies != dict(): # self.clawUserInfo() # 爬取用户信息 # self.clawCallInfo() # 爬取通话记录 user_info = self.clawUserInfo() # 爬取用户信息 if user_info['code'] == 2000: call_info = self.clawCallInfo() # 爬取通话记录 return call_info else: return user_info else: return returnResult(4000, [], desc=u'网络错误,cookie 为空')
def loginSys(spider): """ 登陆系统 :param spider: the object of ChinaMobile_GD :return: """ if not isinstance(spider, ChinaMobile_GD): print 'obj error' raise ValueError(u'参数错误') # login = spider.login() # if login == 2000: # 登录成功 login = spider.fetch_cookie() if login['code'] == 2000: # 登录成功 # login = 2000 # if login == 2000: print u'登录成功' search = spider.clawAllInfo() # 爬取内容 # if search == 2000: if search['code'] == 2000: print u'爬取内容成功' # print spider.saveItems() result = dict( t_operator_user=spider.user_items, t_operator_call=spider.call_items, t_operator_note=spider.note_items, ) # spider.browser.close() # return dict(code=2000, result=result) return returnResult(2000, result, search['desc']) else: return search else: # print u'登录失败,失败码:{0}'.format(login) # spider.browser.close() # return dict(code=login, temp=None) # 密码错误4401,动态码错误4402 return login
def set_login_cookie(self, login_url): from requests import cookies login_url_response = self.login_url_request(login_url) if login_url_response: if login_url_response.status_code == 302: if login_url_response.cookies: try: self.cookies = cookies.merge_cookies( self.cookies, login_url_response.cookies) return returnResult(2000, [], desc=u'登陆网址成功') except: return returnResult(4100, [], desc=u'无法获取 cookie') else: return returnResult(4100, [], desc=u'cookie 获取错误') else: returnResult(4000, [], desc=u'登陆网址网络错误: {}'.format( login_url_response.status_code)) else: return returnResult(4000, [], desc=u'登陆网址网络错误')
def clawCallInfo(self): """ Save all call records :return: null """ item = { 'cert_num': self.user_items[0]['cert_num'], 'phone': self.user_items[0]['phone'] } text_seq = self.getFiveMonthCall() # if len(text_seq) > 0: desc_list = [u'爬取记录成功,但存在未能爬取信息的月份:'] for text in text_seq: if isinstance(text[1], dict): desc_list.append(u'{}, 错误原因: {};'.format( unicode(text[0]), unicode(text[1]['desc']))) else: try: results = json.loads( text[1])['content']['realtimeListSearchRspBean'][ 'calldetail']['calldetaillist'] sms_results = (json.loads( text[1])['content']['realtimeListSearchRspBean'] ['smsdetail']['smsdetaillist']) except Exception as e: desc_list.append(u'{}, 错误原因: {};'.format( unicode(text[0]), unicode(e.message))) for record in results: temp = copy(item) # 'place', 'time', 'time', 'chargefee','period', 'contnum', 'becall', 'conttype' for k, v in record.items(): if k in config.KEY_CONVERT_CALL.keys(): column_name = config.KEY_CONVERT_CALL[k] temp[column_name] = v try: # 入库修正 self.convertValues(temp) except Exception as ex: print ex for k, v in temp.items(): print k, v self.call_items.append(temp) for record in sms_results: temp = copy(item) # 'time', 'fee', 'smstype', 'smsnum' for k, v in record.items(): if k in config.KEY_CONVERT_NOTE_MOBILE.keys(): column_name = config.KEY_CONVERT_NOTE_MOBILE[k] temp[column_name] = v try: # 入库修正 self.convert_value_note(temp) except Exception as ex: print ex for k, v in temp.items(): print k, v self.note_items.append(temp) if len(desc_list) == 7: return returnResult(4000, [], desc=u'爬取记录网络错误') elif 7 > len(desc_list) > 1: return returnResult(2000, [], desc=''.join(desc_list)) elif len(desc_list) == 1: return returnResult(2000, [], desc=u'爬取记录成功')
def checkAttr(phone_attr): _key = ('phone', 'province', 'city', 'company', 'password') if not isinstance(phone_attr, dict) or set(phone_attr.keys()) != set(_key): return returnResult(4400, data={}) else: # 参数正确返回True return True