def zqProductInfo(self): # 得到个人贷企业贷的有效产品ids datas = self.getProductId() print "len-->" + str(len(datas)) if len(datas) < 1: print "这个范围数据已经爬完,休眠5个小时",PageUtils.getCurrentTime() sleep(60 * 60 * 5) print "休眠结束,开始爬数据",PageUtils.getCurrentTime() index = 1 for data in datas: sleep(6) productId = data["productid"] id = data["id"] index += 1 url = "https://zhaocaibao.alipay.com/pf/purchase.htm?productId=" + productId print "zqProductInfo -->" , url try: # 下载页面 page = PageUtils.downloadPage(self.opener,url) except Exception, e: print "downloadPage err -->", e continue try: p = self.parsePage(page) except Exception, e: print "parsePage err -->", e
class CrawlerCQYY: def __init__(self): reload(sys) sys.setdefaultencoding('utf8') # @UndefinedVariable self.pageUtils = PageUtils() ym = "https://zcbprod.alipay.com" def crawler (self): self.pageUtils.url = "https://zcbprod.alipay.com/appointment/lotteryHistoryActivityList.htm" #self.pageUtils.login("13651781949", "lufax123") self.pageUtils.login("*****@*****.**", "lufax123456") print self.pageUtils.browser.title try: pageNum = self.pageUtils.browser.find_element_by_css_selector('span[class="ui-paging-bold"]').text except: pageNum = '1/1' pageNum = pageNum[2:] print pageNum for i in range(int(pageNum)) : url = self.ym+"/appointment/lotteryHistoryActivityList.htm?currentPage="+str(i) print "url-->",url page = self.pageUtils.downloadPage(url) flag = self.parsePageUrl(page) if flag ==False: self.pageUtils.browser.quit() self.crawler() return urls = flag for u in urls: u = self.ym + u print u page = self.pageUtils.downloadPage(u) print self.parsePage(page) def parsePageUrl (self,page): soup = BeautifulSoup(page) print "title==>",soup.title.string if "登录中心 - 支付宝" in soup.title.string: return False hrefs = soup.find_all('td', {'class': "detail-link"})# #hrefs = soup.find_all('a', {'seed': re.compile('detailLink-linkT*')}) print "==>",hrefs result = [] for href in hrefs: result.append(href.find("a")['href']) print "-->",href,"===",href.find("a"),"---",href.find("a")['href'] return result def parsePage(self,page): result = {} soup = BeautifulSoup(page) #利率 lilv = soup.find('p',{'class':'product-param-value-num'}) result["lilv"]=lilv.getText() #投资期限 tzqx = soup.find('p',{'class':'product-param-value-num'}) result["tzqx"]=tzqx.getText() #担保机构 dbjg = soup.find('h2').find("a").getText() result["dbjg"]=dbjg.getText() #类型 type = soup.find('p',{'class':'product-param-value-num product-param-value-txt'}) result["type"] = type.getText() infos = soup.find('div',{'class':'app-apply'}) #中签人数 result["zqrs"]=infos #中签率 #result["zql"]=tzqx infos = soup.find('div',{'class':'product-info-detail fn-clear'}).find("p") #总金额 zje = infos[0].getText() result["zje"]=zje #已预约金额 yyje = infos[1].getText() result["yyje"]=yyje #抽签完成时间 cqwcDate = infos[2].getText() result["cqwcDate"]=cqwcDate #开放预约时间 kfyyDate = soup.find('div',{'class':'timer1'}).find("p") result["kfyyDate"]=kfyyDate.getText() #开始抽签时间 kfyyDate = soup.find('div',{'class':'timer2'}).find("p") result["kfyyDate"]=kfyyDate.getText() #下架时间 xjDate = soup.find('div',{'class':'timer4'}).find("p") result["xjDate"]=xjDate.getText() #起息日 qxDate = soup.find('div',{'class':'timer5'}).find("p") result["qxDate"]=qxDate.getText() #到期日 dqDate = soup.find('div',{'class':'timer6'}).find("p") result["dqDate"]=dqDate.getText() return result