def zqProductInfo(self):
         
         # 得到个人贷企业贷的有效产品ids    
     datas = self.getProductId()
     print "len-->" + str(len(datas))
     if len(datas) < 1:
         print "这个范围数据已经爬完,休眠5个小时",PageUtils.getCurrentTime()
         sleep(60 * 60 * 5)
         print "休眠结束,开始爬数据",PageUtils.getCurrentTime()
     index = 1
        
     for data in datas:
         sleep(6)
         productId = data["productid"]
         id = data["id"]
         index += 1
         url = "https://zhaocaibao.alipay.com/pf/purchase.htm?productId=" + productId
         print "zqProductInfo -->" , url
         try:
                 # 下载页面
             page = PageUtils.downloadPage(self.opener,url)
         except Exception, e:
             print "downloadPage err -->", e
             continue
         try:
             p = self.parsePage(page)
             
         except Exception, e:
             print "parsePage err -->", e
Пример #2
0
class CrawlerCQYY:
    
    def __init__(self):
        reload(sys) 
        sys.setdefaultencoding('utf8')  # @UndefinedVariable
        self.pageUtils = PageUtils()

    ym = "https://zcbprod.alipay.com"

    def crawler (self):
        
        self.pageUtils.url = "https://zcbprod.alipay.com/appointment/lotteryHistoryActivityList.htm"
        #self.pageUtils.login("13651781949", "lufax123")
        self.pageUtils.login("*****@*****.**", "lufax123456")
        print self.pageUtils.browser.title
        
        try:     
            pageNum = self.pageUtils.browser.find_element_by_css_selector('span[class="ui-paging-bold"]').text
        except:
            pageNum = '1/1'
        pageNum  = pageNum[2:] 
        print pageNum
        for i in range(int(pageNum)) :
            
            url = self.ym+"/appointment/lotteryHistoryActivityList.htm?currentPage="+str(i)
            print "url-->",url
            page = self.pageUtils.downloadPage(url)
            
            flag = self.parsePageUrl(page)
            if flag ==False:
                self.pageUtils.browser.quit()
                self.crawler()
                return
                
            urls = flag
            for u in urls:
                u = self.ym + u
                print u
                page = self.pageUtils.downloadPage(u)
                print self.parsePage(page)
        
        
    def parsePageUrl  (self,page):
           
        soup = BeautifulSoup(page)
        print "title==>",soup.title.string
        if "登录中心 - 支付宝" in soup.title.string:
            return False
        hrefs = soup.find_all('td', {'class': "detail-link"})#
        #hrefs = soup.find_all('a', {'seed': re.compile('detailLink-linkT*')})
        print "==>",hrefs
        result = []
        for href in hrefs:
            result.append(href.find("a")['href'])
            print "-->",href,"===",href.find("a"),"---",href.find("a")['href']
        return result
    
    def parsePage(self,page):
        result = {}
        soup = BeautifulSoup(page)
            #利率
        lilv = soup.find('p',{'class':'product-param-value-num'}) 
        result["lilv"]=lilv.getText()   
            #投资期限
        tzqx = soup.find('p',{'class':'product-param-value-num'}) 
        result["tzqx"]=tzqx.getText()   
            #担保机构
        dbjg = soup.find('h2').find("a").getText() 
        result["dbjg"]=dbjg.getText()     
            #类型
        type = soup.find('p',{'class':'product-param-value-num product-param-value-txt'}) 
        result["type"] = type.getText()    
        
        infos = soup.find('div',{'class':'app-apply'}) 
            #中签人数
        result["zqrs"]=infos  
            #中签率
        #result["zql"]=tzqx  
        
        
        infos = soup.find('div',{'class':'product-info-detail fn-clear'}).find("p")
            #总金额
        zje = infos[0].getText()
        result["zje"]=zje
          
            #已预约金额
        yyje = infos[1].getText()
        result["yyje"]=yyje 
            
            #抽签完成时间
        cqwcDate = infos[2].getText()
        result["cqwcDate"]=cqwcDate 
        
        
            #开放预约时间
        kfyyDate = soup.find('div',{'class':'timer1'}).find("p")
        result["kfyyDate"]=kfyyDate.getText()
            #开始抽签时间
        kfyyDate = soup.find('div',{'class':'timer2'}).find("p")
        result["kfyyDate"]=kfyyDate.getText()
            #下架时间
        xjDate = soup.find('div',{'class':'timer4'}).find("p")
        result["xjDate"]=xjDate.getText()   
            #起息日
        qxDate = soup.find('div',{'class':'timer5'}).find("p")
        result["qxDate"]=qxDate.getText()   
            #到期日
        dqDate = soup.find('div',{'class':'timer6'}).find("p")
        result["dqDate"]=dqDate.getText()
        return result