def login (self): user = self.g_users[self.g_index] username = user.get("username") password = user.get("password") url = "https://zhaocaibao.alipay.com/pf/purchase.htm?productId=20150821000230020000680048696668" login = {"username":username, "password":password, "username_input":"//input[@id='J-input-user']", "password_input":"//*[@id='password_input']", "submit_but":"//*[@id='J-login-btn']", "check_code":{"check_code_input":"//*[@id='J-input-checkcode']", "img_tag":"//*[@id='J-checkcode-img']", "img_path":"/home/lufax/img"}, "url":url} f = PageUtils.login(login) if f==False: if self.g_waitFlag>5: sleep(10*60) login() self.g_waitFlag += 1 else: self.browser = f self.opener = PageUtils.build_opener_with_chrome_cookies(self.browser.get_cookies())
class CrawlerHome: def __init__(self): reload(sys) sys.setdefaultencoding("utf8") # @UndefinedVariable self.pageUtils = PageUtils() def updateData(self, data): _main = data.get("m") _slaves = data.get("s") main = {} slave = {} print _main print _slaves db = DataBase() today = self.pageUtils.getCurrentDate() tomorrow = self.pageUtils.delayed(today, -1) # 取昨天的数据 sql = "select * from zcb_report_master where date(createdate)='" + str(tomorrow) + "'" tomorrow_main = db.execute(sql) if len(tomorrow_main) > 0: tomorrow_main = tomorrow_main[0] print "sql==>", sql for _d in _main: print "-->", _main.get(_d), "==", tomorrow_main.get(_d) if _d == "yycjjebl": main[_d] = _main.get(_d) else: if tomorrow_main.get(_d) == None or _main.get(_d) > tomorrow_main.get(_d): main[_d] = _main.get(_d) # 今天是否有数据 sql = "select * from zcb_report_master where date(createdate)='" + str(today) + "'" today_main = db.execute(sql) if len(today_main) < 1: # 主表数据插入 sql = db.parseInsert("zcb_report_master", main) print "insert master sql -->", sql else: today_main = today_main[0] sql = db.parseUpdate("zcb_report_master", main, " id = " + str(today_main.get("id"))) print "update master sql -->", sql db.execute(sql) else: sql = "select * from zcb_report_master where date(createdate)='" + str(today) + "'" today_main = db.execute(sql) if len(today_main) < 1: sql = db.parseInsert("zcb_report_master", _main) print "insert master sql -->", sql db.execute(sql) else: today_main = today_main[0] sql = db.parseUpdate("zcb_report_master", _main, "id=" + str(today_main.get("id"))) print "update master sql -->", sql db.execute(sql) # 处理从表数据 for _slave in _slaves: sql = ( "select * from zcb_report_slave where date(createdate)='" + str(tomorrow) + "' and type='" + _slave.get("type") + "' and tzqx = '" + _slave.get("tzqx") + "'" ) print sql tomorrow_slave = db.execute(sql) if len(tomorrow_slave) > 0: tomorrow_slave = tomorrow_slave[0] slave = {} slave["type"] = _slave.get("type") slave["tzqx"] = _slave.get("tzqx") for _d in _slave: print "-->", _slave.get(_d), "==", tomorrow_slave.get(_d) if tomorrow_slave.get(_d) == None or _slave.get(_d) != tomorrow_slave.get(_d): print _d, "==>", _slave.get(_d), "!!!!!", tomorrow_slave.get(_d) slave[_d] = _slave.get(_d) if len(slave) < 1: continue print "slave-->", slave sql = ( "select * from zcb_report_slave where date(createdate)='" + str(today) + "' and type='" + _slave.get("type") + "' and tzqx = '" + _slave.get("tzqx") + "'" ) today_slave = db.execute(sql) if len(today_slave) < 1: # insert sql = db.parseInsert("zcb_report_slave", slave) print "insert slave sql -->", sql else: # update today_slave = today_slave[0] sql = db.parseUpdate("zcb_report_slave", slave, " id = " + str(today_slave.get("id"))) print "update slave sql -->", sql db.execute(sql) else: sql = ( "select * from zcb_report_slave where date(createdate)='" + str(today) + "' and type='" + _slave.get("type") + "' and tzqx = '" + _slave.get("tzqx") + "'" ) today_slave = db.execute(sql) if len(today_slave) < 1: sql = db.parseInsert("zcb_report_slave", _slave) print "insert slave sql -->", sql db.execute(sql) else: today_slave = today_slave[0] sql = db.parseUpdate("zcb_report_slave", _slave, "id = " + str(today_slave.get("id"))) print "update slave sql -->", sql db.execute(sql) def crawlerTest(self): url = "https://zhaocaibao.alipay.com/pf/purchase.htm?productId=20150821000230020000680048696668" self.pageUtils.url = url self.pageUtils.login("13651781949", "lufax123") sleep(10) url = "https://zhaocaibao.alipay.com/pf/purchase.htm?productId=20151009000230020000280058270528" self.pageUtils.browser.get(url) def crawler(self): url = "https://zhaocaibao.alipay.com/pf/productList.htm" browser = self.pageUtils.startBrowser() browser.get(url) print browser.title a = self.parsePage_home(browser) url = "https://cmspromo.alipay.com/finance/fullyear.htm" browser.get(url) print browser.title b = self.parsePage_finance(browser) b.update() a.get("m").update(b) # print a.get("s") # print b self.updateData(a) # d = self.pageUtils.downloadPage(url) # browser.find_element("", "").get_attribute(name) # print "==>",browser.find_element_by_class_name("data-box").text() browser.quit() def parsePage_home(self, page): result = {} result_m = {} result["m"] = result_m # print soup.title # #平台成交金额 cjjes = page.find_elements_by_class_name("data-box") c = "" for cjje in cjjes: # cjje += cjje.get_attribute("class") c += cjje.text result_m["cjje"] = c yycjje = page.find_element_by_class_name("week-book-data") result_m["yycjje"] = self.clearNumber(yycjje.text) yycjjebl = page.find_element_by_class_name("book-rate-data") result_m["yycjjebl"] = yycjjebl.text grqyds = page.find_elements_by_css_selector('div[class="several-months fn-clear"]') i = 0 qixis = ["3", "3-6", "6-12", "12-24", "24"] result_s_list = [] result["s"] = result_s_list for grqyd in grqyds: result_s_map = {"type": "个人企业贷"} result_s_list.append(result_s_map) result_s_map["tzqx"] = qixis[i] aa = grqyd.find_element_by_css_selector('div[class="product-book fn-clear"]') zg = aa.find_element_by_class_name("content-third-type") # 总共 result_s_map["zgje"] = self.clearNumber(zg.text) yylilv = aa.find_element_by_class_name("content-second-type") # 预约利率 result_s_map["yylilv"] = yylilv.text print "yylilv-->", yylilv.text try: bb = grqyd.find_element_by_css_selector('div[class="product-buy fn-clear"]') gm = bb.find_element_by_class_name("content-third-type") # 购买 result_s_map["gmje"] = self.clearNumber(gm.text) gmlilv = bb.find_element_by_css_selector('li[class="w145 buy-product-rate"]') # 购买利率 result_s_map["gmlilv"] = gmlilv.text print "gmlilv-->", gmlilv.text except Exception, e: print e i += 1 # .find_element_by_class_name("content-third-type")#购买 # print zg,gm # print result return result
class CrawlerCQYY: def __init__(self): reload(sys) sys.setdefaultencoding('utf8') # @UndefinedVariable self.pageUtils = PageUtils() ym = "https://zcbprod.alipay.com" def crawler (self): self.pageUtils.url = "https://zcbprod.alipay.com/appointment/lotteryHistoryActivityList.htm" #self.pageUtils.login("13651781949", "lufax123") self.pageUtils.login("*****@*****.**", "lufax123456") print self.pageUtils.browser.title try: pageNum = self.pageUtils.browser.find_element_by_css_selector('span[class="ui-paging-bold"]').text except: pageNum = '1/1' pageNum = pageNum[2:] print pageNum for i in range(int(pageNum)) : url = self.ym+"/appointment/lotteryHistoryActivityList.htm?currentPage="+str(i) print "url-->",url page = self.pageUtils.downloadPage(url) flag = self.parsePageUrl(page) if flag ==False: self.pageUtils.browser.quit() self.crawler() return urls = flag for u in urls: u = self.ym + u print u page = self.pageUtils.downloadPage(u) print self.parsePage(page) def parsePageUrl (self,page): soup = BeautifulSoup(page) print "title==>",soup.title.string if "登录中心 - 支付宝" in soup.title.string: return False hrefs = soup.find_all('td', {'class': "detail-link"})# #hrefs = soup.find_all('a', {'seed': re.compile('detailLink-linkT*')}) print "==>",hrefs result = [] for href in hrefs: result.append(href.find("a")['href']) print "-->",href,"===",href.find("a"),"---",href.find("a")['href'] return result def parsePage(self,page): result = {} soup = BeautifulSoup(page) #利率 lilv = soup.find('p',{'class':'product-param-value-num'}) result["lilv"]=lilv.getText() #投资期限 tzqx = soup.find('p',{'class':'product-param-value-num'}) result["tzqx"]=tzqx.getText() #担保机构 dbjg = soup.find('h2').find("a").getText() result["dbjg"]=dbjg.getText() #类型 type = soup.find('p',{'class':'product-param-value-num product-param-value-txt'}) result["type"] = type.getText() infos = soup.find('div',{'class':'app-apply'}) #中签人数 result["zqrs"]=infos #中签率 #result["zql"]=tzqx infos = soup.find('div',{'class':'product-info-detail fn-clear'}).find("p") #总金额 zje = infos[0].getText() result["zje"]=zje #已预约金额 yyje = infos[1].getText() result["yyje"]=yyje #抽签完成时间 cqwcDate = infos[2].getText() result["cqwcDate"]=cqwcDate #开放预约时间 kfyyDate = soup.find('div',{'class':'timer1'}).find("p") result["kfyyDate"]=kfyyDate.getText() #开始抽签时间 kfyyDate = soup.find('div',{'class':'timer2'}).find("p") result["kfyyDate"]=kfyyDate.getText() #下架时间 xjDate = soup.find('div',{'class':'timer4'}).find("p") result["xjDate"]=xjDate.getText() #起息日 qxDate = soup.find('div',{'class':'timer5'}).find("p") result["qxDate"]=qxDate.getText() #到期日 dqDate = soup.find('div',{'class':'timer6'}).find("p") result["dqDate"]=dqDate.getText() return result