def GetDataFromWebsite(L, dbType, siteType, sleepTime): spiderdb = qbdb() spider = htParse.URLPARSE() spiderdb.init_database_type(dbType) curpage = 0 for site in L: #指定爬虫要爬取的网址,以及网址的类型 spider.mySite(site, siteType) #将读取到的数据插入数据库 InsertIntoDataBase(spider, spiderdb, dbType) curpage += 1 print('''当前处理页数:%d''' % (str(curpage))) time.sleep(sleepTime)
def InsertData(startindex, endindex): spiderdb = qbdb() spider = htParse.URLPARSE() website = ('http://www.ixxzy22.com/?m=vod-index-pg-%s.html', 2, 3, 14) #子爬虫,因为有两种网址要解析 spiderson = htParse.URLPARSE() spiderdb.init_database_type(website[1]) curpage = startindex while curpage <= endindex: print(u'''当前处理页数:%s''' % (str(curpage))) #指定爬虫要爬取的网址,以及网址的类型 spider.pagestory = [] spider.mySite(website[0] % (str(curpage)), website[1]) for index in range(len(spider.pagestory)): # print(u'''当前处理编号:%s''' % (str(index))) spiderson.pagestory = [] spiderson.mySite(spider.pagestory[index][1], 256) if (len(spiderson.pagestory) > 0): spider.pagestory[index][1] = spiderson.pagestory[0] time.sleep(1.5) curpage = (curpage + 1) #将读取到的数据插入数据库 InsertIntoDataBase(spider, spiderdb, website[1]) spiderdb.Showwebsite()
def getQiuBai(): spiderdb = qbdb() spider = htParse.URLPARSE() qiubaiType = [('http://www.qiushibaike.com/textnew/page/%s', 1, 3, 14), ('http://www.qiushibaike.com/text/page/%s', 1, 3, 14), ('http://www.qiushibaike.com/hot/page/%s', 1, 3, 14)] for sinTask in qiubaiType: spiderdb.init_database_type(sinTask[1]) curpage = 1 while curpage <= 35: #指定爬虫要爬取的网址,以及网址的类型 spider.mySite(sinTask[0] % (str(curpage)), 1) #将读取到的数据插入数据库 InsertIntoDataBase(spider, spiderdb, sinTask[2]) curpage += 1 print('''当前处理页数:''') print(curpage) time.sleep(0.5) spiderdb.ShowTopTen() return