def __init__(self): super(whutparser, self).__init__() # self.arg = arg self.root = "i.whut.edu.cn" self.catlist = [] self.faclist = [] self.parser = webparser()
def parse(self, html, parse_type): '''Function to parse html data in a user defined way Input: html - html to be parsed parse_type - parameter that controls how the html will be parsed Return: N/A Side effect: Fill dataList with parsed html ''' parser = webparser(html, parse_type) self.dataList += parser.parse()
def parseshoplistdirect(ws, url): count = 1 totalpage = 0 hastotalpage = False while (url != None): #page = ws.getpagebyurl(url) page = ws.getpagebyurlwithheader(url) wp = webparser('taobao', page) wp.parsepage() url = wp.getnext() print "== Parse Page %d finished ==" % count #if (hastotalpage == False): (totalpage, hastotalpage) = wp.gettotalpagenumber() if (count < totalpage): count = count + 1 time.sleep(10) else : break
def parseshoplistdirect(ws, url): count = 1 totalpage = 0 hastotalpage = False while (url != None): #page = ws.getpagebyurl(url) page = ws.getpagebyurlwithheader(url) wp = webparser('taobao', page) wp.parsepage() url = wp.getnext() print "== Parse Page %d finished ==" % count #if (hastotalpage == False): (totalpage, hastotalpage) = wp.gettotalpagenumber() if (count < totalpage): count = count + 1 time.sleep(10) else: break
def parseshoplistbybrowser(mb, url): count = 1 totalpage = 0 hastotalpage = False ratelinklist = [] while (url != None): page = mb.getpagebyurl(url) wp = webparser('taobao', page) wp.parsepage() url = wp.getnext() ratelinklist.append(wp.ratelinklist) #if (hastotalpage == False): (totalpage, hastotalpage) = wp.gettotalpagenumber() print "== Parse Page %d finished ==" % count if (count < totalpage): count = count + 1 time.sleep(10) else : break break return ratelinklist
def parseshoplistbybrowser(mb, url): count = 1 totalpage = 0 hastotalpage = False ratelinklist = [] while (url != None): page = mb.getpagebyurl(url) wp = webparser('taobao', page) wp.parsepage() url = wp.getnext() ratelinklist.append(wp.ratelinklist) #if (hastotalpage == False): (totalpage, hastotalpage) = wp.gettotalpagenumber() print "== Parse Page %d finished ==" % count if (count < totalpage): count = count + 1 time.sleep(10) else: break break return ratelinklist
def parseshopratedetailbybrowser(mb, url): page = mb.getpagebyurl(url) wp = webparser('rate information', page) wp.parseproductratedetail(wp.soup) print "== Parse rate information finished =="
def getshoplistbybrowser(mb, url): page = mb.getpagebyurl(url) wp = webparser('taobao', page) catlist = wp.parseproductcat() return catlist