def importHTML(self, rawhtml, url = ''): """Populate object by scraping chunk of HTML rawhtml : May be a string or a list of strings. url : Optional param, useful to specify URL explicitly in situations where the URL is known. Many vB installations use only relative links so it can be hard to discover a URL from code. """ html = [] # Clean up the raw html if type(rawhtml) == type(list()): for h in rawhtml: html.append(vbutils.cleanEncoding(h)) else: html.append(vbutils.cleanEncoding(rawhtml)) self.id = vbscrape.scrapeThreadID(html[0]) if url: self.url = url else: self.url = vbscrape.scrapeThreadURL(self.id, html[0]) self.forum = vbutils.makeSlug(vbscrape.scrapeForumName(html[0])) self.title = vbutils.makeSlug(vbscrape.scrapeThreadTitle(html[0])) self.numpages = vbscrape.scrapeNumPages(html[0]) self.post = {} for h in html: self.post.update(vbscrape.scrapePosts(h))
def update(self, url = ''): """Retrieve HTML from first page and scrape basic info """ if not url: url = self.url self.url = vbutils.cleanURL(url) self.id = vbutils.findThreadID(self.url) page = [] print "Scraping %s ..." % self.url page.append(getPage(self.url)) self.numpages = int(vbscrape.scrapeNumPages(page[0])) print "Found %s pages." % str(self.numpages) for p in range(1, self.numpages): print "Scraping page %s of %s ..." % (str(p+1), str(self.numpages)) page.append(getPage(self.url, (p + 1))) print "Importing data from HTML ..." self.importHTML(page, self.url) self.lastupdate = vbutils.getDateTime() print "Thread update completed at %s" % self.lastupdate