예제 #1
0
    def importHTML(self, rawhtml, url = ''):
        """Populate object by scraping chunk of HTML
       
        rawhtml : May be a string or a list of strings.
        url : Optional param, useful to specify URL explicitly
                in situations where the URL is known.
                Many vB installations use only relative links
                so it can be hard to discover a URL from code.
        """

        html = []
        # Clean up the raw html
        if type(rawhtml) == type(list()):
            for h in rawhtml:
                html.append(vbutils.cleanEncoding(h))
        else:
            html.append(vbutils.cleanEncoding(rawhtml))
        
        self.id = vbscrape.scrapeThreadID(html[0]) 
        if url:
            self.url = url
        else:
            self.url = vbscrape.scrapeThreadURL(self.id, html[0]) 
        self.forum = vbutils.makeSlug(vbscrape.scrapeForumName(html[0]))
        self.title = vbutils.makeSlug(vbscrape.scrapeThreadTitle(html[0]))
        self.numpages = vbscrape.scrapeNumPages(html[0])

        self.post = {} 
        for h in html:
            self.post.update(vbscrape.scrapePosts(h))
예제 #2
0
    def update(self, url = ''):
        """Retrieve HTML from first page and scrape basic info 
        """
   
        if not url:
            url = self.url 

        self.url = vbutils.cleanURL(url)
        self.id = vbutils.findThreadID(self.url)
        page = []
        print "Scraping %s ..." % self.url
        page.append(getPage(self.url))
        self.numpages = int(vbscrape.scrapeNumPages(page[0]))
        print "Found %s pages." % str(self.numpages)
        for p in range(1, self.numpages):
            print "Scraping page %s of %s ..." % (str(p+1), str(self.numpages))
            page.append(getPage(self.url, (p + 1)))

        print "Importing data from HTML ..."
        self.importHTML(page, self.url)

        self.lastupdate = vbutils.getDateTime()   
        print "Thread update completed at %s" % self.lastupdate