def constructTargetDirs(thread, localdir, utc = 0): 
    """Create needed subdirs to archive thread
    
    If utc is True, then use UTC time
    """
    
    # Use machine- and human-readable date-time, e.g. 20070304T203217
    date = vbutils.getDateTime()

    # Construct a target directory
    # TODO template this?
    print "Create target directories ..."
    localsubdirs = [localdir, thread.forum, thread.title, date]
    newdir = os.path.join(*localsubdirs)

    if os.access(newdir, os.F_OK):
        print "Target already exists ..."
    else:
        print "Creating target dirs ..."
        try:
            # makedirs() recursively creates all
            # needed subdirs
            # raises error exception if leaf exists
            # or can't be created
            os.makedirs(newdir)    
        except:
            # because we previously test if leaf
            # exists, we can be sure that this is
            # some other error.
            print "Error: failed to create target dir: %s" % newdir
            return '' 
    print "Target directory is %s" % newdir
    return newdir
 def exportDict(self):
     """Return dict obj with all Archive data"""
     # Create dict of the current data model
     archive = {}
     archive["lastupdate"] = vbutils.getDateTime() 
     archive["forum"] = {}
     
     # Loop over each forum
     for forumid, forumobj in self.forum.iteritems():
         archive["forum"][forumid] = self.forum[forumid].exportDict() 
     
     return archive            
示例#3
0
    def __init__(self, 
            url = '',
            id = '',
            lastupdate = '',
            title = '',
            forum = '',
            numpages = 1,
            post = {}, 
            jsonstr = '',
            rawhtml = ''
            ):
        
        if jsonstr:

            self.importJSON(jsonstr)

        elif rawhtml:

            self.importHTML(rawhtml)
            
            # When creating a thread object with 
            #   raw HTML, one should also pass along 
            #   the lastupdate string 
            if lastupdate:
                self.lastupdate = lastupdate
            else:
                # Else set lastupdate to now
                self.lastupdate = vbutils.getDateTime()

        else:

            self.lastupdate = lastupdate 
            self.forum = forum
            self.id = id
            self.numpages = numpages 
            self.title = title 
            self.url = url
            if post:
                self.post = {}
                for id, p in post.iteritems():
                    kw = vbutils.convertKeysToStr(p)
                    self.post[id] = vbpost.Post(**kw) 
            else:
                self.post = post 
            if url:
                self.update(url)
示例#4
0
    def update(self, url = ''):
        """Retrieve HTML from first page and scrape basic info 
        """
   
        if not url:
            url = self.url 

        self.url = vbutils.cleanURL(url)
        self.id = vbutils.findThreadID(self.url)
        page = []
        print "Scraping %s ..." % self.url
        page.append(getPage(self.url))
        self.numpages = int(vbscrape.scrapeNumPages(page[0]))
        print "Found %s pages." % str(self.numpages)
        for p in range(1, self.numpages):
            print "Scraping page %s of %s ..." % (str(p+1), str(self.numpages))
            page.append(getPage(self.url, (p + 1)))

        print "Importing data from HTML ..."
        self.importHTML(page, self.url)

        self.lastupdate = vbutils.getDateTime()   
        print "Thread update completed at %s" % self.lastupdate
    def update(self, platform_ = ''):
        """Download latest version of all threads
            and update the JSON summary file.
        """
        # Very important that platform_ is set correctly
        # Filename transformations depend on this setting
        if not platform_:
            if self.platform:
                platform_ = self.platform
            else:
                platform_ = 'unix'
        
        # Loop over each thread in each forum
        #   and download latest data
        for forumid, forumobj in self.forum.iteritems():
            print "Checking %s threads in %s for updates ..." % (len(forumobj.thread), forumid)
            for threadid, threadobj in forumobj.thread.iteritems():
                # TODO Problem: Thread objects created from
                #   arbitrary HTML/JSON may not have URL 
                #   Maybe we can implement a smarter URL guessing
                #   heuristic based on other things 
                #   Even google search? :-?
                if not downloadThread(threadobj, self.localdir, platform = platform_):
                    print "Failed to download thread %s" % threadobj.title
                    print "Attempting to proceed anyway ..."

        # Sync this Archive object with new data on disk 
        # os.walk gives us an iterator of a dir tree
        # TODO Need to catch errors
        print "Syncing new data in %s" % self.localdir

        currentforum = ''
        currentthreadlist = [] 
        currentthread = ''
        currentinstance = ''
        # Iterate through the rest of the subdirs
        # They won't be a reliable order so we need to
        #   figure out where we are in the tree each time
        for root, dirs, files in os.walk(self.localdir, topdown=True):
            lastdir = os.path.split(root.rstrip('/'))[1]
            nextlastdir = os.path.split(os.path.split(root.rstrip('/'))[0]) 
            if (root == self.localdir):
                # We are in archive root
                # Create a Forum object for each subdir
                for d in dirs:
                    print "Found forum %s" % d
                    self._addForum(d)
            elif (lastdir in self.forum.keys()):
                # We are in a forum dir
                currentforum = lastdir
                # Assume all files are subdirs named by slugs
                # TODO Need to validate these dirs
                currentthreadlist = dirs 
                for t in currentthreadlist:
                    print "Found thread %s" % t
                currentinstance = ''            
            elif (lastdir in currentthreadlist):
                # We are in a thread dir
                currentthread = lastdir
                # Assume all files are subdirs with instances
                # Sort by date 
                print "Reviewing thread %s" % currentthread
                print "Found %s saved instances" % len(dirs)
                dirs.sort()
                dirs = [dirs[0]]
                currentinstance = dirs[0]
                print "Latest update was on %s" % currentinstance
            elif (lastdir == currentinstance):
                # We are in the dir of an instance
                # Find an original html page,
                #   e.g. showthread.php@t=01235.orig
                # TODO this is vB convention need to be template
                try:
                    for f in files:
                        print f
                    orig_file = (f for f in files if (f[:4] == 'show' and f[-4:] == 'orig')).next()
                    print "Found source file: %s" % orig_file
                except:
                    orig_file = ''
                    print "Source HTML file not found"
                    break

                # TODO Much easier if subdirs = IDs instead of slugs
                # TODO Is this something we should change?
                id = vbutils.findThreadID(orig_file)

                # Is there already a thread object?
                if not id in self.forum[currentforum].thread.keys():
                    # Read the original html
                    subdir = root 
                    print "Attempting to read from %s" % subdir
                    with open(os.path.join(subdir, orig_file), 'r') as f:
                        orig_html = f.read()

                    # Find the URL in this HTML 
                    # TODO sometimes the URL is not discoverable
                    url = vbutils.findThreadURL(orig_html, id)
 
                    # Create this thread object 
                    # TODO Try/except might be cleaner
                    if not url:
                        print "No valid URL found in HTML."
                        print "Trying to make Thread object anyway from the HTML."
                        self.addThread(rawhtml_ = orig_html)
                    else:
                        if not self.addThread(url):
                            print "Tried and failed to create Thread object for URL: %s" % url

        # Last update is now!
        self.lastupdate = vbutils.getDateTime()        

        # If we survive that insane loop,
        #   the data model should be up to date
        # Store this updated data to a JSON summary
        self.writeSummary()