def constructTargetDirs(thread, localdir, utc = 0): """Create needed subdirs to archive thread If utc is True, then use UTC time """ # Use machine- and human-readable date-time, e.g. 20070304T203217 date = vbutils.getDateTime() # Construct a target directory # TODO template this? print "Create target directories ..." localsubdirs = [localdir, thread.forum, thread.title, date] newdir = os.path.join(*localsubdirs) if os.access(newdir, os.F_OK): print "Target already exists ..." else: print "Creating target dirs ..." try: # makedirs() recursively creates all # needed subdirs # raises error exception if leaf exists # or can't be created os.makedirs(newdir) except: # because we previously test if leaf # exists, we can be sure that this is # some other error. print "Error: failed to create target dir: %s" % newdir return '' print "Target directory is %s" % newdir return newdir
def exportDict(self): """Return dict obj with all Archive data""" # Create dict of the current data model archive = {} archive["lastupdate"] = vbutils.getDateTime() archive["forum"] = {} # Loop over each forum for forumid, forumobj in self.forum.iteritems(): archive["forum"][forumid] = self.forum[forumid].exportDict() return archive
def __init__(self, url = '', id = '', lastupdate = '', title = '', forum = '', numpages = 1, post = {}, jsonstr = '', rawhtml = '' ): if jsonstr: self.importJSON(jsonstr) elif rawhtml: self.importHTML(rawhtml) # When creating a thread object with # raw HTML, one should also pass along # the lastupdate string if lastupdate: self.lastupdate = lastupdate else: # Else set lastupdate to now self.lastupdate = vbutils.getDateTime() else: self.lastupdate = lastupdate self.forum = forum self.id = id self.numpages = numpages self.title = title self.url = url if post: self.post = {} for id, p in post.iteritems(): kw = vbutils.convertKeysToStr(p) self.post[id] = vbpost.Post(**kw) else: self.post = post if url: self.update(url)
def update(self, url = ''): """Retrieve HTML from first page and scrape basic info """ if not url: url = self.url self.url = vbutils.cleanURL(url) self.id = vbutils.findThreadID(self.url) page = [] print "Scraping %s ..." % self.url page.append(getPage(self.url)) self.numpages = int(vbscrape.scrapeNumPages(page[0])) print "Found %s pages." % str(self.numpages) for p in range(1, self.numpages): print "Scraping page %s of %s ..." % (str(p+1), str(self.numpages)) page.append(getPage(self.url, (p + 1))) print "Importing data from HTML ..." self.importHTML(page, self.url) self.lastupdate = vbutils.getDateTime() print "Thread update completed at %s" % self.lastupdate
def update(self, platform_ = ''): """Download latest version of all threads and update the JSON summary file. """ # Very important that platform_ is set correctly # Filename transformations depend on this setting if not platform_: if self.platform: platform_ = self.platform else: platform_ = 'unix' # Loop over each thread in each forum # and download latest data for forumid, forumobj in self.forum.iteritems(): print "Checking %s threads in %s for updates ..." % (len(forumobj.thread), forumid) for threadid, threadobj in forumobj.thread.iteritems(): # TODO Problem: Thread objects created from # arbitrary HTML/JSON may not have URL # Maybe we can implement a smarter URL guessing # heuristic based on other things # Even google search? :-? if not downloadThread(threadobj, self.localdir, platform = platform_): print "Failed to download thread %s" % threadobj.title print "Attempting to proceed anyway ..." # Sync this Archive object with new data on disk # os.walk gives us an iterator of a dir tree # TODO Need to catch errors print "Syncing new data in %s" % self.localdir currentforum = '' currentthreadlist = [] currentthread = '' currentinstance = '' # Iterate through the rest of the subdirs # They won't be a reliable order so we need to # figure out where we are in the tree each time for root, dirs, files in os.walk(self.localdir, topdown=True): lastdir = os.path.split(root.rstrip('/'))[1] nextlastdir = os.path.split(os.path.split(root.rstrip('/'))[0]) if (root == self.localdir): # We are in archive root # Create a Forum object for each subdir for d in dirs: print "Found forum %s" % d self._addForum(d) elif (lastdir in self.forum.keys()): # We are in a forum dir currentforum = lastdir # Assume all files are subdirs named by slugs # TODO Need to validate these dirs currentthreadlist = dirs for t in currentthreadlist: print "Found thread %s" % t currentinstance = '' elif (lastdir in currentthreadlist): # We are in a thread dir currentthread = lastdir # Assume all files are subdirs with instances # Sort by date print "Reviewing thread %s" % currentthread print "Found %s saved instances" % len(dirs) dirs.sort() dirs = [dirs[0]] currentinstance = dirs[0] print "Latest update was on %s" % currentinstance elif (lastdir == currentinstance): # We are in the dir of an instance # Find an original html page, # e.g. showthread.php@t=01235.orig # TODO this is vB convention need to be template try: for f in files: print f orig_file = (f for f in files if (f[:4] == 'show' and f[-4:] == 'orig')).next() print "Found source file: %s" % orig_file except: orig_file = '' print "Source HTML file not found" break # TODO Much easier if subdirs = IDs instead of slugs # TODO Is this something we should change? id = vbutils.findThreadID(orig_file) # Is there already a thread object? if not id in self.forum[currentforum].thread.keys(): # Read the original html subdir = root print "Attempting to read from %s" % subdir with open(os.path.join(subdir, orig_file), 'r') as f: orig_html = f.read() # Find the URL in this HTML # TODO sometimes the URL is not discoverable url = vbutils.findThreadURL(orig_html, id) # Create this thread object # TODO Try/except might be cleaner if not url: print "No valid URL found in HTML." print "Trying to make Thread object anyway from the HTML." self.addThread(rawhtml_ = orig_html) else: if not self.addThread(url): print "Tried and failed to create Thread object for URL: %s" % url # Last update is now! self.lastupdate = vbutils.getDateTime() # If we survive that insane loop, # the data model should be up to date # Store this updated data to a JSON summary self.writeSummary()