예제 #1
0
def scrapeThreadURL(id, html):
    """Return thread URL from str of HTML"""
    # TODO Some vB installation have only relative links 
    # TODO Not sure how to get the URL in that case 
    pattern = r'http://[^\'"]*showthread[^\'"]*t=%s[^\'"]*' % id
    m = re.search(pattern, html)
    if m:
        return vbutils.cleanURL(m.group(0).strip())
    return ''
예제 #2
0
    def update(self, url = ''):
        """Retrieve HTML from first page and scrape basic info 
        """
   
        if not url:
            url = self.url 

        self.url = vbutils.cleanURL(url)
        self.id = vbutils.findThreadID(self.url)
        page = []
        print "Scraping %s ..." % self.url
        page.append(getPage(self.url))
        self.numpages = int(vbscrape.scrapeNumPages(page[0]))
        print "Found %s pages." % str(self.numpages)
        for p in range(1, self.numpages):
            print "Scraping page %s of %s ..." % (str(p+1), str(self.numpages))
            page.append(getPage(self.url, (p + 1)))

        print "Importing data from HTML ..."
        self.importHTML(page, self.url)

        self.lastupdate = vbutils.getDateTime()   
        print "Thread update completed at %s" % self.lastupdate
예제 #3
0
def init(argv):
    """Init all globals according to options and params in argv
    Returns string with cleaned up URL of a thread to archive
    """
    
    params = {}
    params = { "utc" : 0,
                "localdir" : '.' }

    # wget uses _platform for escaping illegal chrs in filenames
    if (sys.platform in "win32"):
        params["platform"] = "windows"
    else:
        params["platform"] = "unix"

    # params["useragent"] = "HTTP_USER_AGENT:Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.17) Gecko/2010010604 Ubuntu/9.04 (jaunty) Firefox/3.0.17"
        
    # Check the commandline for arguments
    try:                                
        opts, args = getopt.getopt(argv, "dhl:uvw", ["debug", "help", "localdir=", "utc", "verbose", "windows"])
    except getopt.GetoptError:
        # Found a flag not in our known list
        # Returning a short usage message ...
        print "Error: unrecognized flag"
        usage()
        # ... and bye-bye! 
        sys.exit(2)
  
    # Evaluate commandline options, arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
        elif opt in ("-v", "--verbose"):
            pass 
        elif opt in ("-d", "--debug"):
            pass
        elif opt in ("-u", "--utc"):
            params["utc"] = 1
        elif opt in ("-l", "--localdir"):
            # TODO need to validate the directory
            # strip trailing slashes
            params["localdir"] = arg.rstrip('/')
        elif opt in ("-w", "--windows"):
            params["platform"] = "windows"
        else:
            print "opt: %s, arg: %s" % (opt, arg)

    # Should be exactly 1 positional argument
    # URL to the thread should be the only argument 
    try:
        raw_URL = args[0]
    except IndexError:
        print "getthread.py: missing URL"
        usage()
        sys.exit(2)

    # Is raw_URL a valid vB thread URL?
    print "Validating URL ..."
    if not (vbutils.isValidURL(raw_URL)):
        print "Error: %s is not a valid vBulletin thread URL." % raw_URL
        print
        sys.exit(2)

    # Clean up URL from the commandline 
    # Keep the domain, sub dirs, showthread, and &t=
    params["url"] = vbutils.cleanURL(raw_URL)
    print "Valid thread URL: %s" % url
    return params