Пример #1
0
def wgetPage(thread, page, targetdir, platform = 'unix', logfile = ''):
    """Call wget to download page of a thread to targetdir for local archive
    """

    if not vbutils.isValidURL(thread.url):
        return False

    # Count up the subdirs in this URL
    #   Sometimes vBulletin is installed in root
    #   Sometimes it is in a subdir
    # We need to know this number so we can tell wget to chill later
    subdirs = (thread.url.count('/') - 3)
    print "Found %s sub-dir in the URL ..." % subdirs

    if (page == 1):
        wget_cmd = "wget -v -nH --cut-dirs=%s -k -K -E -H -p -P %s -S -w 1 --random-wait --no-cache --no-cookies \"%s\"" % (subdirs, targetdir, thread.url)
    else:
        wget_cmd = "wget -v -nH --cut-dirs=%s -k -K -E -H -p -P %s -N -w 1 --random-wait --no-cache --no-cookies \"%s&page=%s\"" % (subdirs, targetdir, thread.url, page)

    # Split wget invocation into list for subprocess
    # TODO Apparently shlex.split can't take unicode arguments?
    # Is this going to be a problem in the future?
    args = shlex.split(str(wget_cmd))
 
    # Are we specifying a special logfile location?
    if (logfile != ''):
        args.insert(1, '-a')
        args.insert(2, logfile) 

    # Force special character converstion to win32
    # e.g. ? becomes @ (see wget man pages for details)
    if (platform == 'windows'):
        args.insert(1, '--restrict-file-names=windows')

    # subprocess.Popen() will start a subprocess
    # TODO This would be better with a single data structure
    #       of fixed length containing all the processes.
    # TODO need to catch errors
    proc_firstpage = subprocess.Popen(args)
    # .communicate() will grab output and exit info
    # will also half execution until complete
    # TODO Catch errors from communicate()
    proc_firstpage.communicate()

    # Assuming success!
    return True 
Пример #2
0
def init(argv):
    """Init all globals according to options and params in argv
    Returns string with cleaned up URL of a thread to archive
    """
    
    params = {}
    params = { "utc" : 0,
                "localdir" : '.' }

    # wget uses _platform for escaping illegal chrs in filenames
    if (sys.platform in "win32"):
        params["platform"] = "windows"
    else:
        params["platform"] = "unix"

    # params["useragent"] = "HTTP_USER_AGENT:Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.17) Gecko/2010010604 Ubuntu/9.04 (jaunty) Firefox/3.0.17"
        
    # Check the commandline for arguments
    try:                                
        opts, args = getopt.getopt(argv, "dhl:uvw", ["debug", "help", "localdir=", "utc", "verbose", "windows"])
    except getopt.GetoptError:
        # Found a flag not in our known list
        # Returning a short usage message ...
        print "Error: unrecognized flag"
        usage()
        # ... and bye-bye! 
        sys.exit(2)
  
    # Evaluate commandline options, arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
        elif opt in ("-v", "--verbose"):
            pass 
        elif opt in ("-d", "--debug"):
            pass
        elif opt in ("-u", "--utc"):
            params["utc"] = 1
        elif opt in ("-l", "--localdir"):
            # TODO need to validate the directory
            # strip trailing slashes
            params["localdir"] = arg.rstrip('/')
        elif opt in ("-w", "--windows"):
            params["platform"] = "windows"
        else:
            print "opt: %s, arg: %s" % (opt, arg)

    # Should be exactly 1 positional argument
    # URL to the thread should be the only argument 
    try:
        raw_URL = args[0]
    except IndexError:
        print "getthread.py: missing URL"
        usage()
        sys.exit(2)

    # Is raw_URL a valid vB thread URL?
    print "Validating URL ..."
    if not (vbutils.isValidURL(raw_URL)):
        print "Error: %s is not a valid vBulletin thread URL." % raw_URL
        print
        sys.exit(2)

    # Clean up URL from the commandline 
    # Keep the domain, sub dirs, showthread, and &t=
    params["url"] = vbutils.cleanURL(raw_URL)
    print "Valid thread URL: %s" % url
    return params