def run(self): t1 = time.time() while not self.obj.pool.done(): self.dl_speed = self.calcDownloadSpeed(self.shared_var.value) if self.dl_speed > 0: self.eta = self.calcETA( (self.obj.filesize - self.shared_var.value) / self.dl_speed) if self.progress_bar: if self.obj.filesize: status = r"[%s Threads] %s / %s @ %s/s %s [%3.1f%%, %s left] " % ( self.obj.actual_threads_used, utils.sizeof_human(self.shared_var.value), utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.dl_speed), utils.progress_bar( 1.0 * self.shared_var.value / self.obj.filesize), self.shared_var.value * 100.0 / self.obj.filesize, utils.time_human(self.eta, fmt_short=True)) else: status = r"[%s Threads] %s / ??? MB @ %s/s " % ( self.obj.actual_threads_used, utils.sizeof_human(self.shared_var.value), utils.sizeof_human(self.dl_speed)) status = status + chr(8) * (len(status) + 1) print status, time.sleep(0.1) if self.obj._killed: self.logger.debug("File download process has been stopped.") return if self.progress_bar: if self.obj.filesize: print r"[%s Threads] %s / %s @ %s/s %s [100%%, 0s left] " % ( self.obj.actual_threads_used, utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.dl_speed), utils.progress_bar(1.0)) else: print r"[%s Threads] %s / %s @ %s/s " % ( self.obj.actual_threads_used, utils.sizeof_human(self.shared_var.value), self.shared_var.value / 1024.0**2, utils.sizeof_human(self.dl_speed)) t2 = time.time() self.dl_time = float(t2 - t1) while self.obj.post_threadpool_thread.is_alive(): time.sleep(0.1) self.obj.pool.shutdown() self.obj.status = "finished" if not self.obj.errors: self.logger.debug("File downloaded within %.2f seconds." % self.dl_time)
def get_speed(self, human=False): ''' Get current transfer speed in bytes per second. :param human: If true, returns a human-readable formatted string. Else, returns an int type number :type human: bool :rtype: int/string ''' if human: return "%s/s" % utils.sizeof_human(self.control_thread.get_speed()) return self.control_thread.get_speed()
def get_dl_size(self, human=False): ''' Get downloaded bytes counter in bytes. :param human: If true, returns a human-readable formatted string. Else, returns an int type number :type human: bool :rtype: int/string ''' if human: return utils.sizeof_human(self.control_thread.get_dl_size()) return self.control_thread.get_dl_size()
def run(self): t1 = time.time() while self.obj.pool.workRequests: self.dl_speed = self.calcDownloadSpeed(self.shared_var.value) if self.dl_speed > 0: self.eta = self.calcETA((self.obj.filesize-self.shared_var.value)/self.dl_speed) if self.progress_bar: if self.obj.filesize: status = r"[*] %s / %s @ %s/s %s [%3.2f%%, %s left] " % (utils.sizeof_human(self.shared_var.value), utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.dl_speed), utils.progress_bar(1.0*self.shared_var.value/self.obj.filesize), self.shared_var.value * 100.0 / self.obj.filesize, utils.time_human(self.eta, fmt_short=True)) else: status = r"[*] %s / ??? MB @ %s/s " % (utils.sizeof_human(self.shared_var.value), utils.sizeof_human(self.dl_speed)) status = status + chr(8)*(len(status)+1) print status, try: self.obj.pool.poll() except threadpool.NoResultsPending: break time.sleep(0.1) if self.obj._killed: self.logger.debug("File download process has been stopped.") return if self.progress_bar: if self.obj.filesize: print r"[*] %s / %s @ %s/s %s [100%%, 0s left] " % (utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.obj.filesize), utils.sizeof_human(self.dl_speed), utils.progress_bar(1.0)) else: print r"[*] %s / %s @ %s/s " % (utils.sizeof_human(self.shared_var.value), self.shared_var.value / 1024.0**2, utils.sizeof_human(self.dl_speed)) t2 = time.time() self.dl_time = float(t2-t1) while self.obj.post_threadpool_thread.is_alive(): time.sleep(0.1) self.obj.status = "finished" if not self.obj.errors: self.logger.debug("File downloaded within %.2f seconds." % self.dl_time)
def get_dl_size(self, human=False): ''' Get downloaded bytes counter in bytes. :param human: If true, returns a human-readable formatted string. Else, returns an int type number :type human: bool :rtype: int/string ''' if not self.control_thread: return 0 if human: return utils.sizeof_human(self.control_thread.get_dl_size()) return self.control_thread.get_dl_size()
class SmartDL: ''' The main SmartDL class :param urls: Download url. It is possible to pass unsafe and unicode characters. You can also pass a list of urls, and those will be used as mirrors. :type urls: string or list of strings :param dest: Destination path. Default is `%TEMP%/pySmartDL/`. :type dest: string :param progress_bar: If True, prints a progress bar to the `stdout stream <http://docs.python.org/2/library/sys.html#sys.stdout>`_. Default is `True`. :type progress_bar: bool :param fix_urls: If true, attempts to fix urls with unsafe characters. :type fix_urls: bool :param logger: An optional logger. :type logger: `logging.Logger` instance :param connect_default_logger: If true, connects a default logger to the class. :type connect_default_logger: bool :rtype: `SmartDL` instance .. NOTE:: The provided dest may be a folder or a full path name (including filename). The workflow is: * If the path exists, and it's an existing folder, the file will be downloaded to there with the original filename. * If the past does not exist, it will create the folders, if needed, and refer to the last section of the path as the filename. * If you want to download to folder that does not exist at the moment, and want the module to fill in the filename, make sure the path ends with `os.sep`. * If no path is provided, `%TEMP%/pySmartDL/` will be used. ''' def __init__(self, urls, dest=None, progress_bar=True, fix_urls=True, logger=None, connect_default_logger=False): self.mirrors = [urls] if isinstance(urls, basestring) else urls if fix_urls: self.mirrors = [utils.url_fix(x) for x in self.mirrors] self.url = self.mirrors.pop(0) fn = os.path.basename(urlparse(self.url).path) self.dest = dest or os.path.join(tempfile.gettempdir(), 'pySmartDL', fn) if self.dest[-1] == os.sep: if os.path.exists(self.dest[:-1]) and os.path.isfile( self.dest[:-1]): os.unlink(self.dest[:-1]) self.dest += fn if os.path.isdir(self.dest): self.dest = os.path.join(self.dest, fn) self.progress_bar = progress_bar if logger: self.logger = logger elif connect_default_logger: self.logger = utils.create_debugging_logger() else: self.logger = utils.DummyLogger() self.headers = {'User-Agent': utils.get_random_useragent()} self.threads_count = 5 self.timeout = 4 self.current_attemp = 1 self.attemps_limit = 4 self.minChunkFile = 1024**2 * 2 # 2MB self.filesize = 0 self.shared_var = multiprocessing.Value( c_int, 0) # a ctypes var that counts the bytes already downloaded self.thread_shared_cmds = {} self.status = "ready" self.verify_hash = False self._killed = False self._failed = False self._start_func_blocking = True self.errors = [] self.post_threadpool_thread = None self.control_thread = None if not os.path.exists(os.path.dirname(self.dest)): self.logger.debug('Folder "%s" does not exist. Creating...' % os.path.dirname(self.dest)) os.makedirs(os.path.dirname(self.dest)) if not utils.is_HTTPRange_supported(self.url): self.logger.warning( "Server does not support HTTPRange. threads_count is set to 1." ) self.threads_count = 1 if os.path.exists(self.dest): self.logger.warning( 'Destination "%s" already exists. Existing file will be removed.' % self.dest) if not os.path.exists(os.path.dirname(self.dest)): self.logger.warning( 'Directory "%s" does not exist. Creating it...' % os.path.dirname(self.dest)) os.makedirs(os.path.dirname(self.dest)) self.pool = utils.ManagedThreadPoolExecutor(self.threads_count) def __str__(self): return 'SmartDL(r"%s", dest=r"%s")' % (self.url, self.dest) def __repr__(self): return "<SmartDL %s>" % (self.url) def add_basic_authentication(self, username, password): ''' Uses HTTP Basic Access authentication for the connection. :param username: Username. :type username: string :param password: Password. :type password: string ''' auth_string = '%s:%s' % (username, password) base64string = base64.standard_b64encode(auth_string.encode('utf-8')) self.headers['Authorization'] = b"Basic " + base64string def add_hash_verification(self, algorithm, hash): ''' Adds hash verification to the download. If hash is not correct, will try different mirrors. If all mirrors aren't passing hash verification, `HashFailedException` Exception will be raised. .. NOTE:: If downloaded file already exist on the destination, and hash matches, pySmartDL will not download it again. .. WARNING:: The hashing algorithm must be supported on your system, as documented at `hashlib documentation page <http://docs.python.org/2/library/hashlib.html>`_. :param algorithm: Hashing algorithm. :type algorithm: string :param hash: Hash code. :type hash: string ''' self.verify_hash = True self.hash_algorithm = algorithm self.hash_code = hash def fetch_hash_sums(self): ''' Will attempt to fetch UNIX hash sums files (`SHA256SUMS`, `SHA1SUMS` or `MD5SUMS` files in the same url directory). Calls `self.add_hash_verification` if successful. Returns if a matching hash was found. :rtype: bool *New in 1.2.1* ''' default_sums_filenames = ['SHA256SUMS', 'SHA1SUMS', 'MD5SUMS'] folder = os.path.dirname(self.url) orig_basename = os.path.basename(self.url) self.logger.debug("Looking for SUMS files...") for filename in default_sums_filenames: try: sums_url = "%s/%s" % (folder, filename) obj = urllib2.urlopen(sums_url) data = obj.read().split('\n') obj.close() for line in data: if orig_basename.lower() in line.lower(): self.logger.debug("Found a matching hash in %s" % sums_url) algo = filename.rstrip('SUMS') hash = line.split(' ')[0] self.add_hash_verification(algo, hash) return except urllib2.HTTPError: continue def start(self, blocking=None): ''' Starts the download task. Will raise `RuntimeError` if it's the object's already downloading. .. warning:: If you're using the non-blocking mode, Exceptions won't be raised. In that case, call `isSuccessful()` after the task is finished, to make sure the download succeeded. Call `get_errors()` to get the the exceptions. :param blocking: If true, calling this function will block the thread until the download finished. Default is *True*. :type blocking: bool ''' if not self.status == "ready": raise RuntimeError("cannot start (current status is %s)" % self.status) if blocking is None: blocking = self._start_func_blocking else: self._start_func_blocking = blocking if self.mirrors: self.logger.debug('One URL and %d mirrors are loaded.' % len(self.mirrors)) else: self.logger.debug('One URL is loaded.') if self.verify_hash and os.path.exists(self.dest): with open(self.dest, 'rb') as f: hash = hashlib.new(self.hash_algorithm, f.read()).hexdigest() if hash == self.hash_code: self.logger.debug( "Destination '%s' already exists, and the hash matches. No need to download." % self.dest) self.status = 'finished' return self.logger.debug("Downloading '%s' to '%s'..." % (self.url, self.dest)) req = urllib2.Request(self.url, headers=self.headers) try: urlObj = urllib2.urlopen(req, timeout=self.timeout) except (urllib2.HTTPError, urllib2.URLError), e: self.errors.append(e) if self.mirrors: self.logger.debug("%s. Trying next mirror..." % unicode(e)) self.url = self.mirrors.pop(0) self.start(blocking) return else: self.logger.warning(unicode(e)) self.errors.append(e) self._failed = True self.status = "finished" raise try: self.filesize = int(urlObj.headers["Content-Length"]) self.logger.debug( "Content-Length is %d (%s)." % (self.filesize, utils.sizeof_human(self.filesize))) except IndexError: self.logger.warning( "Server did not send Content-Length. Filesize is unknown.") self.filesize = 0 args = _calc_chunk_size(self.filesize, self.threads_count, self.minChunkFile) bytes_per_thread = args[0][1] - args[0][0] if len(args) > 1: self.logger.debug( "Launching %d threads (downloads %s/Thread)." % (len(args), utils.sizeof_human(bytes_per_thread))) else: self.logger.debug("Launching 1 thread.") self.status = "downloading" for i, arg in enumerate(args): req = self.pool.submit(download, self.url, self.dest + ".%.3d" % i, arg[0], arg[1], copy.deepcopy(self.headers), self.timeout, self.shared_var, self.thread_shared_cmds) self.post_threadpool_thread = threading.Thread( target=post_threadpool_actions, args=(self.pool, [[(self.dest + ".%.3d" % i) for i in range(len(args))], self.dest], self.filesize, self)) self.post_threadpool_thread.daemon = True self.post_threadpool_thread.start() self.control_thread = ControlThread(self) if blocking: self.wait(raise_exceptions=True)
def get_dl_size(self, human=False): if not self.control_thread: return 0 if human: return utils.sizeof_human(self.control_thread.get_dl_size()) return self.control_thread.get_dl_size()
def get_speed(self, human=False): if human: return "%s/s" % utils.sizeof_human(self.control_thread.get_speed()) return self.control_thread.get_speed()
class MultiGet: def __init__(self, urls, dest=None, progress_bar=True, fix_urls=True, logger=None, connect_default_logger=False, max_parallelism=16, timeout=4, max_retries=4, minimum_chunk_size=1024**2 * 2): self.mirrors = [urls] if isinstance(urls, basestring) else urls if fix_urls: self.mirrors = [utils.url_fix(x) for x in self.mirrors] self.url = self.mirrors.pop(0) fn = urllib2.unquote(os.path.basename(urlparse( self.url).path)).decode('utf-8') self.dest = dest or os.path.join(tempfile.gettempdir(), 'pyMultiGet', fn) if self.dest[-1] == os.sep: if os.path.exists(self.dest[:-1]) and os.path.isfile( self.dest[:-1]): os.unlink(self.dest[:-1]) self.dest += fn if os.path.isdir(self.dest): self.dest = os.path.join(self.dest, fn) self.progress_bar = progress_bar if logger: self.logger = logger elif connect_default_logger: self.logger = utils.create_debugging_logger() else: self.logger = utils.DummyLogger() self.headers = {'User-Agent': utils.get_random_useragent()} self.threads_count = max_parallelism self.actual_threads_used = max_parallelism self.timeout = timeout self.current_attemp = 1 self.attemps_limit = max_retries self.minChunkFile = minimum_chunk_size self.filesize = 0 self.shared_var = multiprocessing.Value(c_int, 0) self.thread_shared_cmds = {} self.status = "ready" self.verify_hash = False self._killed = False self._failed = False self._start_func_blocking = True self.errors = [] self.post_threadpool_thread = None self.control_thread = None if not os.path.exists(os.path.dirname(self.dest)): self.logger.debug('Folder "%s" does not exist. Creating...' % os.path.dirname(self.dest)) os.makedirs(os.path.dirname(self.dest)) if not utils.is_HTTPRange_supported(self.url): self.logger.warning( "Server does not support HTTPRange. threads_count is set to 1." ) self.threads_count = 1 if os.path.exists(self.dest): self.logger.warning( 'Destination "%s" already exists. Existing file will be removed.' % self.dest) if not os.path.exists(os.path.dirname(self.dest)): self.logger.warning( 'Directory "%s" does not exist. Creating it...' % os.path.dirname(self.dest)) os.makedirs(os.path.dirname(self.dest)) self.pool = utils.ManagedThreadPoolExecutor(self.threads_count) def __str__(self): return 'MultiGet(r"%s", dest=r"%s")' % (self.url, self.dest) def __repr__(self): return "<MultiGet %s>" % (self.url) def add_basic_authentication(self, username, password): auth_string = '%s:%s' % (username, password) base64string = base64.standard_b64encode(auth_string.encode('utf-8')) self.headers['Authorization'] = b"Basic " + base64string def add_hash_verification(self, algorithm, hash): self.verify_hash = True self.hash_algorithm = algorithm self.hash_code = hash def fetch_hash_sums(self): default_sums_filenames = ['SHA256SUMS', 'SHA1SUMS', 'MD5SUMS'] folder = os.path.dirname(self.url) orig_basename = os.path.basename(self.url) self.logger.debug("Looking for SUMS files...") for filename in default_sums_filenames: try: sums_url = "%s/%s" % (folder, filename) obj = urllib2.urlopen(sums_url) data = obj.read().split('\n') obj.close() for line in data: if orig_basename.lower() in line.lower(): self.logger.debug("Found a matching hash in %s" % sums_url) algo = filename.rstrip('SUMS') hash = line.split(' ')[0] self.add_hash_verification(algo, hash) return except urllib2.HTTPError: continue def start(self, blocking=None): if not self.status == "ready": raise RuntimeError("cannot start (current status is %s)" % self.status) if blocking is None: blocking = self._start_func_blocking else: self._start_func_blocking = blocking if self.mirrors: self.logger.debug('One URL and %d mirrors are loaded.' % len(self.mirrors)) else: self.logger.debug('One URL is loaded.') if self.verify_hash and os.path.exists(self.dest): with open(self.dest, 'rb') as f: hash = hashlib.new(self.hash_algorithm, f.read()).hexdigest() if hash == self.hash_code: self.logger.debug( "Destination '%s' already exists, and the hash matches. No need to download." % self.dest) self.status = 'finished' return self.logger.debug("Downloading '%s' to '%s'..." % (self.url, self.dest)) req = urllib2.Request(self.url, headers=self.headers) try: urlObj = urllib2.urlopen(req, timeout=self.timeout) except (urllib2.HTTPError, urllib2.URLError), e: self.errors.append(e) if self.mirrors: self.logger.debug("%s. Trying next mirror..." % unicode(e)) self.url = self.mirrors.pop(0) self.start(blocking) return else: self.logger.warning(unicode(e)) self.errors.append(e) self._failed = True self.status = "finished" raise try: self.filesize = int(urlObj.headers["Content-Length"]) self.logger.debug( "Content-Length is %d (%s)." % (self.filesize, utils.sizeof_human(self.filesize))) except (IndexError, KeyError): self.logger.warning( "Server did not send Content-Length. Filesize is unknown.") self.filesize = 0 args = _calc_chunk_size(self.filesize, self.threads_count, self.minChunkFile) self.actual_threads_used = len(args) bytes_per_thread = args[0][1] - args[0][0] if len(args) > 1: self.logger.debug( "Launching %d threads (downloads %s/Thread)." % (len(args), utils.sizeof_human(bytes_per_thread))) else: self.logger.debug("Launching 1 thread.") self.status = "downloading" for i, arg in enumerate(args): req = self.pool.submit(download, self.url, self.dest + ".%.3d" % i, arg[0], arg[1], copy.deepcopy(self.headers), self.timeout, self.shared_var, self.thread_shared_cmds) self.post_threadpool_thread = threading.Thread( target=post_threadpool_actions, args=(self.pool, [[(self.dest + ".%.3d" % i) for i in range(len(args))], self.dest], self.filesize, self)) self.post_threadpool_thread.daemon = True self.post_threadpool_thread.start() self.control_thread = ControlThread(self) if blocking: self.wait(raise_exceptions=True)