def _batch_download(self, uris, local_path=None, throttle=0): """Downloads a package from specified uri. This is a W.I.P!!! Args: uris (list of strings) - Uris of the package to download. local_path (string) - Full path where the package is be saved. Do not include a file name. throttle (int) - Number of kilobytes to throttle the bandwidth by. If throttle == 0, throttling is disabled. Returns: True if package downloaded successfully. False otherwise. """ success = False if throttle != 0: throttle *= 1024 for uri in uris: try: if local_path: name = uri.split('/')[-1] if '?' in name: name = name.split('?')[0] path = os.path.join(local_path, name) else: urlgrab(uri, throttle=throttle) except Exception as e: logger.exception(e)
def run(self,force=False): """ Download bootloader content for all of the latest bootloaders, since the user has chosen to not supply their own. You may ask "why not get this from yum", though Fedora has no IA64 repo, for instance, and we also want this to be able to work on Debian and further do not want folks to have to install a cross compiler. For those that don't like this approach they can still source their cross-arch bootloader content manually. """ content_server = "http://mdehaan.fedorapeople.org/loaders" dest = "/var/lib/cobbler/loaders" files = ( ( "%s/README" % content_server, "%s/README" % dest ), ( "%s/COPYING.elilo" % content_server, "%s/COPYING.elilo" % dest ), ( "%s/COPYING.yaboot" % content_server, "%s/COPYING.yaboot" % dest), ( "%s/COPYING.syslinux" % content_server, "%s/COPYING.syslinux" % dest), ( "%s/elilo-3.8-ia64.efi" % content_server, "%s/elilo-ia64.efi" % dest ), ( "%s/yaboot-1.3.14-12" % content_server, "%s/yaboot" % dest), ( "%s/pxelinux.0-3.61" % content_server, "%s/pxelinux.0" % dest), ( "%s/menu.c32-3.61" % content_server, "%s/menu.c32" % dest), ) self.logger.info("downloading content required to netboot all arches") for f in files: src = f[0] dst = f[1] if os.path.exists(dst) and not force: self.logger.info("path %s already exists, not overwriting existing content, use --force if you wish to update" % dst) continue self.logger.info("downloading %s to %s" % (src,dst)) urlgrabber.urlgrab(src,dst) return True
def fetch (self): """Return value: Fetched file's full path..""" # import urlgrabber module try: import urlgrabber except ImportError: raise FetchError(_('Urlgrabber needs to be installed to run this command')) if not self.url.filename(): raise FetchError(_('Filename error')) if not os.access(self.destdir, os.W_OK): raise FetchError(_('Access denied to write to destination directory: "%s"') % (self.destdir)) if os.path.exists(self.archive_file) and not os.access(self.archive_file, os.W_OK): raise FetchError(_('Access denied to destination file: "%s"') % (self.archive_file)) try: urlgrabber.urlgrab(self.url.get_uri(), self.partial_file, progress_obj = UIHandler(self.progress), http_headers = self._get_http_headers(), ftp_headers = self._get_ftp_headers(), proxies = self._get_proxies(), throttle = self._get_bandwith_limit(), reget = self._test_range_support(), user_agent = 'PiSi Fetcher/' + pisi.__version__) except urlgrabber.grabber.URLGrabError, e: raise FetchError(_('Could not fetch destination file "%s": %s') % (self.archive_file, e))
def fetchHTMLFiles(clubDict, league, season='2016'): # create csv directory dir = os.path.dirname(baseDirname) if not os.path.exists(dir): os.makedirs(dir) # create league directory inside HTML directory dir = os.path.dirname(baseDirname + league + '/') if not os.path.exists(dir): os.makedirs(dir) # create season directory inside league directory dir = os.path.dirname(baseDirname + league + '/' + season + '/') if not os.path.exists(dir): os.makedirs(dir) print "[File Conqueror] Getting HTML for league: %s\tseason: %s" % (league, season) url = constants.urls[league]['baseUrl'] + constants.urls[league]['urlPrefix'] + season + constants.urls[league]['urlSuffix'] filename = baseDirname + league + '/' + season + '/' + 'playerStats.html' try: urlgrabber.urlgrab(url, filename, retries=5) except Exception, e: time.sleep(60) urlgrabber.urlgrab(url, filename, retries=5) print "Exception occurred!", e print "URL: ", url pass
def srpm_from_ticket(self): '''Retrieve the latest srpmURL from the buzilla URL. ''' try: bugzillaURL = self.checklist.properties['ticketURL'].value except KeyError: # No ticket URL was given, set nothing return if not bugzillaURL: # No ticket URL was given, set nothing return data = urlgrabber.urlread(bugzillaURL) srpmList = re.compile('"((ht|f)tp(s)?://.*?\.src\.rpm)"', re.IGNORECASE).findall(data) if srpmList == []: # No SRPM was found. Just decide not to set anything. return # Set the srpm to the last SRPM listed on the page srpmURL = srpmList[-1][0] if not srpmURL: # No srpm found. Just decide not to set anything. return # Download the srpm to the temporary directory. urlgrabber.urlgrab(srpmURL, self.tmpDir) # Fill the SRPMfile properties with the srpm in the temp directory self.checklist.properties['SRPMfile'].value = ( self.tmpDir + os.path.basename(srpmURL))
def __download_prop_file(self): """ download prop file and validate """ # retry 3 times download prop file for _ in range(3): try: sotimeout = float(pylons.config['download_thread_sotimeout']) proxies = json.loads(pylons.config['urlgrabber_proxies']) urlgrabber.urlgrab(self.__uriDict['propUri'], self.__uriDict['propPath'], keepalive=0, timeout=sotimeout, proxies=proxies) break except Exception: randsleep = randint(30, 60) time.sleep(randsleep) if (not os.path.exists(self.__uriDict['propPath'])): raise AgentException( Errors.DC_MISSING_PROP_FILE, 'Prop file (%s) does not exist' % (self.__uriDict['propPath'])) if not PackageUtil.validateProp(self.__uriDict['propPath']): raise AgentException( Errors.DC_MISSING_PROP_FILE, 'Prop file (%s) failed validation' % (self.__uriDict['propPath']))
def __download_prop_file(self): """ download prop file and validate """ # retry 3 times download prop file for _ in range(3): try: sotimeout = float(pylons.config['download_thread_sotimeout']) proxies = json.loads(pylons.config['urlgrabber_proxies']) urlgrabber.urlgrab( self.__uriDict['propUri'], self.__uriDict['propPath'], keepalive = 0, timeout = sotimeout, proxies = proxies) break except Exception: randsleep = randint(30, 60) time.sleep(randsleep) if (not os.path.exists(self.__uriDict['propPath'])): raise AgentException(Errors.DC_MISSING_PROP_FILE, 'Prop file (%s) does not exist' % (self.__uriDict['propPath'])) if not PackageUtil.validateProp(self.__uriDict['propPath']): raise AgentException(Errors.DC_MISSING_PROP_FILE, 'Prop file (%s) failed validation' % (self.__uriDict['propPath']))
def download_file(uri, dl_path, throttle): if uri.startswith('https://api.github.com/'): # TODO: handle 200 and 302 response headers = (("Accept", "application/octet-stream"),) urlgrab(uri, filename=dl_path, throttle=throttle, http_headers=headers) else: urlgrab(uri, filename=dl_path, throttle=throttle)
def _download_file(self, filename): url = "%s%s" % (self.EPF_FULL_URL % (self.username, self.password), filename) urlgrab(url, "%s/%s" % (self.target_dir, filename), progress_obj=text_progress_meter(), reget="simple", retry=0)
def downloadPackage(self): # download the package urlgrabber.urlgrab(self.testPkgUri, self.localPkgName) urlgrabber.urlgrab(self.testPkgUri + '.prop', self.localPkgName + '.prop') LOG.debug('localpackagename = %s', self.localPkgName) assert os.path.exists(self.localPkgName + '.prop') assert os.path.exists(self.localPkgName)
def get_neg_images(url, neg_image_folder): images_urls = urllib.urlopen(url).read().decode('utf8') pic_num = 1 for iurl in images_urls.split('\n'): try: print (iurl) urlgrab(iurl, neg_image_folder + "/" + str(pic_num) + ".jpg", timeout = 20) img = cv2.imread(neg_image_folder + "/" + str(pic_num) + ".jpg", cv2.IMREAD_GRAYSCALE) resize_img = cv2.resize(img, (100, 100)) cv2.imwrite(neg_image_folder + "/" + str(pic_num) + ".jpg", resize_img) pic_num += 1 except Exception as e: print str(e)
def page_download(page_url, folder): page = urllib2.urlopen(page_url) soup = BeautifulSoup(page) print len(soup.find_all("a", { "class" : "next" })) for src in soup.find_all('img'): if src.get('src').endswith(sfx): tgt_url = str(src.get('src').replace('small', 'big')) print "saving : " + tgt_url tgt_name = os.path.basename(tgt_url) try: urlgrabber.urlgrab(tgt_url, "./" + folder + "/" + tgt_name, progress_obj=urlgrabber.progress.TextMeter()) except urlgrabber.grabber.URLGrabError as detail: print "Error eccours: " + detail
def proses(): # link pertama link = sys.argv[1] buka = urllib2.urlopen(link) cari = re.compile('a href="(.*)" class="dbtn"') dapat = re.findall(cari,buka.read()) # link download baru = urllib2.urlopen(dapat[0]) lagi = re.compile('var flvLink = \'(.*)\'') final = re.findall(lagi,baru.read()) prog = urlgrabber.progress.text_progress_meter() urlgrabber.urlgrab(final[0],sys.argv[2],progress_obj=prog)
def fetch_image_files(layer, opts): if opts.layer: path = str(opts.layer) if not opts.test and not os.path.isdir(path): os.makedirs(path) else: path = "." for image in layer["images"]: filetype = image["url"].split(".")[-1] target = os.path.join(path, image["hash"] + "." + filetype) if opts.test: print >>sys.stderr, image["url"], "->", target else: meter = urlgrabber.progress.text_progress_meter() urlgrabber.urlgrab(image["url"], target, progress_obj=meter)
def fetch_image_files(layer, opts): if opts.layer: path = str(opts.layer) if not opts.test and not os.path.isdir(path): os.makedirs(path) else: path = "." for image in layer["images"]: filetype = image["url"].split(".")[-1] target = os.path.join(path, image["hash"] + "." + filetype) if opts.test: print >> sys.stderr, image["url"], "->", target else: meter = urlgrabber.progress.text_progress_meter() urlgrabber.urlgrab(image["url"], target, progress_obj=meter)
def read_kickstart(path): """Parse a kickstart file and return a KickstartParser instance. This is a simple utility function which takes a path to a kickstart file, parses it and returns a pykickstart KickstartParser instance which can be then passed to an ImageCreator constructor. If an error occurs, a CreatorError exception is thrown. """ version = ksversion.makeVersion() ks = ksparser.KickstartParser(version) try: ksfile = urlgrabber.urlgrab(path) ks.readKickstart(ksfile) # Fallback to e.args[0] is a workaround for bugs in urlgragger and pykickstart. except IOError as e: raise errors.KickstartError("Failed to read kickstart file " "'%s' : %s" % (path, e.strerror or e.args[0])) except kserrors.KickstartError as e: raise errors.KickstartError("Failed to parse kickstart file " "'%s' : %s" % (path, e)) return ks
def fetch_jetty(self): """Download the requested version of Jetty""" if path.exists(self.home): return url = self.node.config.get('jetty','REPO') + self.version + "/jetty-distribution-" + self.version + ".tar.gz" if not path.exists(self.cachedir): os.makedirs(self.cachedir) f = tempfile.mktemp(prefix='jetty-' + self.version + '-', suffix='.tar.gz') try: print("Downloading Jetty from " + url) meter = urlgrabber.progress.TextMeter() urlgrabber.urlgrab(url, filename=f, progress_obj=meter) subprocess.check_call(["tar", "-x", "-C", self.cachedir, "-f", f]) finally: os.remove(f) os.rename(path.join(self.cachedir, 'jetty-distribution-' + self.version), self.home)
def updateLocalDb(): try: if urlgrabber.urlgrab(self.remote_db, self.local_db) == self.local_db: updateLocalSum() return True except urlgrabber.grabber.URLGrabError: return False
def __init__(self): data = StringIO.StringIO(urlgrabber.urlread("http://itunes.com/version")) stream = gzip.GzipFile(fileobj=data) data = stream.read() updates = plistlib.readPlistFromString(data) devs = self.findPods() for (dev, name, family, firmware) in devs: if not family: family, firmware = self.getIPodData(dev) print "Found %s with family %s and firmware %s" % (name, family, firmware) if updates["iPodSoftwareVersions"].has_key(unicode(family)): uri = updates["iPodSoftwareVersions"][unicode(family)]["FirmwareURL"] print "Latest firmware: %s" % uri print "Fetching firmware..." path = urlgrabber.urlgrab( uri, progress_obj=urlgrabber.progress.text_progress_meter(), reget="check_timestamp" ) print "Extracting firmware..." zf = zipfile.ZipFile(path) for name in zf.namelist(): if name[:8] == "Firmware": print "Firmware found." outfile = open("Firmware", "wb") outfile.write(zf.read(name)) outfile.close() infile = open("Firmware", "rb") outfile = open(dev, "wb") # FIXME: do the following in pure python? print "Making backup..." commands.getoutput("dd if=%s of=Backup" % dev) print "Uploading firmware..." commands.getoutput("dd if=Firmware of=%s" % dev) print "Done."
def fetchRemoteFile(self, archive_file): try: urlgrab(self.url.get_uri(), archive_file, progress_obj=UIHandler(self.progress), http_headers=self._get_http_headers(), ftp_headers=self._get_ftp_headers(), proxies=self._get_proxies(), throttle=self._get_bandwith_limit(), reget=self._test_range_support(archive_file), copy_local=1, user_agent='PISI Fetcher/' + pisi.__version__) except grabber.URLGrabError, e: raise FetchError( _('Could not fetch destination file "%s": %s') % (self.url.get_uri(), e))
def grab(url, filename, timeout=120, retry=5, proxy=None, ftpmode=False): print "Grabbing", url def grab_fail_callback(data): # Only print debug here when non fatal retries, debug in other cases # is already printed if (data.exception.errno in retrycodes) and (data.tries != data.retry): print "grabbing retry %d/%d, exception %s"%( data.tries, data.retry, data.exception) try: retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes if 12 not in retrycodes: retrycodes.append(12) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) downloaded_file = urlgrabber.urlgrab( url, filename,timeout=timeout,retry=retry, retrycodes=retrycodes, progress_obj=SimpleProgress(), failure_callback=grab_fail_callback, copy_local=True, proxies=proxy, ftp_disable_epsv=ftpmode) if not downloaded_file: return False except urlgrabber.grabber.URLGrabError as e: warn('URLGrabError %i: %s' % (e.errno, e.strerror)) if os.path.exists(filename): os.unlink(filename) return False return True
def getRemoteURL(url): logger.info('downloading %s' % url) start = time.time() try: fileName = urlgrabber.urlgrab(url, config.localOSMPath) fileSize = os.path.getsize(fileName) except Exception, e: logger.warning('urlgrabber: %s' % e.strerror) return(None)
def archive_downloader(i): list_name = i[0] year = i[1] month = i[2] if not list_name or not year or not month: return basename = "{0}-{1}.txt.gz".format(year, month) filename = "http://lists.fedoraproject.org/pipermail/{0}/{1}".format(list_name, basename) try: urlgrabber.urlgrab(filename) pos = str(months.index(month) + 1) if len(pos) == 1: pos = "0{0}".format(pos) newname = "{0}-{1}-{2}-{3}.txt".format(list_name, year, pos, month) with open(newname, "w") as f: f.write(gzip.open(basename).read()) print "== {0} downloaded ==".format(filename) except urlgrabber.grabber.URLGrabError: pass
def fetch_image_files(client, bbox, opts): # if opts.layer: # path = str(opts.layer) # if not opts.test and not os.path.isdir(path): # os.makedirs(path) # else: files = [] args = {"archive":"true"} if opts.source else {} for image in client.images_by_bbox(bbox, **args): target = image.path.split("/")[-1] if opts.dest: meter = urlgrabber.progress.text_progress_meter() target = os.path.join(opts.dest, target) print >>sys.stderr, image.path, "->", target urlgrabber.urlgrab(str(image.path), target, progress_obj=meter) else: print >>sys.stderr, image.path, "->", target files.append(target) return files
def make_floppy(autoinst): (fd, floppy_path) = tempfile.mkstemp(suffix='.floppy', prefix='tmp', dir="/tmp") print("- creating floppy image at %s" % floppy_path) # create the floppy image file cmd = "dd if=/dev/zero of=%s bs=1440 count=1024" % floppy_path print("- %s" % cmd) rc = os.system(cmd) if not rc == 0: raise InfoException("dd failed") # vfatify cmd = "mkdosfs %s" % floppy_path print("- %s" % cmd) rc = os.system(cmd) if not rc == 0: raise InfoException("mkdosfs failed") # mount the floppy mount_path = tempfile.mkdtemp(suffix=".mnt", prefix='tmp', dir="/tmp") cmd = "mount -o loop -t vfat %s %s" % (floppy_path, mount_path) print("- %s" % cmd) rc = os.system(cmd) if not rc == 0: raise InfoException("mount failed") # download the autoinst file onto the mounted floppy print("- downloading %s" % autoinst) save_file = os.path.join(mount_path, "unattended.txt") urlgrabber.urlgrab(autoinst, filename=save_file) # umount cmd = "umount %s" % mount_path print("- %s" % cmd) rc = os.system(cmd) if not rc == 0: raise InfoException("umount failed") # return the path to the completed disk image to pass to virt-install return floppy_path
def make_floppy(kickstart): (fd, floppy_path) = tempfile.mkstemp( suffix='.floppy', prefix='tmp', dir="/tmp") print("- creating floppy image at %s" % floppy_path) # create the floppy image file cmd = "dd if=/dev/zero of=%s bs=1440 count=1024" % floppy_path print("- %s" % cmd) rc = os.system(cmd) if not rc == 0: raise InfoException("dd failed") # vfatify cmd = "mkdosfs %s" % floppy_path print("- %s" % cmd) rc = os.system(cmd) if not rc == 0: raise InfoException("mkdosfs failed") # mount the floppy mount_path = tempfile.mkdtemp(suffix=".mnt", prefix='tmp', dir="/tmp") cmd = "mount -o loop -t vfat %s %s" % (floppy_path, mount_path) print("- %s" % cmd) rc = os.system(cmd) if not rc == 0: raise InfoException("mount failed") # download the kickstart file onto the mounted floppy print("- downloading %s" % kickstart) save_file = os.path.join(mount_path, "unattended.txt") urlgrabber.urlgrab(kickstart, filename=save_file) # umount cmd = "umount %s" % mount_path print("- %s" % cmd) rc = os.system(cmd) if not rc == 0: raise InfoException("umount failed") # return the path to the completed disk image to pass to virt-install return floppy_path
def fetch(self): """Return value: Fetched file's full path..""" # import urlgrabber module try: import urlgrabber except ImportError: raise FetchError( _('Urlgrabber needs to be installed to run this command')) if not self.url.filename(): raise FetchError(_('Filename error')) if not os.access(self.destdir, os.W_OK): raise FetchError( _('Access denied to write to destination directory: "%s"') % (self.destdir)) if os.path.exists(self.archive_file) and not os.access( self.archive_file, os.W_OK): raise FetchError( _('Access denied to destination file: "%s"') % (self.archive_file)) try: urlgrabber.urlgrab( self.url.get_uri(), self.partial_file, progress_obj=UIHandler(self.progress), http_headers=self._get_http_headers(), ftp_headers=self._get_ftp_headers(), proxies=self._get_proxies(), throttle=self._get_bandwith_limit(), reget=self._test_range_support(), copy_local=1, retry=3, # retry 3 times timeout=120, # Reduce from default of 5 minutes to 2 minutes user_agent='eopkg Fetcher/' + pisi.__version__) except urlgrabber.grabber.URLGrabError, e: raise FetchError( _('Could not fetch destination file "%s": %s') % (self.url.get_uri(), e))
def download_rpms(pkg, outdir): """ TBD. :param pkg: A dict contains RPM basic information other than url :param outdir: Where to save RPM[s] """ url = RS.call("packages.getPackageUrl", [pkg["id"]], ["--no-cache"])[0] logging.info("RPM URL: " + ', '.join(url)) return urlgrabber.urlgrab(url, os.path.join(outdir, os.path.basename(url)))
def _process_patch(self, patch): import pdb pdb.set_trace() url = urlparse.urlparse(patch.file) src_dir = self.build_dir('SOURCES') if not url.scheme: filename = os.path.basename(url.path) path = os.path.join(self.base_path, filename) dest = os.path.join(src_dir, filename) if not os.path.exists(dest) or not os.path.samefile(path, dest): shutil.copyfile(path, dest) else: filename = url.path.rsplit('/', 1)[-1] dest = os.path.join(src_dir, filename) urlgrabber.urlgrab(url, dest) return Patch(filename, patch.options)
def fetchHTMLFiles(clubDict, league, season='15'): # create HTML directory dir = os.path.dirname(baseDirname) if not os.path.exists(dir): os.makedirs(dir) # create league directory inside HTML directory dir = os.path.dirname(baseDirname + league + '/') if not os.path.exists(dir): os.makedirs(dir) # create season directory inside league directory dir = os.path.dirname(baseDirname + league + '/' + season + '/') if not os.path.exists(dir): os.makedirs(dir) for clubName, clubId in clubDict.iteritems(): print "[File Getter] Getting HTML for club: %s\tleague: %s\tseason: 20%s" % \ (clubName, league, season) url = baseURL + ` clubId ` filename = baseDirname + league + '/' + season + '/' + clubName + '_' + ` clubId ` if (season != '15'): url = baseURL + ` clubId ` + '?saison_id=20' + season # because of different season schedule seasons are shifted for one number in MLS... if (league == 'MajorLeagueSoccer'): url = baseURL + ` clubId ` + '?saison_id=' + ` ( int('20' + season) - 1) ` try: urlgrabber.urlgrab(url, filename, retries=5) except Exception, e: time.sleep(60) urlgrabber.urlgrab(url, filename, retries=5) print "Exception occurred!", e print "URL: ", url pass
def get_images(active=True, outdir='player_images', outlist='player_names.csv'): import bs4, urlgrabber, httplib if active: list = 'http://stats.nba.com/frags/stats-site-page-players-directory-active.html' else: list = 'http://stats.nba.com/players.html' # prepare player list flist = open(outlist, 'w') flist.write('# name\n') # fetch and parse the NBA player list player_page = urlgrabber.urlread(list) soup = bs4.BeautifulSoup(player_page) # loop through the player list for p in soup('a', 'playerlink'): phref = str(p['href']) ## exclude "historical" players #if (len(phref.split('HISTADD')) == 1): # verify that player pages exist pname = phref.split('/')[-1] conn = httplib.HTTPConnection('i.cdn.turner.com') conn.request('HEAD', '/nba/nba/.element/img/2.0/sect/statscube/players/large/'+pname+'.png') if (conn.getresponse().status != 404): # download and save player images img_link = 'http://i.cdn.turner.com/nba/nba/.element/img/2.0/sect/statscube/players/large/'+pname+'.png' urlgrabber.urlgrab(img_link, filename=outdir+'/'+pname+'.png') # write player names to list flist.write(pname+'\n') # close name list flist.close() return
def download(url, progress=False): """ Download the document pointed to by url to cwd """ filename = get_filename(url) if os.path.exists(filename): info(filename + " already exists in cwd. Not downloading. ") else: debug("Downloading " + url) if progress: import urlgrabber from urlgrabber.progress import text_progress_meter urlgrabber.urlgrab(url=url, filename=filename, progress_obj=text_progress_meter()) else: urllib.urlretrieve(url=url, filename=filename) debug("Finished Downloading " + filename) return filename
def get_mediaproducts(self): """ Return path to media.1/products file if available :returns: str """ media_products_path = os.path.join(self._get_repodata_path(), 'media.1/products') try: (s,b,p,q,f,o) = urlparse(self.url) if p[-1] != '/': p = p + '/' p = p + 'media.1/products' except (ValueError, IndexError, KeyError) as e: return None url = urlunparse((s,b,p,q,f,o)) try: urlgrabber_opts = {} self.set_download_parameters(urlgrabber_opts, url, media_products_path) urlgrabber.urlgrab(url, media_products_path, **urlgrabber_opts) except Exception as exc: # no 'media.1/products' file found return None return media_products_path
def read_kickstart(path): """Parse a kickstart file and return a KickstartParser instance. This is a simple utility function which takes a path to a kickstart file, parses it and returns a pykickstart KickstartParser instance which can be then passed to an ImageCreator constructor. If an error occurs, a CreatorError exception is thrown. """ version = ksversion.makeVersion() ks = ksparser.KickstartParser(version) try: ksfile = urlgrabber.urlgrab(path) ks.readKickstart(ksfile) # Fallback to e.args[0] is a workaround for bugs in urlgragger and pykickstart. except IOError, e: raise errors.KickstartError("Failed to read kickstart file " "'%s' : %s" % (path, e.strerror or e.args[0]))
def sanity_check_repodata(myurl): """ Sanity check the repodata for a given repository. Initial implementation by Seth Vidal. """ myurl = str(myurl) tempdir = tempfile.mkdtemp() errorstrings = [] if myurl[-1] != '/': myurl += '/' baseurl = myurl if not myurl.endswith('repodata/'): myurl += 'repodata/' else: baseurl = baseurl.replace('repodata/', '/') rf = myurl + 'repomd.xml' try: rm = urlgrabber.urlopen(rf) repomd = repoMDObject.RepoMD('foo', rm) for t in repomd.fileTypes(): data = repomd.getData(t) base, href = data.location if base: loc = base + '/' + href else: loc = baseurl + href destfn = tempdir + '/' + os.path.basename(href) dest = urlgrabber.urlgrab(loc, destfn) ctype, known_csum = data.checksum csum = checksum(ctype, dest) if csum != known_csum: errorstrings.append("checksum: %s" % t) if href.find('xml') != -1: decompressed = decompress(dest) retcode = subprocess.call( ['/usr/bin/xmllint', '--noout', decompressed]) if retcode != 0: errorstrings.append("failed xml read: %s" % t) except urlgrabber.grabber.URLGrabError, e: errorstrings.append('Error accessing repository %s' % e)
def sanity_check_repodata(myurl): """ Sanity check the repodata for a given repository. Initial implementation by Seth Vidal. """ myurl = str(myurl) tempdir = tempfile.mkdtemp() errorstrings = [] if myurl[-1] != '/': myurl += '/' baseurl = myurl if not myurl.endswith('repodata/'): myurl += 'repodata/' else: baseurl = baseurl.replace('repodata/', '/') rf = myurl + 'repomd.xml' try: rm = urlgrabber.urlopen(rf) repomd = repoMDObject.RepoMD('foo', rm) for t in repomd.fileTypes(): data = repomd.getData(t) base, href = data.location if base: loc = base + '/' + href else: loc = baseurl + href destfn = tempdir + '/' + os.path.basename(href) dest = urlgrabber.urlgrab(loc, destfn) ctype, known_csum = data.checksum csum = checksum(ctype, dest) if csum != known_csum: errorstrings.append("checksum: %s" % t) if href.find('xml') != -1: decompressed = decompress(dest) retcode = subprocess.call(['/usr/bin/xmllint', '--noout', decompressed]) if retcode != 0: errorstrings.append("failed xml read: %s" % t) except urlgrabber.grabber.URLGrabError, e: errorstrings.append('Error accessing repository %s' % e)
def get_file(self, path, local_base=None): try: try: temp_file = "" if local_base is not None: target_file = os.path.join(local_base, path) target_dir = os.path.dirname(target_file) if not os.path.exists(target_dir): os.makedirs(target_dir, int('0755', 8)) temp_file = target_file + '..download' if os.path.exists(temp_file): os.unlink(temp_file) downloaded = urlgrabber.urlgrab(path, temp_file) os.rename(downloaded, target_file) return target_file else: return urlgrabber.urlread(path) except urlgrabber.URLGrabError: return finally: if os.path.exists(temp_file): os.unlink(temp_file)
def get_file(self, path, local_base=None): try: try: temp_file = "" try: if not urlparse(path).scheme: (s,b,p,q,f,o) = urlparse(self.url) if p[-1] != '/': p = p + '/' p = p + path path = urlunparse((s,b,p,q,f,o)) except (ValueError, IndexError, KeyError) as e: return None if local_base is not None: target_file = os.path.join(local_base, path) target_dir = os.path.dirname(target_file) if not os.path.exists(target_dir): os.makedirs(target_dir, int('0755', 8)) temp_file = target_file + '..download' if os.path.exists(temp_file): os.unlink(temp_file) urlgrabber_opts = {} self.set_download_parameters(urlgrabber_opts, path, temp_file) downloaded = urlgrabber.urlgrab(path, temp_file, **urlgrabber_opts) os.rename(downloaded, target_file) return target_file else: urlgrabber_opts = {} self.set_download_parameters(urlgrabber_opts, path) return urlgrabber.urlread(path, **urlgrabber_opts) except urlgrabber.grabber.URLGrabError: return finally: if os.path.exists(temp_file): os.unlink(temp_file)
os.makedirs(os.path.dirname(destfile)) except OSError: pass if self.callbacks.has_key(name): self.callbacks[name]() opos = self.pos[name] while True: sourceurl = self.__createSourceURI(uri, name) print sourceurl, destfile, self.headers[name] try: if force: f = urlgrab(sourceurl, destfile, timeout=30.0, copy_local=copy_local, http_headers=self.headers[name], ssl_ca_cert='/usr/share/rhn/RHNS-CA-CERT') else: f = urlgrab(sourceurl, destfile, timeout=30.0, reget='check_timestamp', copy_local=copy_local, http_headers=self.headers[name], ssl_ca_cert='/usr/share/rhn/RHNS-CA-CERT') except Exception, e: # urlgrab fails with invalid range for already completely # transfered files, pretty strange to me to be honest... :) if e[0] == 9: f = destfile
def main(argv): """Download utility to simplify the download of USPTO patent data USPTO patent applications are currently hosted by Google. In most cases, you will be interested in all patents from a specific year or which lie in a relevant period of time. Since downloading each compressed file separately is cumbersome, this download utility might help you. This tools offers three basic operations: (1) -d Downloads the relevant files one at a time; might be slow. (2) -f Lists all available hyperlinks pointing to zip files, and store them in year-based text files. This is suitable for all that want to use their own donwload utility (e.g. parallelise the downloads). (3) -p Prints all links found to zip files to the standard out Usage: python uspto-patent-downloader.py [options] Options: -d .., --download downloads each zip file found using 'url' -f .., --files writes all relevant links found into files; one file for each year -h, --help shows this help -l .., --loglevel ... determines the log level (INFO, DEBUG, ..) -o .., --out ... specifies the output directory (default: './uspto-files') -p, --print prints all relevant links found to the standard out (this option is selected per default if '-f' is misssing) -u .., --url ... url to the USPTO patent applications bibliography hosted by Google (default: http://www.google.com/googlebooks/uspto-patents-applications-biblio.html) -y .., --years ... comma separated list of years (e.g. '2002,2004') to consider for download (default: all years are considered from 2001 to now) Examples: uspto-patent-downloader.py -list > links-to-download.txt uspto-patent-downloader.py -u http://www.google.com/googlebooks/uspto-patents-applications-biblio.html -f uspto-patent-downloader.py -years 2001,2002 uspto-patent-downloader.py -f -y 2003 -out . """ defaults = { 'uspto_url': 'http://www.google.com/googlebooks/uspto-patents-applications-biblio.html', 'html_page': None, 'requested_years': [], 'write_to_stdout': False, 'print_to_file': False, 'download': False, 'output_directory': '.' } validate_input(argv, defaults); if not defaults['html_page']: defaults['html_page'] = load_url(uspto_url) soup = BeautifulSoup(html_page) links_per_year = defaultdict(list) links = soup.findAll('a', attrs={ 'href': re.compile('zip$') }) logging.info(' found ' + str(len(links)) + ' links') for link in links: logging.debug(' . ' + link['href']) matched_year = re.search( '/([0-9]{4})/', link['href']) if matched_year: links_per_year[matched_year.group(1)].append(link['href']) filtered_dict = links_per_year if requested_years: filtered_dict = { year : links_per_year[year] for year in requested_years } if write_to_stdout: for links in sorted(filtered_dict.itervalues()): for link in links: print link if print_to_file: makedirs(output_directory) for k,v in filtered_dict.iteritems(): basename = k + '.txt' filename = output_directory + '/' + basename if os.path.isfile(filename): os.remove(filename) with open(filename, 'a') as text_file: for link in sorted(v): text_file.write(link + '\n') logging.debug(' file ' + basename + ' written to disk') logging.info(' all files written to disk') if download: for year, links in filtered_dict.iteritems(): makedirs(os.path.join(output_directory, year)) for link in links: try: filename = os.path.join(output_directory, year, link.split('/')[-1]) prog = urlgrabber.progress.text_progress_meter() urlgrabber.urlgrab(str(link), filename, progress_obj=prog) except Exception, e: logging.warn(' error while downloading %s: %s' % (link, e))
def fileDowload(url, full_destiny): try: print(time.strftime("%H:%M:%S")+": Start downloading file - "+full_destiny) dataj = urlgrab(str(url), str(full_destiny), **opts) except Exception: print("cannot connect to the server ")
def test_urlgrab(self): "module-level urlgrab() function" _, outfile = tempfile.mkstemp() filename = urlgrabber.urlgrab('http://www.python.org', filename=outfile) os.unlink(outfile)
#!/usr/bin/env python """ Grabs all the sources for a given version number of elasticsearch """ import urlgrabber from optparse import OptionParser parser = OptionParser() (options,args) = parser.parse_args() print "Getting Main Source for version %s " % args[0] urlgrabber.urlgrab('https://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-%s.tar.gz' % args[0]) print "Getting plugin-analysis-icu" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/analysis-icu/elasticsearch-analysis-icu-%s.zip' % args[0]) print "Getting plugin-cloud-aws" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/cloud-aws/elasticsearch-cloud-aws-%s.zip' % args[0]) print "Getting plugin-hadoop" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/hadoop/elasticsearch-hadoop-%s.zip' % args[0]) print "Getting plugin-lang-groovy" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/lang-groovy/elasticsearch-lang-groovy-%s.zip' % args[0]) print "Getting plugin-lang-javascript" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/lang-javascript/elasticsearch-lang-javascript-%s.zip' % args[0]) print "Getting plugin-lang-python" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/lang-python/elasticsearch-lang-python-%s.zip' % args[0]) print "Getting plugin-mapper-attachments" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/mapper-attachments/elasticsearch-mapper-attachments-%s.zip' % args[0]) print "Getting plugin-river-couchdb" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/river-couchdb/elasticsearch-river-couchdb-%s.zip' % args[0]) print "Getting plugin-river-rabbitmq" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/river-rabbitmq/elasticsearch-river-rabbitmq-%s.zip' % args[0]) print "Getting plugin-river-twitter" urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/river-twitter/elasticsearch-river-twitter-%s.zip' % args[0])
# parsed_xml = xmlparse.parse(u2.urlopen(feed_url)) try: all_downloads = [item.findtext('link') for item in xmlparse.parse(u2.urlopen(feed_url)).iterfind('channel/item')] except xmlparse.ParseError: print "Error: invalid RSS feed. Quitting ..." exit(0) except URLError as e: print str(e) exit(0) except ValueError as e: print str(e) exit(0) except KeyError as e: print str(e) exit(0) print all_downloads # downloading for single_download in all_downloads: print "Starting: "+ single_download g = ug.grabber.URLGrabber(reget='simple',retry=2) response = ug.urlgrab(single_download) print "Completed: "+ single_download
base_filename = os.path.basename(filename) base_upstream_filename = os.path.basename(files[filename]) modname = os.path.splitext(base_upstream_filename)[0] if not os.path.exists(modname): os.mkdir(modname) sp = open(os.path.join('..', filename), 'r') sp_sum = sha1(sp.read()).hexdigest() sp_sum_line = '%s\n' % sp_sum record = open('%s/record' % modname, 'w') record.writelines(sp_sum_line) record.close() os.chdir(modname) urlgrabber.urlgrab(files[filename]) sp = open(base_upstream_filename) sp_sum = sha1(sp.read()).hexdigest() sp_sum_line = '%s\n' % sp_sum record = open('record', 'r') if sp_sum_line in record: os.remove(base_upstream_filename) os.chdir('..') continue retrieved_at = int(time.time()) new_name = '%s-%s' % (retrieved_at, base_upstream_filename) os.rename(base_upstream_filename, new_name) print('New %s found: %s' % (base_upstream_filename, new_name))
from urlgrabber import urlgrab url = 'http://i1.letvimg.com/vrs/201204/05/c3671b2ca6be47c6bcdb4d32e24f60ab.jpg' try: filename = urlgrab(url, '/tmp/' + 'image') print('download %s ok' % filename) except Exception as (errno, strerr): print('download failed - ERRNO: %d ERR INFO: %s ' % (errno, strerr))
def __startDownload(self): """ actual download logic """ try: LOG.info("Starting package download for package %s" % self.__uriDict['package']) # check to see if there's an in progress file, # since PackageMgr guarantees that duplicate threads will not be spawned # for same pkg, assume an existing thread was killed. # attempt to clean up package n move if (os.path.exists(self.__uriDict['inProgressPackagePath'])): LOG.debug( 'In progress file (%s) already exists. Will validate and reattempt download if necessary' % self.__uriDict['inProgressPackagePath']) if os.path.exists(self.__uriDict['packagePath']): if (os.path.exists(self.__uriDict['propPath']) and PackageUtil.validateProp(self.__uriDict['propPath']) and PackageUtil.validatePackage( self.__uriDict['packagePath'], self.__uriDict['propPath'])): msg = 'The package already exists. Will NOT download duplicate package' + self.__uriDict[ 'packagePath'] LOG.info(msg) os.utime(self.__uriDict['packagePath'], None) os.utime(self.__uriDict['propPath'], None) self._updateStatus(progress=100) # NOTE: this is a normal exit not an error! return LOG.warning( 'The package already exists. However package prop (%s) failed validation. Downloading package.' % self.__uriDict['propPath']) # Delete all traces of package before beginning download LOG.info('Cleaning up all packages for %s ' % self.__uriDict['packagePath']) PackageUtil.cleanUpPackage(self.__uriDict['inProgressPackagePath'], self.__uriDict['packagePath'], self.__uriDict['propPath']) AgentThread._updateProgress(self, 0) if not self.__skipProp: # First, download .prop file LOG.info( 'Starting download of prop file %s - %s' % (self.__uriDict['propUri'], self.__uriDict['propPath'])) self.__download_prop_file() try: self.__prop = loadPropFile(self.__uriDict['propPath']) except FileNotFoundError: raise AgentException( Errors.DC_MISSING_PROP_FILE, 'Prop file (%s) unable to read or did not parse' % (self.__uriDict['propPath'])) AgentThread._updateProgress(self, 2) self.__setProgressTimeouts() if self.__uriDict['scheme'] == 'http': # try download 3 times, with random sleep for _ in range(3): try: sotimeout = float( pylons.config['download_thread_sotimeout']) proxies = json.loads( pylons.config['urlgrabber_proxies']) urlgrabber.urlgrab( self.__uriDict['uri'], self.__uriDict['inProgressPackagePath'], checkfunc=None if self.__skipProp else (PackageUtil.validateDownload, (), {}), progress_obj=DownloadProgress(self), throttle=float(pylons.config['package_throttle']), bandwidth=int(pylons.config['package_bandwidth']), keepalive=0, timeout=sotimeout, proxies=proxies) break except Exception as exc: msg = 'Download error %s - %s' % ( str(exc), traceback.format_exc(3)) LOG.warning(msg) randsleep = randint(30, 60) time.sleep(randsleep) else: # oops! only http and bittorrent supported now raise AgentException( Errors.DC_UNSUPPORTED_PROTOCOL, 'Only http protocols is supported at the moment') self._checkStop() if not self.__skipProp: if (not PackageUtil.validatePackage( self.__uriDict['inProgressPackagePath'], self.__uriDict['propPath'])): raise AgentException( Errors.DC_FAILED_VALIDATE, 'Package ' + self.__uriDict['packagePath'] + ' failed validation') os.utime(self.__uriDict['propPath'], None) utils.rchmod(self.__uriDict['propPath'], "777", 'no') LOG.info( 'Download complete, will now rename and do validation on this file %s' % self.__uriDict['packagePath']) os.rename(self.__uriDict['inProgressPackagePath'], self.__uriDict['packagePath']) os.utime(self.__uriDict['packagePath'], None) utils.rchmod(self.__uriDict['packagePath'], "777", 'no') LOG.info( "Download complete, Validation completed, updating progress to 100" ) self._updateStatus(progress=100) except AgentException, exc: self._updateStatus(httpStatus=500, progress=0, error=exc.getCode(), errorMsg=exc.getMsg()) msg = 'Download error %s - %s' % (str(exc), traceback.format_exc(3)) LOG.error(msg) raise exc
def test_urlgrab(self): "module-level urlgrab() function" outfile = tempfile.mktemp() filename = urlgrabber.urlgrab('http://abat.au.example.com', filename=outfile) os.unlink(outfile)
#!/usr/bin/env python """ Grabs all the sources for a given version number of elasticsearch """ import urlgrabber from optparse import OptionParser parser = OptionParser() (options, args) = parser.parse_args() print "Getting Main Source for version %s " % args[0] urlgrabber.urlgrab( 'https://github.com/downloads/elasticsearch/elasticsearch/elasticsearch-%s.tar.gz' % args[0]) #Things we don't want: #print "Getting plugin-lang-python" #urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/lang-python/elasticsearch-lang-python-%s.zip' % args[0]) #print "Getting plugin-lang-groovy" #urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/lang-groovy/elasticsearch-lang-groovy-%s.zip' % args[0]) #print "Getting plugin-river-couchdb" #urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/river-couchdb/elasticsearch-river-couchdb-%s.zip' % args[0]) #print "Getting plugin-river-rabbitmq" #urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/river-rabbitmq/elasticsearch-river-rabbitmq-%s.zip' % args[0]) #print "Getting plugin-river-twitter" #urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/river-twitter/elasticsearch-river-twitter-%s.zip' % args[0]) #print "Getting plugin-river-wikipedia" #urlgrabber.urlgrab('http://elasticsearch.googlecode.com/svn/plugins/river-wikipedia/elasticsearch-river-wikipedia-%s.zip' % args[0]) print "Getting plugin-lang-javascript" urlgrabber.urlgrab( 'http://elasticsearch.googlecode.com/svn/plugins/lang-javascript/elasticsearch-lang-javascript-%s.zip' % args[0])
def _get_mirror_list(self, repo, url): mirrorlist_path = os.path.join(repo.root, 'mirrorlist.txt') returnlist = [] content = [] try: urlgrabber_opts = {} self.set_download_parameters(urlgrabber_opts, url, mirrorlist_path) urlgrabber.urlgrab(url, mirrorlist_path, **urlgrabber_opts) except Exception as exc: # no mirror list found continue without return returnlist def _replace_and_check_url(url_list): goodurls = [] skipped = None for url in url_list: # obvious bogons get ignored b/c, we could get more interesting checks but <shrug> if url in ['', None]: continue try: # This started throwing ValueErrors, BZ 666826 (s, b, p, q, f, o) = urlparse(url) if p[-1] != '/': p = p + '/' except (ValueError, IndexError, KeyError) as e: s = 'blah' if s not in ['http', 'ftp', 'file', 'https']: skipped = url continue else: goodurls.append(urlunparse((s, b, p, q, f, o))) return goodurls try: with open(mirrorlist_path, 'r') as mirrorlist_file: content = mirrorlist_file.readlines() except Exception as exc: self.error_msg("Could not read mirrorlist: {}".format(exc)) try: # Try to read a metalink XML for files in etree.parse(mirrorlist_path).getroot(): file_elem = files.find(METALINK_XML + 'file') if file_elem.get('name') == 'repomd.xml': _urls = file_elem.find(METALINK_XML + 'resources').findall(METALINK_XML + 'url') for _url in _urls: # The mirror urls in the metalink file are for repomd.xml so it # gives a list of mirrors for that one file, but we want the list # of mirror baseurls. Joy of reusing other people's stds. :) if not _url.text.endswith("/repodata/repomd.xml"): continue returnlist.append( _url.text[:-len("/repodata/repomd.xml")]) except Exception as exc: # If no metalink XML, we try to read a mirrorlist for line in content: if re.match('^\s*\#.*', line) or re.match('^\s*$', line): continue mirror = re.sub('\n$', '', line) # no more trailing \n's (mirror, count) = re.subn('\$ARCH', '$BASEARCH', mirror) returnlist.append(mirror) returnlist = _replace_and_check_url(returnlist) try: # Write the final mirrorlist that is going to be pass to Zypper with open(mirrorlist_path, 'w') as mirrorlist_file: mirrorlist_file.write(os.linesep.join(returnlist)) except Exception as exc: self.error_msg( "Could not write the calculated mirrorlist: {}".format(exc)) return returnlist
def download_file(self, uri, lhash, fsize, local_path=None, throttle=0): """Downloads a package from specified uri. Args: uri (strings): Uri of the file to download. local_path (string): Full path where the package is be saved. Do not include a file name. throttle (int): Number of kilobytes to throttle the bandwidth by. If throttle == 0, throttling is disabled. Returns: True if package downloaded successfully. False otherwise. """ # urlgrab doesn't like unicode. uri = str(uri) if not lhash: lhash = '' success = False hash_status = 'not verified' fsize_match = False path = '' if throttle != 0: throttle *= 1024 try: if local_path and len(uri) > 0: name = uri.split('/')[-1] if '?' in name: name = name.split('?')[0] path = os.path.join(local_path, name) urlgrab(uri, filename=path, throttle=throttle) elif len(uri) > 0 and not local_path: path = urlgrab(uri, throttle=throttle) except Exception as e: logger.exception(e) if os.path.exists(path): if len(lhash) > 0: hash_match = hash_verifier(orig_hash=lhash, file_path=path) if hash_match['pass']: hash_status = 'verified' fsize_match = True success = True elif fsize and len(lhash) < 1: if os.path.getsize(path) == fsize: hash_status = 'no hash' fsize_match = True success = True return(success, hash_status, fsize_match)