def register(self,cycle=None,since=None): if since: since=datetime.strptime(since,"%Y-%m-%d") print(since) else: since=self._dbinvent.lastupdate #create a list of files which need to be (re)registered if self.updated: files=self.updated else: slurplogger().info("Listing files to process (this can take a while)...") if cycle: files=[UriFile(file) for file in findFiles(os.path.join(self._dbinvent.datadir,self.sat,self.phase,"c%03d"%(cycle)),'.*\.nc$',since=since)] else: files=[UriFile(file) for file in findFiles(os.path.join(self._dbinvent.datadir,self.sat,self.phase),'.*\.nc$',since=since)] if not files: slurplogger().info("No updated files found") return newfiles=self.retainnewUris(files) if not newfiles: slurplogger().info("Nothing to update") return for uri in newfiles: meta=radsMetaDataExtractor(uri) if not meta: #don't register empty entries continue self.addEntry(meta) self.updateInvent()
def download(self): """Download file""" muri = Uri(self.mopts) #check if download is needed muri.requestInfo() uristacked = UriFile(self.mopts.fullname()) if uristacked.lastmod: if muri.lastmod <= uristacked.lastmod: slurplogger().info("Already downloaded %s" % (uristacked.url)) #quick return when there is no need to merge/download return uristacked, False #check if download is allowed kb, maxkb = muri.updateSize() if kb > maxkb: #split up request and try again #create 2 bounding boxes split on time Abbox, Bbbox = muri.opts.btdbox.timeSplit() AmotuRec = MotuRecursive(copy.deepcopy(self.mopts)) AmotuRec.mopts.syncbtdbox(Abbox) AmotuRec.mopts.out_name = self.mopts.out_name.replace( '.nc', '_A.nc') AmotuRec.mopts.out_dir = AmotuRec.mopts.cache BmotuRec = MotuRecursive(copy.deepcopy(self.mopts)) BmotuRec.mopts.syncbtdbox(Bbbox) BmotuRec.mopts.out_name = self.mopts.out_name.replace( '.nc', '_B.nc') BmotuRec.mopts.out_dir = BmotuRec.mopts.cache Auri, Aupd = AmotuRec.download() Buri, Bupd = BmotuRec.download() #possible improvement here split a dataset at an unlimited dimensions and append the second one to the first one #patch files together (if updated) if Aupd or Bupd or not os.path.exists(self.mopts.fullname()): uristacked, upd = stackNcFiles(self.mopts.fullname(), Auri.url, Buri.url, 'time') if not self.keepfiles: #remove the partial files os.remove(AmotuRec.mopts.fullname()) os.remove(BmotuRec.mopts.fullname()) else: uristacked = UriFile(self.mopts.fullname()) upd = False return uristacked, True else: return muri.download(self.mopts.out_dir, check=True)
def register(self, pattern=None): """Register static gravity fields donwloaded in the data director :param pattern: only register files whose filename obeys this regular expression """ if not pattern: pattern = '.*\.gz' #create a list of files which need to be (re)registered if self.updated: files = self.updated else: files = [ UriFile(file) for file in findFiles(self.dataDir(), pattern) ] #loop over files for uri in files: urilike = os.path.basename(uri.url) if not self.uriNeedsUpdate(urilike, uri.lastmod): continue meta = icgemMetaExtractor(uri) self.addEntry(meta) self.updateInvent()
def pull(self,pattern='.*'): """ Pulls the Easy CORA file from the copernicus FTP server , and unpacks them :param pattern (string) only download data which obey this regular expression file pattern (e.g. 20[0-9][0-9] to download from 2000 and onward) """ ftproot="ftp://my.cmems-du.eu/Core/INSITU_GLO_TS_REP_OBSERVATIONS_013_001_b/CORIOLIS-GLOBAL-EasyCORA-OBS/global" #get cmems authentication details from database cred=self.conf.authCred("cmems") ftpcr=ftpCrawler(ftproot,auth=cred, pattern=pattern) updated=ftpcr.parallelDownload(self.cacheDir(),check=True,maxconn=10,continueonError=True) #unpack the downloaded files in the data directory datadir=self.dataDir() for tarf in [UriFile(f) for f in findFiles(self.cacheDir(),".*tgz$")]: succesfile=os.path.join(datadir,os.path.basename(tarf.url)+".isextracted") try: #check if the files need unpacking (only unpack when needed) #check if the last file is already extracted if os.path.exists(succesfile): slurplogger().info(f"{tarf.url} is already extracted, skipping") else: with tarfile.open(tarf.url,"r:gz") as tf: slurplogger().info(f"Extracting trajectory files from {tarf.url}") tf.extractall(datadir) #touch the sucessfile to indcate this archive has been sucessfully extracted Path(succesfile).touch() except tarfile.ReadError as exc: raise exc
def register(self, center=None): """register downloaded commbined prof files""" #create a list of files which need to be (re)registered if self.updated: files = self.updated else: slurplogger().info("Building file list..") files = [ UriFile(file) for file in findFiles(self.dataDir(), '.*nc', self._dbinvent.lastupdate) ] if len(files) == 0: slurplogger().info("Argo: No new files found since last update") return filesnew = self.retainnewUris(files) if len(filesnew) == 0: slurplogger().info("Argo: No database update needed") return #loop over files for uri in filesnew: if center and not re.search(center, uri.url): continue meta = argoMetaExtractor(uri) if meta: self.addEntry(meta) self.updateInvent()
def download(self, direc, check=False, gzip=False, outfile=None, continueonError=False): if not self.webdav: self.connect() if outfile: outf = os.path.join(direc, outfile) else: outf = os.path.join(direc, self.fname) uri = UriFile(url=outf) if check and self.lastmod and uri.lastmod: if self.lastmod <= uri.lastmod: #no need to download the file slurplog.info("Already Downloaded, skipping %s" % (uri.url)) return uri, False slurplog.info("Downloading %s" % (uri.url)) self.webdav.download(self.fname, uri.url) #change modification and access time to that provided by the ftp server setFtime(uri.url, self.lastmod) return uri, True
def download(self, direc, check=False, gzip=False, outfile=None): #check whether the file exists and retrive thelast update date if outfile: self.opts.out_name = outfile self.opts.out_dir = direc fout = os.path.join(direc, self.opts.out_name) uri = UriFile(fout) if check and os.path.exists(fout): self.updateModTime() if self.lastmod <= uri.lastmod: slurplogger().info("No need to download file %s" % (fout)) return uri, False slurplogger().info("Downloading %s" % (fout)) try: execute_request(self.opts) except Exception as e: slurplogger().error("failed to download file %s", e) raise (e) return uri, True
def retainnewUris(self, urilist): """Filters those uris which have table entries which are too old or are not present in the database""" #create a temporary table with uri and lastmodification time entries cols = [ Column('id', Integer, primary_key=True), Column('uri', String), Column('lastmod', TIMESTAMP) ] #setup a seperate session and transaction in order to work with a temporary table trans, ses = self.db.transsession() tmptable = self.db.createTable('tmpuris', cols, temporary=True, bind=ses.get_bind()) # tmptable=self.db.createTable('tmpuris',cols,temporary=False,bind=ses.get_bind()) # import pdb;pdb.set_trace() #fill the table with the file list and last modification timsstamps count = 0 for uri in urilist: entry = tmptable(uri=uri.url, lastmod=uri.lastmod) ses.add(entry) count += 1 if count > self.commitperN: ses.commit() count = 0 ses.commit() #delete all entries which require updating # first gather all the ids of i entries which are expired subqry = ses.query(self.table.id).join( tmptable, and_(tmptable.uri == self.table.uri, tmptable.lastmod > self.table.lastupdate)).subquery() # #then delete those entries from the table # import pdb;pdb.set_trace() delqry = self.table.__table__.delete().where(self.table.id.in_(subqry)) ses.execute(delqry) #now make a list of new uris qrynew = ses.query(tmptable).outerjoin( self.table, self.table.uri == tmptable.uri).filter(self.table.uri == None) #submit transaction trans.commit() #return entried which need updating he entries in the original table which need updating return [UriFile(x.uri, x.lastmod) for x in qrynew]
def register(self): slurplogger().info("Building file list..") files=[UriFile(file) for file in findFiles(self.dataDir(),'.*gz',self._dbinvent.lastupdate)] # import pdb;pdb.set_trace() filesnew=self.retainnewUris(files) if len(filesnew) == 0: slurplogger().info("GRDC: No database update needed") return # filesnew=[UriFile(os.path.join(self.dataDir(),"4208270_Q_Month.txt.gz"))] #loop over files for uri in filesnew: meta=GRDCmetaExtractor(uri) self.addEntry(meta) self.updateInvent()
def register(self,pattern='.*\.nc$'): """Register downloaded trajectory files from CORA :param pattern (string) file pattern to look for (defaults to all files ending with .nc) """ #create a list of files which need to be (re)registered newfiles=self.retainnewUris([UriFile(file) for file in findFiles(self.dataDir(),pattern)]) for uri in newfiles: meta=coraMetaExtractor(uri) if not meta: #don't register empty entries continue self.addEntry(meta) self._dbinvent.data["Description"]="EasyCora output data table" self._dbinvent.data["CORAversion"] = "5.2" self.updateInvent()
def register(self): slurplogger().info("Building file list..") files = [ UriFile(file) for file in findFiles(self.cacheDir(), '.*love', self._dbinvent.lastupdate) ] if len(files) == 0: slurplogger().info("LLove: No new files found since last update") return self.truncateTable() #loop over files for uri in files: self.addEntry(lloveMetaExtractor(uri)) self.updateInvent()
def parallelDownload(self, outdir, check=False): updated = [] if check: cmd = [ 'rsync', '-avz', '--del', '--update', self.auth.user + "@" + self.rooturl, outdir ] else: cmd = [ 'rsync', '-avz', '--del', self.auth.user + "@" + self.rooturl, outdir ] for file in self.startrsync(cmd): updated.append(UriFile(file)) return updated
def register(self): slurplogger().info("Building file list..") files=[UriFile(file) for file in findFiles(self.dataDir(),'.*love',self._dbinvent.lastupdate)] if len(files) == 0: slurplogger().info("LLove: No new files found since last update") return filesnew=self.retainnewUris(files) if len(filesnew) == 0: slurplogger().info("LLove: No database update needed") return #loop over files for uri in filesnew: self.addEntry(lloveMetaExtractor(uri)) self.updateInvent()
def register(self): #create a list of files which need to be (re)registered if self.updated: files=self.updated else: files=[UriFile(file) for file in findFiles(self._dbinvent.datadir,'.*gfc.gz',since=self._dbinvent.lastupdate)] newfiles=self.retainnewUris(files) #loop over files for uri in newfiles: slurplogger().info("extracting meta info from %s"%(uri.url)) meta=icgemMetaExtractor(uri) meta=enhanceMeta(meta) self.addEntry(meta) self.updateInvent()
def register(self): if not self.table: #create a new table on the fly self.createTable(self.columns) #create a list of files which need to be (re)registered newfiles = self.retainnewUris([ UriFile(file) for file in findFiles(self.dataDir(), f".*\{self.app}$") ]) for uri in newfiles: meta = self.metaExtractor(uri) if not meta: #don't register empty entries continue slurplogger().info(f"Adding metadata from {uri.url}") self.addEntry(meta) self._dbinvent.data["Description"] = self.description self.updateInvent()
def register(self): #create a list of files which need to be (re)registered if self.updated: files = self.updated else: files = [ UriFile(file) for file in findFiles(self.dataDir(), 'G.*\.gz', self._dbinvent.lastupdate) ] filesnew = self.retainnewUris(files) #loop over the newer files for uri in filesnew: meta = graceMetaExtractor(uri) self.addEntry(meta) self.updateInvent()
def download(self, direc, check=False, outfile=None, continueonError=False, restdict=None): """Download file into directory and possibly check the modification time :param check : check whether the file needs updating :param gzip: additionally gzips the file (adds .gz to file name) :param continueonError (bool): don't raise an exception when a download error occurrs """ #setup the output uri if outfile: outf = os.path.join(direc, self.subdirs, outfile) else: outf = os.path.join(direc, self.subdirs, os.path.basename(self.url)) #create directory if it does not exist if not os.path.exists(os.path.dirname(outf)): os.makedirs(os.path.dirname(outf), exist_ok=True) uri = UriFile(url=outf) if check and self.lastmod and uri.lastmod: if self.lastmod <= uri.lastmod: #no need to download the file slurplog.info("Already Downloaded, skipping %s" % (uri.url)) return uri, False slurplog.info("Downloading %s" % (uri.url)) stat = self.sftpconnection.stat(self.rpath) mtime = datetime.fromtimestamp(stat.st_mtime) self.sftpconnection.get(self.rpath, outf) #set the modification time to match the server setFtime(outf, mtime) return uri, True
def parallelDownload(self, outdir, check=False, includes=None, dryrun=False): updated = [] cmd = ['rsync', '-avz', '--del'] if check: cmd.append('--update') if dryrun: cmd.append('--dry-run') if includes: cmd.extend([f'--include={inc}' for inc in includes]) # inclist='{"'+'","'.join(includes)+'"}' # cmd.append(f'--include={inclist}') #exclude everything else which is not obeying the include filters cmd.append('--exclude=*') cmd.append(self.auth.user + "@" + self.rooturl) cmd.append(outdir) for file in self.startrsync(cmd): updated.append(UriFile(os.path.join(outdir, file))) return updated
def register(self,rundir=None,pattern='.*\.nc$'): """register netcdf output files @param rundir: directory where the netcdf files reside @param pattern: regular expression which the netcdfiles must obey defaults tkakes all files ending with nc""" if not rundir: raise RuntimeError("A directory/regex with output data needs to be supplied when registering this dataset") newfiles=self.retainnewUris([UriFile(file) for file in findFiles(rundir,pattern)]) for uri in newfiles: meta=orasMetaExtractor(uri) if not meta: #don't register empty entries continue self.addEntry(meta) self._dbinvent.data["Description"]="ORAS5 output data table" self.setDataDir(os.path.abspath(rundir)) self._dbinvent.data["grid"]="025" self.updateInvent()
def register(self): """ Register all downloaded fronts (in text files)""" slurplogger().info("Building file list..") files = [ UriFile(file) for file in findFiles(self.cacheDir(), '.*txt', self._dbinvent.lastupdate) ] if len(files) == 0: slurplogger().info( "Orsifronts: No new files found since last update") return #possibly empty table self.truncateTable() #loop over files for uri in files: slurplogger().info("adding %s" % (uri.url)) self.addEntry(orsiMetaExtractor(uri)) self.updateInvent()
def register(self, pattern=None): if not pattern: pattern = '.*\.gz' #create a list of files which need to be (re)registered if self.updated: files = self.updated else: files = [ UriFile(file) for file in findFiles(self.dataDir(), pattern) ] #loop over files for uri in files: urilike = os.path.basename(uri.url) if not self.uriNeedsUpdate(urilike, uri.lastmod): continue meta = icgemMetaExtractor(uri) self.addEntry(meta) self.updateInvent()
def pull(self, name=None, wsne=None, tstart=None, tend=None): """Pulls a subset of a gridded dataset as netcdf from an motu enabled server This routine calls the internal routines of the motuclient python client :param name: Name of the output datatset (file will be named 'name.nc') :param wsne: bounding box of the section of interest as [West,South,North,East] :param tstart: start date (as yyyy-mm-dd) for the extraction :param tend: end date (as yyyy-mm-dd) for the extraction """ if not name: raise RuntimeError( "A name must be supplied to MotuGridsBase.pull !!") if None in wsne: raise RuntimeError("Please supply a geographical bounding box") try: bbox = BtdBox(w=wsne[0], n=wsne[2], s=wsne[1], e=wsne[3], ts=tstart, te=tend) except: raise RuntimeError("Invalid bounding box provided to Duacs pull") cred = self.conf.authCred(self.authalias) ncout = os.path.join(self.dataDir(), name + ".nc") mOpts = MotuOpts(moturoot=self.moturoot, service=self.motuservice, product=self.motuproduct, btdbox=bbox, fout=ncout, cache=self.cacheDir(), variables=self.variables, auth=cred) if bbox.isGMTCentered(): # we need 2 downloads to split the and a merging of the grids ! # split the bounding box in two bboxleft, bboxright = bbox.lonSplit(0.0) bboxleft.to0_360() bboxright.to0_360() ncoutleft = os.path.join(self.cacheDir(), name + "_left.nc") mOptsleft = copy.deepcopy(mOpts) mOptsleft.syncbtdbox(bboxleft) mOptsleft.syncfilename(ncoutleft) MotuRecleft = MotuRecursive(mOptsleft) urileft, updleft = MotuRecleft.download() ncoutright = os.path.join(self.cacheDir(), name + "_right.nc") mOptsright = copy.deepcopy(mOpts) mOptsright.syncbtdbox(bboxright) mOptsright.syncfilename(ncoutright) MotuRecright = MotuRecursive(mOptsright) uriright, updright = MotuRecright.download() stackNcFiles(ncout, urileft.url, uriright.url, 'longitude') if updleft or updright: #change the longitude representation to -180..0 (without reshuffeling the data ncSwapLongitude(urileft.url) # patch files uri, upd = stackNcFiles(ncout, urileft.url, uriright.url, 'longitude') else: upd = False uri = UriFile(ncout) else: #we can handle this by a single recursive motu instance MotuRec = MotuRecursive(mOpts) uri, upd = MotuRec.download() if upd: self.updated.append(uri)