Exemplo n.º 1
0
    def download(self,
                 direc,
                 check=False,
                 gzip=False,
                 outfile=None,
                 continueonError=False):

        if not self.webdav:
            self.connect()

        if outfile:
            outf = os.path.join(direc, outfile)
        else:
            outf = os.path.join(direc, self.fname)

        uri = UriFile(url=outf)

        if check and self.lastmod and uri.lastmod:
            if self.lastmod <= uri.lastmod:
                #no need to download the file
                slurplog.info("Already Downloaded, skipping %s" % (uri.url))
                return uri, False

        slurplog.info("Downloading %s" % (uri.url))
        self.webdav.download(self.fname, uri.url)

        #change modification and access time to that provided by the ftp server
        setFtime(uri.url, self.lastmod)
        return uri, True
Exemplo n.º 2
0
    def register(self, df=None):
        """Update/populate a database table from a pandas compatible file) 
    """
        if df is not None:
            #supplying an existing dataframe takes precedence
            indf = df.copy(deep=False)
        elif self.ftype == "csv":
            indf = pd.read_csv(self.pdfile,
                               skipfooter=self.skipfooter,
                               encoding=self.encoding)
        elif self.ftype == "excel":
            indf = pd.read_excel(self.pdfile,
                                 skipfooter=self.skipfooter,
                                 engine="openpyxl")

        else:
            raise RuntimeError("Don't know how to open %s, specify ftype" %
                               (self.pdfile))
            #possibly modify dataframe in derived class

        indf = self.modify_df(indf)

        slurplog.info("Filling pandas table %s.%s" % (self.scheme, self.name))

        self.registerInDatabase(indf)

        #also update entry in the inventory table
        self.updateInvent()
Exemplo n.º 3
0
def FESOMMetaExtractor(uri):
    """Extract meta information from a FESOM output file"""
    slurplog.info("extracting data from %s" % (uri.url))

    try:
        ncFESOM = ncDset(uri.url)
    except OSError:
        slurplog.error("Cannot open netcdf file, skipping")
        return None
    tvar = ncFESOM["time"]

    if tvar.shape[0] == 0:
        #quick return
        return None

    if tvar.calendar == "noleap":
        slurplog.warning(
            "Note found 'noleap' calendar string but assuming 'standard'")
        cal = 'standard'
    else:
        cal = tvar.calendar

    #parse time
    time = num2date(tvar[:], tvar.units, cal, only_use_cftime_datetimes=False)
    # try to estimate the time step fromt he median
    deltamedian = np.median(np.diff(time))
    if deltamedian.days > 28 and deltamedian.days <= 32:
        freq = 'monthly'
        #set tstart to the beginning of the month
        tstart = datetime(time[0].year, time[0].month, 1)
    elif deltamedian.days >= 1 and deltamedian.days < 28:
        freq = "%ddaily" % (deltamedian.days)
        #remove the median time interval from the first time
        tstart = time[0] - deltamedian
    elif deltamedian.days < 1:
        freq = "%dhourly" % (deltamedian.seconds / 3600)
        #remove the median time interval from the first time
        tstart = time[0] - deltamedian

    data = {"variables": {}}

    for ky, var in ncFESOM.variables.items():
        try:
            data["variables"][ky] = var.description
        except AttributeError:
            data["variables"][ky] = ky

    meta = {
        "tstart": tstart,
        "tend": time[-1] + deltamedian,
        "lastupdate": uri.lastmod,
        "interval": freq,
        "uri": uri.url,
        "data": data
    }
    ncFESOM.close()
    return meta
Exemplo n.º 4
0
    def refresh(self,conf):
        """Refresh the dataset catalogue"""
        slurplog.info("Refreshing cached catalogue %s"%cachefile)
        self.registerAllDataSets(conf)

        #load inventory of existing datasets (for the templated)
        Inv=Inventory(conf.db)

        self.__catalogue__={"datasets":{},"factories":{},"functions":{}}
        #loop over dataset in the factories

        for ds in self.__dsets__:
            name=".".join([ds.scheme,ds.__name__])
            if re.search("TEMPLATE",name):
                srch=re.sub("TEMPLATE","([^\\\s]+)",name.replace(".","\."))
                #possibly also add existing datasets so they can be found by regular expressions
                for entry in Inv:
                    nameexisting=".".join([entry.scheme,entry.dataset])
                    if re.search(srch,nameexisting):
                        #add 
                        self.__catalogue__["datasets"][nameexisting]={"template":name}
            self.__catalogue__["datasets"][name]={"module":ds.__module__}
            self.__dscache__[name]=ds

        for dsfac in self.__dsetfac__:
            self.__catalogue__["factories"][dsfac.__name__]={"module":dsfac.__module__}
            for ds in dsfac(conf):
                name=".".join([ds.scheme,ds.__name__])
                if re.search("TEMPLATE",name):
                    srch=re.sub("TEMPLATE","([^\\\s]+)",name.replace(".","\."))
                    #possibly also add existing datasets so they can be found by regular expressions
                    for entry in Inv:
                        nameexisting=".".join([entry.scheme,entry.dataset])
                        if re.search(srch,nameexisting):
                            #add 
                            self.__catalogue__["datasets"][nameexisting]={"template":name}
                
                self.__catalogue__["datasets"][name]={"factory":dsfac.__name__}
                self.__dscache__[name]=ds
        
        #also find already existing instances of templated datasets


        #save to yaml
        self.__catalogue__["lastupdate"]=datetime.now()
        cachefile=self.getCacheFile(conf)
        slurplog.info("saving available Dataset catalogue to %s"%cachefile)
        with open(cachefile,'wt') as fid:
            yaml.dump(self.__catalogue__, fid, default_flow_style=False)
Exemplo n.º 5
0
    def register(self, ds=None):
        """Update/populate a database table from a xarray compatible file or from a dataset directly)"""

        if self.xarfile != "" and ds is None:
            #note if a ds is explicitly provided it takes precedence over the data from xarfile
            try:
                ds = xr.open_dataset(self.xarfile)
            except:
                raise RuntimeError(f"Cannot open Xarray file {self.xarfile}")
        slurplog.info("Filling xarray table %s.%s" % (self.scheme, self.name))

        self.registerInDatabase(ds)

        # #also update entry in the inventory table
        self.updateInvent()
Exemplo n.º 6
0
    def convert2zarr(self, tarname):

        #load latitude and longitude
        ddir = self.cacheDir()
        lonlat = loadmat(os.path.join(ddir, "Longitude.mat"))
        loadmat(os.path.join(ddir, "Latitude.mat"), mdict=lonlat)

        #start creating a basic xarray dataset
        dsbase = xr.Dataset(coords=dict(lon=(["lon"],
                                             lonlat['Longitude'][0, :]),
                                        lat=(["lat"], lonlat['Latitude'][:,
                                                                         0])))
        #NOTE: although the download ends with rar it is NOT A RAR but a TAR ARCHIVE!!!

        tf = tarfile.open(tarname)
        appdim = None
        for mem in tf.getmembers():
            slurplog.info(f"Converting {mem.name} to zarr")
            mat = np.ma.masked_equal(loadmat(tf.extractfile(mem))['ETm'], 0.0)
            mat.set_fill_value(np.nan)

            #extract the time centered at the 15th of the month
            time = datetime.strptime(mem.name[5:11] + "15", "%Y%m%d")
            ds = dsbase.assign_coords(time=[time])
            ds["ETm"] = (["time", "lat", "lon"], np.expand_dims(mat, 0))
            mmmon_kgsecm2 = 1 / (86400 * monthrange(time.year, time.month)[1])
            ds["ETm"] = ds.ETm * mmmon_kgsecm2
            #add CF atributes
            cfadd_global(
                ds,
                title="SEBSv2 Evapotranspiration estimates",
                references=
                "https://agupubs.onlinelibrary.wiley.com/doi/full/10.1029/2020JD032873",
                source=f"Geoslurp class {self.__class__.__name__}")
            cfadd_standard_name(ds.ETm, "water_evapotranspiration_flux")
            # cfencode_time(ds.time)
            cfadd_coord(ds.lon, 'X', standard_name='longitude')
            cfadd_coord(ds.lat, 'Y', standard_name='latitude')
            if appdim:
                ds.to_zarr(self.xarfile, append_dim=appdim)
            else:

                ds.to_zarr(self.xarfile, mode='w')
                appdim = "time"
Exemplo n.º 7
0
    def register(self):
        """Update/populate a database table from a pandas compatible file) 
    """
        if self.ftype == "csv":
            df=pd.read_csv(self.pdfile,skipfooter=self.skipfooter,encoding=self.encoding)
        elif self.ftype == "excel":
            df=pd.read_excel(self.pdfile,skipfooter=self.skipfooter)

        else:
            raise RuntimeError("Don't know how to open %s, specify ftype"%(self.pdfile))
            #possibly modify dataframe in derived class 
            
        slurplog.info("Filling pandas table %s.%s with data from %s" % (self.scheme, self.name, self.pdfile))
        
        df=self.modify_df(df)                
        df.to_sql(self.name,self.db.dbeng,schema=self.scheme, if_exists='replace', dtype=self.dtypes)


        #also update entry in the inventory table
        self.updateInvent()
Exemplo n.º 8
0
def orasMetaExtractor(uri):
    """Extract meta information from a output file"""
    slurplog.info("extracting data from %s"%(uri.url))

    try:
        nc_id=ncDset(uri.url)
    except OSError:
        slurplog.error("Cannot open netcdf file, skipping")
        return None
    tvar=nc_id["time_counter"]

    if tvar.shape[0] == 0:
        #quick return 
        return None
    
    if tvar.calendar == "noleap":
        slurplog.warning("Note found 'noleap' calendar string but assuming 'standard'")
        cal='standard'
    else:
        cal=tvar.calendar

    #parse time
    time=num2date(tvar[:], tvar.units,cal,only_use_cftime_datetimes=False)

    data={"variables":{}}

    for ky,var in nc_id.variables.items():
        try:
            data["variables"][ky]=var.description
        except AttributeError:
            data["variables"][ky]=ky


    meta={"tstart":datetime(time[0].year,time[0].month,1),
          "lastupdate":uri.lastmod,
          "uri":uri.url,
          "data":data
          }
    nc_id.close()
    return meta
Exemplo n.º 9
0
def graceMetaExtractor(uri):
    """Extract meta information from a GRACE file"""
    buf = StringIO()
    with gzip.open(uri.url, 'rt') as fid:
        slurplog.info("Extracting info from %s" % (uri.url))
        for ln in fid:
            if '# End of YAML header' in ln:
                break
            else:
                buf.write(ln)
    hdr = yaml.safe_load(buf.getvalue())["header"]
    nonstand = hdr["non-standard_attributes"]

    meta = {
        "nmax": hdr["dimensions"]["degree"],
        "omax": hdr["dimensions"]["order"],
        "tstart": hdr["global_attributes"]["time_coverage_start"],
        "tend": hdr["global_attributes"]["time_coverage_end"],
        "lastupdate": uri.lastmod,
        "format": nonstand["format_id"]["short_name"],
        "gm": nonstand["earth_gravity_param"]["value"],
        "re": nonstand["mean_equator_radius"]["value"],
        "uri": uri.url,
        "type": nonstand["product_id"][0:3],
        "data": {
            "description": hdr["global_attributes"]["title"]
        }
    }

    #add tide system
    try:
        tmp = nonstand["permanent_tide_flag"]
        if re.search('inclusive', tmp):
            meta["tidesystem"] = "zero-tide"
        elif re.search('exclusive'):
            meta["tidesystem"] = "tide-free"
    except:
        pass

    return meta
Exemplo n.º 10
0
    def download(self,
                 direc,
                 check=False,
                 outfile=None,
                 continueonError=False,
                 restdict=None):
        """Download file into directory and possibly check the modification time
        :param check : check whether the file needs updating
        :param gzip: additionally gzips the file (adds .gz to file name)
        :param continueonError (bool): don't raise an exception when a download error occurrs
        """

        #setup the output uri
        if outfile:
            outf = os.path.join(direc, self.subdirs, outfile)
        else:
            outf = os.path.join(direc, self.subdirs,
                                os.path.basename(self.url))

        #create directory if it does not exist
        if not os.path.exists(os.path.dirname(outf)):
            os.makedirs(os.path.dirname(outf), exist_ok=True)

        uri = UriFile(url=outf)
        if check and self.lastmod and uri.lastmod:
            if self.lastmod <= uri.lastmod:
                #no need to download the file
                slurplog.info("Already Downloaded, skipping %s" % (uri.url))
                return uri, False
        slurplog.info("Downloading %s" % (uri.url))

        stat = self.sftpconnection.stat(self.rpath)
        mtime = datetime.fromtimestamp(stat.st_mtime)
        self.sftpconnection.get(self.rpath, outf)
        #set the modification time to match the server
        setFtime(outf, mtime)

        return uri, True
Exemplo n.º 11
0
def cf_load(cf_convfile=None):
    if cf_convfile is None:
        cf_convfile = os.path.join(os.path.expanduser('~'),
                                   '.cf-conventions.yaml')
    if os.path.exists(cf_convfile):
        slurplog.info(f"Reading CF convention defaults from {cf_convfile}")
        with open(cf_convfile, 'r') as fid:
            cfconv = yaml.safe_load(fid)
    else:
        #create a new version
        resp = requests.get(
            "https://cfconventions.org/Data/cf-standard-names/79/src/cf-standard-name-table.xml"
        )
        user = os.environ["USER"]
        cfconv = {
            "Conventions": "CF-1.9",
            "institution": f"{user}@unknown, Institute, Country",
            "source": "geoslurp"
        }

        cf = minidom.parseString(resp.text)
        cfconv["standard_names"] = {
            un.parentNode.getAttribute("id"): {
                "units": un.firstChild.data
            }
            for un in cf.getElementsByTagName('canonical_units')
            if un.firstChild is not None
        }
        #cache to file to file
        slurplog.info(
            f"Writing CF convention defaults to {cf_convfile}, modify the file to add better defaults when writing files"
        )
        with open(cf_convfile, 'w') as fid:
            yaml.dump(cfconv, fid, default_flow_style=False)

    return cfconv
Exemplo n.º 12
0
    def download(self,
                 direc,
                 check=False,
                 gzip=False,
                 gunzip=False,
                 outfile=None,
                 continueonError=False,
                 restdict=None):
        """Download file into directory and possibly check the modification time
        :param check : check whether the file needs updating
        :param gzip: additionally gzips the file (adds .gz to file name)
        :param continueonError (bool): don't raise an exception when a download error occurrs
        """

        #setup the output uri
        if outfile:
            outf = os.path.join(direc, self.subdirs, outfile)
        else:
            if gzip:
                outf = os.path.join(direc, self.subdirs,
                                    os.path.basename(self.url)) + '.gz'
            elif gunzip:
                #strip gz suffix
                outf = os.path.splitext(
                    os.path.join(direc, self.subdirs,
                                 os.path.basename(self.url)))[0]
            else:
                outf = os.path.join(direc, self.subdirs,
                                    os.path.basename(self.url))

        #create directory if it does not exist
        if not os.path.exists(os.path.dirname(outf)):
            os.makedirs(os.path.dirname(outf), exist_ok=True)

        uri = UriFile(url=outf)
        if check and self.lastmod and uri.lastmod:
            if self.lastmod <= uri.lastmod:
                #no need to download the file
                slurplog.info("Already Downloaded, skipping %s" % (uri.url))
                return uri, False
        slurplog.info("Downloading %s" % (uri.url))
        try:
            if self.lastmod:
                curlDownload(self.url,
                             uri.url,
                             self.lastmod,
                             gzip=gzip,
                             gunzip=gunzip,
                             auth=self.auth,
                             restdict=restdict,
                             headers=self.headers,
                             cookiefile=self.cookiefile)
            else:
                self.lastmod = curlDownload(self.url,
                                            uri.url,
                                            gzip=gzip,
                                            gunzip=gunzip,
                                            auth=self.auth,
                                            restdict=restdict,
                                            headers=self.headers,
                                            cookiefile=self.cookiefile)
        except pycurl.error as pyexc:
            slurplog.info("Download failed, skipping %s" % (uri.url))
            if not continueonError:
                raise pyexc
        except Exception as e:
            raise e
        uri.lastmod = self.lastmod
        return uri, True
Exemplo n.º 13
0
 def convert2zarr(self):
     slurplog.info("Converting data to zarr%s"%(self.xarfile))
     #open all datasets together
     ds=xr.open_mfdataset(os.path.join(self.cacheDir(),"*.nc"))
     #save to zarr format
     ds.to_zarr(self.xarfile)
Exemplo n.º 14
0
def graceMetaExtractor(uri):
    """Extract meta information from a GRACE file"""

    #some dirty search rand replace hacks to fix faulty yaml header in the grace/fo data
    hdrpatches = [
        (re.compile("0000-00-00T00:00:00"), "1970-01-01T00:00:00"),
        (re.compile("Dahle et al\. \(2019\)\:"), "Dahle et al. (2019),"),
        (re.compile("Dobslaw et al\. \(2019\)\:"), "Dobslaw et al. (2019),")
    ]
    patchedLines = 0

    buf = StringIO()
    with gzip.open(uri.url, 'rt') as fid:
        slurplog.info("Extracting info from %s" % (uri.url))
        for ln in fid:
            if '# End of YAML header' in ln:
                #parse the yaml header
                hdr = yaml.safe_load(buf.getvalue())["header"]
                break
            else:
                # if re.search("Dahle",ln):
                # import pdb;pdb.set_trace()
                #see if the line needs patching
                for reg, repl in hdrpatches:
                    ln, nr = re.subn(reg, repl, ln, count=1)
                    patchedLines += nr
                #hack replace 0000-00-00 dates because yaml can't parse them
                buf.write(ln)
        if patchedLines > 0:
            #we want to fix the header and patch the input file
            buf.write(ln)  #write end of YAML file
            #dunp the remainder of the file in the stringio buffer
            buf.write(fid.read())

    if patchedLines > 0:
        slurplog.info("Patching faulty yaml header in file %s" % uri.url)
        with gzip.open(uri.url, 'wt') as fidout:
            fidout.write(buf.getvalue())

    nonstand = hdr["non-standard_attributes"]

    meta = {
        "nmax": hdr["dimensions"]["degree"],
        "omax": hdr["dimensions"]["order"],
        "tstart": hdr["global_attributes"]["time_coverage_start"],
        "tend": hdr["global_attributes"]["time_coverage_end"],
        "lastupdate": uri.lastmod,
        "format": nonstand["format_id"]["short_name"],
        "gm": nonstand["earth_gravity_param"]["value"],
        "re": nonstand["mean_equator_radius"]["value"],
        "uri": uri.url,
        "type": nonstand["product_id"][0:3],
        "data": {
            "description": hdr["global_attributes"]["title"]
        }
    }

    #add tide system
    try:
        tmp = nonstand["permanent_tide_flag"]
        if re.search('inclusive', tmp):
            meta["tidesystem"] = "zero-tide"
        elif re.search('exclusive'):
            meta["tidesystem"] = "tide-free"
    except:
        pass

    return meta