def init(self,**kwargs): self.__dict__.update(ginit(self,**kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy())))
def __init__(self, **kwargs): kwargs['defaults'] = { 'store_msg' : [],\ 'database' : None,\ 'product' : 'MCD15A3H',\ 'tile' : 'h08v06',\ 'log' : None,\ 'day' : '01',\ 'doy' : None, 'month' : '*',\ 'sds' : None, 'year' : "2019",\ 'site' : 'https://e4ftl01.cr.usgs.gov',\ 'size_check' : False,\ 'noclobber' : True,\ 'local_dir' : 'work',\ 'local_file' : None,\ 'db_file' : None,\ 'db_dir' : 'work',\ 'verbose' : False,\ 'stderr' : sys.stderr } self.__dict__.update(ginit(self, **kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy(),ignore=['db_dir','db_file']))) self.translateoptions = gdal.TranslateOptions( gdal.ParseCommandLine("-of Gtiff -co COMPRESS=LZW")) # list of tiles if type(self.tile) is str: self.tile = [self.tile] if type(self.sds) is str: self.sds = [self.sds] if self.sds is not None: self.msg(f'initial SDS {self.sds}') self.required_sds = self.sds # for most transactions, we want all SDS # so self.sds should reflect that self.sds = None response = self.database.get_from_db('SDS', self.product) if response: self.msg("found SDS names in database") self.sds = response self.msg(self.sds) # require them all if 'required_sds' not in self.__dict__: self.required_sds = self.sds
def __init__(self, **kwargs): kwargs['defaults'] = { 'store_msg' : [],\ 'database' : None,\ 'product' : 'MCD15A3H',\ 'tile' : 'h08v06',\ 'log' : None,\ 'day' : '01',\ 'doy' : None, 'month' : '*',\ 'sds' : None, 'year' : "2019",\ 'site' : 'https://e4ftl01.cr.usgs.gov',\ 'size_check' : False,\ 'noclobber' : True,\ 'local_dir' : 'work',\ 'local_file' : None,\ 'db_file' : None,\ 'db_dir' : 'work',\ 'verbose' : False,\ 'stderr' : sys.stderr } self.__dict__.update(ginit(self, **kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy(),ignore=['db_dir','db_file']))) self.translateoptions = gdal.TranslateOptions( gdal.ParseCommandLine("-of Gtiff -co COMPRESS=LZW")) # list of tiles if type(self.tile) is str: self.tile = [self.tile] if type(self.sds) is str: self.sds = [self.sds]
def stitch_date(self, year, doy, get_files=False, test=False): '''stitch data for date''' year = int(year) doy = int(doy) dater = (datetime.datetime(year, 1, 1) +\ datetime.timedelta(doy - 1)).strftime('%Y %m %d').split() self.year = f'{year}' self.month = f'{str(int(dater[1])) :0>2s}' self.day = f'{str(int(dater[2])) :0>2s}' d = self.__dict__.copy() fd = fdict(d) # dont need to read it fd['no_read'] = True ofilebase = f"{self.product}/data.__SDS__." + \ f"{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" hdf_urls = self.get_url(**(fd)) if not (len(hdf_urls) and (type(hdf_urls[0]) == URL)): if get_files: return None, None return [None] if 'db_file' in self.__dict__: if 'database' not in self.__dict__: # load database d = self.__dict__.copy() self.database = Database( self.db_file, **(fdict(d, ignore=['db_dir', 'db_file']))) if not test and not get_files: # look up in db warp_args = None dstNodata = None step = 1 #this_set = f"{self.product}.{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" store_flag = 'modis' kwargs = {'year': self.year, 'doy':doy,'day':self.day,'month':self.month,'step':step,\ 'warp_args':warp_args,'product': self.product, 'dstNodata':dstNodata, 'tile': self.tile} mkey = json.dumps(kwargs) # this is an hdf file response = self.database.get_from_db(store_flag, mkey) if response and self.noclobber: # test if self.test_ok(response[0]): # safe to return self.msg(f'positive response from database') ofiles = response return ofiles else: msg = f'WARNING: invalid entry {response[0]} in database {str(self.db_file)}' print(msg) self.msg(msg) try: hdf_files = [str(f.local()) for f in hdf_urls] except: for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] if get_files: sds = self.get_sds(hdf_files, do_all=False) return hdf_files, sds sds = self.get_sds(hdf_files, do_all=True) if sds == []: for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) # early return if we just want sds if test == True: return sds if len(sds) == 0: # failed to get SDS: need to download example file for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) ofiles = [] if len(sds) > len(self.sds): self.msg(f"ERROR in product {self.product} specification of SDS") self.msg(f"all SDS claimed to be: {len(self.sds)}") self.msg(self.sds) self.msg(f"But request for {len(sds)} SDSs made") self.msg(sds) sys.exit(1) for i, sd in enumerate(sds): ofile = f'{ofilebase.replace("__SDS__",self.sds[i])}.vrt'.replace( ' ', '_') spatial_file = Path(f"{self.local_dir[0]}", ofile) spatial_file.parent.mkdir(parents=True, exist_ok=True) g = gdal.BuildVRT(spatial_file.as_posix(), sds[i]) if not g: d = self.__dict__ print( f"problem building dataset for {spatial_file} with {fdict(d)}" ) sys.exit(1) del g ofiles.append(Path(spatial_file).absolute().as_posix()) # store in db cache = {store_flag: {mkey: ofiles}} #self.database.set_db(cache,write=True) return ofiles
class Modis(): ''' get MODIS datasets from the server ''' def __init__(self, **kwargs): kwargs['defaults'] = { 'store_msg' : [],\ 'database' : None,\ 'product' : 'MCD15A3H',\ 'tile' : 'h08v06',\ 'log' : None,\ 'day' : '01',\ 'doy' : None, 'month' : '*',\ 'sds' : None, 'year' : "2019",\ 'site' : 'https://e4ftl01.cr.usgs.gov',\ 'size_check' : False,\ 'noclobber' : True,\ 'local_dir' : 'work',\ 'local_file' : None,\ 'db_file' : None,\ 'db_dir' : 'work',\ 'verbose' : False,\ 'stderr' : sys.stderr } self.__dict__.update(ginit(self, **kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy(),ignore=['db_dir','db_file']))) self.translateoptions = gdal.TranslateOptions( gdal.ParseCommandLine("-of Gtiff -co COMPRESS=LZW")) # list of tiles if type(self.tile) is str: self.tile = [self.tile] if type(self.sds) is str: self.sds = [self.sds] if self.sds is not None: self.msg(f'initial SDS {self.sds}') self.required_sds = self.sds # for most transactions, we want all SDS # so self.sds should reflect that self.sds = None response = self.database.get_from_db('SDS', self.product) if response: self.msg("found SDS names in database") self.sds = response self.msg(self.sds) # require them all if 'required_sds' not in self.__dict__: self.required_sds = self.sds #def __del__(self): # cache = {"done" : { "done" : "exit" }} # self.database.set_db(cache,write=True) def msg(self, *args): '''msg to self.stderr''' this = str(*args) try: # DONT REPEAT MESSAGES ... doesnt work as yet if this in self.store_msg: return self.store_msg.append(this) except: self.store_msg = [this] try: if self.verbose or (self.log is not None): print('-->', *args, file=self.stderr) except: pass def get_data(self, year, doy=None, idict=None, month=None, day=None, step=1, fatal=False): ''' Return data dictionary of MODIS dataset for specified time period args: year : year (2000 to present for MOD, or 2002 to present if using MYD) NB this is ignoired if idict is given options: doy : day in year, or day in month if month specified, or None when specified as day in year, or day in month, can be a list 1-365/366 or 1-28-31 as appropriate day : day in month or None. Can be list. month : month index 1-12 or None. Can be list. step : dataset step. Default 1, but set to 4 for 4-day product, i 8 for 8-day, 365/366 for year etc. fatal : default False. If True, exit if dataset not found. idict : data file dictionary provided by eg call to self.get_modis(year,doy=None,month=None,step=1,fatal=False) see get_modis for more details returns: data dictionary with keys specified by: - self.sds list - or all SDS if self.sds is None (default) data dictionary key 'bandnames' of DOY Each data item a 2- or 3-dimensional numpy array: For a single date: kwargs = { 'tile' : ['h17v03', 'h17v04', 'h18v03', 'h18v04'], 'product' : 'MCD15A3H', 'sds' : 'Lai_500m', } modis = Modis(**kwargs) # specify day of year (DOY) and year data_MCD15A3H = modis.get_data(2019,doy=1+4*10) print(data_MCD15A3H.keys()) dict_keys(['Lai_500m', 'bandnames', 'files']) print(data_MCD15A3H['Lai_500m'].shape) (4800, 4800) print(len(data_MCD15A3H['bandnames'])) 1 If a list of days, or a month or year is specified, the datasets are 3-D: kwargs = { 'tile' : ['h19v03'], 'product' : 'MOD10A1', 'sds' : ['NDSI_Snow_Cover'] } year = 2019 month = 1 # get the data modis = Modis(**kwargs) # specify month and year data_MOD10A1 = modis.get_data(year,month=1) print(data_MOD10A1.keys()) dict_keys(['NDSI_Snow_Cover', 'bandnames', 'files']) print(data_MOD10A1['NDSI_Snow_Cover'].shape) (31, 2400, 2400) print(len(data_MOD10A1['bandnames'])) 31 ''' idict = idict or self.get_modis( year, day=day, doy=doy, month=month, step=step, fatal=fatal) # for get_data, we only want required_sds try: if 'required_sds' in self.__dict__: sds = self.required_sds bandnames = idict['bandnames'] else: bandnames = idict['bandnames'] del idict['bandnames'] self.required_sds = idict.keys() sds = self.required_sds vfiles = [idict[k] for k in sds] data = [] for i, s in enumerate(sds): g = gdal.Open(vfiles[i]) dataset = g.ReadAsArray() if dataset is None: msg = f"WARNING: no datasets in get_data() for {vfiles[i]}\n" +\ f"check datasets and database file {str(self.db_file)}" print(msg) self.msg(msg) if fatal == True: sys.exit(1) data.append(dataset) # enforce 3D data = np.atleast_3d(np.array(data).T).T odict = dict(zip(sds, data)) odict['bandnames'] = bandnames odict['files'] = idict return odict except: self.msg("Error calling get_data") return {} def monkey(self, kwargs): # could use json.dumps(d) return json.dumps(kwargs) keys = np.array(list(kwargs.keys())) keys.sort() s = '' for k in keys: v = kwargs[k] if type(v) is dict: ss = '{' + f'{self.monkey(v)}' + '}' elif type(v) is list: ss = "_".join(v) else: ss = str(v) s = s + '.' + f"{k}:{ss}" s = self.tidy(s.replace(' ', '_')) return s def sort_vfiles(self, vfiles, sds): # reconcile the order of sds and vfiles list _sds = np.array([s.replace(" ", "_") for s in sds]) _vfiles = np.array([f.split('/')[-1].split('.')[1] for f in vfiles]) index = tuple([np.where(_vfiles == ts)[0][0] for ts in _sds]) vf = vfiles.copy() vfiles = [vf[i] for i in index] return vfiles def get_modis(self,year,doy=None,day=None,month=None,step=1,\ warp_args=None,dstNodata=None,fatal=False): ''' Return data dictionary of MODIS datasets for specified time period args: year : year (2000 to present for MOD, or 2002 to present if using MYD) options: doy : day in year, or day in month if month specified, or None when specified as day in year, or day in month, can be a list 1-365/366 or 1-28-31 as appropriate day : day in month or None. Can be list. month : month index 1-12 or None. Can be list. step : dataset step. Integer. Default 1, but set to 4 for 4-day product, i 8 for 8-day, 365/366 for year etc. dstNodata : fill value warp_args : sub-setting and warping control fatal : default False. If True, exit if dataset not found. returns: data dictionary with SDS names as keys and gdal VRT filename data dictionary key 'bandnames' of DOY For a single date: kwargs = { 'tile' : ['h17v03', 'h17v04', 'h18v03', 'h18v04'], 'product' : 'MCD15A3H', } modis = Modis(**kwargs) # specify day of year (DOY) and year data_MCD15A3H = modis.get_modis(2019,1+4*10) print(data_MCD15A3H.keys()) dict_keys(['Lai_500m', ... 'bandnames']) print(len(data_MCD15A3H['bandnames'])) 1 If a list of days, or a month or year is specified, the datasets are 3-D: kwargs = { 'tile' : ['h19v03'], 'product' : 'MOD10A1', } year = 2019 month = 1 # get the data modis = Modis(**kwargs) # specify month and year data_MOD10A1 = modis.get_modis(year,month=1) print(data_MOD10A1.keys()) dict_keys(['NDSI_Snow_Cover', ... 'bandnames']) print(len(data_MOD10A1['bandnames'])) 31 If a month and day are specified, the datasets are 3-D: kwargs = { 'tile' : ['h22v10'], 'product' : 'MCD64A1', } year = 2019 month = 1 day = 1 # get the data modis = Modis(**kwargs) # specify month and year data_MCD64A1 = modis.get_modis(year,month=month,day=day) print(data_MCD64A1.keys()) dict_keys(['NDSI_Snow_Cover', ... 'bandnames']) print(len(data_MCD64A1['bandnames'])) 31 ''' # check in db #store for diagnostics kwargs = {'year': year, 'doy':doy,'day':day,'month':month,'step':step,\ 'warp_args':warp_args,'product': self.product, 'dstNodata':dstNodata, 'tile': self.tile} mkey = json.dumps(kwargs) response = self.database.get_from_db("modis-vrt", mkey) if response is not None: if (type(response) is list) and (len(response)): return response[0] elif (type(response) is dict): # test to see it has all SDS ok = True for s in self.sds: if s not in response.keys(): ok = False if ok: return response else: return response dates = list_of_doys(year, doy=doy, day=day, month=month, step=step) year_list, doy_list = list(dates['year']), list(dates['doy']) bandnames = [f'{year}-{d :0>3d}' for d, y in zip(doy_list, year_list)] vfiles = self.stitch(year=year_list,doy=doy_list,\ dstNodata=dstNodata,warp_args=warp_args) # error if (not vfiles) or (len(vfiles) == 0) or (len(vfiles) and (vfiles[0] == None)): msg = f"WARNING: no datasets in get_data() for product {self.product} tile {self.tile} year {year} month {month} doy {doy}" print(msg) self.msg(msg) self.msg(f"dict : {self.__dict__}") self.msg(f"kwargs : {kwargs}") try: return dict(zip(self.sds, [[]] * len(self.sds))) except: return {None: None} # cache before selection odict = dict(zip(self.sds, vfiles)) odict['bandnames'] = bandnames cache = {"modis-vrt": {mkey: odict}} self.database.set_db(cache, write=True) # now filter to just what was asked for if 'required_sds' in self.__dict__: sds = self.required_sds else: sds = self.sds vfiles = self.sort_vfiles(vfiles, sds) odict = dict(zip(sds, vfiles)) odict['bandnames'] = bandnames return odict def tidy(self, s): ss = str(s).replace("'", "").replace('"', '').replace(',', '_').replace( '[', '_').replace(']', '_') ss = ss.replace(' ', '') return ss def read_data(self, ifile): g = gdal.Open(ifile) if not g: return None, None data = np.array([ g.GetRasterBand(i).ReadAsArray() for i in range(1, len(g.GetFileList())) ]) b = g.GetRasterBand(1) return data, (b.GetScale(), b.GetOffset()) def fix_sds(self, sds, year, doy): '''fix sds''' if sds: return sds #if 'required_sds' in self.__dict__: # self.sds = self.required_sds # else look in dictionary response = self.database.get_from_db("SDS", self.product) if response: self.msg("found SDS names in database") self.sds = response self.msg(self.sds) # else need to derive it self.msg("polling for SDS names") self.stitch_date(year, doy, test=True) if self.sds is None: # try again self.msg("error finding SDS names") return [] #if 'required_sds' not in self.__dict__: # self.required_sds = self.sds self.msg(f"SDS: {self.sds}") return self.sds def get_blank(self, dstNodata, s, i): # no dataset if ('blanco' in self.__dict__) and (Path(self.blanco).exists()): output_filename = self.blanco self.msg(f'using file with value {dstNodata} {output_filename}') bthis = f'blank-{dstNodata}-{str(i):0>2s}' this = output_filename else: try: # repeat last for now self.msg( f'no dataset for sds {s} for dataset {i}: using filler') this = ofiles[-1] output_filename = this.replace('.vrt', '{dstNodata}_blank.tif') if not Path(output_filename).exists(): # need to set to invalid number ... self.msg(f'creating dummy file') create_blank_file(this, output_filename, value=dstNodata) self.msg( f'using file with value {dstNodata} {output_filename}') self.blanco = output_filename bthis = f'blank-{dstNodata}-{str(i):0>2s}' this = output_filename except: bthis = f'blank-{dstNodata}-{str(i):0>2s}' this = None return this, bthis def stitch(self, year, month=None, day=None, doy=None, step=1, warp_args=None, dstNodata=None): '''create vrt dataset of all images for doys / a month / year''' # get a dict of year, doy dates = list_of_doys(year, month=month, day=day, doy=doy, step=step) years, doys = list(dates['year']), list(dates['doy']) ndays = len(years) self.msg(f"create vrt dataset for doys {doys} year {years}") sfiles = {} bandlist = [] # sds may not be defined self.fix_sds(self.sds, years[0], doys[0]) # set nodata value if (warp_args is not None) and (dstNodata is None): dstNodata = warp_args['dstNodata'] if dstNodata is None: dstNodata = 0 if (warp_args is not None) and ('dstNodata' not in warp_args): warp_args['dstNodata'] = dstNodata # loop over sds store_files = [None] * len(years) for i, s in enumerate(self.sds): ofiles = [] bandlist = [] for j, (year, doy) in enumerate(zip(years, doys)): year = int(year) doy = int(doy) ifiles = self.stitch_date(year, doy) if (not ifiles) or (len(ifiles) and ifiles[0] == None): this, bthis = self.get_blank(dstNodata, s, i) else: this, bthis = ifiles[i], f'{str(i):0>2s}' store_files[j] = ifiles if this: bandlist.append(bthis) ofiles.append(this) if len(ofiles): ofile = f"{self.product}/data.{self.sds[i]}.{self.tidy(self.tile)}." + \ f"{year}.{str(int(doy)) :0>3s}.{str(int(step)) :0>3s}.vrt" ofile = ofile.replace(' ', '_') spatial_file = Path(f"{self.local_dir[0]}", ofile) spatial_file.parent.mkdir(parents=True, exist_ok=True) g = gdal.BuildVRT(spatial_file.as_posix(), ofiles, separate=True) try: g.FlushCache() except: pass if not g: d = self.__dict__ print( f"problem building dataset for {spatial_file} with {fdict(d)}" ) del g if warp_args is not None: warp_args['format'] = 'VRT' # warp the files using warp_args spatial_ofile = Path(spatial_file.as_posix().replace( '.vrt', '_warp.vrt')) self.msg(f"warping to {spatial_ofile} using {warp_args}") g = gdal.Warp(spatial_ofile.as_posix(), spatial_file.as_posix(), **warp_args) try: g.FlushCache() except: pass if not g: d = self.__dict__ print( f"problem building dataset for {spatial_ofile} with {fdict(d)}" ) del g sfiles[s] = spatial_ofile else: sfiles[s] = spatial_file # build list of files ofiles = [str(i) for i in sfiles.values()] return ofiles def test_ok(self, hdffile, dosubs=True): '''sanity check on file''' if not Path(hdffile).exists(): msg = f'test: file {hdffile} does not exist' self.msg(msg) return False g = gdal.Open(hdffile) if not g: msg = f'test: file {hdffile} failed to open with gdal' self.msg(msg) del g return False # check referenced files if dosubs: for f in g.GetFileList(): # dont do too much recursion if not self.test_ok(f, dosubs=False): return False data = g.ReadAsArray(xsize=1, ysize=1) if data is None: msg = f'test: file {hdffile} failed: None returned in read ' self.msg(msg) del g return False return True def stitch_date(self, year, doy, get_files=False, test=False): '''stitch data for date''' year = int(year) doy = int(doy) dater = (datetime.datetime(year, 1, 1) +\ datetime.timedelta(doy - 1)).strftime('%Y %m %d').split() self.year = f'{year}' self.month = f'{str(int(dater[1])) :0>2s}' self.day = f'{str(int(dater[2])) :0>2s}' d = self.__dict__.copy() fd = fdict(d) # dont need to read it fd['no_read'] = True ofilebase = f"{self.product}/data.__SDS__." + \ f"{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" hdf_urls = self.get_url(**(fd)) if not (len(hdf_urls) and (type(hdf_urls[0]) == URL)): if get_files: return None, None return [None] if 'db_file' in self.__dict__: if 'database' not in self.__dict__: # load database d = self.__dict__.copy() self.database = Database( self.db_file, **(fdict(d, ignore=['db_dir', 'db_file']))) if not test and not get_files: # look up in db warp_args = None dstNodata = None step = 1 #this_set = f"{self.product}.{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" store_flag = 'modis' kwargs = {'year': self.year, 'doy':doy,'day':self.day,'month':self.month,'step':step,\ 'warp_args':warp_args,'product': self.product, 'dstNodata':dstNodata, 'tile': self.tile} mkey = json.dumps(kwargs) # this is an hdf file response = self.database.get_from_db(store_flag, mkey) if response and self.noclobber: # test if self.test_ok(response[0]): # safe to return self.msg(f'positive response from database') ofiles = response return ofiles else: msg = f'WARNING: invalid entry {response[0]} in database {str(self.db_file)}' print(msg) self.msg(msg) try: hdf_files = [str(f.local()) for f in hdf_urls] except: for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] if get_files: sds = self.get_sds(hdf_files, do_all=False) return hdf_files, sds sds = self.get_sds(hdf_files, do_all=True) if sds == []: for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) # early return if we just want sds if test == True: return sds if len(sds) == 0: # failed to get SDS: need to download example file for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) ofiles = [] if len(sds) > len(self.sds): self.msg(f"ERROR in product {self.product} specification of SDS") self.msg(f"all SDS claimed to be: {len(self.sds)}") self.msg(self.sds) self.msg(f"But request for {len(sds)} SDSs made") self.msg(sds) sys.exit(1) for i, sd in enumerate(sds): ofile = f'{ofilebase.replace("__SDS__",self.sds[i])}.vrt'.replace( ' ', '_') spatial_file = Path(f"{self.local_dir[0]}", ofile) spatial_file.parent.mkdir(parents=True, exist_ok=True) g = gdal.BuildVRT(spatial_file.as_posix(), sds[i]) if not g: d = self.__dict__ print( f"problem building dataset for {spatial_file} with {fdict(d)}" ) sys.exit(1) del g ofiles.append(Path(spatial_file).absolute().as_posix()) # store in db cache = {store_flag: {mkey: ofiles}} #self.database.set_db(cache,write=True) return ofiles def get_files(self, year, doy): ''' get MODIS dataset for specified doy and year return: files : list of filenames sds : list of SDS names ''' return self.stitch_date(year, doy, get_files=True) def has_wildness(self, uc): is_wild = np.logical_or(np.array(['*' in i for i in uc]), np.array(['?' in i for i in uc])) is_wild_2 = np.logical_or(np.array(['[' in i for i in uc]), np.array([']' in i for i in uc])) is_wild = np.logical_or(is_wild, is_wild_2) return is_wild def get_url(self, **kwargs): ''' Get URL object list for NASA MODIS products for the specified product, tile, year, month, day Keyword Arguments: verbose: bool site : str product : str e.g. 'MCD15A3H' tile : str e.g. 'h08v06' year : str valid 2000-present month : str 01-12 day : str 01-(28,29,30,31) ''' site = ('site' in kwargs and kwargs['site']) or 'https://e4ftl01.cr.usgs.gov' product = ('product' in kwargs and kwargs['product']) or self.product tile = ('tile' in kwargs and kwargs['tile']) or self.tile day = ('day' in kwargs and kwargs['day']) or self.day month = ('month' in kwargs and kwargs['month']) or self.month year = ('year' in kwargs and kwargs['year']) or self.year doy = ('doy' in kwargs and kwargs['doy']) or self.doy if product[:5] == "MOD10" or product[:5] == "MYD10": # NSIDC site = "https://n5eil01u.ecs.nsidc.org" self.msg(f"Snow and ice product {product}") self.msg(f"switching to server {site}") if product[:3] == "MOD": code = "MOST" elif product[:3] == "MYD": code = "MOSA" else: code = "MOTA" self.msg(f"product {product} -> code {code}") # special cases #if self.product[:5] == 'MCD19': # self.site = 'https://ladsweb.modaps.eosdis.nasa.gov' # you should put some tests in site_dir = f'{code}/{product}.006/{year}.{month}.{day}' if site == 'https://ladsweb.modaps.eosdis.nasa.gov': if self.doy is None: try: doy = (datetime.datetime(year+1, 1, 1) - \ datetime.datetime(year=int(year),month=int(month),day=int(day))).days except: self.verbose = True self.msg( f"ERROR: you need to specify doy explicitly for product {self.product}" ) sys.exit(1) site_dir = f'archive/allData/6/{product}/{year}/{doy}' site_file = f'*.{tile}*.hdf' kwargs = {"verbose" : self.verbose,\ "full_url" : True,\ "skipper" : True, "noclobber" : self.noclobber,\ "db_dir" : self.db_dir,\ "db_file" : self.db_file,\ "log" : self.log,\ "size_check" : self.size_check,\ "local_file" : self.local_file,\ "database" : self.database.database, "local_dir" : self.local_dir } hdf_urls = [] url = None for t in self.tile: url = ((url is None) and URL(site,site_dir,**kwargs)) or \ url.update(site,site_dir,**kwargs) hdf_urls += url.glob(f'{self.product}*.{t}*.hdf') if len(hdf_urls) == 0: return [None] self.db_file = hdf_urls[0].db_file return hdf_urls def sdscode(self, s1): '''PITA decoding of SDS from HDF field that comes from s0,s1 in g.GetSubDatasets()''' return (' '.join(s1.split()[1:-3])).split( self.product)[0].split('MOD')[0].strip() def get_sds(self, hdf_files, do_all=False): '''get defined SDS or all''' if type(hdf_files) is not list: hdf_files = [hdf_files] if do_all or ((self.sds is None) or len(self.sds) == 0 or \ ((len(self.sds) == 1) and len(self.sds[0]) == 0)) : response = self.database.get_from_db('SDS', self.product) if response: self.msg("found SDS names in database") self.sds = response self.msg(self.sds) # require them all if 'required_sds' not in self.__dict__: self.required_sds = self.sds if len(hdf_files) < 1: return [] try: lfile = hdf_files[0] if not Path(lfile).exists(): return [] g = gdal.Open(str(lfile)) if not g: return [] except: # need to pull this first return [] #hdf_files = list(np.sort(np.unique(np.array(hdf_files)))) # in case not defined if ((self.sds is None) or len(self.sds) == 0 or \ ((len(self.sds) == 1) and len(self.sds[0]) == 0)) : self.msg("trying to get SDS names") self.sds = [self.sdscode(s1) for s0, s1 in g.GetSubDatasets()] cache = {"SDS": {self.product: self.sds}} self.database.set_db(cache, write=True) if 'required_sds' in self.__dict__: self.msg(f'require: {self.required_sds}') self.msg(self.sds) all_subs = [(s0.replace(str(lfile), '{local_file}'), s1) for s0, s1 in g.GetSubDatasets()] this_subs = [] if (not do_all) and ('required_sds' in self.__dict__): sds = self.required_sds else: sds = self.sds for sd in sds: this_subs += [s0 for s0, s1 in all_subs if sd == self.sdscode(s1)] ofiles = [[sub.format(local_file=str(lfile)) for lfile in hdf_files] for sub in this_subs] return ofiles
class URL(urlpath.URL,urllib.parse._NetlocResultMixinStr, PurePath): ''' Derived from https://raw.githubusercontent.com/chrono-meter/urlpath/master/urlpath.py to provide more compatibility with pathlib.Path functionality ''' ''' modified new and init ''' def __new__(cls,*args,**kwargs): self = super(URL, cls).__new__(cls,*args) self.init(**kwargs) return self def __init__(self,*args,**kwargs): # remove any trailing '/' from args args = list(args) for i,arg in enumerate(args): arg = str(arg) while arg[-1] == '/': if len(arg) == 1: break arg = arg[:-1] args[i] = arg args = tuple(args) if not kwargs: kwargs = {} self.fourOhOne = False def init(self,**kwargs): self.__dict__.update(ginit(self,**kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy()))) def __del__(self): try: del self.database self.msg(f'clone: {url.is_clone}') except: pass def __exit__(self, exc_type, exc_value, traceback): '''cleanup''' try: del self.database except: pass tempfile.clean() def dedate(self): if '_cache_original' in self.__dict__: self.__dict__ = self._cache_original.copy() if '_cache_original' in self.__dict__: del self.__dict__['_cache_original'] def update(self,*args,**kwargs): '''update args in object''' if '_cache_original' not in self.__dict__: self._cache_original = self.__dict__.copy() # whetehr we specify full URL in update or not if ('full_url' in kwargs) and (kwargs['full_url'] == True): args = list(args) else: args = [str(self)] + list(args) url = super(URL, self).__new__(self,*args) url.is_clone = True url.__dict__ = fdict(self._cache_original.copy()) return url def check_path(self,ppp): ''' You can corrupt the database by having files where we expect directories so we need to clean these up ''' parts = list(ppp.parts) for i,part in enumerate(parts): this = Path(*(parts[:i+1])) if this.exists() and (not this.is_dir()): # warning path in expected directory self.msg('found non-directory term in path {str(this)}') try: self.msg('trying to correct') this.unlink() return True except: self.msg('failed to correct') return False return True def indb(self): # might be in database store_url = str(self) store_flag = 'data' ifile = self.get_name(self.database.get_from_db(store_flag,store_url)) if ifile: old = self.local_file self.local_file = Path(ifile) if self.local_file.exists() and self.local_file.suffix == '.store': return True if self.local_file.suffix != '.store': self.local_file = old return False return True return False def call_local(self): ''' sort out and return local_file This comes from the URL and local_dir and ends .store ''' if self.indb(): if callable(self.local): sys.msg(f"**unexpected method for self.local {self.local}") else: return self.local kwargs = fdict(self.__dict__.copy()) if 'local_dir' in kwargs and \ (kwargs['local_dir'] is not None) and \ len(kwargs['local_dir']) > 0: self.local_dir = list_resolve(kwargs['local_dir']) if (self.local_dir is None) or (len(self.local_dir) == 0): self.local_dir = list_resolve(self.db_dir) self.local_file = Path(self.local_dir[0],self.as_posix().split("://")[1]) #self.local_file = Path(self.local_dir[-1],str(self.with_scheme(''))[2:]).absolute() # replace ' ' self.local_file = Path(str(self.local_file).replace(' ','_')) suffix = self.local_file.suffix self.local_file = self.local_file.with_suffix(suffix + '.store') self.check_path(self.local_file.parent) self.local_file.parent.mkdir(parents=True,exist_ok=True) return self.local_file def get_read_file(self,filelist): filelist = name_resolve(filelist) readlist,writelist = list_info(filelist) filelist = np.array(filelist,dtype=np.object)[readlist] return (filelist.size and filelist[-1]) or None def get_write_file(self,filelist): filelist = name_resolve(filelist) readlist,writelist = list_info(filelist) filelist = np.array(filelist,dtype=np.object)[writelist] return (filelist.size and filelist[-1]) or None def get_readwrite_file(self,filelist): filelist = name_resolve(filelist) readlist,writelist = list_info(filelist) filelist = np.array(filelist,dtype=np.object)[np.logical_and(np.array(writelist),np.array(readlist))] return (filelist.size and filelist[-1]) or None def _local_file(self,mode="r"): '''get local file name''' if self.indb(): return self.local_file self.call_local() # clobber if not self.noclobber: local_file = self.get_write_file(self.local_file) # file name for writing elif mode == "r": local_file = self.get_read_file(self.local_file) if local_file and not local_file.exists(): self.msg("read file {local_file} doesnt exist") self.local_file = self.local_file[self.local_file != local_file] return self._local_file(mode="r") else: # file name for writing local_file = self.get_write_file(self.local_file) if local_file == None: return local_file # local_file is real if local_file.exists(): if local_file.is_dir(): try: local_file.rmdir() return None except: pass # delete the file if noclobber is False if not self.noclobber: try: self.msg(f"deleting existing file {local_file}") local_file.unlink() except: pass else: self.msg(f"keeping existing file {local_file}") return local_file def open(self,mode='r',buffering=-1, encoding=None, errors=None, newline=None): ''' Open the file pointed by this URL and return a file object, as the built-in open() function does. ''' kwargs = {'mode':mode,'buffering':buffering,'encoding':encoding,\ 'errors':errors,'newline':newline} if self._isfile(): self.msg(f'{self} is not a URL: interpreting as Path') return Path(self).open(**kwargs) # check in database store_url = str(self) store_flag = 'data' binary = ('b' in mode) and ('t' not in mode) get_download,ifile,ofile = self._test_already_local() # get from ofile if ofile and Path(ofile).exists(): ofile = Path(ofile) if binary: data = io.BytesIO(ofile.read_bytes()) else: data = io.StringIO(ofile.read_text()) cache = {store_flag : { str(store_url) : str(ofile) }} self.database.set_db(cache) return data # get from ifile if ifile and Path(ifile).exists(): ifile = Path(ifile) if binary: data = io.BytesIO(ifile.read_bytes()) else: data = io.StringIO(ifile.read_text()) self.check_path(ifile.parent) ifile.parent.mkdir(parents=True,exist_ok=True) if ofile: ofile = Path(ofile) if binary: ofile.write_bytes(data) else: ofile.write_text(data) cache = {store_flag : { str(store_url) : str(ifile) }} self.database.set_db(cache) return data if 'r' in mode: self.msg(f"reading data from {self}") # read if binary: self.msg("open() binary stream") idata = self.read_bytes() data = io.BytesIO(idata) else: self.msg("open() text stream") idata = self.read_text() data = io.StringIO(idata) if ofile: try: ofile = Path(ofile) if binary: ofile.write_bytes(idata) else: ofile.write_text(idata) cache = {store_flag : { str(store_url) : str(ifile) }} self.database.set_db(cache) except: pass return data if ofile: return Path(ofile).open(**kwargs) def write_text(self,data, encoding=None, errors=None): '''Open the file in text mode, write to it, and close the file.''' kwargs = {'encoding':encoding} if self._isfile(): self.msg(f'{self} is not a URL: interpreting as Path') return Path(self).write_text(data) get_download,ifile,ofile = self._test_already_local() if ofile and Path(ofile).exists(): self.msg("file exists so not writing") return Path(ofile).stat().st_size if ofile: self.msg(f'opening output file {ofile}') return Path(ofile).write_text(data,**kwargs) def write_bytes(self,data): '''Open the file in bytes mode, write to it, and close the file.''' if self._isfile(): self.msg(f'{self} is not a URL: interpreting as Path') return Path(self).write_bytes(data) get_download,ifile,ofile = self._test_already_local() if ofile and Path(ofile).exists(): self.msg("file exists so not writing") return Path(ofile).stat().st_size if ofile: self.msg(f'opening output file {ofile}') return Path(ofile).write_bytes(data) def _get_login(self,head=True): u = self with requests.Session() as session: if u.username and u.password: session.auth = u.username,u.password else: uinfo = Cylog(u.anchor).login() if uinfo == (None,None): return None session.auth = uinfo[0].decode('utf-8'),uinfo[1].decode('utf-8') u.msg(f'logging in to {u.anchor}') try: r1 = session.request('get',u) if r1.status_code == 200: u.msg(f'data read from {u.anchor}') return r1 # try encoded login if head: r2 = session.head(r1.url) else: r2 = session.get(r1.url) if r2.status_code == 200: u.msg(f'data read from {u.anchor}') if type(r2) == requests.models.Response: return r2 except: u.msg(f'failure reading data from {u.anchor}') return None u.msg(f'failure reading data from {u.anchor}') return None def msg(self,*args): '''msg to self.stderr''' this = str(*args) try: # DONT REPEAT MESSAGES ... doesnt work as yet if this in self.store_msg: return self.store_msg.extend(this) except: self.store_msg = [this] try: if self.verbose or (self.log is not None): print('-->',*args,file=self.stderr) except: pass def get_name(self,ofile): if ofile == [] or ofile == {}: ofile = None if type(ofile) == list: ofile = ofile[0] if type(ofile) == dict: ofile = list(ofile.values())[0] return ofile def _test_already_local(self): # get local_filename we would use for output # delete it if not noclobber # dont greate dir if it doesnt exist # return False if already downloaded # check in database store_url = str(self) store_flag = 'data' ifile = self.get_name(self.database.get_from_db(store_flag,store_url)) if ifile is not None: ifile = Path(ifile) if not ifile.exists(): # otherwise incorrect db entry self.database.rm_from_db(store_flag,store_url) if not self.noclobber and ifile.exists(): # clobber self.msg(f'deleting local file {ifile}') ifile.unlink() ifile = None ofile = self.get_name(self._local_file("w")) if callable(ofile): print(f"ERROR in type of self.lcoal {ofile}: should be str or list") sys.exit(1) if ifile is None: return True,ifile,ofile if not ifile.exists(): return True,None,ofile # simple if no size check if (not self.size_check) and ifile.exists(): self.msg(f'local file {ifile} exists') #: no size check') # cache this in case we want to re-use it cache = {store_flag : { str(store_url) : str(ifile) }} self.database.set_db(cache) return False,ifile,ofile if self.size_check: lsize = ifile.stat().st_size rsize = self.stat().st_size if rsize < 0: # then its not available self.msg(f'not downloading file') # we might not want to download # cache this in case we want to re-use it cache = {store_flag : { str(store_url) : ifile }} self.database.set_db(cache) return False,ifile,ofile elif lsize == rsize: self.msg(f'local and remote file sizes equal {lsize}') self.msg(f'not downloading file') # we might not want to download # cache this in case we want to re-use it cache = {store_flag : { str(store_url) : ifile }} self.database.set_db(cache) return False,ifile,ofile self.msg(f'local and remote file sizes not equal {lsize}/{rsize} respectively') self.msg(f'so we need to download (or set size_check=False)') if not self.noclobber: if ifile and ifile.exists(): self.msg(f'deleting local ifile {local_file}') ifile.unlink() ifile = None if ofile and ofile.exists(): self.msg(f'deleting local ofile {local_file}') ofile.unlink() ofile = None return True,ifile,ofile def read_text(self, encoding=None, errors=None): '''Open the URL, read in text mode and return text.''' kwargs = {'encoding':encoding} u = self store_url = str(u) store_flag = 'data' if u._isfile(): self.msg(f'{u} is not a URL: interpreting as Path') return Path(u).read_text() get_download,ifile,ofile = self._test_already_local() text = None # get it from ofile if ofile and Path(ofile).exists(): text = Path(ofile).read_text(**kwargs) cache = {store_flag : { str(store_url) : str(ofile) }} self.database.set_db(cache) return text # get it from ifile if ifile and Path(ifile).exists(): self.msg(f'opening already downloaded file {ifile}') text = Path(ifile).read_text(**kwargs) if ofile: ofile = Path(ofile) ofile.write_text(text) cache = {store_flag : { str(store_url) : str(ofile) }} else: cache = {store_flag : { str(store_url) : str(ifile) }} self.database.set_db(cache) return text if text is not None: return text try: u.msg(f'trying {self}') text = u.get_text() if text and ofile: try: ofile = Path(ofile) self.check_path(ofile.parent) ofile.parent.mkdir(parents=True,exist_ok=True) ofile.write_text(text) cache = {store_flag : { str(store_url) : str(ofile) }} self.database.set_db(cache) return text except: pass if text: return text except: pass u.msg(f'getting login') r = u._get_login(head=False) if type(r) != requests.models.Response: return None if r.status_code == 200: u.msg(f'code {r.status_code}') text = r.text if ofile: ofile = Path(ofile) self.check_path(ofile.parent) ofile.parent.mkdir(parents=True,exist_ok=True) ofile.write_text(text) cache = {store_flag : { str(store_url) : str(ofile) }} self.database.set_db(cache) return text if type(r) == requests.models.Response: u.msg(f'code {r.status_code}') return r u.msg(f'failed to connect') return None def local(self,get_file=False): ''' local filename''' u = self get_download,ifile,ofile = u._test_already_local() for f in [ifile,ofile]: if f and get_file: if Path(f).exists(): return Path(f) else: # pull file self.read_bytes() return self.local(get_file=get_file) elif f: return Path(f) return None def exists(self): '''Whether this URL exists and can be accessed''' u = self store_url = str(u) store_flag = 'exists' ex = self.database.get_from_db(store_flag,store_url) if ex is not None: return ex ex = False get_download,ifile,ofile = u._test_already_local() if ofile and Path(ofile).exists(): ex = True cache = {store_flag : { str(store_url) : True }} if not ex: ex = self.ping() if ex: cache = {store_flag : { str(store_url) : True }} self.database.set_db(cache) return ex def stat(self, head=False): ''' Some of the functionality of stat for URLs Currently, only stat_result.st_size is used. ''' input = [0,0,0,0,0,0,self._st_size(head=head),0,0,0] stat_result = os.stat_result(input) return stat_result def _isfile(self): if self.scheme == '' or self.scheme == 'file': self.msg('we are a file ...') return True #self.msg('we are not a file ...') return False def _st_size(self, head=False): ''' retrieve the remote file size You should specify any required login/password with with_components(username=str,password=str) Returns: int if data available Or: -1 ''' u = self # check in database store_url = u store_flag = 'st_size' remote_size = self.database.get_from_db(store_flag,store_url) if remote_size is not None: return remote_size remote_size = -1 if u._isfile(): self.msg(f'{u} is not a URL: interpreting as Path') # not a URL u = Path(u) return u.stat().st_size try: u.msg(f'trying {u}') if head: r = u.head() else: r = u.get() if type(r) == requests.models.Response: if r.status_code == 200: u.msg(f'code 200') hdr = r.headers if "Content-Length" in hdr.keys(): remote_size = int(hdr["Content-Length"]) elif 'Transfer-Encoding' in hdr.keys() and hdr["Transfer-Encoding"] == 'chunked': u.msg(f'file is compressed, remote size not directly available') #self.msg(hdr) if remote_size > 0: # cache this in case we want to re-use it cache = {store_flag : { str(store_url) : remote_size }} self.database.set_db(cache) return(remote_size) # if r.status_code == 401: u.msg(f'code 401') self.fourOhOne = True if self.fourOhOne: # unauthorised # more complex session login and auth # e.g. needed for NASA Earthdata login u.msg(f'getting login') r = u._get_login(head=head) if r.status_code == 200: u.msg(f'code 200') hdr = r.headers if "Content-Length" in hdr: remote_size = int(hdr["Content-Length"]) if remote_size > 0: # cache this in case we want to re-use it cache = {store_flag : { str(store_url) : remote_size }} self.database.set_db(cache) return(remote_size) elif head == False: u.msg(f'code {r.status_code}') return remote_size # return it even if 0 return remote_size except: pass if head == False: u.msg(f'failed to connect') # give up remote_size = -2 # cache this in case we want to re-use it even if its -1 cache = {store_flag : { str(store_url) : remote_size }} self.database.set_db(cache) return remote_size u.msg(f'trying get') return u.st_size(head=False) def ping(self, head=True): ''' ping the URL data return True if response is 200 You should specify any required login/password with with_components(username=str,password=str) Returns: True if data available Or: False ''' u = self if u._isfile(): self.msg(f'{u} is not a URL: interpreting as Path') # not a URL u = Path(u) return u.exists() try: u.msg(f'trying {u}') if head: r = u.head() else: r = u.get() if type(r) == requests.models.Response: if r.status_code == 200: u.msg(f'code 200') return True if r.status_code == 401: u.msg(f'code 401') u.msg(f'trying another') # unauthorised # more complex session login and auth # e.g. needed for NASA Earthdata login u.msg(f'getting login') r = u._get_login(head=head) if r.status_code == 200: u.msg(f'code 200') return True elif head == False: u.msg(f'code {r.status_code}') return False except: pass if head == False: u.msg(f'failed to connect') return False u.msg(f'trying get') return u.ping(head=False) def read_bytes(self): ''' Open the URL data in bytes mode, read it and return the data This first tried self.get() but if the authorisation is more complex (e.g. when using NASA server) then a fuller 2-pass session is used. You should specify any required login/password with with_components(username=str,password=str) Returns: data from url Or: None : on failure requests.models.Response : on connection problem ''' if 'skipper' in self.__dict__: skipper = self.skipper else: skipper = False u = self store_url = str(u) store_flag = 'data' if u._isfile(): self.msg(f'{u} is not a URL: interpreting as Path') return Path(u).read_bytes() get_download,ifile,ofile = self._test_already_local() # get from ofile if ofile and Path(ofile).exists(): data = ofile.read_bytes() ofile = Path(ofile) cache = {store_flag : { str(store_url) : str(ofile) }} self.database.set_db(cache,write=True) return data # get from ifile if ifile and Path(ifile).exists(): ifile = Path(ifile) self.msg(f'opening already downloaded file {ifile}') data = ifile.read_bytes() if ofile: ofile = Path(ofile) self.check_path(ofile.parent) ofile.parent.mkdir(parents=True,exist_ok=True) ofile.write_bytes(data) cache = {store_flag : { str(store_url) : str(ofile) }} else: cache = {store_flag : { str(store_url) : str(ifile) }} self.database.set_db(cache,write=True) return data try: if not skipper: u.msg(f'trying {u}') r = u.get() if skipper or (type(r) == requests.models.Response): if (not skipper) and r.status_code == 200: u.msg(f'code {r.status_code}') data = r.content if ofile: ofile = Path(ofile) self.check_path(ofile.parent) ofile.parent.mkdir(parents=True,exist_ok=True) ofile.write_bytes(data) cache = {store_flag : { str(store_url) : str(ofile) }} self.database.set_db(cache,write=True) return data if skipper or (r.status_code == 401): if not skipper: u.msg(f'code {r.status_code}') u.msg(f'trying another') # unauthorised # more complex session login and auth # e.g. needed for NASA Earthdata login u.msg(f'getting login') r = u._get_login(head=False) if type(r) != requests.models.Response: return None if r.status_code == 200: u.msg(f'code {r.status_code}') data = r.content if ofile: ofile = Path(ofile) self.check_path(ofile.parent) ofile.parent.mkdir(parents=True,exist_ok=True) ofile.write_bytes(data) cache = {store_flag : { str(store_url) : str(ofile) }} self.database.set_db(cache,write=True) return data else: u.msg(f'code {r.status_code}') return r except: pass u.msg(f'failed to connect') return None def _convert_to_abs(self,ilist): # this is slow and may be not needed self.msg(f'parsing URLs from html file {len(ilist)} items') return [self.update(*[str(self),l.rstrip('/#')],**(fdict(self.__dict__.copy()))) for l in ilist ] def _filter(self,links,pattern,pre_filter=True): # pre-filter if pre_filter: links = np.array([str(l).rstrip('/#') for l in links]) matches = np.array([fnmatch.fnmatch(str(l), '*'+pattern) for l in links]) links = list(links[matches]) links = self._convert_to_abs(links) olist = [] try: p = self.done[pattern] except: try: self.done[pattern] = [] except: self.done = {pattern:[]} p = self.done[pattern] olist = [u for u in links if u not in p] self.done[pattern] = self.done[pattern] + olist return olist def has_wildness(self,uc): is_wild = np.logical_or(np.array(['*' in i for i in uc]), np.array(['?' in i for i in uc])) is_wild_2 = np.logical_or(np.array(['[' in i for i in uc]), np.array([']' in i for i in uc])) is_wild = np.logical_or(is_wild,is_wild_2) return is_wild def glob(self,pattern,pre_filter=True): ''' Iterate over this subtree and yield all existing files (of any kind, including directories) matching the given relative pattern. The URL here then needs to return lxml html code. Positional arguments: patterm : to search for e.g. */2021.*.01 only wildcards * and ? considered at present ''' u = self url = str(u) if url[-1] == '/': url = urls[:-1] url = self.update(url,pattern) # check in database store_url = url store_flag = 'query' olist = self.database.get_from_db(store_flag,store_url) if olist is not None: if type(olist) is list: return [self.update(o) for o in olist] return [self.update(olist)] # start at the top uc = np.array(url.parts) for i,w in enumerate(uc[1:]): if i == 0: base_list = [self.update(uc[0])] new_list = [] for b in base_list: # set to new item glob = self.update(b)._glob(w,pre_filter=pre_filter) # glob with the next item new_list = new_list + glob base_list = np.unique(np.array(new_list,dtype=np.object).flatten()) base_list = np.unique(np.array(base_list,dtype=np.object)) olist = list(np.array([self.update(i) for i in base_list]).flatten()) self.dedate() for l in olist: l.init(**(fdict(self.__dict__.copy()))) # cache this in case we want to re-use it cache = {store_flag : { str(store_url) : [str(i) for i in olist] }} self.database.set_db(cache) if type(olist) is list: return [self.update(o) for o in olist] return [self.update(olist)] def rglob(self, pattern,pre_filter=True): ''' Recursively yield all existing files (of any kind, including directories) matching the given relative pattern, anywhere in this subtree. Positional arguments: patterm : to search for e.g. 2021.*.01 only wildcards * and ? considered at present ''' return self.glob(pattern,pre_filter=pre_filter) def flush(self): try: return self.database.set_db(self.database.database,write=True) except: return None def _glob(self, pattern,pre_filter=True): ''' Iterate over this subtree and yield all existing files (of any kind, including directories) matching the given relative pattern. The URL here then needs to return lxml html code. ''' # take off training slash if pattern[-1] == '/': pattern = pattern[:-1] store_url = str(self.update(pattern)) store_flag = 'query' if not self.noclobber: # dont trust cache response = None else: response = self.database.get_from_db(store_flag,store_url) if response: self.msg(f'got response from database for {store_url}') self.msg(f'discovered {len(response)} files with pattern {pattern} in {str(self)}') return [self.update(str(f)) for f in response] try: html = self.read_text() links = np.array([mylink.attrs['href'] for mylink in BeautifulSoup(html,'lxml').find_all('a')]) links = np.array(self._filter(links,pattern,pre_filter=pre_filter)) matches = np.array([fnmatch.fnmatch(str(l), '*'+pattern) for l in links]) files = list(links[matches]) except: files = [] self.msg(f'discovered {len(files)} files with pattern {pattern} in {str(self)}') files = [str(i) for i in files] # cache this in db cache = {store_flag : { str(store_url) : files }} self.database.set_db(cache) return files
class Modis(): ''' get MODIS datasets from the server ''' def __init__(self, **kwargs): kwargs['defaults'] = { 'store_msg' : [],\ 'database' : None,\ 'product' : 'MCD15A3H',\ 'tile' : 'h08v06',\ 'log' : None,\ 'day' : '01',\ 'doy' : None, 'month' : '*',\ 'sds' : None, 'year' : "2019",\ 'site' : 'https://e4ftl01.cr.usgs.gov',\ 'size_check' : False,\ 'noclobber' : True,\ 'local_dir' : 'work',\ 'local_file' : None,\ 'db_file' : None,\ 'db_dir' : 'work',\ 'verbose' : False,\ 'stderr' : sys.stderr } self.__dict__.update(ginit(self, **kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy(),ignore=['db_dir','db_file']))) self.translateoptions = gdal.TranslateOptions( gdal.ParseCommandLine("-of Gtiff -co COMPRESS=LZW")) # list of tiles if type(self.tile) is str: self.tile = [self.tile] if type(self.sds) is str: self.sds = [self.sds] def msg(self, *args): '''msg to self.stderr''' this = str(*args) try: # DONT REPEAT MESSAGES ... doesnt work as yet if this in self.store_msg: return self.store_msg.append(this) except: self.store_msg = [this] try: if self.verbose or (self.log is not None): print('-->', *args, file=self.stderr) except: pass def get_data(self, year, doy): '''return data dict for doy year as sds dictionary''' vfiles = self.stitch_date(year, doy) if (not vfiles) or (len(vfiles) and vfiles[0] == None): msg = f"WARNING: no datasets in get_data() for product {self.product} tile {self.tile} year {year} doy {doy}" print(msg) self.msg(msg) try: return dict(zip(self.sds, [[]] * len(self.sds))) except: return {None: None} if not self.sds: # recover from vfiles self.msg("trying to recover SDS from files") self.sds = [Path(i).name.split('.')[1] for i in vfiles] self.msg(self.sds) sds = [Path(i).name.split('.')[1] for i in vfiles] data = [] for i, s in enumerate(sds): g = gdal.Open(vfiles[i]) dataset = g.ReadAsArray() if dataset is None: msg = f"WARNING: no datasets in get_data() for {vfiles[i]}\n" +\ f"check datasets and database file {str(self.db_file)}" print(msg) self.msg(msg) data.append(dataset) return dict(zip(self.sds, data)) def read_data(self, ifile): g = gdal.Open(ifile) if not g: return None, None data = np.array([ g.GetRasterBand(i).ReadAsArray() for i in range(1, len(g.GetFileList())) ]) b = g.GetRasterBand(1) return data, (b.GetScale(), b.GetOffset()) def get_year(self, year, step=4): '''create vrt dataset of all images for a year''' year = int(year) self.year = f'{year}' ayear = (datetime.datetime(year + 1, 1, 1) - datetime.datetime(year, 1, 1)).days sfiles = {} bandlist = [] for i, s in enumerate(self.sds): ofiles = [] bandlist = [] for doy in range(1, ayear + 1, step): ifiles = self.stitch_date(year, doy) if (not ifiles) or (len(ifiles) and ifiles[0] == None): # no dataset try: # repeat last for now self.msg( 'no dataset for sds {s} for dataset {i}: using filler' ) this = ofiles[-1] bthis = 'filler ' + bandlist[-1] except: this = None else: this = ifiles[i] bthis = f'{str(i):0>2s}' if this: bandlist.append(bthis) ofiles.append(this) if len(ofiles): ofile = f"data.{self.sds[i]}.{'_'.join(self.tile)}.{self.year}.vrt" ofile = ofile.replace(' ', '_') spatial_file = Path(f"{self.local_dir[0]}", ofile) g = gdal.BuildVRT(spatial_file.as_posix(), ofiles, separate=True) try: g.FlushCache() except: pass if not g: d = self.__dict__ print( f"problem building dataset for {spatial_file} with {fdict(d)}" ) del g sfiles[s] = spatial_file sfiles[s + '_name'] = bandlist return sfiles, bandlist def test_ok(self, hdffile, dosubs=True): '''sanity check on file''' if not Path(hdffile).exists(): msg = f'test: file {hdffile} does not exist' self.msg(msg) return False g = gdal.Open(hdffile) if not g: msg = f'test: file {hdffile} failed to open with gdal' self.msg(msg) del g return False # check referenced files if dosubs: for f in g.GetFileList(): # dont do too much recursion if not self.test_ok(f, dosubs=False): return False data = g.ReadAsArray(xsize=1, ysize=1) if data is None: msg = f'test: file {hdffile} failed: None returned in read ' self.msg(msg) del g return False return True def stitch_date(self, year, doy): '''stitch data for date''' year = int(year) doy = int(doy) dater = (datetime.datetime(year, 1, 1) +\ datetime.timedelta(doy - 1)).strftime('%Y %m %d').split() self.year = f'{year}' self.month = f'{str(int(dater[1])) :0>2s}' self.day = f'{str(int(dater[2])) :0>2s}' d = self.__dict__.copy() hdf_urls = self.get_url(**(fdict(d))) if not (len(hdf_urls) and (type(hdf_urls[0]) == URL)): return [None] if 'db_file' in self.__dict__: if 'database' not in self.__dict__: # load database d = self.__dict__.copy() self.database = Database( self.db_file, **(fdict(d, ignore=['db_dir', 'db_file']))) # look up in db this_set = f"{self.product}.{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" store_flag = 'modis' response = self.database.get_from_db(store_flag, this_set) if response and self.noclobber: # test if self.test_ok(response[0]): # safe to return self.msg(f'positive response from database') ofiles = response return ofiles else: msg = f'WARNING: invalid entry {response[0]} in database {str(self.db_file)}' print(msg) self.msg(msg) for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) ofiles = [] if len(sds) > len(self.sds): self.msg(f"ERROR in product {self.product} specification of SDS") self.msg(f"all SDS claimed to be: {len(self.sds)}") self.msg(self.sds) self.msg(f"But request for {len(sds)} SDSs made") self.msg(sds) sys.exit(1) for i, sd in enumerate(sds): ofile = f"data.{self.sds[i]}." + \ f"{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}.vrt" ofile = ofile.replace(' ', '_') spatial_file = Path(f"{self.local_dir[0]}", ofile) g = gdal.BuildVRT(spatial_file.as_posix(), sds[i]) if not g: d = self.__dict__ print( f"problem building dataset for {spatial_file} with {fdict(d)}" ) sys.exit(1) del g ofiles.append(Path(spatial_file).absolute().as_posix()) # store in db cache = {store_flag: {this_set: ofiles}} self.database.set_db(cache, write=True) return ofiles #def get_files(self,**kwargs): # hdf_urls = self.get_url(**kwargs) # hdf_files = [f.local() for f in hdf_urls] # return hdf_files def has_wildness(self, uc): is_wild = np.logical_or(np.array(['*' in i for i in uc]), np.array(['?' in i for i in uc])) is_wild_2 = np.logical_or(np.array(['[' in i for i in uc]), np.array([']' in i for i in uc])) is_wild = np.logical_or(is_wild, is_wild_2) return is_wild def get_url(self, **kwargs): ''' Get URL object list for NASA MODIS products for the specified product, tile, year, month, day Keyword Arguments: verbose: bool site : str product : str e.g. 'MCD15A3H' tile : str e.g. 'h08v06' year : str valid 2000-present month : str 01-12 day : str 01-(28,29,30,31) ''' site = ('site' in kwargs and kwargs['site']) or 'https://e4ftl01.cr.usgs.gov' product = ('product' in kwargs and kwargs['product']) or self.product tile = ('tile' in kwargs and kwargs['tile']) or self.tile day = ('day' in kwargs and kwargs['day']) or self.day month = ('month' in kwargs and kwargs['month']) or self.month year = ('year' in kwargs and kwargs['year']) or self.year doy = ('doy' in kwargs and kwargs['doy']) or self.doy if product[:5] == "MOD10" or product[:5] == "MYD10": # NSIDC site = "https://n5eil01u.ecs.nsidc.org" self.msg(f"Snow and ice product {product}") self.msg(f"switching to server {site}") if product[:3] == "MOD": code = "MOST" elif product[:3] == "MYD": code = "MOSA" else: code = "MOTA" self.msg(f"product {product} -> code {code}") # special cases #if self.product[:5] == 'MCD19': # self.site = 'https://ladsweb.modaps.eosdis.nasa.gov' # you should put some tests in site_dir = f'{code}/{product}.006/{year}.{month}.{day}' if site == 'https://ladsweb.modaps.eosdis.nasa.gov': if self.doy is None: try: doy = (datetime.datetime(year+1, 1, 1) - \ datetime.datetime(year=int(year),month=int(month),day=int(day))).days except: self.verbose = True self.msg( f"ERROR: you need to specify doy explicitly for product {self.product}" ) sys.exit(1) site_dir = f'archive/allData/6/{product}/{year}/{doy}' site_file = f'*.{tile}*.hdf' kwargs = {"verbose" : self.verbose,\ "full_url" : True,\ "noclobber" : self.noclobber,\ "db_dir" : self.db_dir,\ "db_file" : self.db_file,\ "log" : self.log,\ "size_check" : self.size_check,\ "local_file" : self.local_file,\ "local_dir" : self.local_dir } hdf_urls = [] url = None for t in self.tile: url = ((url is None) and URL(site,site_dir,**kwargs)) or \ url.update(site,site_dir,**kwargs) hdf_urls += url.glob(f'{self.product}*.{t}*.hdf') if len(hdf_urls) == 0: return [None] self.db_file = hdf_urls[0].db_file return hdf_urls def sdscode(self, s1): '''PITA decoding of SDS from HDF field that comes from s0,s1 in g.GetSubDatasets()''' return (' '.join(s1.split()[1:-3])).split( self.product)[0].split('MOD')[0].strip() def get_sds(self, hdf_files, do_all=False): '''get defined SDS or all''' if type(hdf_files) is not list: hdf_files = [hdf_files] if len(hdf_files) < 1: return [] lfile = hdf_files[0] g = gdal.Open(str(lfile)) if not g: return [] # in case not defined if do_all or ((self.sds is None) or len(self.sds) == 0 or \ ((len(self.sds) == 1) and len(self.sds[0]) == 0)) : self.msg("trying to get SDS names") self.sds = [self.sdscode(s1) for s0, s1 in g.GetSubDatasets()] self.msg(self.sds) all_subs = [(s0.replace(str(lfile), '{local_file}'), s1) for s0, s1 in g.GetSubDatasets()] this_subs = [] for sd in self.sds: this_subs += [s0 for s0, s1 in all_subs if sd == self.sdscode(s1)] return [[sub.format(local_file=str(lfile)) for lfile in hdf_files] for sub in this_subs]
def stitch_date(self, year, doy): '''stitch data for date''' year = int(year) doy = int(doy) dater = (datetime.datetime(year, 1, 1) +\ datetime.timedelta(doy - 1)).strftime('%Y %m %d').split() self.year = f'{year}' self.month = f'{str(int(dater[1])) :0>2s}' self.day = f'{str(int(dater[2])) :0>2s}' d = self.__dict__.copy() hdf_urls = self.get_url(**(fdict(d))) if not (len(hdf_urls) and (type(hdf_urls[0]) == URL)): return [None] if 'db_file' in self.__dict__: if 'database' not in self.__dict__: # load database d = self.__dict__.copy() self.database = Database( self.db_file, **(fdict(d, ignore=['db_dir', 'db_file']))) # look up in db this_set = f"{self.product}.{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" store_flag = 'modis' response = self.database.get_from_db(store_flag, this_set) if response and self.noclobber: # test if self.test_ok(response[0]): # safe to return self.msg(f'positive response from database') ofiles = response return ofiles else: msg = f'WARNING: invalid entry {response[0]} in database {str(self.db_file)}' print(msg) self.msg(msg) for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) ofiles = [] if len(sds) > len(self.sds): self.msg(f"ERROR in product {self.product} specification of SDS") self.msg(f"all SDS claimed to be: {len(self.sds)}") self.msg(self.sds) self.msg(f"But request for {len(sds)} SDSs made") self.msg(sds) sys.exit(1) for i, sd in enumerate(sds): ofile = f"data.{self.sds[i]}." + \ f"{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}.vrt" ofile = ofile.replace(' ', '_') spatial_file = Path(f"{self.local_dir[0]}", ofile) g = gdal.BuildVRT(spatial_file.as_posix(), sds[i]) if not g: d = self.__dict__ print( f"problem building dataset for {spatial_file} with {fdict(d)}" ) sys.exit(1) del g ofiles.append(Path(spatial_file).absolute().as_posix()) # store in db cache = {store_flag: {this_set: ofiles}} self.database.set_db(cache, write=True) return ofiles