def search_files(self, decadals=None, project="baseline1", firstyear=None, lastyear=None, product="*", time_frequency='6hr', model="mpi-esm-lr", ensembles=["*"], experiment=None, variable="ta", institute="MPI-M", realm="atmos", driving_model=None, rcm_ensemble=None, domain=None, find_variables=False): """ use solr_search to find the files needed by the plugin set find_variables=True to get a list of available variables instead of searching files """ # convert some arguments # if '*' in ensembles: # Logger.Error('All ensembles not allowed\n' # 'Please select one',-1) if ensembles != '\*': ensembles = ensembles.split(',') else: ensembles = [ensembles] if decadals is not None: years = map(int, decadals) else: years = None if firstyear is not None: firstyear = int(firstyear) if lastyear is not None: lastyear = int(lastyear) # use solr_search to get the input files -------------------------------------------------- # construct a search string for experiments experiment_prefix = experiment if experiment_prefix is None or experiment_prefix == "*": if project.lower() == 'baseline0': experiment_prefix = 'decadal' elif project.lower() == 'baseline1': experiment_prefix = 'decs4e' elif project.lower() == 'prototype': experiment_prefix = 'dffs4e' elif project.lower() == 'cmip5': experiment_prefix = 'decadal' elif project.lower() == 'historical': experiment_prefix = 'historical' else: experiment_prefix = '*' # if not experiment_prefix.endswith("*"): # experiment_prefix += "*" # compose solr_search arguments ssargs = {} ssargs["project"] = project ssargs["institute"] = institute ssargs["realm"] = realm if type(variable) == list: variables = variable else: variables = [variable] ssargs["time_frequency"] = time_frequency # are there products in this project? product_facets = SolrFindFiles.facets(facets=["product"], **ssargs) if len(product_facets["product"]) > 0: ssargs["product"] = product # are there models in this project? model_facets = SolrFindFiles.facets(facets=["model"], **ssargs) if len(model_facets["model"]) > 0: ssargs["model"] = model # are there multiple experiments? experiment_facets = SolrFindFiles.facets(facets=["experiment"], **ssargs) if len(experiment_facets["experiment"]) > 0: ssargs["experiment"] = experiment_prefix # additional parameters for regional models if rcm_ensemble is not None: rcm_ensemble_facets = SolrFindFiles.facets(facets=["rcm_ensemble"], **ssargs) if len(rcm_ensemble_facets["rcm_ensemble"]) > 0: ssargs["rcm_ensemble"] = rcm_ensemble if driving_model is not None: driving_model_facets = SolrFindFiles.facets( facets=["driving_model"], **ssargs) if len(driving_model_facets["driving_model"]) > 0: ssargs["driving_model"] = driving_model if domain is not None: domain_facets = SolrFindFiles.facets(facets=["domain"], **ssargs) if len(domain_facets["domain"]) > 0: ssargs["domain"] = domain # search variables instead of files? if find_variables: variable_facets = SolrFindFiles.facets(facets=["variable"], **ssargs) return variable_facets["variable"] # put all files into a list self.inputfiles = [] if years is not None or firstyear is not None or lastyear is not None: self.inputfilesByDecade = {} # we have multiple experiments that contain the decade if "experiment" in ssargs and project != "observations" and project != "reanalysis" and years is not None: for year in years: yearfiles = [] ssargs["experiment"] = "%s%d" % (experiment_prefix, year) for ens in ensembles: if ens != "*": ssargs["ensemble"] = ens for onefile in solr_search_multivar(variables, ssargs): self.inputfiles.append(onefile) yearfiles.append(onefile) self.inputfilesByDecade[year] = yearfiles # we have only one experiment. fetch all files and filter them by year elif firstyear is not None or lastyear is not None: for ens in ensembles: if ens != "*": ssargs["ensemble"] = ens for onefile in solr_search_multivar(variables, ssargs): starttime, endtime = get_start_and_end_time_from_DRSFile( onefile, include_str=False) if firstyear is None and \ any([True for e in range(starttime.year, endtime.year + 1, 1) \ if e <= lastyear]): self.inputfiles.append(onefile) elif lastyear is None and \ any([True for e in range(starttime.year, endtime.year + 1, 1) \ if e >= firstyear]): self.inputfiles.append(onefile) elif any([True for e in range(firstyear, lastyear + 1, 1)\ if e in range(starttime.year, endtime.year + 1, 1)]): self.inputfiles.append(onefile) else: for ens in ensembles: if ens != "*": ssargs["ensemble"] = ens for onefile in solr_search_multivar(variables, ssargs): starttime, endtime = get_start_and_end_time_from_DRSFile( onefile, include_str=False) for year in years: if starttime.year > year and endtime.year <= year + 10: self.inputfiles.append(onefile) if year not in self.inputfilesByDecade: self.inputfilesByDecade[year] = [onefile] else: self.inputfilesByDecade[year].append( onefile) break # we want all files, not only those for a special decade else: self.inputfilesByDecade = None for ens in ensembles: if ens != "*": ssargs["ensemble"] = ens for onefile in solr_search_multivar(variables, ssargs): self.inputfiles.append(onefile) # nothing found? cancel! if len(self.inputfiles) == 0: Logger.Error( "No input files found!\n" "Data-Browser command:\t" "freva --databrowser project='%s' product='%s' institute='%s' model='%s' experiment='%s'" " time_frequency='%s' realm='%s' variable='%s'" % (project, product, institute, model, experiment, time_frequency, realm, variable), -1) # changed the time part if only a single lead year is of interest or remove files that do not belong to the # requested lead year # check for overlapping time-periods within the same folder self.inputfiles = self.remove_overlapping_time_periods_from_file_list( self.inputfiles) # check if all ensemble members have the same number of files self.inputfiles = self.check_ensemble_completeness(self.inputfiles) # repair some known special cases apply_workarounds_for_path(self.inputfiles) # merge multiple variables merged_by_var = self.merge_multiple_variables(self.inputfiles) if len(merged_by_var) == 0: Logger.Error( "no files found for different variables and same time steps!")
def getFiles(self, year, fileType, model, variable, time_frequency='mon', product='*', ensemblemembers='*', institute='*', exp_prefix='d*', maxleadtime=10, minLeadtime=1): """ Method to get model files with solr_search. :param year: decadal starting year :param fileType: baseline1, cmip5, historical or... :param model: model name i.e. MPI-ESM-LR :param variable: CMOR variable :param time_frequency: monthly, yearly, daily and so on :return: list with all ensemblemembers members found """ # TODO: BUGFIX for minLeadyear minLeadtime = 1 output = list() decStr = exp_prefix+str(year) project = fileType.lower() tmpList = list() for fn in SolrFindFiles.search(experiment=decStr, latest_version=True, product=product, institute=institute, variable=variable, time_frequency=time_frequency, model=model, project=project): if str(fn).split('.')[-1] == 'nc': tmpList.append(str(fn)) try: test = tmpList[0] except: import time time.sleep(5) # delays for 5 seconds for fn in SolrFindFiles.search(experiment=decStr, latest_version=True, product=product, institute=institute, variable=variable, time_frequency=time_frequency, model=model, project=project): if str(fn).split('.')[-1] == 'nc': tmpList.append(str(fn)) try: test = tmpList[0] except: if exp_prefix.find('*') != -1: raise NoFilesFoundError,\ "Couldn't find files for %s in %s %s %s experiment: %s" % (variable, fileType, model, product, year) # OK we can't find files, now try one last time using only the exp_prefix, i.e. "historical" decStr = exp_prefix for fn in SolrFindFiles.search(experiment=exp_prefix, latest_version=True, product=product, institute=institute, variable=variable, time_frequency=time_frequency, model=model, project=project): if str(fn).split('.')[-1] == 'nc': tmpList.append(str(fn)) try: test = tmpList[0] except: # OK, there are no Files... raise NoFilesFoundError,\ "Couldn't find files for %s in %s %s %s experiment: %s" % (variable, fileType, model, product, year) # Check if we have time-splitted files time_values = SolrFindFiles.facets(facets='time', experiment=decStr, latest_version=True, product=product, institute=institute, variable=variable, time_frequency=time_frequency, model=model, project=project) if len(time_values['time']) > 1: tmpList = self.mergeSplittedFiles(tmpList) # select only wanted ensemblemembers if type(ensemblemembers) == list and ensemblemembers[0] != '*': ensList = list() for ens in ensemblemembers: onlyfiles = [f for f in tmpList if f.lower().find(ens) != -1] if len(onlyfiles) > 0: ensList = ensList + onlyfiles else: raise EnsembleMemberError, "Ensemble member %s not found for %s %s %s for starting year %s" % (ens,fileType, model, product, year) tmpList = ensList for fn in tmpList: years = cdo.showyear(input=str(fn))[0] yearList = years.split(' ') if str(year+minLeadtime) not in yearList or str(year+maxleadtime) not in yearList: raise NotEnoughYearsInFile, \ "1Not enough years in %s %s %s for starting year %s" % (fileType, model, product, year) selStr = ','.join(map(str, range(year+minLeadtime, year+1+maxleadtime))) fileName = str(fn).split('/')[-1] output.append(cdo.selyear(selStr, input=str(fn), output=self.tmpDir+fileName+self.getRandomStr()+'_'+str(year+minLeadtime)+'-'+str(year+maxleadtime), options='-f nc')) if len(cdo.showyear(input=output[-1])[0].split(' ')) < maxleadtime-minLeadtime: raise NotEnoughYearsInFile,\ "2Not enough years in %s %s %s for starting year %s" % (fileType, model, product, year) if not output or not isinstance(output, list): raise NoFilesFoundError, \ "Couldn't find files for %s in %s %s %s for starting year %s" % (variable, fileType, model, product, year) # check for curvilinear grid if not hasattr(self,'curvilinearGrid') or self.curvilinearGrid == True: output = self.checkGrid(output, model) # user wants to select levels if self.level is not None: return self.selectLevel(output) else: return output
def getReanalysis(self, year, fileType, experiment, variable, filePath='', time_frequency='mon', maxLeadtime=10, observation_ensemble='*', minLeadtime=1): """ Wrapper method to find reanalysis file with solr_search. :param year: startyear :param fileType: reanalysis or observation :param experiment: i.e. NCEP, HadCrut or MERRA :param variable: CMOR Variable :param time_frequency: monthly, yearly, daily and so on :return: "decadal" file with observations """ # TODO: BUGFIX for minLeadyear minLeadtime = 1 reanFiles = list() if experiment == 'HadCrut' and variable == 'tas': return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime) # to use your own reanalysis data if os.path.isfile(self.observation): return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime, minLeadtime=minLeadtime) if not hasattr(self, 'mergedReanFile'): # Observation or reanalysis? facet = SolrFindFiles.facets(facets='data_type', experiment=experiment, variable=variable, time_frequency=time_frequency) try: if 'reanalysis' in facet['data_type']: searchList = SolrFindFiles.search(project=['reanalysis', 'observations'], experiment=experiment, variable=variable, time_frequency=time_frequency, ensemble=observation_ensemble) else: searchList = SolrFindFiles.search(project=['reanalysis', 'observations'], experiment=experiment, variable=variable, time_frequency=time_frequency, product='grid', ensemble=observation_ensemble) except IndexError: raise NoFilesFoundError, "Couldn't find files for %s in %s" % (variable, experiment) for fn in searchList: yearTmp = cdo.showyear(input=str(fn))[0] fname = str(fn).split('/')[-1] reanFiles.append(str(fn)) # if more than one year in File we break the loop and expect it to be a observationsfile if len(yearTmp.split(' ')) > 1: break if len(reanFiles) == 0: raise NoFilesFoundError,\ "Couldn't find files for %s in %s " % (variable, experiment) mergedFile = cdo.mergetime(input=' '.join(reanFiles), output=self.tmpDir+'mergedREAN_YEARMEAN') tmpMean = cdo.timmean(input=mergedFile) self.mergedReanFile = cdo.sub(input=' '.join([mergedFile, tmpMean]), output=self.tmpDir+'reananomalies.nc') if self.level is not None: self.mergedReanFile = self._selectLevel(self.mergedReanFile) if not hasattr(self, 'mergedReanFile'): raise NoFilesFoundError, "Couldn't find files for %s in %s" % (variable, experiment) years = cdo.showyear(input=self.mergedReanFile)[0] if years.find(str(year+minLeadtime)) != -1 and years.find(str(year+maxLeadtime)) != -1: # create tmp decadal file fileStr = ','.join(map(str, range(year+minLeadtime, year+maxLeadtime+1))) tmp = cdo.selyear(fileStr, input=self.mergedReanFile, output=self.tmpDir+'reanalysis_'+experiment+str(year+1)+'-'+str(year+maxLeadtime)+'.nc', options='-f nc') return tmp else: raise NotEnoughYearsInFile,\ "%s-%s are not part of %s reanalysis" % (year+minLeadtime, year+maxLeadtime, experiment)
def getReanalysis(self, year, fileType, experiment, variable, filePath='', time_frequency='mon', maxLeadtime=10): ''' Wrapper method to find reanalysis file with solr_search. :param year: startyear :param fileType: reanalysis or observation :param experiment: i.e. NCEP, HadCrut or MERRA :param variable: CMOR Variable :param time_frequency: monthly, yearly, daily and so on :return: "decadal" file with observations ''' reanFiles = list() if ((experiment == 'HadCrut') and (variable == 'tas')): return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime) #to use your own reanalysis data if os.path.isfile(self.observation): return self.getObsFiles(variable, year, maxLeadtime=maxLeadtime) if (not hasattr(self, 'mergedReanFile')): #Observation or reanalysis? facet = SolrFindFiles.facets(facets='data_type', experiment=experiment, variable=variable, time_frequency=time_frequency) try: if facet['data_type'][0] == 'reanalysis': searchList = SolrFindFiles.search( data_type=['reanalysis', 'observations'], experiment=experiment, variable=variable, time_frequency=time_frequency) else: searchList = SolrFindFiles.search( data_type=['reanalysis', 'observations'], experiment=experiment, variable=variable, time_frequency=time_frequency, data_structure='grid') except IndexError: raise NoFilesFoundError, "Couldn't find files for %s in %s" % ( variable, experiment) for fn in searchList: yearTmp = cdo.showyear(input=str(fn))[0] fname = str(fn).split('/')[-1] #reanFiles.append(cdo.yearmean(input=str(fn), output=self.tmpDir+fname+'_YEARMEAN')) reanFiles.append(str(fn)) #if more than one year in File we break the loop and expect it to be a observationsfile if (len(yearTmp.split(' ')) > 1): break if (len(reanFiles) == 0): raise NoFilesFoundError, "Couldn't find files for %s in %s" % ( variable, experiment) mergedFile = cdo.mergetime(input=' '.join(reanFiles), output=self.tmpDir + 'mergedREAN_YEARMEAN') tmpMean = cdo.timmean(input=mergedFile) self.mergedReanFile = cdo.sub( input=' '.join([mergedFile, tmpMean]), output=self.tmpDir + 'reananomalies.nc') #print self.mergedReanFile if self.level is not None: self.mergedReanFile = self._selectLevel(self.mergedReanFile) #print self.mergedReanFile if (not hasattr(self, 'mergedReanFile')): raise NoFilesFoundError, "Couldn't find files for %s in %s" % ( variable, experiment) years = cdo.showyear(input=self.mergedReanFile)[0] if ((years.find(str(year + 1)) != -1) and (years.find(str(year + maxLeadtime)) != -1)): #create tmp decadal file fileStr = ','.join( map(str, range(year + 1, year + maxLeadtime + 1))) tmp = cdo.selyear(fileStr, input=self.mergedReanFile, output=self.tmpDir + 'reanalysis_' + experiment + str(year + 1) + '-' + str(year + maxLeadtime) + '.nc') return tmp else: raise NotEnoughYearsInFile, "%s-%s are not part of %s reanalysis" % ( year + 1, year + maxLeadtime, experiment)
def _run(self): args = self.args last_args = self.last_args # Are we searching for facets or files? facets = [] if args.all_facets: facets = None if args.facet: facets.append(args.facet) latest = not args.multiversion batch_size = args.batch_size if args.batch_size else 10 search_dict = {} # contruct search_dict by looping over last_args for arg in last_args: if '=' not in arg: raise CommandError("Invalid format for query: %s" % arg) items = arg.split('=') key, value = items[0], ''.join(items[1:]) if key not in search_dict: search_dict[key] = value else: if not isinstance(search_dict[key], list): search_dict[key] = [search_dict[key]] search_dict[key].append(value) if 'version' in search_dict and latest: # it makes no sense to look for a specific version just among the latest # the speedup is marginal and it might not be what the user expects sys.stderr.write( 'Turning latest of when searching for a specific version.') latest = False logging.debug("Searching dictionary: %s\n", search_dict) # exit() # flush stderr in case we have something pending sys.stderr.flush() if facets != [] and not args.attributes: if 'facet.limit' in search_dict: facet_limit = int(search_dict['facet.limit']) else: # default facet_limit = 1000 search_dict['facet.limit'] = -1 for att, values in SolrFindFiles.facets(facets=facets, latest_version=latest, **search_dict).items(): # values come in pairs: (value, count) value_count = len(values) / 2 if args.relevant_only and value_count < 2: continue if args.count_facet_values: sys.stdout.write('%s: %s' % (att, ','.join([ '%s (%s)' % (v, c) for v, c in zip(*[iter(values)] * 2) ]))) else: sys.stdout.write('%s: %s' % (att, ','.join(values[::2]))) if value_count == facet_limit: sys.stdout.write('...') sys.stdout.write('\n') sys.stdout.flush() elif args.attributes: # select all is none defined but this flag was set if not facets: facets = None results = SolrFindFiles.facets(facets=facets, latest_version=latest, **search_dict) if args.relevant_only: atts = ', '.join([k for k in results if len(results[k]) > 2]) else: atts = ', '.join( SolrFindFiles.facets(facets=facets, latest_version=latest, **search_dict)) sys.stdout.write(atts) sys.stdout.write('\n') sys.stdout.flush() else: # find the files and display them for f in SolrFindFiles.search(batch_size=batch_size, latest_version=latest, **search_dict): sys.stdout.write(str(f)) sys.stdout.write('\n') sys.stdout.flush()
def getFiles(self, year, fileType, model, variable, time_frequency='mon', product='*', ensemblemembers='*', institute='*', exp_prefix='d*', maxleadtime=10, minLeadtime=1): """ Method to get model files with solr_search. :param year: decadal starting year :param fileType: baseline1, cmip5, historical or... :param model: model name i.e. MPI-ESM-LR :param variable: CMOR variable :param time_frequency: monthly, yearly, daily and so on :return: list with all ensemblemembers members found """ # TODO: BUGFIX for minLeadyear minLeadtime = 1 output = list() decStr = exp_prefix + str(year) project = fileType.lower() tmpList = list() for fn in SolrFindFiles.search(experiment=decStr, latest_version=True, product=product, institute=institute, variable=variable, time_frequency=time_frequency, model=model, project=project): if str(fn).split('.')[-1] == 'nc': tmpList.append(str(fn)) try: test = tmpList[0] except: import time time.sleep(5) # delays for 5 seconds for fn in SolrFindFiles.search(experiment=decStr, latest_version=True, product=product, institute=institute, variable=variable, time_frequency=time_frequency, model=model, project=project): if str(fn).split('.')[-1] == 'nc': tmpList.append(str(fn)) try: test = tmpList[0] except: if exp_prefix.find('*') != -1: raise NoFilesFoundError,\ "Couldn't find files for %s in %s %s %s experiment: %s" % (variable, fileType, model, product, year) # OK we can't find files, now try one last time using only the exp_prefix, i.e. "historical" decStr = exp_prefix for fn in SolrFindFiles.search(experiment=exp_prefix, latest_version=True, product=product, institute=institute, variable=variable, time_frequency=time_frequency, model=model, project=project): if str(fn).split('.')[-1] == 'nc': tmpList.append(str(fn)) try: test = tmpList[0] except: # OK, there are no Files... raise NoFilesFoundError,\ "Couldn't find files for %s in %s %s %s experiment: %s" % (variable, fileType, model, product, year) # Check if we have time-splitted files time_values = SolrFindFiles.facets(facets='time', experiment=decStr, latest_version=True, product=product, institute=institute, variable=variable, time_frequency=time_frequency, model=model, project=project) if len(time_values['time']) > 1: tmpList = self.mergeSplittedFiles(tmpList) # select only wanted ensemblemembers if type(ensemblemembers) == list and ensemblemembers[0] != '*': ensList = list() for ens in ensemblemembers: onlyfiles = [f for f in tmpList if f.lower().find(ens) != -1] if len(onlyfiles) > 0: ensList = ensList + onlyfiles else: raise EnsembleMemberError, "Ensemble member %s not found for %s %s %s for starting year %s" % ( ens, fileType, model, product, year) tmpList = ensList for fn in tmpList: years = cdo.showyear(input=str(fn))[0] yearList = years.split(' ') if str(year + minLeadtime) not in yearList or str( year + maxleadtime) not in yearList: raise NotEnoughYearsInFile, \ "1Not enough years in %s %s %s for starting year %s" % (fileType, model, product, year) selStr = ','.join( map(str, range(year + minLeadtime, year + 1 + maxleadtime))) fileName = str(fn).split('/')[-1] output.append( cdo.selyear( selStr, input=str(fn), output=self.tmpDir + fileName + self.getRandomStr() + '_' + str(year + minLeadtime) + '-' + str(year + maxleadtime), options='-f nc')) if len(cdo.showyear(input=output[-1])[0].split( ' ')) < maxleadtime - minLeadtime: raise NotEnoughYearsInFile,\ "2Not enough years in %s %s %s for starting year %s" % (fileType, model, product, year) if not output or not isinstance(output, list): raise NoFilesFoundError, \ "Couldn't find files for %s in %s %s %s for starting year %s" % (variable, fileType, model, product, year) # check for curvilinear grid if not hasattr(self, 'curvilinearGrid') or self.curvilinearGrid == True: output = self.checkGrid(output, model) # user wants to select levels if self.level is not None: return self.selectLevel(output) else: return output