def update_datasets_status(): """ Update status flag for all datasets. Notes - This func is used to fix inconsistencies. - This func doesn't handle the 'latest' flag """ sdlog.info("SYDDFLAG-186","Update status for all datasets") datasets=sddatasetdao.get_datasets() update_datasets_status_HELPER(datasets)
def update_datasets_status(): """ Update status flag for all datasets. Notes - This func is used to fix inconsistencies. - This func doesn't handle the 'latest' flag """ sdlog.info("SYDDFLAG-186", "Update status for all datasets") datasets = sddatasetdao.get_datasets() update_datasets_status_HELPER(datasets)
def update_complete_datasets_status(): """ Update status flag for datasets with complete status. Notes - This func is used to fix inconsistencies, when dataset have complete status, but some of its files are not 'done' yet. - This func doesn't handle the 'latest' flag """ sdlog.info("SYDDFLAG-184","Update complete datasets status") complete_datasets=sddatasetdao.get_datasets(status=sdconst.DATASET_STATUS_COMPLETE) update_datasets_status_HELPER(complete_datasets)
def PROC0001(): """Print obsolete versions of datasets. Notes - use shell expansion pattern in the path, as dataset can be split over the two product output1 and output2, and can also be in output !! - basic algo using get_datasets() method - also see PROC0005 """ for d in sddatasetdao.get_datasets(): datasetVersions=sddatasetquery.get_dataset_versions(d,True) # retrieves all the versions of the dataset if not datasetVersions.ismostrecentversionnumber(d.version): # basic test (for smarter version selection, use PROC0005 which use getoldversionsdatasets()) print d.get_full_local_path('output{,1,2}')
def set_model_when_empty(): """Fix B0025 bug.""" datasets=sddatasetdao.get_datasets() for d in datasets: m=re.search('^[^/]*/([^/]*)/.*$',d.name) # sample => MOHC/HadGEM2-ES/rcp26/day/atmos/day/r1i1p1/v20110524 if m!=None: model=m.group(1) d.model(model) sdrebuildquery.update_dataset(d) else: raise SDException("SDREBUIL-120","incorrect dataset format (%s)"%d.getName()) SDProgressDot.print_char(".")
def update_complete_datasets_status(): """ Update status flag for datasets with complete status. Notes - This func is used to fix inconsistencies, when dataset have complete status, but some of its files are not 'done' yet. - This func doesn't handle the 'latest' flag """ sdlog.info("SYDDFLAG-184", "Update complete datasets status") complete_datasets = sddatasetdao.get_datasets( status=sdconst.DATASET_STATUS_COMPLETE) update_datasets_status_HELPER(complete_datasets)
def update_incomplete_datasets_status(): """ Set status flag for datasets with incomplete status. When removing error and waiting transfers (e.g. with 'synda reset' func), the dataset status become incorrect (i.e. it remains on 'empty' or 'in-progress', while all transfers are now 'done'). This func fix this problem. Notes - This func doesn't handle the 'latest' flag - This func is quite the same as 'update_complete_datasets_status' func, but is faster as it doesn't processes complete dataset (which are the largest part of all the datasets). TODO Also handle the 'latest' flag in this func """ sdlog.info("SYDDFLAG-182", "Update incomplete datasets status") incomplete_datasets = sddatasetdao.get_datasets( status=sdconst.DATASET_STATUS_EMPTY) + sddatasetdao.get_datasets( status=sdconst.DATASET_STATUS_IN_PROGRESS) update_datasets_status_HELPER(incomplete_datasets)
def update_datasets__status_and_latest(): """ Set status and latest flag for all datasets. Return value Returns how many datasets have been modified Note This procedure must be run until no modifications remain (a run makes changes, which impact the next one, and so one. after a few runs, the graph traversal must be complete) """ datasets_modified_count = 0 i = 0 for d in sddatasetdao.get_datasets(): # store dataset current state l__latest = d.latest l__status = d.status # compute new 'status' flag d.status = compute_dataset_status(d) sddatasetdao.update_dataset(d) # compute new 'latest' flag if not d.latest: # we check here the current value for 'latest' flag update_latest_flag( d ) # warning: this method modifies the dataset in memory (and in database too) else: # nothing to do concerning the 'latest' flag as the current dataset is already the latest # (the latest flag can only be switched off (i.e. to False) by *other* datasets versions, not by himself !!!) pass # check if the dataset has changed if l__latest != d.latest or l__status != d.status: datasets_modified_count += 1 # display progress if i % 2 == 0: SDProgressDot.print_char(".") i += 1 print "" sdlog.info("SYDDFLAG-630", "modified datasets: %i" % datasets_modified_count) return datasets_modified_count
def PROC0001(): """Print obsolete versions of datasets. Notes - use shell expansion pattern in the path, as dataset can be split over the two product output1 and output2, and can also be in output !! - basic algo using get_datasets() method - also see PROC0005 """ for d in sddatasetdao.get_datasets(): datasetVersions = sddatasetquery.get_dataset_versions( d, True) # retrieves all the versions of the dataset if not datasetVersions.ismostrecentversionnumber( d.version ): # basic test (for smarter version selection, use PROC0005 which use getoldversionsdatasets()) print d.get_full_local_path('output{,1,2}')
def fix_timestamp(): # HACK 1 # # Once all insertions are done, we update 'dataset.timestamp' column (this # cannot be done in one step, because dataset 'timestamp' attribute doesn't # exist in file's attributes). # # 'timestamp' is mainly (only ?) needed by sddatasetversion.compare() func # # Indeed, this code is a hack that makes the workflow less readable # (i.e. 'search' then 'enqueue' then 'search' again). Maybe try to improve # this in the future. Still, it not as bad as if 'search' triggers 'search' # recursively, because in our case, when the second search starts, the # first search is completed (AFAIR sdsearch is protected not to permit # recursion anyway). # But if needed, there is a way to trigger search recursively: use # sdquicksearch (also in this case, sdsearch can still be used for the top # level search (so resulting with a mix of sdsearch and sdquicksearch)). # datasets_without_timestamp = sddatasetdao.get_datasets( timestamp=None) # retrieve datasets with timestamp not set # HACK 2 recent_datasets_without_timestamp = keep_recent_datasets( datasets_without_timestamp) if len(recent_datasets_without_timestamp) > 0: sdlog.info( "SDENQUEU-004", "Retrieving timestamp for %i dataset(s)." % len(recent_datasets_without_timestamp)) for dataset_without_timestamp in recent_datasets_without_timestamp: try: sdtimestamp.fill_missing_dataset_timestamp( dataset_without_timestamp) except SDException, e: if e.code in ['SDTIMEST-011', 'SDTIMEST-008', 'SDTIMEST-800']: sdlog.info( "SDENQUEU-909", "Timestamp not set for '%s' dataset (%s)" % (dataset_without_timestamp.dataset_functional_id, str(e))) else: # fatal error come here raise
def set_timestamp_when_empty__BATCH_MODE_2(project='CMIP5'): """ Retrieve datasets from local database, then retrieve datasets from ESGF, then update local timestamp. """ datasets_without_timestamp=sddatasetdao.get_datasets(project=project,timestamp=None) # retrieve datasets with timestamp not set sdlog.info("SDREBUIL-004","Updating %i dataset(s) timestamp."%len(datasets_without_timestamp)) for dataset_without_timestamp in datasets_without_timestamp: try: sdtimestamp.fill_missing_dataset_timestamp(dataset_without_timestamp) except SDException, e: if e.code in ['SDTIMEST-011','SDTIMEST-008','SDTIMEST-800']: sdlog.info("SDREBUIL-694","Timestamp not set for dataset (reason=%s,dataset=%s)"%(e.code,dataset_without_timestamp.dataset_functional_id)) else: # fatal error come here raise
def set_model_when_empty(): """Fix B0025 bug.""" datasets = sddatasetdao.get_datasets() for d in datasets: m = re.search( '^[^/]*/([^/]*)/.*$', d.name ) # sample => MOHC/HadGEM2-ES/rcp26/day/atmos/day/r1i1p1/v20110524 if m != None: model = m.group(1) d.model(model) sdrebuildquery.update_dataset(d) else: raise SDException("SDREBUIL-120", "incorrect dataset format (%s)" % d.getName()) SDProgressDot.print_char(".")
def get_old_versions_datasets(): """Return old versions datasets list.""" lst=[] for d in sddatasetdao.get_datasets(): datasetVersions=sddatasetquery.get_dataset_versions(d,True) # retrieves all the versions of the dataset if d.latest==False: # this version is not the latest if datasetVersions.exists_version_with_latest_flag_set_to_true(): # latest exists if not datasetVersions.is_version_higher_than_latest(d): # version is not higher than latest # assert if datasetVersions.is_most_recent_version_number(d): # should never occurs because of the previous tests raise SDException("SDSTAT-ERR042","fatal error (version=%s,path_without_version=%s)"%(d.version,d.get_name_without_version())) lst.append(d) return lst
def update_datasets__status_and_latest(): """ Set status and latest flag for all datasets. Return value Returns how many datasets have been modified Note This procedure must be run until no modifications remain (a run makes changes, which impact the next one, and so one. after a few runs, the graph traversal must be complete) """ datasets_modified_count=0 i=0 for d in sddatasetdao.get_datasets(): # store dataset current state l__latest=d.latest l__status=d.status # compute new 'status' flag d.status=compute_dataset_status(d) sddatasetdao.update_dataset(d) # compute new 'latest' flag if not d.latest: # we check here the current value for 'latest' flag update_latest_flag(d) # warning: this method modifies the dataset in memory (and in database too) else: # nothing to do concerning the 'latest' flag as the current dataset is already the latest # (the latest flag can only be switched off (i.e. to False) by *other* datasets versions, not by himself !!!) pass # check if the dataset has changed if l__latest!=d.latest or l__status!=d.status: datasets_modified_count+=1 # display progress if i%2==0: SDProgressDot.print_char(".") i+=1 print "" sdlog.info("SYDDFLAG-630","modified datasets: %i"%datasets_modified_count) return datasets_modified_count
def add_files(files): for f in files: add_file(File(**f)) # HACK 1 # # Once all insertions are done, we update 'dataset.timestamp' column (this # cannot be done in one step, because dataset 'timestamp' attribute doesn't # exist in file's attributes). # # 'timestamp' is mainly (only ?) needed by sddatasetversion.compare() func # # Indeed, this code is a hack that makes the workflow less readable # (i.e. 'search' then 'enqueue' then 'search' again). Maybe try to improve # this in the future. Still, it not as bad as if 'search' triggers 'search' # recursively, because in our case, when the second search starts, the # first search is completed (AFAIR sdsearch is protected not to permit # recursion anyway). # But if needed, there is a way to trigger search recursively: use # sdquicksearch (also in this case, sdsearch can still be used for the top # level search (so resulting with a mix of sdsearch and sdquicksearch)). # datasets_without_timestamp=sddatasetdao.get_datasets(timestamp=None) # retrieve datasets with timestamp not set # HACK 2 recent_datasets_without_timestamp=keep_recent_datasets(datasets_without_timestamp) if len(recent_datasets_without_timestamp)>0: sdlog.info("SDENQUEU-004","Retrieving timestamp for %i dataset(s)."%len(recent_datasets_without_timestamp)) for dataset_without_timestamp in recent_datasets_without_timestamp: try: sdtimestamp.fill_missing_dataset_timestamp(dataset_without_timestamp) except SDException, e: if e.code in ['SDTIMEST-011','SDTIMEST-008','SDTIMEST-800']: sdlog.info("SDENQUEU-909","Timestamp not set for dataset (reason=%s,dataset=%s)"%(e.code,dataset_without_timestamp.dataset_functional_id)) else: # fatal error come here raise
def update_incomplete_datasets_status(): """ Set status flag for datasets with incomplete status. When removing error and waiting transfers (e.g. with 'synda reset' func), the dataset status become incorrect (i.e. it remains on 'empty' or 'in-progress', while all transfers are now 'done'). This func fix this problem. Notes - This func doesn't handle the 'latest' flag - This func is quite the same as 'update_complete_datasets_status' func, but is faster as it doesn't processes complete dataset (which are the largest part of all the datasets). TODO Also handle the 'latest' flag in this func """ sdlog.info("SYDDFLAG-182","Update incomplete datasets status") incomplete_datasets=sddatasetdao.get_datasets(status=sdconst.DATASET_STATUS_EMPTY)+sddatasetdao.get_datasets(status=sdconst.DATASET_STATUS_IN_PROGRESS) update_datasets_status_HELPER(incomplete_datasets)
def set_timestamp_when_empty__BATCH_MODE_2(project='CMIP5'): """ Retrieve datasets from local database, then retrieve datasets from ESGF, then update local timestamp. """ datasets_without_timestamp = sddatasetdao.get_datasets( project=project, timestamp=None) # retrieve datasets with timestamp not set sdlog.info( "SDREBUIL-004", "Updating %i dataset(s) timestamp." % len(datasets_without_timestamp)) for dataset_without_timestamp in datasets_without_timestamp: try: sdtimestamp.fill_missing_dataset_timestamp( dataset_without_timestamp) except SDException, e: if e.code in ['SDTIMEST-011', 'SDTIMEST-008', 'SDTIMEST-800']: sdlog.info( "SDREBUIL-694", "Timestamp not set for dataset (reason=%s,dataset=%s)" % (e.code, dataset_without_timestamp.dataset_functional_id)) else: # fatal error come here raise
def get_old_versions_datasets(): """Return old versions datasets list.""" lst = [] for d in sddatasetdao.get_datasets(): datasetVersions = sddatasetquery.get_dataset_versions( d, True) # retrieves all the versions of the dataset if d.latest == False: # this version is not the latest if datasetVersions.exists_version_with_latest_flag_set_to_true( ): # latest exists if not datasetVersions.is_version_higher_than_latest( d): # version is not higher than latest # assert if datasetVersions.is_most_recent_version_number( d ): # should never occurs because of the previous tests raise SDException( "SDSTAT-042", "fatal error (version=%s,path_without_version=%s)" % (d.version, d.get_name_without_version())) lst.append(d) return lst