def variable_complete_event(project,model,dataset,variable): sdlog.info("SYDEVENT-002","'variable_complete_event' triggered (%s,%s)"%(dataset.dataset_functional_id,variable)) # cascade 1 if dataset.status==sdconst.DATASET_STATUS_COMPLETE: dataset_complete_event(project,model,dataset) # trigger 'dataset complete' event # cascade 2 if project=='CMIP5': assert '/output/' not in dataset.path (ds_path_output1,ds_path_output2)=sdproduct.get_output12_dataset_paths(dataset.path) if sddatasetdao.exists_dataset(path=ds_path_output1) and sddatasetdao.exists_dataset(path=ds_path_output2): d1=sddatasetdao.get_dataset(path=ds_path_output1) d2=sddatasetdao.get_dataset(path=ds_path_output2) if sdvariable.is_variable_complete(d1.dataset_id,variable) and sdvariable.is_variable_complete(d2.dataset_id,variable): dataset_pattern=sdproduct.build_output12_dataset_pattern(dataset.local_path) variable_complete_output12_event(project,model,dataset_pattern,variable) # trigger event (cross dataset event) else: # we also trigger the 'variable_complete_output12_event' event if the variable is over one product only (because if only one product, then output12 event is also true) dataset_pattern=sdproduct.build_output12_dataset_pattern(dataset.local_path) variable_complete_output12_event(project,model,dataset_pattern,variable) # trigger event (cross dataset event)
def dataset_latest_event(project,model,dataset_path,commit=True): # this event means one dataset has been granted latest (i.e. was not latest before and now is) sdlog.log("SYDEVENT-008","'dataset_latest_event' triggered (%s)"%dataset_path,event_triggered_log_level) # not used for now """ event=Event(name=sdconst.EVENT_DATASET_LATEST) event.project=project event.model=model event.dataset_pattern=dataset_pattern event.variable='' event.filename_pattern='' event.crea_date=sdtime.now() event.priority=sdconst.DEFAULT_PRIORITY sdeventdao.add_event(event,commit=commit) """ # cascade if project=='CMIP5': assert '/output/' not in dataset_path (ds_path_output1,ds_path_output2)=sdproduct.get_output12_dataset_paths(dataset_path) if sddatasetdao.exists_dataset(path=ds_path_output1) and sddatasetdao.exists_dataset(path=ds_path_output2): d1=sddatasetdao.get_dataset(path=ds_path_output1) d2=sddatasetdao.get_dataset(path=ds_path_output2) if d1.latest and d2.latest: dataset_pattern=sdproduct.replace_output12_product_with_wildcard(dataset_path) output12_dataset_latest_event(project,model,dataset_pattern,commit=commit) # trigger event else: dataset_pattern=sdproduct.replace_output12_product_with_wildcard(dataset_path) output12_dataset_latest_event(project,model,dataset_pattern,commit=commit) # trigger event
def dataset_complete_event(project,model,dataset,commit=True): sdlog.log("SYDEVENT-004","'dataset_complete_event' triggered (%s)"%dataset.dataset_functional_id,event_triggered_log_level) if project=='CMIP5': (ds_path_output1,ds_path_output2)=sdproduct.get_output12_dataset_paths(dataset.path) if sddatasetdao.exists_dataset(path=ds_path_output1) and sddatasetdao.exists_dataset(path=ds_path_output2): d1=sddatasetdao.get_dataset(path=ds_path_output1) d2=sddatasetdao.get_dataset(path=ds_path_output2) if d1.status==sdconst.DATASET_STATUS_COMPLETE and d2.status==sdconst.DATASET_STATUS_COMPLETE: dataset_pattern=sdproduct.replace_output12_product_with_wildcard(dataset.local_path) dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) if d1.latest and d2.latest: latest_dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) elif not d1.latest and not d2.latest: non_latest_dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) else: sdlog.warning("SYDEVENT-032","Event not triggered as one product is latest while the other product is not") # TODO: is this the right way to handle this case ? else: dataset_pattern=sdproduct.replace_output12_product_with_wildcard(dataset.local_path) dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) if dataset.latest: latest_dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) else: non_latest_dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) # <<<--- 'latest' flag management related code begin # store current 'latest' flag state old_latest=dataset.latest # TODO: check if we we switch latest flag independently for each product (meaning output1 latest can be 1 while output2 latest is 0) # tag4342342 # compute new 'latest' flag if not old_latest: # old state is not latest sddatasetflag.update_latest_flag(dataset) # warning: this method modifies the dataset in memory (and in database too) else: # nothing to do concerning the 'latest' flag as the current dataset is already the latest # (the latest flag can only be switched off (i.e. to False) by *other* datasets versions, not by himself !!!) pass # store new 'latest' flag state new_latest=dataset.latest # --->>> 'latest' flag management related code end # cascade 2 if (not old_latest) and new_latest: dataset_latest_event(project,model,dataset.path,commit=commit) # trigger 'dataset_latest' event
def variable_complete_event(project, model, dataset, variable, commit=True): sdlog.log( "SYDEVENT-002", "'variable_complete_event' triggered (%s,%s)" % (dataset.dataset_functional_id, variable), event_triggered_log_level) if sdconfig.is_event_enabled(sdconst.EVENT_VARIABLE_COMPLETE, project): event = Event(name=sdconst.EVENT_VARIABLE_COMPLETE) event.project = project event.model = model event.dataset_pattern = dataset.local_path event.variable = variable event.filename_pattern = '' event.crea_date = sdtime.now() event.priority = sdconst.DEFAULT_PRIORITY sdeventdao.add_event(event, commit=commit) # cascade 1 (trigger dataset event) if dataset.status == sdconst.DATASET_STATUS_COMPLETE: dataset_complete_event(project, model, dataset) # trigger 'dataset complete' event # cascade 2 (trigger variable output12 event) if project == 'CMIP5': if '/output/' in dataset.path: return (ds_path_output1, ds_path_output2) = sdproduct.get_output12_dataset_paths(dataset.path) if sddatasetdao.exists_dataset( path=ds_path_output1) and sddatasetdao.exists_dataset( path=ds_path_output2): d1 = sddatasetdao.get_dataset(path=ds_path_output1) d2 = sddatasetdao.get_dataset(path=ds_path_output2) if sdvariable.is_variable_complete( d1.dataset_id, variable) and sdvariable.is_variable_complete( d2.dataset_id, variable): dataset_pattern = sdproduct.replace_output12_product_with_wildcard( dataset.local_path) variable_complete_output12_event( project, model, dataset_pattern, variable) # trigger event (cross dataset event) else: # we also trigger the 'variable_complete_output12_event' event if the variable is over one product only (because if only one product, then output12 event is also true) dataset_pattern = sdproduct.replace_output12_product_with_wildcard( dataset.local_path) variable_complete_output12_event( project, model, dataset_pattern, variable) # trigger event (cross dataset event)
def dataset_latest_event(project, model, dataset_path, commit=True): # this event means one dataset has been granted latest (i.e. was not latest before and now is) sdlog.log("SYDEVENT-008", "'dataset_latest_event' triggered (%s)" % dataset_path, event_triggered_log_level) # not used for now """ event=Event(name=sdconst.EVENT_DATASET_LATEST) event.project=project event.model=model event.dataset_pattern=dataset_pattern event.variable='' event.filename_pattern='' event.crea_date=sdtime.now() event.priority=sdconst.DEFAULT_PRIORITY sdeventdao.add_event(event,commit=commit) """ # cascade if project == 'CMIP5': if '/output/' in dataset_path: return (ds_path_output1, ds_path_output2) = sdproduct.get_output12_dataset_paths(dataset_path) if sddatasetdao.exists_dataset( path=ds_path_output1) and sddatasetdao.exists_dataset( path=ds_path_output2): d1 = sddatasetdao.get_dataset(path=ds_path_output1) d2 = sddatasetdao.get_dataset(path=ds_path_output2) if d1.latest and d2.latest: dataset_pattern = sdproduct.replace_output12_product_with_wildcard( dataset_path) output12_dataset_latest_event(project, model, dataset_pattern, commit=commit) # trigger event else: dataset_pattern = sdproduct.replace_output12_product_with_wildcard( dataset_path) output12_dataset_latest_event(project, model, dataset_pattern, commit=commit) # trigger event
def set_timestamp_when_empty__BATCH_MODE_1(): """ Retrieve *all* datasets from ESGF, then update local timestamp. Not used. """ datasets=sddump.dump_ESGF(['type=Dataset','searchapi_host=esgf-data.dkrz.de'],'timestamp') sdlog.info("SDREBUIL-008","%i dataset(s) retrieved from ESGF."%len(datasets)) sdlog.info("SDREBUIL-012","Start updating timestamp in local database.") for i,d in enumerate(datasets): if 'instance_id' in d: # this is because some dataset have no instance_id in ESGF ! dataset=sddatasetdao.get_dataset(dataset_functional_id=d['instance_id']) if dataset is not None: if 'timestamp' in d: # this is because some dataset have no timestamp in ESGF ! dataset.timestamp=d['timestamp'] sddatasetdao.update_dataset(dataset,commit=False,keys=['timestamp']) SDProgressBar.print_progress_bar(len(datasets),i,title="Updating dataset's timestamp.. ") SDProgressBar.progress_complete() sddb.conn.commit()
def complete(files): for f in files: # the if/else block below is because this module can be used to process different metadata type (File and Dataset). if f["type"]==sdconst.SA_TYPE_FILE: transfer=sdfiledao.get_file(f['file_functional_id']) if transfer<>None: f['status']=transfer.status if sdpostpipelineutils.exists_attached_parameter(f,'priority'): # this is to allow setting priority using selection parameter (i.e. default priority can be overrided using selection parameter). It is usefull here for example when user wants to change priority (YES, a search-API request is needed in this case!). f['priority']=sdpostpipelineutils.get_attached_parameter(f,'priority') else: f['priority']=transfer.priority else: f['status']=sdconst.TRANSFER_STATUS_NEW if sdpostpipelineutils.exists_attached_parameter(f,'priority'): # this is to allow setting priority using selection parameter (i.e. default priority can be overrided using selection parameter). This is usefull here to set special priority for new files. f['priority']=sdpostpipelineutils.get_attached_parameter(f,'priority') else: f['priority']=sdconst.DEFAULT_PRIORITY elif f["type"]==sdconst.SA_TYPE_DATASET: dataset=sddatasetdao.get_dataset(dataset_functional_id=f['dataset_functional_id']) if dataset<>None: f['status']=dataset.status else: f['status']=sdconst.DATASET_STATUS_NEW else: raise SDException('SDCOMPLE-001','Incorrect type (%s)'%f["type"]) return files
def set_timestamp_when_empty__BATCH_MODE_1(): """ Retrieve *all* datasets from ESGF, then update local timestamp. Not used. """ datasets = sddump.dump_ESGF(parameter=['searchapi_host=esgf-data.dkrz.de'], fields=sdfields.get_timestamp_fields()) sdlog.info("SDREBUIL-008", "%i dataset(s) retrieved from ESGF." % len(datasets)) sdlog.info("SDREBUIL-012", "Start updating timestamp in local database.") for i, d in enumerate(datasets): if 'instance_id' in d: # this is because some dataset have no instance_id in ESGF ! dataset = sddatasetdao.get_dataset( dataset_functional_id=d['instance_id']) if dataset is not None: if 'timestamp' in d: # this is because some dataset have no timestamp in ESGF ! dataset.timestamp = d['timestamp'] sddatasetdao.update_dataset(dataset, commit=False, keys=['timestamp']) SDProgressBar.print_progress_bar( len(datasets), i, title="Updating dataset's timestamp.. ") SDProgressBar.progress_complete() sddb.conn.commit()
def populate_selection_transfer_junction(): """ populate "selection__transfer" association table WARNING: this method is only CMIP5 DRS compatible """ sdlargequery.get_files_pagination__reset() transfer_without_selection=0 transfer_without_dataset=0 i=0 transfers=sdlargequery.get_files_pagination() # loop over block (trick not to load 300000 CTransfer objects in memory..). Size is given by pagination_block_size while len(transfers)>0: for t in transfers: d=sddatasetdao.get_dataset(dataset_id=t.dataset_id) if d is not None: t.setDataset(d) else: insert_transfer_without_dataset(t) transfer_without_dataset+=1 # we can't go on without dataset (contains() method needs it) continue # selection<=>transfer mapping and insertion in assoc table orphan=1 # this is to detect orphan transfer (i.e. don't belong to any selection) for us in get_Selections(): # debug #print "%s<=>%s"%(t.getTransferID(),us.getSelectionID()) if us.contains(t): sddao.insert_selection_transfer_junction(t,us,_conn) # no commit inside orphan=0 if orphan==1: inserttransferwithoutselection(t) transfer_without_selection+=1 _conn.commit() # commit block of insertSelectionTransferJunction # display progress #if i%100==0: SDProgressDot.print_char(".") i+=1 transfers=sdlargequery.get_files_pagination() if transfer_without_selection>0: sdlog.warning("SDOPERAQ-032","%d transfer(s) not matching any selection found"%transfer_without_selection) if transfer_without_dataset>0: sdlog.warning("SDOPERAQ-033","%d missing dataset found (file exists but corresponding dataset is missing)"%transfer_without_dataset)
def _get_dataset_details(dataset_functional_id): """Helper func.""" d=sddatasetdao.get_dataset(dataset_functional_id=dataset_functional_id) d.dataset_versions=sdstatquery.get_dataset_versions(d,True) # retrieves all the versions of the dataset d.stats=sdstatquery.get_dataset_stats(d) d.variables=sdvariable.get_variables_progress(d) d.files=sdfiledao.get_dataset_files(d) return d
def _get_dataset_details(dataset_functional_id): """Helper func.""" d=sddatasetdao.get_dataset(dataset_functional_id=dataset_functional_id) d.dataset_versions=sddatasetquery.get_dataset_versions(d,True) # retrieves all the versions of the dataset d.stats=sddatasetquery.get_dataset_stats(d) d.variables=sdvariable.get_variables_progress(d) d.files=sdfiledao.get_dataset_files(d) return d
def set_latest_flag(path): """This method is used to manually set the 'latest' flag.""" d=sddatasetdao.get_dataset(path=path,raise_exception_if_not_found=False) # retrieve dataset from database if d is not None: if d.latest==True: print "'latest' flag is already set for this dataset" else: sddatasetflag.update_latest_flag(d,force_latest=True) # warning: this method modifies the dataset in memory (and in database too) else: print "Dataset not found"
def dataset_latest_event(project,model,dataset_path,commit=True): # this event means one dataset has been granted latest (i.e. was not latest before and now is) sdlog.info("SYDEVENT-008","'dataset_latest_event' triggered (%s)"%dataset_path) # cascade if project=='CMIP5': assert '/output/' not in dataset_path (ds_path_output1,ds_path_output2)=sdproduct.get_output12_dataset_paths(dataset_path) if sddatasetdao.exists_dataset(path=ds_path_output1) and sddatasetdao.exists_dataset(path=ds_path_output2): d1=sddatasetdao.get_dataset(path=ds_path_output1) d2=sddatasetdao.get_dataset(path=ds_path_output2) if d1.latest and d2.latest: dataset_pattern=sdproduct.build_output12_dataset_pattern(dataset_path) dataset_latest_output12_event(project,model,dataset_pattern,commit=commit) # trigger event else: dataset_pattern=sdproduct.build_output12_dataset_pattern(dataset_path) dataset_latest_output12_event(project,model,dataset_pattern,commit=commit) # trigger event
def get_file(file_functional_id=None): li = sdfiledao.get_files(file_functional_id=file_functional_id) if len(li) == 0: raise FileNotFoundException() else: f = li[0] # retrieve the dataset d = sddatasetdao.get_dataset(dataset_id=f.dataset_id) f.dataset = d return f
def get_file(file_functional_id=None): li=sdfiledao.get_files(file_functional_id=file_functional_id) if len(li)==0: raise FileNotFoundException() else: f=li[0] # retrieve the dataset d=sddatasetdao.get_dataset(dataset_id=f.dataset_id) f.dataset=d return f
def get_one_waiting_transfer(): li=get_files(limit=1,status=sdconst.TRANSFER_STATUS_WAITING) if len(li)==0: raise NoTransferWaitingException() else: t=li[0] # retrieve the dataset d=sddatasetdao.get_dataset(dataset_id=t.dataset_id) t.dataset=d return t
def get_one_waiting_transfer(): li=sdfiledao.get_files(limit=1,status=sdconst.TRANSFER_STATUS_WAITING) if len(li)==0: raise NoTransferWaitingException() else: t=li[0] # retrieve the dataset d=sddatasetdao.get_dataset(dataset_id=t.dataset_id) t.dataset=d return t
def variable_complete_event(project,model,dataset,variable,commit=True): sdlog.log("SYDEVENT-002","'variable_complete_event' triggered (%s,%s)"%(dataset.dataset_functional_id,variable),event_triggered_log_level) if sdconfig.is_event_enabled(sdconst.EVENT_VARIABLE_COMPLETE,project): event=Event(name=sdconst.EVENT_VARIABLE_COMPLETE) event.project=project event.model=model event.dataset_pattern=dataset.local_path event.variable=variable event.filename_pattern='' event.crea_date=sdtime.now() event.priority=sdconst.DEFAULT_PRIORITY sdeventdao.add_event(event,commit=commit) # cascade 1 (trigger dataset event) if dataset.status==sdconst.DATASET_STATUS_COMPLETE: dataset_complete_event(project,model,dataset) # trigger 'dataset complete' event # cascade 2 (trigger variable output12 event) if project=='CMIP5': assert '/output/' not in dataset.path (ds_path_output1,ds_path_output2)=sdproduct.get_output12_dataset_paths(dataset.path) if sddatasetdao.exists_dataset(path=ds_path_output1) and sddatasetdao.exists_dataset(path=ds_path_output2): d1=sddatasetdao.get_dataset(path=ds_path_output1) d2=sddatasetdao.get_dataset(path=ds_path_output2) if sdvariable.is_variable_complete(d1.dataset_id,variable) and sdvariable.is_variable_complete(d2.dataset_id,variable): dataset_pattern=sdproduct.replace_output12_product_with_wildcard(dataset.local_path) variable_complete_output12_event(project,model,dataset_pattern,variable) # trigger event (cross dataset event) else: # we also trigger the 'variable_complete_output12_event' event if the variable is over one product only (because if only one product, then output12 event is also true) dataset_pattern=sdproduct.replace_output12_product_with_wildcard(dataset.local_path) variable_complete_output12_event(project,model,dataset_pattern,variable) # trigger event (cross dataset event)
def fill_missing_dataset_timestamp(dataset_without_timestamp): """This funcs set the dataset timestamp. Notes - This func DO NOT commit. - In ESFG, timestamp differs from replica to replica, and so, as there is no dataset replica concept in 'sdt', it's really a hack, because we set the timestamp randomly (i.e. dataset's timestamp in Synda installation at user A may differ to dataset's timestamp in Synda installation at user B (because the timestamp for the dataset may have been retrieved from replica X in the case of user A and from replica Y in the case of user B (and X replica's timestamp may differ from Y replica's timestamp))). Anyway, in the end, we hope that the timestamp random is on a much smaller scale than the version-to-version time interval scale, so to be able to detect which version is the latest ! And yes: all this mess is because version exists in different formats ('v1', 'v20140318'..). """ # Retrieve timestamps from ESGF # Note # We do not filter replica in the query below in case the master host is not up result = sdquicksearch.run(parameter=[ 'limit=1', 'fields=%s' % timestamp_fields, 'type=Dataset', 'instance_id=%s' % dataset_without_timestamp.dataset_functional_id ], post_pipeline_mode=None) li = result.get_files() # check if dataset has been found in ESGF if len(li) > 0: d = li[0] else: raise SDException( "SDTIMEST-800", "%s dataset does not exist in ESGF (or the index used does not list it)" % dataset_without_timestamp.dataset_functional_id) # use file's timestamp if dataset's timestamp is not set in ESGF # (this is needed, because some dataset in ESGF have NO timestamp...) use_file_timestamp_if_dataset_timestamp_is_missing(d) # update timestamp in DB dataset = sddatasetdao.get_dataset(dataset_functional_id=d['instance_id']) dataset.timestamp = d['timestamp'] sddatasetdao.update_dataset(dataset, commit=False, keys=['timestamp'])
def file_(): """This func perform a fake 'end of transfer' event.""" sdlog.info("SDEVENTB-002", "Reset 'end of transfer' events") # check that only files with 'done' status exist li = sdfilequery.get_download_status() if len(li) > 1: raise SDException( 'SDEVENTB-001', "Incorrect files status (status must be 'done' for all files before running this func)" ) # reset files status from done to waiting sdmodifyquery.change_status(sdconst.TRANSFER_STATUS_DONE, sdconst.TRANSFER_STATUS_WAITING) # reset dataset status to empty, and dataset 'latest' flag to false sdmodifyquery.wipeout_datasets_flags(status=sdconst.DATASET_STATUS_EMPTY) # mimic end of transfer dbpagination = sddbpagination.DBPagination() files = dbpagination.get_files() while len(files) > 0: for f in files: sdlog.info("SDEVENTB-003", "trigger eot event on %s" % f.file_functional_id) # PAYLOAD # set status to done f.status = sdconst.TRANSFER_STATUS_DONE sdfiledao.update_file(f) # retrieve the dataset d = sddatasetdao.get_dataset(dataset_id=f.dataset_id) f.dataset = d # trigger end of transfer file event for all files sdevent.file_complete_event(f) sddb.conn.commit() # commit block files = dbpagination.get_files() # next block sdprogress.SDProgressDot.print_char(".")
def set_latest_flag(path): """This method is used to manually set the 'latest' flag. Note Not used. """ d = sddatasetdao.get_dataset( path=path, raise_exception_if_not_found=False) # retrieve dataset from database if d is not None: if d.latest == True: print "'latest' flag is already set for this dataset" else: sddatasetflag.update_latest_flag( d, force_latest=True ) # warning: this method modifies the dataset in memory (and in database too) else: sdtools.print_stderr('Dataset not found')
def file_(): """This func perform a fake 'end of transfer' event.""" sdlog.info("SDEVENTB-002","Reset 'end of transfer' events") # check that only files with 'done' status exist li=sdfilequery.get_download_status() if len(li)>1: raise SDException('SDEVENTB-001',"Incorrect files status (status must be 'done' for all files before running this func)") # reset files status from done to waiting sdmodifyquery.change_status(sdconst.TRANSFER_STATUS_DONE,sdconst.TRANSFER_STATUS_WAITING) # reset dataset status to empty, and dataset 'latest' flag to false sdmodifyquery.wipeout_datasets_flags(status=sdconst.DATASET_STATUS_EMPTY) # mimic end of transfer dbpagination=sddbpagination.DBPagination() files=dbpagination.get_files() while len(files)>0: for f in files: sdlog.info("SDEVENTB-003","trigger eot event on %s"%f.file_functional_id) # PAYLOAD # set status to done f.status=sdconst.TRANSFER_STATUS_DONE sdfiledao.update_file(f) # retrieve the dataset d=sddatasetdao.get_dataset(dataset_id=f.dataset_id) f.dataset=d # trigger end of transfer file event for all files sdevent.file_complete_event(f) sddb.conn.commit() # commit block files=dbpagination.get_files() # next block sdprogress.SDProgressDot.print_char(".")
def fill_missing_dataset_timestamp(dataset_without_timestamp): """This funcs set the dataset timestamp. Notes - This func DO NOT commit. - In ESFG, timestamp differs from replica to replica, and so, as there is no dataset replica concept in 'sdt', it's really a hack, because we set the timestamp randomly (i.e. dataset's timestamp in Synda installation at user A may differ to dataset's timestamp in Synda installation at user B (because the timestamp for the dataset may have been retrieved from replica X in the case of user A and from replica Y in the case of user B (and X replica's timestamp may differ from Y replica's timestamp))). Anyway, in the end, we hope that the timestamp random is on a much smaller scale than the version-to-version time interval scale, so to be able to detect which version is the latest ! And yes: all this mess is because version exists in different formats ('v1', 'v20140318'..). """ # Retrieve timestamps from ESGF # Note # We do not filter replica in the query below in case the master host is not up result=sdquicksearch.run(parameter=['limit=1','fields=%s'%timestamp_fields,'type=Dataset','instance_id=%s'%dataset_without_timestamp.dataset_functional_id],post_pipeline_mode=None) li=result.get_files() # check if dataset has been found in ESGF if len(li)>0: d=li[0] else: raise SDException("SDTIMEST-800","%s dataset does not exist in ESGF (or the index used does not list it)"%dataset_without_timestamp.dataset_functional_id) # use file's timestamp if dataset's timestamp is not set in ESGF # (this is needed, because some dataset in ESGF have NO timestamp...) use_file_timestamp_if_dataset_timestamp_is_missing(d) # update timestamp in DB dataset=sddatasetdao.get_dataset(dataset_functional_id=d['instance_id']) dataset.timestamp=d['timestamp'] sddatasetdao.update_dataset(dataset,commit=False,keys=['timestamp'])
def add_dataset(f): """ Returns: dataset_id """ d = sddatasetdao.get_dataset(dataset_functional_id=f.dataset_functional_id) if d is not None: # check dataset local path format # # (once a dataset has been created using one local_path format, it # cannot be changed anymore without removing the all dataset / # restarting the dataset from scratch). # if d.local_path != f.dataset_local_path: raise SDException( "SDENQUEU-008", "Incorrect local path format (existing_format=%s,new_format=%s)" % (d.local_path, f.dataset_local_path)) # compute new dataset status if d.status == sdconst.DATASET_STATUS_IN_PROGRESS: d.status = sdconst.DATASET_STATUS_IN_PROGRESS elif d.status == sdconst.DATASET_STATUS_EMPTY: d.status = sdconst.DATASET_STATUS_EMPTY elif d.status == sdconst.DATASET_STATUS_COMPLETE: d.status = sdconst.DATASET_STATUS_IN_PROGRESS # this means that a dataset may be "in-progress" and also "latest" # Note related to the "latest" dataset column # # Adding new files to a datasets may change the status, but don't # change dataset "latest" flag. This is because a dataset can only # downgrade here ("complete" => "in-progress"), or stay the same. And # when a dataset downgrade, "latest" flag, if true, stay as is, and if # false, stay as is also. # "last_mod_date" is only modified here (i.e. it is not modified when # dataset's files status change). in other words, it changes only when # adding new files to it using this script. # d.last_mod_date = sdtime.now() sddatasetdao.update_dataset(d, commit=False) return d.dataset_id else: sdlog.info("SDENQUEU-002", "create dataset (dataset_path=%s)" % (f.dataset_path)) d = Dataset() d.local_path = f.dataset_local_path d.path = f.dataset_path d.path_without_version = f.dataset_path_without_version d.dataset_functional_id = f.dataset_functional_id d.template = f.dataset_template d.version = f.dataset_version d.project = f.project d.status = sdconst.DATASET_STATUS_EMPTY d.latest = False d.crea_date = sdtime.now() d.last_mod_date = sdtime.now() # non-mandatory attributes d.timestamp = f.dataset_timestamp if hasattr( f, 'dataset_timestamp') else None d.model = f.model if hasattr(f, 'model') else None return sddatasetdao.add_dataset(d, commit=False)
def dataset_complete_event(project,model,dataset,commit=True): sdlog.log("SYDEVENT-004","'dataset_complete_event' triggered (%s)"%dataset.dataset_functional_id,event_triggered_log_level) # not used for now """ event=Event(name=sdconst.EVENT_DATASET_COMPLETE) event.project=project event.model=model event.dataset_pattern=dataset_pattern event.variable='' event.filename_pattern='' event.crea_date=sdtime.now() event.priority=sdconst.DEFAULT_PRIORITY sdeventdao.add_event(event,commit=commit) """ # <<<--- 'latest' flag management related code begin # store current 'latest' flag state old_latest=dataset.latest # TODO: check if we we switch latest flag independently for each product (meaning output1 latest can be 1 while output2 latest is 0) # tag4342342 # compute new 'latest' flag if not old_latest: # old state is not latest sddatasetflag.update_latest_flag(dataset) # warning: this method modifies the dataset object in memory (and in database too) else: # nothing to do concerning the 'latest' flag as the current dataset is already the latest # (the latest flag can only be switched off (i.e. to False) by *other* datasets versions, not by himself !!!) pass # store new 'latest' flag state new_latest=dataset.latest # --->>> 'latest' flag management related code end # cascade 1 (trigger dataset latest switch event) if (not old_latest) and new_latest: # latest flag has been switched from false to true dataset_latest_event(project,model,dataset.path,commit=commit) # trigger 'dataset_latest' event # cascade 2 (trigger latest dataset complete event) if dataset.latest: latest_dataset_complete_event(project,model,dataset.local_path,commit=commit) else: non_latest_dataset_complete_event(project,model,dataset.local_path,commit=commit) # cascade 3 (trigger output12 dataset complete event) if project=='CMIP5': (ds_path_output1,ds_path_output2)=sdproduct.get_output12_dataset_paths(dataset.path) if sddatasetdao.exists_dataset(path=ds_path_output1) and sddatasetdao.exists_dataset(path=ds_path_output2): d1=sddatasetdao.get_dataset(path=ds_path_output1) d2=sddatasetdao.get_dataset(path=ds_path_output2) if d1.status==sdconst.DATASET_STATUS_COMPLETE and d2.status==sdconst.DATASET_STATUS_COMPLETE: dataset_pattern=sdproduct.replace_output12_product_with_wildcard(dataset.local_path) dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) else: # only one product exists for this dataset # not sure if this code is required. # basically, it says that if only one product is present (output1 or output2) # then the 'output12' is considered ready to be triggered # (i.e. output12 does not require output1 and output2 to be present, # it only require that if there are, they must both be complete) # dataset_pattern=sdproduct.replace_output12_product_with_wildcard(dataset.local_path) dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) # cascade 4 (trigger latest output12 dataset complete event) if project=='CMIP5': (ds_path_output1,ds_path_output2)=sdproduct.get_output12_dataset_paths(dataset.path) if sddatasetdao.exists_dataset(path=ds_path_output1) and sddatasetdao.exists_dataset(path=ds_path_output2): d1=sddatasetdao.get_dataset(path=ds_path_output1) d2=sddatasetdao.get_dataset(path=ds_path_output2) if d1.status==sdconst.DATASET_STATUS_COMPLETE and d2.status==sdconst.DATASET_STATUS_COMPLETE: if d1.latest and d2.latest: latest_output12_dataset_complete_event(project,model,dataset_pattern,commit=commit) elif not d1.latest and not d2.latest: non_latest_dataset_complete_output12_event(project,model,dataset_pattern,commit=commit) else: sdlog.warning("SYDEVENT-032","Event not triggered as one product is latest while the other product is not") # TODO: is this the right way to handle this case ? else: # only one product exists for this dataset # not sure if this code is required. # basically, it says that if only one product is present (output1 or output2) # then the 'output12' is considered ready to be triggered # (i.e. output12 does not require output1 and output2 to be present, # it only require that if there are, they must both be complete) # if dataset.latest: latest_output12_dataset_complete_event(project,model,dataset_pattern,commit=commit) else: non_latest_dataset_complete_output12_event(project,model,dataset_pattern,commit=commit)
def dataset_complete_event(project, model, dataset, commit=True): sdlog.log( "SYDEVENT-004", "'dataset_complete_event' triggered (%s)" % dataset.dataset_functional_id, event_triggered_log_level) # not used for now """ event=Event(name=sdconst.EVENT_DATASET_COMPLETE) event.project=project event.model=model event.dataset_pattern=dataset_pattern event.variable='' event.filename_pattern='' event.crea_date=sdtime.now() event.priority=sdconst.DEFAULT_PRIORITY sdeventdao.add_event(event,commit=commit) """ # <<<--- 'latest' flag management related code begin # store current 'latest' flag state old_latest = dataset.latest # TODO: check if we we switch latest flag independently for each product (meaning output1 latest can be 1 while output2 latest is 0) # tag4342342 # compute new 'latest' flag if not old_latest: # old state is not latest sddatasetflag.update_latest_flag( dataset ) # warning: this method modifies the dataset object in memory (and in database too) else: # nothing to do concerning the 'latest' flag as the current dataset is already the latest # (the latest flag can only be switched off (i.e. to False) by *other* datasets versions, not by himself !!!) pass # store new 'latest' flag state new_latest = dataset.latest # --->>> 'latest' flag management related code end # cascade 1 (trigger dataset latest switch event) if (not old_latest) and new_latest: # latest flag has been switched from false to true dataset_latest_event(project, model, dataset.path, commit=commit) # trigger 'dataset_latest' event # cascade 2 (trigger latest dataset complete event) if dataset.latest: latest_dataset_complete_event(project, model, dataset.local_path, commit=commit) else: non_latest_dataset_complete_event(project, model, dataset.local_path, commit=commit) # cascade 3 (trigger output12 dataset complete event) if project == 'CMIP5': (ds_path_output1, ds_path_output2) = sdproduct.get_output12_dataset_paths(dataset.path) if sddatasetdao.exists_dataset( path=ds_path_output1) and sddatasetdao.exists_dataset( path=ds_path_output2): d1 = sddatasetdao.get_dataset(path=ds_path_output1) d2 = sddatasetdao.get_dataset(path=ds_path_output2) if d1.status == sdconst.DATASET_STATUS_COMPLETE and d2.status == sdconst.DATASET_STATUS_COMPLETE: dataset_pattern = sdproduct.replace_output12_product_with_wildcard( dataset.local_path) dataset_complete_output12_event(project, model, dataset_pattern, commit=commit) else: # only one product exists for this dataset # not sure if this code is required. # basically, it says that if only one product is present (output1 or output2) # then the 'output12' is considered ready to be triggered # (i.e. output12 does not require output1 and output2 to be present, # it only require that if there are, they must both be complete) # dataset_pattern = sdproduct.replace_output12_product_with_wildcard( dataset.local_path) dataset_complete_output12_event(project, model, dataset_pattern, commit=commit) # cascade 4 (trigger latest output12 dataset complete event) if project == 'CMIP5': (ds_path_output1, ds_path_output2) = sdproduct.get_output12_dataset_paths(dataset.path) if sddatasetdao.exists_dataset( path=ds_path_output1) and sddatasetdao.exists_dataset( path=ds_path_output2): d1 = sddatasetdao.get_dataset(path=ds_path_output1) d2 = sddatasetdao.get_dataset(path=ds_path_output2) if d1.status == sdconst.DATASET_STATUS_COMPLETE and d2.status == sdconst.DATASET_STATUS_COMPLETE: if d1.latest and d2.latest: latest_output12_dataset_complete_event(project, model, dataset_pattern, commit=commit) elif not d1.latest and not d2.latest: non_latest_dataset_complete_output12_event(project, model, dataset_pattern, commit=commit) else: sdlog.warning( "SYDEVENT-032", "Event not triggered as one product is latest while the other product is not" ) # TODO: is this the right way to handle this case ? else: # only one product exists for this dataset # not sure if this code is required. # basically, it says that if only one product is present (output1 or output2) # then the 'output12' is considered ready to be triggered # (i.e. output12 does not require output1 and output2 to be present, # it only require that if there are, they must both be complete) # if dataset.latest: latest_output12_dataset_complete_event(project, model, dataset_pattern, commit=commit) else: non_latest_dataset_complete_output12_event(project, model, dataset_pattern, commit=commit)
def populate_selection_transfer_junction(): """ populate "selection__transfer" association table WARNING: this method is only CMIP5 DRS compatible TODO: not tested: check this method before use """ dbpagination = sddbpagination.DBPagination() transfer_without_selection = 0 transfer_without_dataset = 0 i = 0 transfers = dbpagination.get_files( ) # loop over block (trick not to load 300000 CTransfer objects in memory..). Size is given by pagination_block_size while len(transfers) > 0: for t in transfers: d = sddatasetdao.get_dataset(dataset_id=t.dataset_id) if d is not None: t.setDataset(d) else: insert_transfer_without_dataset(t) transfer_without_dataset += 1 # we can't go on without dataset (contains() method needs it) continue # selection<=>transfer mapping and insertion in assoc table orphan = 1 # this is to detect orphan transfer (i.e. don't belong to any selection) for us in get_Selections(): # debug #print "%s<=>%s"%(t.get_transfer_id(),us.get_selection_id()) if us.contains(t): sddao.insert_selection_transfer_junction( t, us, _conn) # no commit inside orphan = 0 if orphan == 1: insert_transfer_without_selection(t) transfer_without_selection += 1 _conn.commit() # commit block # display progress #if i%100==0: SDProgressDot.print_char(".") i += 1 transfers = dbpagination.get_files() if transfer_without_selection > 0: sdlog.warning( "SDOPERAQ-032", "%d transfer(s) not matching any selection found" % transfer_without_selection) if transfer_without_dataset > 0: sdlog.warning( "SDOPERAQ-033", "%d missing dataset found (file exists but corresponding dataset is missing)" % transfer_without_dataset)
def add_dataset(f): """ Returns: dataset_id """ d=sddatasetdao.get_dataset(dataset_functional_id=f.dataset_functional_id) if d is not None: # check dataset local path format # # (once a dataset has been created using one local_path format, it # cannot be changed anymore without removing the all dataset / # restarting the dataset from scratch). # if d.local_path!=f.dataset_local_path: raise SDException("SDENQUEU-008","Incorrect local path format (existing_format=%s,new_format=%s)"%(d.local_path,f.dataset_local_path)) # compute new dataset status if d.status==sdconst.DATASET_STATUS_IN_PROGRESS: d.status=sdconst.DATASET_STATUS_IN_PROGRESS elif d.status==sdconst.DATASET_STATUS_EMPTY: d.status=sdconst.DATASET_STATUS_EMPTY elif d.status==sdconst.DATASET_STATUS_COMPLETE: d.status=sdconst.DATASET_STATUS_IN_PROGRESS # this means that a dataset may be "in-progress" and also "latest" # Note related to the "latest" dataset column # # Adding new files to a datasets may change the status, but don't # change dataset "latest" flag. This is because a dataset can only # downgrade here ("complete" => "in-progress"), or stay the same. And # when a dataset downgrade, "latest" flag, if true, stay as is, and if # false, stay as is also. # "last_mod_date" is only modified here (i.e. it is not modified when # dataset's files status change). in other words, it changes only when # adding new files to it using this script. # d.last_mod_date=sdtime.now() sddatasetdao.update_dataset(d,commit=False) return d.dataset_id else: sdlog.info("SDENQUEU-002","create dataset (dataset_path=%s)"%(f.dataset_path)) d=Dataset() d.local_path=f.dataset_local_path d.path=f.dataset_path d.path_without_version=f.dataset_path_without_version d.dataset_functional_id=f.dataset_functional_id d.template=f.dataset_template d.version=f.dataset_version d.project=f.project d.status=sdconst.DATASET_STATUS_EMPTY d.latest=False d.crea_date=sdtime.now() d.last_mod_date=sdtime.now() # non-mandatory attributes d.timestamp=f.dataset_timestamp if hasattr(f,'dataset_timestamp') else None d.model=f.model if hasattr(f,'model') else None return sddatasetdao.add_dataset(d,commit=False)