def run(metadata,timestamp_right_boundary=None): """ Returns Number of enqueued items. """ if metadata.count() < 1: return 0 f=metadata.get_one_file() selection_filename=sdpostpipelineutils.get_attached_parameter__global([f],'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank selection_file=sdpostpipelineutils.get_attached_parameter__global([f],'selection_file') # note that if no files are found at all for this selection (no matter the status), then 'selection_file' will be blank metadata=sdsimplefilter.run(metadata,'status',sdconst.TRANSFER_STATUS_NEW,'keep') count=metadata.count() # how many files to be inserted total_size=metadata.size if count>0: sdlog.info("SDENQUEU-102","Add insertion_group_id..") insertion_group_id=sdsqlutils.nextval('insertion_group_id','history') # this is uniq identifier for all inserted files during this run po=sdpipelineprocessing.ProcessingObject(add_insertion_group_id,insertion_group_id) metadata=sdpipelineprocessing.run_pipeline(metadata,po) if sdconfig.progress: sdprogress.ProgressThread.start(sleep=0.1,running_message='',end_message='') # spinner start sdlog.info("SDENQUEU-103","Insert files and datasets..") po=sdpipelineprocessing.ProcessingObject(add_files) metadata=sdpipelineprocessing.run_pipeline(metadata,po) sdlog.info("SDENQUEU-104","Fill timestamp..") fix_timestamp() sddb.conn.commit() # final commit (we do all insertion/update in one transaction). if sdconfig.progress: sdprogress.ProgressThread.stop() # spinner stop histo_crea_date=sdtime.search_api_datetime_format_to_sqlite_datetime_format(timestamp_right_boundary) if timestamp_right_boundary is not None else None sdhistory.add_history_line(action=sdconst.ACTION_ADD,selection_file=selection_file,insertion_group_id=insertion_group_id,crea_date=histo_crea_date) sdlog.info("SDENQUEU-001","%i new file(s) added (total size=%i,selection=%s)"%(count,total_size,selection_filename)) return count
def uniq(metadata): if metadata.count() < 1: return metadata # retrieve global flag f=metadata.get_one_file() keep_replica=sdpostpipelineutils.get_attached_parameter__global([f],'keep_replica') functional_id_keyname=sdpostpipelineutils.get_functional_identifier_name(f) if keep_replica=='true': # Keep replica. # In this case, we remove type-A duplicates, but we keep type-B duplicates (i.e. replicas) # uniq key => id (i.e. including datanode) sdlog.info("SSHRINKU-001","Remove duplicate..") metadata=sdrmdup.run(metadata,functional_id_keyname) else: # Do not keep replica. # In this case, we remove type-A and type-B duplicates by randomly keeping one candidate # uniq key => instance_id (i.e. excluding datanode) sdlog.info("SSHRINKU-002","Remove duplicate and replicate..") metadata=sdrmduprep.run(metadata,functional_id_keyname) return metadata
def run(metadata): """ Set files status to "delete" Returns: Number of deleted items. Note - the func only change the status (i.e. data and metadata will be removed later by the daemon) """ if metadata.count() < 1: return 0 f=metadata.get_one_file() selection_filename=sdpostpipelineutils.get_attached_parameter__global([f],'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank # TODO: merge both to improve perf metadata=sdsimplefilter.run(metadata,'status',sdconst.TRANSFER_STATUS_NEW,'remove') metadata=sdsimplefilter.run(metadata,'status',sdconst.TRANSFER_STATUS_DELETE,'remove') count=metadata.count() if count>0: po=sdpipelineprocessing.ProcessingObject(delete) metadata=sdpipelineprocessing.run_pipeline(metadata,po) sddb.conn.commit() # final commit (we do all update in one transaction). sdhistorydao.add_history_line(sdconst.ACTION_DELETE,selection_filename) sdlog.info("SDDELETE-929","%i files marked for deletion (selection=%s)"%(count,selection_filename)) return count
def transform_url(files): url_replace = sdpostpipelineutils.get_attached_parameter__global( files, 'url_replace') if url_replace is not None: (from_string, to_string) = parse_rule('url_replace', url_replace) for f in files: f['url'] = f['url'].replace(from_string, to_string) return files
def run(metadata): """ Set files status to "delete" Returns: Number of deleted items. Note - the func only change the status (i.e. data and metadata will be removed later by the daemon) """ if metadata.count() < 1: return 0 f = metadata.get_one_file() selection_filename = sdpostpipelineutils.get_attached_parameter__global( [f], 'selection_filename' ) # note that if no files are found at all for this selection (no matter the status), then the filename will be blank # TODO: merge both to improve perf metadata = sdsimplefilter.run(metadata, 'status', sdconst.TRANSFER_STATUS_NEW, 'remove') metadata = sdsimplefilter.run(metadata, 'status', sdconst.TRANSFER_STATUS_DELETE, 'remove') count = metadata.count() if count > 0: po = sdpipelineprocessing.ProcessingObject(delete) metadata = sdpipelineprocessing.run_pipeline(metadata, po) sddb.conn.commit( ) # final commit (we do all update in one transaction). sdhistorydao.add_history_line(sdconst.ACTION_DELETE, selection_filename) sdlog.info( "SDDELETE-929", "%i files marked for deletion (selection=%s)" % (count, selection_filename)) return count
def run(files): """ Set files status to "delete" Note - the func only change the status (i.e. data and metadata will be removed later by the daemon) """ selection_filename=sdpostpipelineutils.get_attached_parameter__global(files,'selection_filename') files=sdsimplefilter.run(files,'status',sdconst.TRANSFER_STATUS_NEW,'remove') files=sdsimplefilter.run(files,'status',sdconst.TRANSFER_STATUS_DELETE,'remove') count=len(files) if count>0: for file in files: sddeletefile.deferred_delete(file['file_functional_id']) sddao.add_history_line(sdconst.ACTION_DELETE,selection_filename) sdlog.info("SDDELETE-929","%i files marked for deletion (selection=%s)"%(count,selection_filename)) return count
def run(files): if is_nearestpost_enabled(files): # In this case, we remove duplicates by keeping the nearest files=sdnearestpost.run(files) else: keep_replica=sdpostpipelineutils.get_attached_parameter__global(files,'keep_replica') if keep_replica=='true': # Keep replica. # In this case, we remove type-A duplicates, but we keep type-B duplicates (i.e. replicas) # uniq key => id (i.e. including datanode) files=sduniq.run(files,keep_replica=True) else: # Do not keep replica. # In this case, we remove type-A and type-B duplicates by randomly keeping one candidate # uniq key => instance_id (i.e. excluding datanode) files=sduniq.run(files) return files
def run(files): selection_filename=sdpostpipelineutils.get_attached_parameter__global(files,'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank files=sdsimplefilter.run(files,'status',sdconst.TRANSFER_STATUS_NEW,'keep') count=len(files) # how many files to be inserted total_size=sum(int(f['size']) for f in files) if count>0: insertion_group_id=sdsqlutils.nextval('insertion_group_id','history') # this is uniq identifier for all inserted files during this run files=add_insertion_group_id(files,insertion_group_id) # TODO: maybe add a way to prevent progress (may be usefull when using 'upgrade' action) ProgressThread.start(sleep=0.1,running_message='',end_message='') # spinner start add_files(files) ProgressThread.stop() # spinner stop sddao.add_history_line(sdconst.ACTION_ADD,selection_filename,insertion_group_id) sdlog.info("SDENQUEU-001","%i new files added (total size=%i,selection=%s)"%(count,total_size,selection_filename)) return count
def pexec(args): import sdsearch, sdpporder, sddb, syndautils, sdconst, sdpostpipelineutils, sdhistorydao, sddeferredbefore, sddomainutils if args.order_name=='cdf': selection_filename=None # use search-api operator to build datasets list stream=syndautils.get_stream(subcommand=args.subcommand,selection_file=args.selection_file,no_default=args.no_default) sddeferredbefore.add_forced_parameter(stream,'type','Dataset') dataset_found_count=0 order_variable_count=0 order_dataset_count=0 for facets_group in stream: # we need to process each facets_group one by one because of TAG45345JK3J53K metadata=sdsearch.run(stream=[facets_group],post_pipeline_mode='dataset') # TAGJ43KJ234JK dataset_found_count+=metadata.count() if metadata.count() > 0: # WART # (gets overwritten at each iteration, but not a big deal as always the same value) if selection_filename is None: # this is to keep the first found value (i.e. if last facets_group is empty but not the previous ones do not keep the last one (which would be None)) dataset=metadata.get_one_file() selection_filename=sdpostpipelineutils.get_attached_parameter__global([dataset],'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank for d in metadata.get_files(): # warning: load list in memory if d['status']==sdconst.DATASET_STATUS_COMPLETE: # TAG45J4K45JK # first, send cdf variable order # (note: total number of variable event is given by: "total+=#variable for each ds") for v in d['variable']: if v in facets_group['variable']: # TAG45345JK3J53K (we check here that the variable has been asked for in the first place) order_variable_count+=1 # hack if sddomainutils.is_one_var_per_ds(d['project']): # maybe move this test at TAG45J4K45JK line, and replace 'EVENT_CDF_VARIABLE_O' by a dataset level event (note however that the choice about passing 'EVENT_CDF_VARIABLE_O' event as variable or dataset is arbitrary, both work. But passing as variable is a bit strange as variable appears in both dataset_pattern and variable columns) e_names=[sdconst.EVENT_CDF_INT_VARIABLE_O, sdconst.EVENT_CDF_COR_VARIABLE_O] # this case is a bit awkward as we have 'variable' in both dataset_pattern and variable columns.. else: e_names=[sdconst.EVENT_CDF_INT_VARIABLE_N, sdconst.EVENT_CDF_COR_VARIABLE_N] for e_name in e_names: sdpporder.submit(e_name,d['project'],d['model'],d['local_path'],variable=v,commit=False) # second, send cdf dataset order if d['project'] in sdconst.PROJECT_WITH_ONE_VARIABLE_PER_DATASET: # we do not trigger 'dataset' level event in this case pass else: order_dataset_count+=1 e_names=[sdconst.EVENT_CDF_INT_DATASET, sdconst.EVENT_CDF_COR_DATASET] for e_name in e_names: sdpporder.submit(e_name,d['project'],d['model'],d['local_path'],commit=False) sddb.conn.commit() if dataset_found_count>0: if order_dataset_count==0 and order_variable_count==0: print_stderr("Data not ready (data must be already downloaded before performing pexec task): operation cancelled") else: sdhistorydao.add_history_line(sdconst.ACTION_PEXEC,selection_filename) print_stderr("Post-processing task successfully submitted (order_dataset_count=%d,order_variable_count=%d)"%(order_dataset_count,order_variable_count)) else: print_stderr('Data not found') elif args.order_name=='cds': selection_filename = None # use search-api operator to build datasets list stream = syndautils.get_stream(subcommand=args.subcommand, selection_file=args.selection_file, no_default=args.no_default) sddeferredbefore.add_forced_parameter(stream, 'type', 'Dataset') dataset_found_count = 0 order_variable_count = 0 for facets_group in stream: # we need to process each facets_group one by one because of TAG45345JK3J53K metadata = sdsearch.run(stream=[facets_group], post_pipeline_mode='dataset') # TAGJ43KJ234JK dataset_found_count += metadata.count() if metadata.count() > 0: # WART # (gets overwritten at each iteration, but not a big deal as always the same value) if selection_filename is None: # this is to keep the first found value (i.e. if last facets_group is empty but not the previous ones do not keep the last one (which would be None)) dataset = metadata.get_one_file() selection_filename = sdpostpipelineutils.get_attached_parameter__global([dataset], 'selection_filename') # note that if no files are found at all for this selection (no matter the status), then the filename will be blank for d in metadata.get_files(): # warning: load list in memory if d['status'] == sdconst.DATASET_STATUS_COMPLETE: # TAG45J4K45JK # send cds variable order # (note: total number of variable event is given by: "total+=#variable for each ds") for v in d['variable']: if v in facets_group['variable']: # TAG45345JK3J53K (we check here that the variable has been asked for in the first place) order_variable_count += 1 sdpporder.submit(sdconst.EVENT_CDS_VARIABLE, d['project'], d['model'], d['local_path'], variable=v, commit=False) sddb.conn.commit() if dataset_found_count > 0: if order_variable_count == 0: print_stderr("Data not ready (data must be already downloaded before performing pexec task): operation cancelled") else: sdhistorydao.add_history_line(sdconst.ACTION_PEXEC, selection_filename) print_stderr( "Post-processing task successfully submitted (order_variable_count=%d)" % (order_variable_count)) else: print_stderr('Data not found') else: print_stderr("Invalid order name ('%s')"%args.order_name) return 1 return 0
def run(metadata, timestamp_right_boundary=None): """ Returns Number of enqueued items. """ if metadata.count() < 1: return 0 f = metadata.get_one_file() selection_filename = sdpostpipelineutils.get_attached_parameter__global( [f], 'selection_filename' ) # note that if no files are found at all for this selection (no matter the status), then the filename will be blank selection_file = sdpostpipelineutils.get_attached_parameter__global( [f], 'selection_file' ) # note that if no files are found at all for this selection (no matter the status), then 'selection_file' will be blank metadata = sdsimplefilter.run(metadata, 'status', sdconst.TRANSFER_STATUS_NEW, 'keep') count = metadata.count() # how many files to be inserted total_size = metadata.size if count > 0: sdlog.info("SDENQUEU-102", "Add insertion_group_id..") insertion_group_id = sdsqlutils.nextval( 'insertion_group_id', 'history' ) # this is uniq identifier for all inserted files during this run po = sdpipelineprocessing.ProcessingObject(add_insertion_group_id, insertion_group_id) metadata = sdpipelineprocessing.run_pipeline(metadata, po) if sdconfig.progress: sdprogress.ProgressThread.start(sleep=0.1, running_message='', end_message='') # spinner start sdlog.info("SDENQUEU-103", "Insert files and datasets..") po = sdpipelineprocessing.ProcessingObject(add_files) metadata = sdpipelineprocessing.run_pipeline(metadata, po) sdlog.info("SDENQUEU-104", "Fill timestamp..") fix_timestamp() sddb.conn.commit( ) # final commit (we do all insertion/update in one transaction). if sdconfig.progress: sdprogress.ProgressThread.stop() # spinner stop histo_crea_date = sdtime.search_api_datetime_format_to_sqlite_datetime_format( timestamp_right_boundary ) if timestamp_right_boundary is not None else None sdhistory.add_history_line(action=sdconst.ACTION_ADD, selection_file=selection_file, insertion_group_id=insertion_group_id, crea_date=histo_crea_date) sdlog.info( "SDENQUEU-001", "%i new file(s) added (total size=%i,selection=%s)" % (count, total_size, selection_filename)) return count
def pexec(args): import sdsearch, sdpporder, sddb, syndautils, sdconst, sdpostpipelineutils, sdhistorydao, sddeferredbefore, sddomainutils if args.order_name == 'cdf': selection_filename = None # use search-api operator to build datasets list stream = syndautils.get_stream(subcommand=args.subcommand, selection_file=args.selection_file, no_default=args.no_default) sddeferredbefore.add_forced_parameter(stream, 'type', 'Dataset') dataset_found_count = 0 order_variable_count = 0 order_dataset_count = 0 for facets_group in stream: # we need to process each facets_group one by one because of TAG45345JK3J53K metadata = sdsearch.run( stream=[facets_group], post_pipeline_mode='dataset') # TAGJ43KJ234JK dataset_found_count += metadata.count() if metadata.count() > 0: # WART # (gets overwritten at each iteration, but not a big deal as always the same value) if selection_filename is None: # this is to keep the first found value (i.e. if last facets_group is empty but not the previous ones do not keep the last one (which would be None)) dataset = metadata.get_one_file() selection_filename = sdpostpipelineutils.get_attached_parameter__global( [dataset], 'selection_filename' ) # note that if no files are found at all for this selection (no matter the status), then the filename will be blank for d in metadata.get_files(): # warning: load list in memory if d['status'] == sdconst.DATASET_STATUS_COMPLETE: # TAG45J4K45JK # first, send cdf variable order # (note: total number of variable event is given by: "total+=#variable for each ds") for v in d['variable']: if v in facets_group[ 'variable']: # TAG45345JK3J53K (we check here that the variable has been asked for in the first place) order_variable_count += 1 # hack if sddomainutils.is_one_var_per_ds( d['project'] ): # maybe move this test at TAG45J4K45JK line, and replace 'EVENT_CDF_VARIABLE_O' by a dataset level event (note however that the choice about passing 'EVENT_CDF_VARIABLE_O' event as variable or dataset is arbitrary, both work. But passing as variable is a bit strange as variable appears in both dataset_pattern and variable columns) e_names = [ sdconst.EVENT_CDF_INT_VARIABLE_O, sdconst.EVENT_CDF_COR_VARIABLE_O ] # this case is a bit awkward as we have 'variable' in both dataset_pattern and variable columns.. else: e_names = [ sdconst.EVENT_CDF_INT_VARIABLE_N, sdconst.EVENT_CDF_COR_VARIABLE_N ] for e_name in e_names: sdpporder.submit(e_name, d['project'], d['model'], d['local_path'], variable=v, commit=False) # second, send cdf dataset order if d['project'] in sdconst.PROJECT_WITH_ONE_VARIABLE_PER_DATASET: # we do not trigger 'dataset' level event in this case pass else: order_dataset_count += 1 e_names = [ sdconst.EVENT_CDF_INT_DATASET, sdconst.EVENT_CDF_COR_DATASET ] for e_name in e_names: sdpporder.submit(e_name, d['project'], d['model'], d['local_path'], commit=False) sddb.conn.commit() if dataset_found_count > 0: if order_dataset_count == 0 and order_variable_count == 0: print_stderr( "Data not ready (data must be already downloaded before performing pexec task): operation cancelled" ) else: sdhistorydao.add_history_line(sdconst.ACTION_PEXEC, selection_filename) print_stderr( "Post-processing task successfully submitted (order_dataset_count=%d,order_variable_count=%d)" % (order_dataset_count, order_variable_count)) else: print_stderr('Data not found') elif args.order_name == 'cds': selection_filename = None # use search-api operator to build datasets list stream = syndautils.get_stream(subcommand=args.subcommand, selection_file=args.selection_file, no_default=args.no_default) sddeferredbefore.add_forced_parameter(stream, 'type', 'Dataset') dataset_found_count = 0 order_variable_count = 0 for facets_group in stream: # we need to process each facets_group one by one because of TAG45345JK3J53K metadata = sdsearch.run( stream=[facets_group], post_pipeline_mode='dataset') # TAGJ43KJ234JK dataset_found_count += metadata.count() if metadata.count() > 0: # WART # (gets overwritten at each iteration, but not a big deal as always the same value) if selection_filename is None: # this is to keep the first found value (i.e. if last facets_group is empty but not the previous ones do not keep the last one (which would be None)) dataset = metadata.get_one_file() selection_filename = sdpostpipelineutils.get_attached_parameter__global( [dataset], 'selection_filename' ) # note that if no files are found at all for this selection (no matter the status), then the filename will be blank for d in metadata.get_files(): # warning: load list in memory if d['status'] == sdconst.DATASET_STATUS_COMPLETE: # TAG45J4K45JK # send cds variable order # (note: total number of variable event is given by: "total+=#variable for each ds") for v in d['variable']: if v in facets_group[ 'variable']: # TAG45345JK3J53K (we check here that the variable has been asked for in the first place) order_variable_count += 1 sdpporder.submit(sdconst.EVENT_CDS_VARIABLE, d['project'], d['model'], d['local_path'], variable=v, commit=False) sddb.conn.commit() if dataset_found_count > 0: if order_variable_count == 0: print_stderr( "Data not ready (data must be already downloaded before performing pexec task): operation cancelled" ) else: sdhistorydao.add_history_line(sdconst.ACTION_PEXEC, selection_filename) print_stderr( "Post-processing task successfully submitted (order_variable_count=%d)" % (order_variable_count)) else: print_stderr('Data not found') else: print_stderr("Invalid order name ('%s')" % args.order_name) return 1 return 0