def compare_data_lists(blocks, datasets, pnn): """ Compare the list of blocks at pnn and dataset at rse :blocks: list of file blocks :datasets: list of rucio datasets :pnn: phedex node name return the liste of datasets to add, remove and update as in DEFAULT_DATADIFF_DICT """ ret = copy.deepcopy(DEFAULT_DATADIFF_DICT) dataitems = list(set(blocks.keys() + datasets.keys())) for dataset in dataitems: if dataset not in datasets: ret['missing'].append(dataset) ret['summary']['missing'] += 1 elif dataset not in blocks: ret['to_remove'].append(dataset) ret['summary']['to_remove'] += 1 elif blocks[dataset] != datasets[dataset]: logging.warning("Dataset %s at pnn %s to update", dataset, pnn) ret['to_update'].append(dataset) ret['summary']['to_update'] += 1 ret['summary']['tot'] += 1 return ret
def _launch_workers(pnns, datasets, pool, options, pcli): procs = [] rcli = Client() for pnn in pnns: account = options.account or SYNC_ACCOUNT_FMT % pnn.lower() # try: # rcli = Client(account=account) # except CannotAuthenticate: # logging.warning("cannot authenticate with account %s, skipping pnn %s", # account, pnn) # continue rse = list(rcli.list_rses('pnn=%s&cms_type=real' % pnn)) if not rse: logging.warning("cannot find real rse for pnn %s, skipping", pnn) continue rse = rse[0]['rse'] for dataset in datasets: procs.append( pool.apply_async( dataset_replica_update, (dataset, pnn, rse, pcli, account, options.dry))) return procs
def block_sync(pnn, rds, pcli, rcli): """ Synchronize one rucio dataset at one rse :pnn: pnn. :rds: rucio dataset. :pcli: phedex client. :rcli: rucio client. """ conf = _get_config(pnn) if 'block_verbosity' in conf: logging.my_lvl(conf['block_verbosity']) if not conf['run']: return 'aborted' if not _ping(rcli): logging.warning('Cannot Ping, aborting.') return 'aborted' ret = _replica_update(dataset=rds, pnn=pnn, rse=conf['rse'], pcli=pcli, rcli=rcli, dry=conf['dry']) return ret
def dataset_replica_update(dataset, pnn, rse, pcli, account, dry): """ Just wrapping the update method. """ try: rcli = Client(account=account) except CannotAuthenticate: logging.warning("cannot authenticate with account %s, skipping pnn %s", account, pnn) return None logging.my_fmt(label='update:rse=%s:rds=%s' % (pnn, dataset)) logging.notice('Starting.') try: ret = _replica_update(dataset, pnn, rse, pcli, rcli, dry) #pylint: disable=broad-except except Exception as exc: logging.error('Exception %s raised: %s', type(exc).__name__, traceback.format_exc().replace('\n', '~~')) return None logging.notice('Finished %s.', ret)
def register_container(self, dry=False): """ Register container of the dataset (only if there is a dataset replica on the pnn) :dry: Dry run. Default false. """ try: self.rcli.get_did(scope=self.scope, name=self.container) return 'exists' except DataIdentifierNotFound: pass if self.is_at_pnn and dry: logging.dry('Create container %s in scope %s.', self.container, self.scope) return 'created' elif self.is_at_pnn: logging.verbose('Create container %s in scope %s.', self.container, self.scope) try: self.rcli.add_container(scope=self.scope, name=self.container, lifetime=self.lifetime) except DataIdentifierAlreadyExists: logging.warning('Container was created in the meanwhile') return 'exists' return 'created' return 'skipped'
def _pnn_abort(pnn, summary, rcli): """ checking if the running flag is False and in case aborting. """ conf = _get_config(pnn) if not _ping(rcli): logging.warning('Cannot Ping. Aborting') conf['run'] = False if not conf['run']: summary['status'] = 'aborted' return True return False
def _load_config(conffile, modif=None, starttime=None): """ Gets the conf file and dumps it to the working copy :conffile: file to be loaded :modif: dictionnary with modifications returns the content dictionnary """ starttime = starttime or datetime.now() try: conf = _open_yaml(conffile, modif) except yaml.parser.ParserError: logging.warning('Problem parsing config. Using loaded one.') conf = _open_yaml(LOADED_CONF) default = dict(DEFAULT_PNN_CONF, **conf.pop('default')) main = dict(DEFAULT_MAIN_CONF, **conf.pop('main')) loaded = dict({'main': main}, **{ pnn: dict(default, **dict({'rse': pnn}, **sec)) for pnn, sec in conf.items() }) loaded = { name: _run_status(sec, starttime) for name, sec in loaded.items() } logging.my_lvl(loaded['main']['verbosity']) logging.debug('Loaded conf %s from %s with modif %s', loaded, conffile, modif) with open(LOADED_CONF, 'w') as outfile: yaml.dump(loaded, outfile, default_flow_style=False) return loaded
def sync(config, logs): """ Main Sync process """ logging.my_logfile(logs=logs) logging.my_fmt(label='main_sync') starttime = datetime.now() modify = {} workers = {} # this is the array of running pnns pnns = None # this is the array of pnn to be launched pool = None pcli = PhEDEx() install_mp_handler() conf = _load_config(config, modify, starttime) pnns = [] size = conf['main']['pool'] logging.summary('Starting') while conf['main']['run']: if pool is None: logging.notice('Started pool of size %d', size) pool = multiprocessing.NDPool(size) add = [ pnn for pnn, sec in conf.items() if pnn != 'main' if sec['run'] if pnn not in workers if pnn not in pnns ] pnns += add random.shuffle(pnns) if not _ping(): logging.warning('Cannot ping, not launching workers') else: _launch_workers(pool, workers, pnns, pcli) pnns = [] _poll_workers(workers, pnns) conf = _load_config(config, modify, starttime) if not conf['main']['run'] or\ conf['main']['pool'] != size: # trigger draining of all workers, close the pool and wait # for the task to be over conf = _load_config(config, {'default': {'run': False}}, starttime) _drain_up(workers, pnns) workers = {} pool.close() pool = None size = conf['main']['pool'] else: time.sleep(conf['main']['sleep']) logging.summary('Exiting.') return config
def pnn_sync(pnn, pcli): """ Synchronize one rucio dataset at one rse :pnn: phedex node name. :pcli: phedex client. """ summary = copy.deepcopy(DEFAULT_PNN_SUMMARY) conf = _get_config(pnn) summary['conf'] = conf if 'verbosity' in conf: logging.my_lvl(conf['verbosity']) rcli = Client(account=SYNC_ACCOUNT_FMT % pnn.lower()) if _pnn_abort(pnn, summary, rcli): return summary diff = get_node_diff(pnn, pcli, rcli, conf) summary['timing'].update(diff['timing']) diff = diff['return'] summary['diff'] = diff['summary'] if (diff['summary']['tot'] == diff['summary']['to_remove']) and \ not conf['allow_clean']: logging.warning('All datasets to be removed. Aborting.') summary['status'] = 'aborted' return summary logging.notice("Got diff=%s, timing=%s", summary['diff'], summary['timing']) if _pnn_abort(pnn, summary, rcli): return summary workers = get_timing(_launch_pnn_workers(conf, diff, pnn, pcli, rcli), summary['timing']) summary['workers'] = len(workers) logging.notice("Launched %d workers, pool size %d, timing %s", summary['workers'], int(conf['pool']), summary['timing']['_launch_pnn_workers']) left = int(conf['chunck']) - summary['workers'] + int( conf['min_deletions']) if left > 0: workers_st = get_timing( _launch_pnn_workers_st(left, diff, pnn, pcli, rcli), summary['timing']) summary['workers_st'] = len(workers_st) logging.notice("Launched %d single thread workers, timing %s", summary['workers_st'], summary['timing']['_launch_pnn_workers_st']) workers = dict(workers, **workers_st) _get_pnn_workers(workers, summary) summary['status'] = 'finished' return summary
def update_replicas(self, dry=False): """ Add or removes replicas for the dataset at rse. :dry: Drydrun. default false """ logging.notice('Updating replicas for %s:%s at %s' % (self.scope, self.dataset, self.rse)) replicas = self.rcli.list_replicas([{ 'scope': self.scope, 'name': self.dataset }], rse_expression='rse=%s' % self.rse) rrepl = [repl['name'] for repl in replicas] prepl = [repl for repl in self.replicas.keys()] missing = list(set(prepl) - set(rrepl)) to_remove = list(set(rrepl) - set(prepl)) if missing and dry: logging.dry('Adding replicas %s to rse %s.', str(missing), self.rse) elif missing: logging.verbose('Adding replicas %s to rse %s.', str(missing), self.rse) self.rcli.add_replicas(rse=self.rse, files=[{ 'scope': self.scope, 'name': self.replicas[lfn]['name'], 'adler32': self.replicas[lfn]['checksum'], 'bytes': self.replicas[lfn]['size'], } for lfn in missing]) # missing files that are not in the list of dataset files # are to be attached. lfns = [ item['name'] for item in self.rcli.list_files(scope=self.scope, name=self.dataset) ] missing_lfns = list(set(missing) - set(lfns)) if missing_lfns: logging.verbose('Attaching lfns %s to dataset %s.', str(missing_lfns), self.dataset) try: self.rcli.attach_dids( scope=self.scope, name=self.dataset, dids=[{ 'scope': self.scope, 'name': lfn } for lfn in list(set(missing) - set(lfns))]) except FileAlreadyExists: logging.warning('Trying to attach already existing files.') if to_remove and dry: logging.dry('Removing replicas %s from rse %s.', str(to_remove), self.rse) elif to_remove: logging.verbose('Removing replicas %s from rse %s.', str(to_remove), self.rse) for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE): attempt = 0 while True: attempt += 1 try: self.rcli.delete_replicas(rse=self.rse, files=[{ 'scope': self.scope, 'name': lfn, } for lfn in to_remove_chunk ]) break except DatabaseException: logging.warning( 'DatabaseException raised, retrying...') if attempt > 3: raise time.sleep(randint(1, 5)) return {'added': missing, 'removed': to_remove}
def pnn_sync(pnn, pcli): """ Synchronize one rucio dataset at one rse :pnn: phedex node name. :pcli: phedex client. """ monitor.record_counter('cms_sync.site_started') summary = copy.deepcopy(DEFAULT_PNN_SUMMARY) conf = _get_config(pnn) summary['conf'] = conf if 'verbosity' in conf: logging.my_lvl(conf['verbosity']) rcli = Client(account=SYNC_ACCOUNT_FMT % pnn.lower()) if _pnn_abort(pnn, summary, rcli): return summary # Do the loop here? with conf['multi_das'] if conf['multi_das_calls']: prefixes = list(string.letters + string.digits) random.shuffle(prefixes) else: prefixes = [None] for prefix in prefixes: diff = get_node_diff(pnn, pcli, rcli, conf, prefix=prefix) summary['timing'].update(diff['timing']) diff = diff['return'] summary['diff'] = diff['summary'] if (diff['summary']['tot'] == diff['summary']['to_remove']) and not conf['allow_clean']: logging.warning('All datasets to be removed. Aborting.') summary['status'] = 'aborted' continue # return summary logging.notice("Got diff=%s, timing=%s", summary['diff'], summary['timing']) if _pnn_abort(pnn, summary, rcli): return summary workers = get_timing(_launch_pnn_workers(conf, diff, pnn, pcli, rcli), summary['timing']) summary['workers'] = len(workers) logging.notice("Launched %d workers, pool size %d, timing %s", summary['workers'], int(conf['pool']), summary['timing']['_launch_pnn_workers']) left = int(conf['chunck']) - summary['workers'] + int( conf['min_deletions']) if left > 0: workers_st = get_timing( _launch_pnn_workers_st(left, diff, pnn, pcli, rcli), summary['timing']) summary['workers_st'] = len(workers_st) logging.notice("Launched %d single thread workers, timing %s", summary['workers_st'], summary['timing']['_launch_pnn_workers_st']) workers = dict(workers, **workers_st) _get_pnn_workers(workers, summary) monitor.record_counter('cms_sync.site_completed') summary['status'] = 'finished' return summary