def test_add_list_bad_replicas(self): """ REPLICA (CORE): Add bad replicas and list them""" tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in range(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK') rse_id1 = rse_info['id'] add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True) # Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses']['MOCK']) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root') assert_equal(r, {}) bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id1: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Adding replicas to non-deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': {'events': 10}} for i in range(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK2') rse_id2 = rse_info['id'] add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True) # Listing replicas on non-deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses']['MOCK2']) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root') assert_equal(r, {}) bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id2: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Now adding non-existing bad replicas files = ['srm://mock2.com/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), ] r = declare_bad_file_replicas(files, 'This is a good reason', 'root') output = ['%s Unknown replica' % rep for rep in files] assert_equal(r, {'MOCK2': output})
def test_add_list_bad_replicas(rse_factory, mock_scope, root_account): """ REPLICA (CORE): Add bad replicas and list them""" nbfiles = 5 # Adding replicas to deterministic RSE _, rse1_id = rse_factory.make_srm_rse(deterministic=True) files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles)] add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True) # Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse1_id]) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse1_id: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert len(replicas) == nbbadrep # Adding replicas to non-deterministic RSE _, rse2_id = rse_factory.make_srm_rse(deterministic=False) files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'pfn': 'srm://%s.cern.ch/srm/managerv2?SFN=/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), 'meta': {'events': 10}} for _ in range(nbfiles)] add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True) # Listing replicas on non-deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse2_id]) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse2_id: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert len(replicas) == nbbadrep # Now adding non-existing bad replicas files = ['srm://%s.cern.ch/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), ] r = declare_bad_file_replicas(files, 'This is a good reason', root_account) output = ['%s Unknown replica' % rep for rep in files] assert r == {rse2_id: output}
def test_get_bad_replicas_backlog(rse_factory, mock_scope, root_account, file_config_mock): """ REPLICA (CORE): Check the behaviour of the necromancer in case of backlog on an RSE""" # Run necromancer once necromancer_run(threads=1, bulk=10000, once=True) nbfiles1 = 100 nbfiles2 = 20 # Adding replicas to deterministic RSE rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True) _, rse2_id = rse_factory.make_srm_rse(deterministic=True) # Create bad replicas on rse1 files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles1)] add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True) replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse1_id]) list_rep.append({'scope': replica['scope'], 'name': replica['name'], 'rse': rse1, 'rse_id': rse1_id}) res = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert res == {} result = get_bad_replicas_backlog(force_refresh=True) assert rse1_id in result assert result[rse1_id] == nbfiles1 # Create more bad replicas on rse2 files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles2)] add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True) repl = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): repl.extend(replica['rses'][rse2_id]) res = declare_bad_file_replicas(repl, 'This is a good reason', root_account) assert res == {} # List bad replicas on rse1 bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}]) assert len(bad_replicas) == nbfiles1 for rep in bad_replicas: assert rep in list_rep # Run necromancer once, all the files on RSE2 should be gone, 80 files should stay on RSE1 get_bad_replicas_backlog(force_refresh=True) necromancer_run(threads=1, bulk=20, once=True) bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}, {'id': rse2_id}]) assert len(bad_replicas) == 80 for rep in bad_replicas: assert rep['rse_id'] == rse1_id
def necromancer(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. param worker_number: The number of the worker (thread). param total_number: The total number of workers (threads). chunk_size: The chunk of the size to process. once: To run only once """ sleep_time = 60 while not graceful_stop.is_set(): stime = time.time() try: replicas = list_bad_replicas(limit=chunk_size, worker_number=worker_number, total_workers=total_workers) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info('Thread [%i/%i] : Working on %s:%s on %s' % (worker_number, total_workers, scope, name, rse)) rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])] if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]): logging.info('Thread [%i/%i] : File %s:%s has no other replicas, it will be marked as lost' % (worker_number, total_workers, scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException, e: logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e))) else: logging.info('Thread [%i/%i] : File %s:%s can be recovered. Available sources : %s' % (worker_number, total_workers, scope, name, str(rep[0]['rses']))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException, e: logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e))) logging.info('Thread [%i/%i] : It took %s seconds to process %s replicas' % (worker_number, total_workers, str(time.time() - stime), str(len(replicas))))
def necromancer(thread=0, bulk=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. """ sleep_time = 60 update_history_threshold = 3600 update_history_time = time.time() executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) stime = time.time() try: replicas = list_bad_replicas(limit=bulk, thread=hb['assign_thread'], total_threads=hb['nr_threads']) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse)) rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])] if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]): logging.info(prepend_str + 'File %s:%s has no other replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException, error: logging.info(prepend_str + '%s' % (str(error))) else: logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s' % (scope, name, str(rep[0]['rses']))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException, error: logging.info(prepend_str + '%s' % (str(error))) logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
def necromancer(thread=0, bulk=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. """ sleep_time = 60 update_history_threshold = 3600 update_history_time = time.time() executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) stime = time.time() replicas = [] try: replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logging.info(prepend_str + 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException as error: logging.info(prepend_str + '%s' % (str(error))) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, []) logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException as error: logging.info(prepend_str + '%s' % (str(error))) logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(prepend_str + ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break else: now = time.time() if (now - update_history_time) > update_history_threshold: logging.info(prepend_str + 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time)) bad_replicas = list_bad_replicas_history(limit=1000000, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for rse_id in bad_replicas: chunk_size = 1000 nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size)) logging.debug(prepend_str + 'Update history for rse_id %s' % (rse_id)) cnt = 0 for chunk in chunks(bad_replicas[rse_id], chunk_size): logging.debug(prepend_str + ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk)) cnt += 1 update_bad_replicas_history(chunk, rse_id) logging.info(prepend_str + 'History table updated in %s seconds' % (time.time() - now)) update_history_time = time.time() tottime = time.time() - stime if len(replicas) == bulk: logging.info(prepend_str + 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle') elif tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def test_add_list_bad_replicas(self): """ REPLICA (CLIENT): Add bad replicas""" tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK') rse_id1 = rse_info['id'] self.replica_client.add_replicas(rse='MOCK', files=files) # Listing replicas on deterministic RSE replicas, list_rep = [], [] for replica in self.replica_client.list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'] } for f in files], schemes=['srm'], unavailable=True): replicas.extend(replica['rses']['MOCK']) list_rep.append(replica) r = self.replica_client.declare_bad_file_replicas( replicas, 'This is a good reason') assert_equal(r, {}) bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id1: if badrep['scope'] == rep['scope'] and badrep[ 'name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Run necromancer once necromancer_run(threads=1, bulk=10000, once=True) # Try to attach a lost file tmp_dsn = 'dataset_%s' % generate_uuid() self.did_client.add_dataset(scope=tmp_scope, name=tmp_dsn) with assert_raises(UnsupportedOperation): self.did_client.add_files_to_dataset(tmp_scope, name=tmp_dsn, files=files, rse='MOCK') # Adding replicas to non-deterministic RSE files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': { 'events': 10 } } for _ in range(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK2') rse_id2 = rse_info['id'] self.replica_client.add_replicas(rse='MOCK2', files=files) # Listing replicas on non-deterministic RSE replicas, list_rep = [], [] for replica in self.replica_client.list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'] } for f in files], schemes=['srm'], unavailable=True): replicas.extend(replica['rses']['MOCK2']) list_rep.append(replica) print(replicas, list_rep) r = self.replica_client.declare_bad_file_replicas( replicas, 'This is a good reason') print(r) assert_equal(r, {}) bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id2: if badrep['scope'] == rep['scope'] and badrep[ 'name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Now adding non-existing bad replicas files = [ 'srm://mock2.com/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), ] r = self.replica_client.declare_bad_file_replicas( files, 'This is a good reason') output = ['%s Unknown replica' % rep for rep in files] assert_equal(r, {'MOCK2': output})
# Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): replicas.extend(replica['rses']['MOCK']) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root') assert_equal(r, {}) bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id1: if badrep['scope'] == rep['scope'] and badrep[ 'name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Adding replicas to non-deterministic RSE files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(),
def test_client_add_list_bad_replicas(rse_factory, replica_client, did_client): """ REPLICA (CLIENT): Add bad replicas""" tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles)] rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True) replica_client.add_replicas(rse=rse1, files=files) # Listing replicas on deterministic RSE replicas, list_rep = [], [] for replica in replica_client.list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'] } for f in files], schemes=['srm'], all_states=True): replicas.extend(replica['rses'][rse1]) list_rep.append(replica) r = replica_client.declare_bad_file_replicas(replicas, 'This is a good reason') assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse1_id: if badrep['scope'].external == rep['scope'] and badrep[ 'name'] == rep['name']: nbbadrep += 1 assert len(replicas) == nbbadrep # Run necromancer once necromancer_run(threads=1, bulk=10000, once=True) # Try to attach a lost file tmp_dsn = 'dataset_%s' % generate_uuid() did_client.add_dataset(scope=tmp_scope, name=tmp_dsn) with pytest.raises(UnsupportedOperation): did_client.add_files_to_dataset(tmp_scope, name=tmp_dsn, files=files, rse=rse1) # Adding replicas to non-deterministic RSE rse2, rse2_id = rse_factory.make_srm_rse(deterministic=False) files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'pfn': 'srm://%s.cern.ch/srm/managerv2?SFN=/test_%s/%s/%s' % (rse2_id, rse2_id, tmp_scope, generate_uuid()), 'meta': { 'events': 10 } } for _ in range(nbfiles)] replica_client.add_replicas(rse=rse2, files=files) # Listing replicas on non-deterministic RSE replicas, list_rep = [], [] for replica in replica_client.list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'] } for f in files], schemes=['srm'], all_states=True): replicas.extend(replica['rses'][rse2]) list_rep.append(replica) r = replica_client.declare_bad_file_replicas(replicas, 'This is a good reason') assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse2_id: if badrep['scope'].external == rep['scope'] and badrep[ 'name'] == rep['name']: nbbadrep += 1 assert len(replicas) == nbbadrep # Now adding non-existing bad replicas files = [ 'srm://%s.cern.ch/test_%s/%s/%s' % (rse2_id, rse2_id, tmp_scope, generate_uuid()), ] r = replica_client.declare_bad_file_replicas(files, 'This is a good reason') output = ['%s Unknown replica' % rep for rep in files] assert r == {rse2: output} # Now test adding bad_replicas with a list of replicas instead of PFNs # Adding replicas to deterministic RSE rse3, rse3_id = rse_factory.make_srm_rse(deterministic=True) files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles)] replica_client.add_replicas(rse=rse3, files=files) list_rep = [{ 'scope': file_['scope'], 'name': file_['name'], 'rse': rse3 } for file_ in files] # Listing replicas on deterministic RSE replicas = [] for replica in replica_client.list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'] } for f in files], schemes=['srm'], all_states=True): replicas.extend(replica['rses'][rse3]) r = replica_client.declare_bad_file_replicas(list_rep, 'This is a good reason') assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse3_id: if badrep['scope'].external == rep['scope'] and badrep[ 'name'] == rep['name']: nbbadrep += 1 assert len(replicas) == nbbadrep # InvalidType is raised if list_rep contains a mixture of replicas and PFNs list_rep.extend([ 'srm://%s.cern.ch/test_%s/%s/%s' % (rse2_id, rse2_id, tmp_scope, generate_uuid()), ]) with pytest.raises(InvalidType): r = replica_client.declare_bad_file_replicas(list_rep, 'This is a good reason')
tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in xrange(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK') rse_id1 = rse_info['id'] add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True) # Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses']['MOCK']) list_rep.append(replica) declare_bad_file_replicas(replicas, 'MOCK') bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id1: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Adding replicas to non-deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb', 'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': {'events': 10}} for i in xrange(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK2') rse_id2 = rse_info['id'] add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True)
def necromancer(thread=0, bulk=5, once=False, sleep_time=60): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not GRACEFUL_STOP.is_set(): stime = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.DEBUG, 'Starting new cycle') # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get('necromancer', 'max_evaluator_backlog_count') max_evaluator_backlog_count = int(max_evaluator_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get('necromancer', 'max_evaluator_backlog_duration') max_evaluator_backlog_duration = int(max_evaluator_backlog_duration) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: evaluator_backlog_count, evaluator_backlog_duration = get_evaluation_backlog(expiration_time=sleep_time) if max_evaluator_backlog_count and \ evaluator_backlog_count and \ max_evaluator_backlog_duration and \ evaluator_backlog_duration and \ evaluator_backlog_count > max_evaluator_backlog_count and \ evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation') GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and evaluator_backlog_count and evaluator_backlog_count > max_evaluator_backlog_count: logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation') GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and evaluator_backlog_duration and evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger(logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation') GRACEFUL_STOP.wait(30) continue # Check how many bad replicas are queued try: max_bad_replicas_backlog_count = config_get('necromancer', 'max_bad_replicas_backlog_count') max_bad_replicas_backlog_count = int(max_bad_replicas_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_bad_replicas_backlog_count = None bad_replicas_backlog = REGION.get('bad_replicas_backlog') if bad_replicas_backlog is NO_VALUE: bad_replicas_backlog = get_bad_replicas_backlog() REGION.set('bad_replicas_backlog', bad_replicas_backlog) tot_bad_files = sum([bad_replicas_backlog[key] for key in bad_replicas_backlog]) list_of_rses = list() # If too many replica, call list_bad_replicas with a list of RSEs if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(bad_replicas_backlog) > 1: logger(logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs') rses = list() cnt = 0 for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=True): rses.append({'id': key}) cnt += bad_replicas_backlog[key] if cnt >= bulk: list_of_rses.append(rses) rses = list() cnt = 0 else: list_of_rses.append(None) tot_processed = 0 if tot_bad_files == 0: logger(logging.INFO, 'No bad replicas to process.') else: ttime = time.time() replicas = [] try: for rses in list_of_rses: replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logger(logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(name='necromancer.badfiles.lostfile') except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, str(error)) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, []) logger(logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(name='necromancer.badfiles.recovering') except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, str(error)) tot_processed += len(replicas) logger(logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - ttime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logger(logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break elif tot_processed == 0 or tot_bad_files == 0: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')
def necromancer(thread=0, bulk=5, once=False, sleep_time=60): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ update_history_threshold = 3600 update_history_time = time.time() executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not GRACEFUL_STOP.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get( 'necromancer', 'max_evaluator_backlog_count') max_evaluator_backlog_count = int(max_evaluator_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get( 'necromancer', 'max_evaluator_backlog_duration') max_evaluator_backlog_duration = int( max_evaluator_backlog_duration) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: backlog = get_evaluation_backlog(expiration_time=sleep_time) if max_evaluator_backlog_count and \ backlog[0] and \ max_evaluator_backlog_duration and \ backlog[1] and \ backlog[0] > max_evaluator_backlog_count and \ backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and backlog[ 0] and backlog[0] > max_evaluator_backlog_count: logger( logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and backlog[ 1] and backlog[1] < datetime.utcnow() - timedelta( minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue # Check how many bad replicas are queued try: max_bad_replicas_backlog_count = config_get( 'necromancer', 'max_bad_replicas_backlog_count') max_bad_replicas_backlog_count = int( max_bad_replicas_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_bad_replicas_backlog_count = None bad_replicas_backlog = get_bad_replicas_backlog() tot_bad_files = sum( [bad_replicas_backlog[key] for key in bad_replicas_backlog]) list_of_rses = list() # If too many replica, call list_bad_replicas with a list of RSEs if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len( bad_replicas_backlog) > 1: logger( logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs' ) rses = list() cnt = 0 for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=False): rses.append({'id': key}) cnt += bad_replicas_backlog[key] if cnt >= bulk: list_of_rses.append(rses) rses = list() cnt = 0 else: list_of_rses.append(None) stime = time.time() replicas = [] try: for rses in list_of_rses: replicas = list_bad_replicas( limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica[ 'name'], replica['rse_id'], replica['rse'] logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logger( logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter( name='necromancer.badfiles.lostfile') except DatabaseException as error: logger(logging.WARNING, str(error)) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get( ReplicaState.TEMPORARY_UNAVAILABLE, []) logger( logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter( name='necromancer.badfiles.recovering') except DatabaseException as error: logger(logging.WARNING, str(error)) logger( logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logger( logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break else: now = time.time() if (now - update_history_time) > update_history_threshold: logger( logging.INFO, 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time)) bad_replicas = list_bad_replicas_history( limit=1000000, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for rse_id in bad_replicas: chunk_size = 1000 nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size)) logger(logging.DEBUG, 'Update history for rse_id %s' % (rse_id)) cnt = 0 for chunk in chunks(bad_replicas[rse_id], chunk_size): logger( logging.DEBUG, ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk)) cnt += 1 update_bad_replicas_history(chunk, rse_id) logger( logging.INFO, 'History table updated in %s seconds' % (time.time() - now)) update_history_time = time.time() if len(replicas) == bulk: logger( logging.INFO, 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle' ) else: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')