def test_to_repair_a_rule_with_only_1_rse_whose_transfers_failed(self): """ JUDGE REPAIRER: Test to repair a rule with only 1 rse whose transfers failed (lock)""" rule_repairer(once=True) # Clean out the repairer scope = InternalScope('mock', **self.vo) files = create_files(4, scope, self.rse4_id, bytes=100) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.DATASET, self.jdoe) attach_dids(scope, dataset, files, self.jdoe) rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account=self.jdoe, copies=1, rse_expression=self.rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] successful_transfer(scope=scope, name=files[0]['name'], rse_id=get_replica_locks(scope=files[0]['scope'], name=files[2]['name'])[0].rse_id, nowait=False) successful_transfer(scope=scope, name=files[1]['name'], rse_id=get_replica_locks(scope=files[1]['scope'], name=files[2]['name'])[0].rse_id, nowait=False) failed_transfer(scope=scope, name=files[2]['name'], rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id) failed_transfer(scope=scope, name=files[3]['name'], rse_id=get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id) cancel_request_did(scope=scope, name=files[2]['name'], dest_rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id) cancel_request_did(scope=scope, name=files[3]['name'], dest_rse_id=get_replica_locks(scope=files[3]['scope'], name=files[2]['name'])[0].rse_id) assert(rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert(RuleState.STUCK == get_rule(rule_id)['state']) rule_repairer(once=True) # Stil assert STUCK because of delays: assert(RuleState.STUCK == get_rule(rule_id)['state']) assert(get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id == get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
def test_to_repair_a_rule_with_DATASET_grouping_whose_transfer_failed( self): """ JUDGE REPAIRER: Test to repair a rule with 1 failed transfer (lock)""" rule_repairer(once=True) # Clean out the repairer scope = InternalScope('mock') files = create_files(4, scope, self.rse4_id, bytes=100) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), self.jdoe) attach_dids(scope, dataset, files, self.jdoe) rule_id = add_rule(dids=[{ 'scope': scope, 'name': dataset }], account=self.jdoe, copies=1, rse_expression=self.T1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None, activity='DebugJudge')[0] successful_transfer( scope=scope, name=files[0]['name'], rse_id=get_replica_locks(scope=files[0]['scope'], name=files[2]['name'])[0].rse_id, nowait=False) successful_transfer( scope=scope, name=files[1]['name'], rse_id=get_replica_locks(scope=files[1]['scope'], name=files[2]['name'])[0].rse_id, nowait=False) failed_transfer( scope=scope, name=files[2]['name'], rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id) failed_transfer( scope=scope, name=files[3]['name'], rse_id=get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id) assert (rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert (RuleState.STUCK == get_rule(rule_id)['state']) rule_repairer(once=True) assert (RuleState.REPLICATING == get_rule(rule_id)['state']) assert (get_replica_locks( scope=files[2]['scope'], name=files[2]['name'])[0].rse_id == get_replica_locks( scope=files[3]['scope'], name=files[3]['name'])[0].rse_id) assert (get_replica_locks( scope=files[1]['scope'], name=files[1]['name'])[0].rse_id == get_replica_locks( scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
def test_to_repair_a_rule_with_NONE_grouping_whose_transfer_failed(self): """ JUDGE REPAIRER: Test to repair a rule with 1 failed transfer (lock)""" rule_repairer(once=True) # Clean out the repairer scope = InternalScope('mock', **self.vo) files = create_files(3, scope, self.rse4_id, bytes=100) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.DATASET, self.jdoe) attach_dids(scope, dataset, files, self.jdoe) rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account=self.jdoe, copies=1, rse_expression=self.T1, grouping='NONE', weight=None, lifetime=None, locked=False, subscription_id=None)[0] failed_rse_id = get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id assert(get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['state'] == ReplicaState.COPYING) assert(get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['lock_cnt'] == 1) successful_transfer(scope=scope, name=files[0]['name'], rse_id=get_replica_locks(scope=files[0]['scope'], name=files[2]['name'])[0].rse_id, nowait=False) successful_transfer(scope=scope, name=files[1]['name'], rse_id=get_replica_locks(scope=files[1]['scope'], name=files[2]['name'])[0].rse_id, nowait=False) failed_transfer(scope=scope, name=files[2]['name'], rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id) assert(rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert(RuleState.STUCK == get_rule(rule_id)['state']) rule_repairer(once=True) assert(RuleState.REPLICATING == get_rule(rule_id)['state']) assert(get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['state'] == ReplicaState.UNAVAILABLE) assert(get_replica(scope=files[2]['scope'], name=files[2]['name'], rse_id=failed_rse_id)['lock_cnt'] == 0)
def test_to_repair_a_rule_with_only_1_rse_whose_transfers_failed(self): """ JUDGE REPAIRER: Test to repair a rule with only 1 rse whose transfers failed (lock)""" rule_repairer(once=True) # Clean out the repairer scope = 'mock' files = create_files(4, scope, self.rse4, bytes=100) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe') attach_dids(scope, dataset, files, 'jdoe') rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=1, rse_expression=self.rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] successful_transfer(scope=scope, name=files[0]['name'], rse_id=get_replica_locks(scope=files[0]['scope'], name=files[2]['name'])[0].rse_id, nowait=False) successful_transfer(scope=scope, name=files[1]['name'], rse_id=get_replica_locks(scope=files[1]['scope'], name=files[2]['name'])[0].rse_id, nowait=False) failed_transfer(scope=scope, name=files[2]['name'], rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id) failed_transfer(scope=scope, name=files[3]['name'], rse_id=get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id) cancel_request_did(scope=scope, name=files[2]['name'], dest_rse_id=get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id) cancel_request_did(scope=scope, name=files[3]['name'], dest_rse_id=get_replica_locks(scope=files[3]['scope'], name=files[2]['name'])[0].rse_id) assert(rule_id == get_rule(rule_id)['id'].replace('-', '').lower()) assert(RuleState.STUCK == get_rule(rule_id)['state']) rule_repairer(once=True) # Stil assert STUCK because of delays: assert(RuleState.STUCK == get_rule(rule_id)['state']) assert(get_replica_locks(scope=files[2]['scope'], name=files[2]['name'])[0].rse_id == get_replica_locks(scope=files[3]['scope'], name=files[3]['name'])[0].rse_id)
def test_to_repair_a_rule_with_only_1_rse_whose_site_is_blocklisted(self): """ JUDGE REPAIRER: Test to repair a rule with only 1 rse whose site is blocklisted""" rse = rse_name_generator() rse_id = add_rse(rse, **self.vo) set_local_account_limit(self.jdoe, rse_id, -1) rule_repairer(once=True) # Clean out the repairer region = make_region().configure('dogpile.cache.memcached', expiration_time=900, arguments={ 'url': config_get( 'cache', 'url', False, '127.0.0.1:11211'), 'distributed_lock': True }) def change_availability(new_value): update_rse(rse_id, {'availability_write': new_value}) # clear cache region.delete(sha256(rse.encode()).hexdigest()) for grouping, ignore_availability in itertools.product( ["NONE", "DATASET", "ALL"], [True, False]): scope = InternalScope('mock', **self.vo) files = create_files(1, scope, self.rse4_id, bytes_=100) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.DATASET, self.jdoe) attach_dids(scope, dataset, files, self.jdoe) if ignore_availability: change_availability(False) rule_id = add_rule(dids=[{ 'scope': scope, 'name': dataset }], account=self.jdoe, copies=1, rse_expression=rse, grouping=grouping, weight=None, lifetime=None, locked=False, subscription_id=None, ignore_availability=ignore_availability, activity='DebugJudge')[0] assert (RuleState.STUCK == get_rule(rule_id)['state']) rule_repairer(once=True) assert (RuleState.REPLICATING == get_rule(rule_id)['state']) change_availability(True) else: rule_id = add_rule(dids=[{ 'scope': scope, 'name': dataset }], account=self.jdoe, copies=1, rse_expression=rse, grouping=grouping, weight=None, lifetime=None, locked=False, subscription_id=None, ignore_availability=ignore_availability, activity='DebugJudge')[0] failed_transfer(scope=scope, name=files[0]['name'], rse_id=get_replica_locks( scope=files[0]['scope'], name=files[0]['name'])[0].rse_id) change_availability(False) assert (RuleState.STUCK == get_rule(rule_id)['state']) rule_repairer(once=True) assert (RuleState.STUCK == get_rule(rule_id)['state']) change_availability(True) rule_repairer(once=True) assert (RuleState.REPLICATING == get_rule(rule_id)['state'])
def update_bad_request(req, dest_rse, new_state, detail, session=None): if new_state == RequestState.FAILED: request.set_request_state(req['request_id'], new_state, session=session) activity = 'default' if req['attributes']: if type(req['attributes']) is dict: req_attributes = json.loads(json.dumps(req['attributes'])) else: req_attributes = json.loads(str(req['attributes'])) activity = req_attributes['activity'] if req_attributes['activity'] else 'default' tss = time.time() add_message('transfer-failed', {'activity': activity, 'request-id': req['request_id'], 'checksum-adler': None, 'checksum-md5': None, 'dst-rse': dest_rse, 'dst-url': None, 'name': req['name'], 'guid': None, 'file-size': None, 'previous-request-id': req['request_id'], 'protocol': None, 'reason': detail, 'transfer-link': None, 'scope': req['scope'], 'src-rse': None, 'src-url': None, 'tool-id': 'rucio-conveyor', 'transfer-endpoint': config_get('conveyor', 'ftshosts'), 'transfer-id': None}, session=session) request.archive_request(req['request_id'], session=session) logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'], req['name'], req['request_id'], detail)) try: replica.update_replicas_states([{'rse': dest_rse, 'scope': req['scope'], 'name': req['name'], 'state': ReplicaState.UNAVAILABLE}], session=session) except: logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise tss = time.time() try: lock.failed_transfer(req['scope'], req['name'], req['dest_rse_id'], session=session) except: logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
replica.update_replicas_states([{'rse': rse_update_name, 'scope': response['scope'], 'name': response['name'], 'state': ReplicaState.UNAVAILABLE}], session=session) except: logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (response['scope'], response['name'], rse_update_name, traceback.format_exc())) raise tss = time.time() try: lock.failed_transfer(response['scope'], response['name'], rse_core.get_rse_id(rse=rse_update_name, session=session), session=session) except: logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (response['scope'], response['name'], rse_update_name, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000) else: logging.warn('REQUEUED DID %s:%s REQUEST %s AS %s TRY %s' % (response['scope'], response['name'], response['request_id'], new_req['request_id'], new_req['retry_count']))