def _send_trace(self, trace): """ Checks if sending trace is allowed and send the trace. :param trace: the trace """ if self.tracing: send_trace(trace, self.client.host, self.client.user_agent)
def download_file_from_archive(self, items, trace_custom_fields={}): """ Download items with a given PFN. This function can only download files, no datasets. :param items: List of dictionaries. Each dictionary describing a file to download. Keys: did - DID string of the archive file (e.g. 'scope:file.name'). Wildcards are not allowed archive - DID string of the archive from which the file should be extracted rse - Optional: rse name (e.g. 'CERN-PROD_DATADISK'). RSE Expressions are allowed base_dir - Optional: Base directory where the downloaded files will be stored. (Default: '.') no_subdir - Optional: If true, files are written directly into base_dir and existing files are overwritten. (Default: False) :param trace_custom_fields: Custom key value pairs to send with the traces :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState clientState can be one of the following: ALREADY_DONE, DONE, FILE_NOT_FOUND, FAIL_VALIDATE, FAILED :raises InputValidationError: if one of the input items is in the wrong format :raises NoFilesDownloaded: if no files could be downloaded :raises NotAllFilesDownloaded: if not all files could be downloaded :raises SourceNotFound: if xrdcp was unable to find the PFN :raises ServiceUnavailable: if xrdcp failed :raises RucioException: if something unexpected went wrong during the download """ logger = self.logger trace = copy.deepcopy(self.trace_tpl) trace['uuid'] = generate_uuid() log_prefix = 'Extracting files: ' logger.info('Processing %d item(s) for input' % len(items)) for item in items: archive = item.get('archive') file_extract = item.get('did') rse_name = item.get('rse') if not archive or not file_extract: raise InputValidationError('File DID and archive DID are mandatory') if '*' in archive: logger.debug(archive) raise InputValidationError('Cannot use PFN download with wildcard in DID') file_extract_scope, file_extract_name = self._split_did_str(file_extract) archive_scope, archive_name = self._split_did_str(archive) # listing all available replicas of given archhive file rse_expression = 'istape=False' if not rse_name else '(%s)&istape=False' % rse_name archive_replicas = self.client.list_replicas([{'scope': archive_scope, 'name': archive_name}], schemes=['root'], rse_expression=rse_expression, unavailable=False, client_location=self.client_location) # preparing trace trace['scope'] = archive_scope trace['dataset'] = archive_name trace['filename'] = file_extract # preparing output directories dest_dir_path = self._prepare_dest_dir(item.get('base_dir', '.'), os.path.join(archive_scope, archive_name + '.extracted'), file_extract, item.get('no_subdir')) logger.debug('%sPreparing output destination %s' % (log_prefix, dest_dir_path)) # validation and customisation of list of replicas archive_replicas = list(archive_replicas) if len(archive_replicas) != 1: raise RucioException('No replicas for DID found or dataset was given.') archive_pfns = archive_replicas[0]['pfns'].keys() if len(archive_pfns) == 0: raise InputValidationError('No PFNs for replicas of archive %s' % archive) # checking whether file already exists success = False dest_file_path = os.path.join(dest_dir_path, file_extract) if os.path.isfile(dest_file_path): logger.info('%s%s File exists already locally: %s' % (log_prefix, file_extract_name, dest_dir_path)) trace['clientState'] = 'ALREADY_DONE' trace['transferStart'] = time.time() trace['transferEnd'] = time.time() send_trace(trace, self.client.host, self.client.user_agent) success = True # DOWNLOAD, iteration over different rses unitl success retry_counter = 0 while not success and len(archive_pfns): retry_counter += 1 pfn = archive_pfns.pop() trace['rse'] = archive_replicas[0]['pfns'][pfn]['rse'] try: start_time = time.time() cmd = 'xrdcp -vf %s -z %s file://%s' % (pfn, file_extract_name, dest_dir_path) logger.debug('%sExecuting: %s' % (log_prefix, cmd)) status, out, err = execute(cmd) end_time = time.time() trace['transferStart'] = start_time trace['transferEnd'] = end_time if status == 54: trace['clientState'] = 'FAILED' raise SourceNotFound(err) elif status != 0: trace['clientState'] = 'FAILED' raise RucioException(err) else: success = True item['clientState'] = 'DONE' trace['clientState'] = 'DONE' except Exception as e: trace['clientState'] = 'FAILED' raise ServiceUnavailable(e) send_trace(trace, self.client.host, self.client.user_agent) if not success: raise RucioException('Failed to download file %s after %d retries' % (file_extract_name, retry_counter)) return self._check_output(items)
def _download_items_aria2c(self, items, aria_rpc, rpc_auth, trace_custom_fields={}): """ Uses aria2c to download the given items. Aria2c needs to be started as RPC background process first and a RPC proxy is needed. (This function is meant to be used as class internal only) :param items: list of dictionaries containing one dict for each file to download :param aria_rcp: RPCProxy to the aria2c process :param rpc_auth: the rpc authentication token :param trace_custom_fields: Custom key value pairs to send with the traces :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState """ logger = self.logger gid_to_item = {} # maps an aria2c download id (gid) to the download item pfn_to_rse = {} items_to_queue = [item for item in items] # items get removed from gid_to_item when they are complete or failed while len(gid_to_item) or len(items_to_queue): num_queued = 0 # queue up to 100 files and then check arias status while (num_queued < 100) and len(items_to_queue): item = items_to_queue.pop() file_scope = item['scope'] file_name = item['name'] file_did_str = '%s:%s' % (file_scope, file_name) trace = {'scope': file_scope, 'filename': file_name, 'datasetScope': item.get('dataset_scope', ''), 'dataset': item.get('dataset_name', ''), 'protocol': 'https', 'remoteSite': '', 'filesize': item.get('bytes', None), 'transferStart': time.time(), 'transferEnd': time.time()} trace.update(self.trace_tpl) trace.update(trace_custom_fields) # get pfns from all replicas pfns = [] for src in item['sources']: pfn = src['pfn'] if pfn[0:4].lower() == 'davs': pfn = pfn.replace('davs', 'https', 1) pfns.append(pfn) pfn_to_rse[pfn] = src['rse'] # does file exist and are sources available? if os.path.isfile(item['dest_file_path']): logger.info('File exists already locally: %s' % file_did_str) item['clientState'] = 'ALREADY_DONE' trace['clientState'] = 'ALREADY_DONE' send_trace(trace, self.client.host, self.client.user_agent) elif len(pfns) == 0: logger.warning('No available source found for file: %s' % file_did_str) item['clientState'] = 'FILE_NOT_FOUND' trace['clientState'] = 'FILE_NOT_FOUND' send_trace(trace, self.client.host, self.client.user_agent) else: item['trace'] = trace options = {'dir': item['dest_dir_path'], 'out': os.path.basename(item['temp_file_path'])} gid = aria_rpc.aria2.addUri(rpc_auth, pfns, options) gid_to_item[gid] = item num_queued += 1 logger.debug('Queued file: %s' % file_did_str) # get some statistics aria_stat = aria_rpc.aria2.getGlobalStat(rpc_auth) num_active = int(aria_stat['numActive']) num_waiting = int(aria_stat['numWaiting']) num_stopped = int(aria_stat['numStoppedTotal']) # save start time if one of the active downloads has started active = aria_rpc.aria2.tellActive(rpc_auth, ['gid', 'completedLength']) for dlinfo in active: gid = dlinfo['gid'] if int(dlinfo['completedLength']) > 0: gid_to_item[gid].setdefault('transferStart', time.time()) stopped = aria_rpc.aria2.tellStopped(rpc_auth, -1, num_stopped, ['gid', 'status', 'files']) for dlinfo in stopped: gid = dlinfo['gid'] item = gid_to_item[gid] file_scope = item['scope'] file_name = item['name'] file_did_str = '%s:%s' % (file_scope, file_name) temp_file_path = item['temp_file_path'] dest_file_path = item['dest_file_path'] # ensure we didnt miss the active state (e.g. a very fast download) start_time = item.setdefault('transferStart', time.time()) end_time = item.setdefault('transferEnd', time.time()) # get used pfn for traces trace = item['trace'] for uri in dlinfo['files'][0]['uris']: if uri['status'].lower() == 'used': trace['remoteSite'] = pfn_to_rse.get(uri['uri'], '') trace['transferStart'] = start_time trace['transferEnd'] = end_time # ensure file exists status = dlinfo.get('status', '').lower() if status == 'complete' and os.path.isfile(temp_file_path): # checksum check skip_check = item.get('ignore_checksum', False) rucio_checksum = 0 if skip_check else item.get('adler32') local_checksum = 0 if skip_check else adler32(temp_file_path) if rucio_checksum == local_checksum: item['clientState'] = 'DONE' trace['clientState'] = 'DONE' # remove .part ending os.rename(temp_file_path, dest_file_path) # calculate duration duration = round(end_time - start_time, 2) duration = max(duration, 0.01) # protect against 0 division size = item.get('bytes', 0) rate = round((size / duration) * 1e-6, 2) size_str = sizefmt(size, self.is_human_readable) logger.info('File %s successfully downloaded. %s in %s seconds = %s MBps' % (file_did_str, size_str, duration, rate)) else: os.unlink(temp_file_path) logger.warning('Checksum validation failed for file: %s' % file_did_str) logger.debug('Local checksum: %s, Rucio checksum: %s' % (local_checksum, rucio_checksum)) item['clientState'] = 'FAIL_VALIDATE' trace['clientState'] = 'FAIL_VALIDATE' else: logger.error('Failed to download file: %s' % file_did_str) logger.debug('Aria2c status: %s' % status) item['clientState'] = 'FAILED' trace['clientState'] = 'DOWNLOAD_ATTEMPT' send_trace(trace, self.client.host, self.client.user_agent) del item['trace'] aria_rpc.aria2.removeDownloadResult(rpc_auth, gid) del gid_to_item[gid] if len(stopped) > 0: logger.info('Active: %d, Waiting: %d, Stopped: %d' % (num_active, num_waiting, num_stopped)) return items
def _download_item(self, item, trace, log_prefix=''): """ Downloads the given item and sends traces for success/failure. (This function is meant to be used as class internal only) :param item: dictionary that describes the item to download :param trace: dictionary representing a pattern of trace that will be send :param log_prefix: string that will be put at the beginning of every log message :returns: dictionary with all attributes from the input item and a clientState attribute """ logger = self.logger did_scope = item['scope'] did_name = item['name'] did_str = '%s:%s' % (did_scope, did_name) logger.info('%sPreparing download of %s' % (log_prefix, did_str)) trace['scope'] = did_scope trace['filename'] = did_name trace.setdefault('datasetScope', item.get('dataset_scope', '')) trace.setdefault('dataset', item.get('dataset_name', '')) trace.setdefault('filesize', item.get('bytes')) # if file already exists, set state, send trace, and return dest_file_path = item['dest_file_path'] if os.path.isfile(dest_file_path): logger.info('%sFile exists already locally: %s' % (log_prefix, did_str)) item['clientState'] = 'ALREADY_DONE' trace['transferStart'] = time.time() trace['transferEnd'] = time.time() trace['clientState'] = 'ALREADY_DONE' send_trace(trace, self.client.host, self.client.user_agent) return item # check if file has replicas sources = item.get('sources') if not sources or not len(sources): logger.warning('%sNo available source found for file: %s' % (log_prefix, did_str)) item['clientState'] = 'FILE_NOT_FOUND' trace['clientState'] = 'FILE_NOT_FOUND' send_trace(trace, self.client.host, self.client.user_agent) return item success = False # try different PFNs until one succeeded i = 0 while not success and i < len(sources): pfn = sources[i]['pfn'] rse_name = sources[i]['rse'] i += 1 scheme = pfn.split(':')[0] try: rse = rsemgr.get_rse_info(rse_name) except RSENotFound: logger.warning('%sCould not get info of RSE %s' % (log_prefix, rse_name)) continue trace['remoteSite'] = rse_name trace['clientState'] = 'DOWNLOAD_ATTEMPT' trace['protocol'] = scheme logger.info('%sTrying to download with %s from %s: %s ' % (log_prefix, scheme, rse_name, did_str)) try: protocol = rsemgr.create_protocol(rse, operation='read', scheme=scheme) protocol.connect() except Exception as error: logger.warning('%sFailed to create protocol for PFN: %s' % (log_prefix, pfn)) logger.debug('scheme: %s, exception: %s' % (scheme, error)) continue attempt = 0 retries = 2 # do some retries with the same PFN if the download fails while not success and attempt < retries: attempt += 1 item['attemptnr'] = attempt temp_file_path = item['temp_file_path'] if os.path.isfile(temp_file_path): logger.debug('%sDeleting existing temporary file: %s' % (log_prefix, temp_file_path)) os.unlink(temp_file_path) start_time = time.time() try: protocol.get(pfn, temp_file_path, transfer_timeout=item.get('transfer_timeout')) success = True except Exception as error: logger.debug(error) trace['clientState'] = str(type(error).__name__) end_time = time.time() if success and not item.get('ignore_checksum', False): rucio_checksum = item.get('adler32') local_checksum = None if not rucio_checksum: rucio_checksum = item.get('md5') local_checksum = md5(temp_file_path) else: local_checksum = adler32(temp_file_path) if rucio_checksum != local_checksum: success = False os.unlink(temp_file_path) logger.warning('%sChecksum validation failed for file: %s' % (log_prefix, did_str)) logger.debug('Local checksum: %s, Rucio checksum: %s' % (local_checksum, rucio_checksum)) try: self.client.declare_suspicious_file_replicas([pfn], reason='Corrupted') except Exception: pass trace['clientState'] = 'FAIL_VALIDATE' if not success: logger.warning('%sDownload attempt failed. Try %s/%s' % (log_prefix, attempt, retries)) send_trace(trace, self.client.host, self.client.user_agent) protocol.close() if not success: logger.error('%sFailed to download file %s' % (log_prefix, did_str)) item['clientState'] = 'FAILED' return item os.rename(temp_file_path, dest_file_path) trace['transferStart'] = start_time trace['transferEnd'] = end_time trace['clientState'] = 'DONE' item['clientState'] = 'DONE' send_trace(trace, self.client.host, self.client.user_agent) duration = round(end_time - start_time, 2) size = item.get('bytes') size_str = sizefmt(size, self.is_human_readable) if size and duration: rate = round((size / duration) * 1e-6, 2) logger.info('%sFile %s successfully downloaded. %s in %s seconds = %s MBps' % (log_prefix, did_str, size_str, duration, rate)) else: logger.info('%sFile %s successfully downloaded in %s seconds' % (log_prefix, did_str, duration)) return item
def upload(self, items, summary_file_path=None): """ :param items: List of dictionaries. Each dictionary describing a file to upload. Keys: path - path of the file that will be uploaded rse - rse name (e.g. 'CERN-PROD_DATADISK') where to upload the file did_scope - Optional: custom did scope (Default: user.<account>) did_name - Optional: custom did name (Default: name of the file) dataset_scope - Optional: custom dataset scope dataset_name - Optional: custom dataset name force_scheme - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None) pfn - Optional: use a given PFN (this sets no_register to True) no_register - Optional: if True, the file will not be registered in the rucio catalogue lifetime - Optional: the lifetime of the file after it was uploaded transfer_timeout - Optional: time after the upload will be aborted guid - Optional: guid of the file :param summary_file_path: Optional: a path where a summary in form of a json file will be stored :returns: 0 on success :raises InputValidationError: if any input arguments are in a wrong format :raises RSEBlacklisted: if a given RSE is not available for writing :raises NoFilesUploaded: if no files were successfully uploaded :raises NotAllFilesUploaded: if not all files were successfully uploaded """ logger = self.logger self.trace['uuid'] = generate_uuid() # check given sources, resolve dirs into files, and collect meta infos files = self._collect_and_validate_file_info(items) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() for file in files: rse = file['rse'] if not self.rses.get(rse): rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse)) if rse_settings['availability_write'] != 1: raise RSEBlacklisted( '%s is blacklisted for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError( 'DIDs used to address both files and datasets: %s' % str(wrong_dids)) # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() num_succeeded = 0 for file in files: basename = file['basename'] logger.info('Preparing upload for file %s' % basename) no_register = file.get('no_register') pfn = file.get('pfn') force_scheme = file.get('force_scheme') self.trace['scope'] = file['did_scope'] self.trace['datasetScope'] = file.get('dataset_scope', '') self.trace['dataset'] = file.get('dataset_name', '') self.trace['remoteSite'] = rse self.trace['filesize'] = file['bytes'] file_did = {'scope': file['did_scope'], 'name': file['did_name']} dataset_did_str = file.get('dataset_did_str') if not no_register: self._register_file(file, registered_dataset_dids) rse = file['rse'] rse_settings = self.rses[rse] # if file already exists on RSE we're done if rsemgr.exists(rse_settings, file_did): logger.info('File already exists on RSE. Skipping upload') continue protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=force_scheme) protocols.reverse() success = False summary = [] while not success and len(protocols): protocol = protocols.pop() cur_scheme = protocol['scheme'] logger.info('Trying upload with %s to %s' % (cur_scheme, rse)) lfn = {} lfn['filename'] = basename lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] lfn['adler32'] = file['adler32'] lfn['filesize'] = file['bytes'] self.trace['protocol'] = cur_scheme self.trace['transferStart'] = time.time() try: state = rsemgr.upload( rse_settings=rse_settings, lfns=lfn, source_dir=file['dirname'], force_scheme=cur_scheme, force_pfn=pfn, transfer_timeout=file.get('transfer_timeout')) success = True file['upload_result'] = state except (ServiceUnavailable, ResourceTemporaryUnavailable) as error: logger.warning('Upload attempt failed') logger.debug('Exception: %s' % str(error)) if success: num_succeeded += 1 self.trace['transferEnd'] = time.time() self.trace['clientState'] = 'DONE' file['state'] = 'A' logger.info('Successfully uploaded file %s' % basename) send_trace(self.trace, self.client.host, self.client.user_agent) if summary_file_path: summary.append(copy.deepcopy(file)) # add file to dataset if needed if dataset_did_str and not no_register: try: self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) except Exception as error: logger.warning('Failed to attach file to the dataset') logger.debug(error) if not no_register: replica_for_api = self._convert_file_for_api(file) if not self.client.update_replicas_states( rse, files=[replica_for_api]): logger.warning('Failed to update replica state') else: logger.error('Failed to upload file %s' % basename) if summary_file_path: final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = { 'scope': file['scope'], 'name': file['name'], 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result']['pfn'], 'guid': file['meta']['guid'], 'adler32': file['adler32'], 'md5': file['md5'] } with open(summary_file_path, 'wb') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1) if num_succeeded == 0: raise NoFilesUploaded() elif num_succeeded != len(files): raise NotAllFilesUploaded() return 0
def _download_item(self, item, trace, log_prefix=''): """ Downloads the given item and sends traces for success/failure. (This function is meant to be used as class internal only) :param item: dictionary that describes the item to download :param trace: dictionary representing a pattern of trace that will be send :param log_prefix: string that will be put at the beginning of every log message :returns: dictionary with all attributes from the input item and a clientState attribute """ logger = self.logger did_scope = item['scope'] did_name = item['name'] did_str = '%s:%s' % (did_scope, did_name) logger.info('%sPreparing download of %s' % (log_prefix, did_str)) trace['scope'] = did_scope trace['filename'] = did_name trace.setdefault('dataset_scope', item.get('dataset_scope', '')) trace.setdefault('dataset', item.get('dataset_name', '')) trace.setdefault('filesize', item.get('bytes')) # if file already exists, set state, send trace, and return dest_dir_path = item['dest_dir_path'] dest_file_path = os.path.join(dest_dir_path, did_name) if os.path.isfile(dest_file_path): logger.info('%sFile exists already locally: %s' % (log_prefix, did_str)) item['clientState'] = 'ALREADY_DONE' trace['transferStart'] = time.time() trace['transferEnd'] = time.time() trace['clientState'] = 'ALREADY_DONE' send_trace(trace, self.client.host, self.user_agent) return item # check if file has replicas rse_names = list(item['rses'].keys()) if not len(rse_names): logger.warning( '%sFile %s has no available replicas. Cannot be downloaded' % (log_prefix, did_str)) item['clientState'] = 'FILE_NOT_FOUND' trace['clientState'] = 'FILE_NOT_FOUND' send_trace(trace, self.client.host, self.user_agent) return item # list_replicas order is: best rse at [0] rse_names.reverse() logger.debug('%sPotential sources: %s' % (log_prefix, str(rse_names))) success = False # retry with different rses if one is not available or fails while not success and len(rse_names): rse_name = rse_names.pop() try: rse = rsemgr.get_rse_info(rse_name) except RSENotFound: logger.warning('%sCould not get info of RSE %s' % (log_prefix, rse_name)) continue if not rse['availability_read']: logger.info('%s%s is blacklisted for reading' % (log_prefix, rse_name)) continue force_scheme = item.get('force_scheme') try: protocols = rsemgr.get_protocols_ordered(rse, operation='read', scheme=force_scheme) protocols.reverse() except RSEProtocolNotSupported as error: logger.info( '%sThe protocol specfied (%s) is not supported by %s' % (log_prefix, force_scheme, rse_name)) logger.debug(error) continue logger.debug('%sPotential protocol(s) read: %s' % (log_prefix, protocols)) trace['remoteSite'] = rse_name trace['clientState'] = 'DOWNLOAD_ATTEMPT' # retry with different protocols on the given rse while not success and len(protocols): protocol = protocols.pop() cur_scheme = protocol['scheme'] trace['protocol'] = cur_scheme logger.info('%sTrying to download with %s from %s: %s ' % (log_prefix, cur_scheme, rse_name, did_str)) attempt = 0 retries = 2 # do some retries with the same rse and protocol if the download fails while not success and attempt < retries: attempt += 1 item['attemptnr'] = attempt try: start_time = time.time() rsemgr.download( rse, files=item, dest_dir=dest_dir_path, force_scheme=cur_scheme, ignore_checksum=item.get('ignore_checksum', False), transfer_timeout=item.get('transfer_timeout')) end_time = time.time() trace['transferStart'] = start_time trace['transferEnd'] = end_time trace['clientState'] = 'DONE' item['clientState'] = 'DONE' success = True except FileConsistencyMismatch as error: logger.warning(str(error)) try: pfn = item.get('pfn') if not pfn: pfns_dict = rsemgr.lfns2pfns(rse, lfns={ 'name': did_name, 'scope': did_scope }, operation='read', scheme=cur_scheme) pfn = pfns_dict[did_str] corrupted_item = copy.deepcopy(item) corrupted_item['clientState'] = 'FAIL_VALIDATE' corrupted_item['pfn'] = pfn # self.corrupted_files.append(corrupted_item) except Exception as error: logger.debug('%s%s' % (log_prefix, str(error))) trace['clientState'] = 'FAIL_VALIDATE' except Exception as error: logger.warning(str(error)) trace['clientState'] = str(type(error).__name__) if not success: logger.debug('%sFailed attempt %s/%s' % (log_prefix, attempt, retries)) send_trace(trace, self.client.host, self.user_agent) if not success: logger.error('%sFailed to download file %s' % (log_prefix, did_str)) item['clientState'] = 'FAILED' return item duration = round(end_time - start_time, 2) size = item.get('bytes') size_str = sizefmt(size, self.is_human_readable) if size and duration: rate = round((size / duration) * 1e-6, 2) logger.info( '%sFile %s successfully downloaded. %s in %s seconds = %s MBps' % (log_prefix, did_str, size_str, duration, rate)) else: logger.info('%sFile %s successfully downloaded in %s seconds' % (log_prefix, did_str, duration)) return item
def upload(self, sources_with_settings, summary_file_path=None): """ List of dictionaries of file descriptions. None means optional [{'path': 'file1', 'rse': 'rse_name1', 'did_scope': None, 'did_name': None, 'dataset_name': None, 'dataset_scope': None, 'scheme': None, 'pfn': None, 'no_register': None, 'lifetime': None }, {'path': 'file2', 'rse': 'rse_name2', 'did_scope': None, 'did_name': None, 'dataset_name': None, 'dataset_scope': None, 'scheme': None, 'pfn': None, 'no_register': None, 'lifetime': None }] raises InputValidationError raises RSEBlacklisted """ logger = self.logger self.trace['uuid'] = generate_uuid() # check given sources, resolve dirs into files, and collect meta infos files = self.collect_and_validate_file_info(sources_with_settings) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() for file in files: rse = file['rse'] if not self.rses.get(rse): rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse)) if rse_settings['availability_write'] != 1: raise RSEBlacklisted('%s is blacklisted for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError('DIDs used to address both files and datasets: %s' % str(wrong_dids)) # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() for file in files: basename = file['basename'] logger.info('Preparing upload for file %s' % basename) no_register = file.get('no_register') pfn = file.get('pfn') scheme = file.get('scheme') self.trace['scope'] = file['did_scope'] self.trace['datasetScope'] = file.get('dataset_scope', '') self.trace['dataset'] = file.get('dataset_name', '') self.trace['remoteSite'] = rse self.trace['filesize'] = file['bytes'] file_scope = file['did_scope'] file_name = file['did_name'] file_did = {'scope': file_scope, 'name': file_name} file_did_str = '%s:%s' % (file_scope, file_name) dataset_did_str = file.get('dataset_did_str') rse = file['rse'] rse_settings = self.rses[rse] # register a dataset if we need to if dataset_did_str and dataset_did_str not in registered_dataset_dids and not no_register: registered_dataset_dids.add(dataset_did_str) try: self.client.add_dataset(scope=file['dataset_scope'], name=file['dataset_name'], rules=[{'account': self.account, 'copies': 1, 'rse_expression': rse, 'grouping': 'DATASET', 'lifetime': file['lifetime']}]) logger.info('Dataset %s successfully created' % dataset_did_str) except DataIdentifierAlreadyExists: # TODO: Need to check the rules thing!! logger.info("Dataset %s already exists" % dataset_did_str) replica_for_api = self.convert_file_for_api(file) try: # if the remote checksum is different this did must not be used meta = self.client.get_metadata(file_scope, file_name) logger.info('Comparing checksums of %s and %s' % (basename, file_did_str)) if meta['adler32'] != file['adler32']: logger.error('Local checksum %s does not match remote checksum %s' % (file['adler32'], meta['adler32'])) raise DataIdentifierAlreadyExists # add file to rse if it is not registered yet replicastate = list(self.client.list_replicas([file_did], all_states=True)) if rse not in replicastate[0]['rses'] and not no_register: logger.info('Adding replica at %s in Rucio catalog' % rse) self.client.add_replicas(rse=file['rse'], files=[replica_for_api]) except DataIdentifierNotFound: if not no_register: logger.info('Adding replica at %s in Rucio catalog' % rse) self.client.add_replicas(rse=file['rse'], files=[replica_for_api]) if not dataset_did_str: # only need to add rules for files if no dataset is given logger.info('Adding replication rule at %s' % rse) self.client.add_replication_rule([file_did], copies=1, rse_expression=rse, lifetime=file['lifetime']) # if file already exists on RSE we're done if not rsemgr.exists(rse_settings, file_did): protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=scheme) protocols.reverse() success = False summary = [] while not success and len(protocols): protocol = protocols.pop() logger.info('Trying upload to %s with protocol %s' % (rse, protocol['scheme'])) lfn = {} lfn['filename'] = file['basename'] lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] lfn['adler32'] = file['adler32'] lfn['filesize'] = file['bytes'] self.trace['protocol'] = protocol['scheme'] self.trace['transferStart'] = time.time() try: state = rsemgr.upload(rse_settings=rse_settings, lfns=lfn, source_dir=file['dirname'], force_scheme=protocol['scheme'], force_pfn=pfn) success = True file['upload_result'] = state except (ServiceUnavailable, ResourceTemporaryUnavailable) as error: logger.warning('Upload attempt failed') logger.debug('Exception: %s' % str(error)) if success: self.trace['transferEnd'] = time.time() self.trace['clientState'] = 'DONE' file['state'] = 'A' logger.info('File %s successfully uploaded' % basename) send_trace(self.trace, self.client.host, self.user_agent, logger=logger) if summary_file_path: summary.append(copy.deepcopy(file)) else: logger.error('Failed to upload file %s' % basename) # TODO trace? continue # skip attach_did and update_states for this file else: logger.info('File already exists on RSE. Skipped upload') if not no_register: # add file to dataset if needed if dataset_did_str: try: logger.info('Attaching file to dataset %s' % dataset_did_str) self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) except Exception as error: logger.warning('Failed to attach file to the dataset') logger.warning(error) logger.info('Setting replica state to available') replica_for_api = self.convert_file_for_api(file) self.client.update_replicas_states(rse, files=[replica_for_api]) if summary_file_path: final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = {'scope': file['scope'], 'name': file['name'], 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result']['pfn'], 'guid': file['meta']['guid'], 'adler32': file['adler32'], 'md5': file['md5']} with open(summary_file_path, 'wb') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1)