def _downloader(self, pfn, protocol, human, input_queue, output_queue, user_agent, threadnb, total_threads, trace_endpoint, trace_pattern, transfer_timeout=None): rse_dict = {} thread_prefix = 'Thread %s/%s' % (threadnb, total_threads) while True: try: file = input_queue.get_nowait() except Empty: return dest_dir = file['dest_dir'] file_scope = file['scope'] file_name = file['name'] file_didstr = '%s:%s' % (file_scope, file_name) # arguments for rsemgr.download already known dlfile = {} dlfile['name'] = file_name dlfile['scope'] = file_scope dlfile['adler32'] = file['adler32'] ignore_checksum = True if pfn else False if pfn: dlfile['pfn'] = pfn logger.info('%s : Starting the download of %s' % (thread_prefix, file_didstr)) trace = deepcopy(trace_pattern) trace.update({'scope': file_scope, 'filename': file_name, 'datasetScope': file['dataset_scope'], 'dataset': file['dataset_name'], 'filesize': file['bytes']}) rses = list(file['rses'].keys()) if rses == []: logger.warning('%s : File %s has no available replicas. Cannot be downloaded.' % (thread_prefix, file_didstr)) trace['clientState'] = 'FILE_NOT_FOUND' self.send_trace(trace, trace_endpoint, user_agent) input_queue.task_done() continue random.shuffle(rses) logger.debug('%s : Potential sources : %s' % (thread_prefix, str(rses))) success = False while not success and len(rses): rse_name = rses.pop() if rse_name not in rse_dict: try: rse_dict[rse_name] = rsemgr.get_rse_info(rse_name) except RSENotFound: logger.warning('%s : Could not get info of RSE %s' % (thread_prefix, rse_name)) continue rse = rse_dict[rse_name] if not rse['availability_read']: logger.info('%s : %s is blacklisted for reading' % (thread_prefix, rse_name)) continue try: if pfn: protocols = [rsemgr.select_protocol(rse, operation='read', scheme=pfn.split(':')[0])] else: protocols = rsemgr.get_protocols_ordered(rse, operation='read', scheme=protocol) protocols.reverse() except RSEProtocolNotSupported as error: logger.info('%s : The protocol specfied (%s) is not supported by %s' % (thread_prefix, protocol, rse_name)) logger.debug(error) continue logger.debug('%s : %d possible protocol(s) for read' % (thread_prefix, len(protocols))) trace['remoteSite'] = rse_name trace['clientState'] = 'DOWNLOAD_ATTEMPT' while not success and len(protocols): protocol_retry = protocols.pop() logger.debug('%s : Trying protocol %s at %s' % (thread_prefix, protocol_retry['scheme'], rse_name)) trace['protocol'] = protocol_retry['scheme'] out = {} out['dataset_scope'] = file['dataset_scope'] out['dataset_name'] = file['dataset_name'] out['scope'] = file_scope out['name'] = file_name attempt = 0 retries = 2 while not success and attempt < retries: attempt += 1 out['attemptnr'] = attempt logger.info('%s : File %s trying from %s' % (thread_prefix, file_didstr, rse_name)) try: trace['transferStart'] = time.time() rsemgr.download(rse, files=[dlfile], dest_dir=dest_dir, force_scheme=protocol_retry['scheme'], ignore_checksum=ignore_checksum, transfer_timeout=transfer_timeout) trace['transferEnd'] = time.time() trace['clientState'] = 'DONE' out['clientState'] = 'DONE' success = True output_queue.put(out) logger.info('%s : File %s successfully downloaded from %s' % (thread_prefix, file_didstr, rse_name)) except KeyboardInterrupt: logger.warning('You pressed Ctrl+C! Exiting gracefully') os.kill(os.getpgid(), signal.SIGINT) return except FileConsistencyMismatch as error: logger.warning(str(error)) try: pfns_dict = rsemgr.lfns2pfns(rse, lfns=[{'name': file_name, 'scope': file_scope}], operation='read', scheme=protocol) pfn = pfns_dict[file_didstr] out['clientState'] = 'CORRUPTED' out['pfn'] = pfn output_queue.put(out) except Exception as error: logger.debug('%s : %s' % (thread_prefix, str(error))) trace['clientState'] = 'FAIL_VALIDATE' logger.debug('%s : Failed attempt %s/%s' % (thread_prefix, attempt, retries)) except Exception as error: logger.warning(str(error)) trace['clientState'] = str(type(error).__name__) logger.debug('%s : Failed attempt %s/%s' % (thread_prefix, attempt, retries)) self.send_trace(trace, trace_endpoint, user_agent, threadnb=threadnb, total_threads=total_threads) if success: duration = round(trace['transferEnd'] - trace['transferStart'], 2) if pfn: logger.info('%s : File %s successfully downloaded in %s seconds' % (thread_prefix, file_didstr, duration)) else: logger.info('%s : File %s successfully downloaded. %s in %s seconds = %s MBps' % (thread_prefix, file_didstr, sizefmt(file['bytes'], human), duration, round((file['bytes'] / duration) * 1e-6, 2))) else: logger.error('%s : Cannot download file %s' % (thread_prefix, file_didstr)) input_queue.task_done()
def upload(self, items, summary_file_path=None, traces_copy_out=None): """ :param items: List of dictionaries. Each dictionary describing a file to upload. Keys: path - path of the file that will be uploaded rse - rse expression/name (e.g. 'CERN-PROD_DATADISK') where to upload the file did_scope - Optional: custom did scope (Default: user.<account>) did_name - Optional: custom did name (Default: name of the file) dataset_scope - Optional: custom dataset scope dataset_name - Optional: custom dataset name force_scheme - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None) pfn - Optional: use a given PFN (this sets no_register to True, and no_register becomes mandatory) no_register - Optional: if True, the file will not be registered in the rucio catalogue register_after_upload - Optional: if True, the file will be registered after successful upload lifetime - Optional: the lifetime of the file after it was uploaded transfer_timeout - Optional: time after the upload will be aborted guid - Optional: guid of the file recursive - Optional: if set, parses the folder structure recursively into collections :param summary_file_path: Optional: a path where a summary in form of a json file will be stored :param traces_copy_out: reference to an external list, where the traces should be uploaded :returns: 0 on success :raises InputValidationError: if any input arguments are in a wrong format :raises RSEWriteBlocked: if a given RSE is not available for writing :raises NoFilesUploaded: if no files were successfully uploaded :raises NotAllFilesUploaded: if not all files were successfully uploaded """ # helper to get rse from rse_expression: def _pick_random_rse(rse_expression): rses = [r['rse'] for r in self.client.list_rses(rse_expression) ] # can raise InvalidRSEExpression random.shuffle(rses) return rses[0] logger = self.logger self.trace['uuid'] = generate_uuid() # check given sources, resolve dirs into files, and collect meta infos files = self._collect_and_validate_file_info(items) logger( logging.DEBUG, 'Num. of files that upload client is processing: {}'.format( len(files))) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() rse_expression = None for file in files: rse_expression = file['rse'] rse = self.rse_expressions.setdefault( rse_expression, _pick_random_rse(rse_expression)) if not self.rses.get(rse): rse_settings = self.rses.setdefault( rse, rsemgr.get_rse_info(rse, vo=self.client.vo)) if rse_settings['availability_write'] != 1: raise RSEWriteBlocked( '%s is not available for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') file['rse'] = rse if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError( 'DIDs used to address both files and datasets: %s' % str(wrong_dids)) logger(logging.DEBUG, 'Input validation done.') # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() num_succeeded = 0 summary = [] for file in files: basename = file['basename'] logger(logging.INFO, 'Preparing upload for file %s' % basename) no_register = file.get('no_register') register_after_upload = file.get( 'register_after_upload') and not no_register pfn = file.get('pfn') force_scheme = file.get('force_scheme') delete_existing = False trace = copy.deepcopy(self.trace) # appending trace to list reference, if the reference exists if traces_copy_out is not None: traces_copy_out.append(trace) rse = file['rse'] trace['scope'] = file['did_scope'] trace['datasetScope'] = file.get('dataset_scope', '') trace['dataset'] = file.get('dataset_name', '') trace['remoteSite'] = rse trace['filesize'] = file['bytes'] file_did = {'scope': file['did_scope'], 'name': file['did_name']} dataset_did_str = file.get('dataset_did_str') rse_settings = self.rses[rse] rse_sign_service = rse_settings.get('sign_url', None) is_deterministic = rse_settings.get('deterministic', True) if not is_deterministic and not pfn: logger(logging.ERROR, 'PFN has to be defined for NON-DETERMINISTIC RSE.') continue if pfn and is_deterministic: logger( logging.WARNING, 'Upload with given pfn implies that no_register is True, except non-deterministic RSEs' ) no_register = True # resolving local area networks domain = 'wan' rse_attributes = {} try: rse_attributes = self.client.list_rse_attributes(rse) except: logger(logging.WARNING, 'Attributes of the RSE: %s not available.' % rse) if (self.client_location and 'lan' in rse_settings['domain'] and 'site' in rse_attributes): if self.client_location['site'] == rse_attributes['site']: domain = 'lan' logger(logging.DEBUG, '{} domain is used for the upload'.format(domain)) if not no_register and not register_after_upload: self._register_file(file, registered_dataset_dids) # if register_after_upload, file should be overwritten if it is not registered # otherwise if file already exists on RSE we're done if register_after_upload: if rsemgr.exists(rse_settings, pfn if pfn else file_did, domain=domain, auth_token=self.auth_token, logger=logger): try: self.client.get_did(file['did_scope'], file['did_name']) logger(logging.INFO, 'File already registered. Skipping upload.') trace['stateReason'] = 'File already exists' continue except DataIdentifierNotFound: logger( logging.INFO, 'File already exists on RSE. Previous left overs will be overwritten.' ) delete_existing = True elif not is_deterministic and not no_register: if rsemgr.exists(rse_settings, pfn, domain=domain, auth_token=self.auth_token, logger=logger): logger( logging.INFO, 'File already exists on RSE with given pfn. Skipping upload. Existing replica has to be removed first.' ) trace['stateReason'] = 'File already exists' continue elif rsemgr.exists(rse_settings, file_did, domain=domain, auth_token=self.auth_token, logger=logger): logger( logging.INFO, 'File already exists on RSE with different pfn. Skipping upload.' ) trace['stateReason'] = 'File already exists' continue else: if rsemgr.exists(rse_settings, pfn if pfn else file_did, domain=domain, auth_token=self.auth_token, logger=logger): logger(logging.INFO, 'File already exists on RSE. Skipping upload') trace['stateReason'] = 'File already exists' continue # protocol handling and upload protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=force_scheme, domain=domain) protocols.reverse() success = False state_reason = '' logger(logging.DEBUG, str(protocols)) while not success and len(protocols): protocol = protocols.pop() cur_scheme = protocol['scheme'] logger(logging.INFO, 'Trying upload with %s to %s' % (cur_scheme, rse)) lfn = {} lfn['filename'] = basename lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS: if checksum_name in file: lfn[checksum_name] = file[checksum_name] lfn['filesize'] = file['bytes'] sign_service = None if cur_scheme == 'https': sign_service = rse_sign_service trace['protocol'] = cur_scheme trace['transferStart'] = time.time() logger(logging.DEBUG, 'Processing upload with the domain: {}'.format(domain)) try: pfn = self._upload_item( rse_settings=rse_settings, rse_attributes=rse_attributes, lfn=lfn, source_dir=file['dirname'], domain=domain, force_scheme=cur_scheme, force_pfn=pfn, transfer_timeout=file.get('transfer_timeout'), delete_existing=delete_existing, sign_service=sign_service) logger(logging.DEBUG, 'Upload done.') success = True file['upload_result'] = { 0: True, 1: None, 'success': True, 'pfn': pfn } # needs to be removed except (ServiceUnavailable, ResourceTemporaryUnavailable, RSEOperationNotSupported, RucioException) as error: logger(logging.WARNING, 'Upload attempt failed') logger(logging.INFO, 'Exception: %s' % str(error), exc_info=True) state_reason = str(error) if success: num_succeeded += 1 trace['transferEnd'] = time.time() trace['clientState'] = 'DONE' file['state'] = 'A' logger(logging.INFO, 'Successfully uploaded file %s' % basename) self._send_trace(trace) if summary_file_path: summary.append(copy.deepcopy(file)) if not no_register: if register_after_upload: self._register_file(file, registered_dataset_dids) replica_for_api = self._convert_file_for_api(file) if not self.client.update_replicas_states( rse, files=[replica_for_api]): logger(logging.WARNING, 'Failed to update replica state') # add file to dataset if needed if dataset_did_str and not no_register: try: self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) except Exception as error: logger(logging.WARNING, 'Failed to attach file to the dataset') logger(logging.DEBUG, 'Attaching to dataset {}'.format(str(error))) else: trace['clientState'] = 'FAILED' trace['stateReason'] = state_reason self._send_trace(trace) logger(logging.ERROR, 'Failed to upload file %s' % basename) if summary_file_path: logger(logging.DEBUG, 'Summary will be available at {}'.format(summary_file_path)) final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = { 'scope': file_scope, 'name': file_name, 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result'].get('pfn', ''), 'guid': file['meta']['guid'] } for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS: if checksum_name in file: final_summary[file_did_str][checksum_name] = file[ checksum_name] with open(summary_file_path, 'w') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1) if num_succeeded == 0: raise NoFilesUploaded() elif num_succeeded != len(files): raise NotAllFilesUploaded() return 0
def upload(self, items, summary_file_path=None): """ :param items: List of dictionaries. Each dictionary describing a file to upload. Keys: path - path of the file that will be uploaded rse - rse name (e.g. 'CERN-PROD_DATADISK') where to upload the file did_scope - Optional: custom did scope (Default: user.<account>) did_name - Optional: custom did name (Default: name of the file) dataset_scope - Optional: custom dataset scope dataset_name - Optional: custom dataset name force_scheme - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None) pfn - Optional: use a given PFN (this sets no_register to True) no_register - Optional: if True, the file will not be registered in the rucio catalogue lifetime - Optional: the lifetime of the file after it was uploaded transfer_timeout - Optional: time after the upload will be aborted guid - Optional: guid of the file :param summary_file_path: Optional: a path where a summary in form of a json file will be stored :returns: 0 on success :raises InputValidationError: if any input arguments are in a wrong format :raises RSEBlacklisted: if a given RSE is not available for writing :raises NoFilesUploaded: if no files were successfully uploaded :raises NotAllFilesUploaded: if not all files were successfully uploaded """ logger = self.logger self.trace['uuid'] = generate_uuid() # check given sources, resolve dirs into files, and collect meta infos files = self._collect_and_validate_file_info(items) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() for file in files: rse = file['rse'] if not self.rses.get(rse): rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse)) if rse_settings['availability_write'] != 1: raise RSEBlacklisted( '%s is blacklisted for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError( 'DIDs used to address both files and datasets: %s' % str(wrong_dids)) # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() num_succeeded = 0 for file in files: basename = file['basename'] logger.info('Preparing upload for file %s' % basename) no_register = file.get('no_register') pfn = file.get('pfn') force_scheme = file.get('force_scheme') self.trace['scope'] = file['did_scope'] self.trace['datasetScope'] = file.get('dataset_scope', '') self.trace['dataset'] = file.get('dataset_name', '') self.trace['remoteSite'] = rse self.trace['filesize'] = file['bytes'] file_did = {'scope': file['did_scope'], 'name': file['did_name']} dataset_did_str = file.get('dataset_did_str') if not no_register: self._register_file(file, registered_dataset_dids) rse = file['rse'] rse_settings = self.rses[rse] # if file already exists on RSE we're done if rsemgr.exists(rse_settings, file_did): logger.info('File already exists on RSE. Skipping upload') continue protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=force_scheme) protocols.reverse() success = False summary = [] while not success and len(protocols): protocol = protocols.pop() cur_scheme = protocol['scheme'] logger.info('Trying upload with %s to %s' % (cur_scheme, rse)) lfn = {} lfn['filename'] = basename lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] lfn['adler32'] = file['adler32'] lfn['filesize'] = file['bytes'] self.trace['protocol'] = cur_scheme self.trace['transferStart'] = time.time() try: state = rsemgr.upload( rse_settings=rse_settings, lfns=lfn, source_dir=file['dirname'], force_scheme=cur_scheme, force_pfn=pfn, transfer_timeout=file.get('transfer_timeout')) success = True file['upload_result'] = state except (ServiceUnavailable, ResourceTemporaryUnavailable) as error: logger.warning('Upload attempt failed') logger.debug('Exception: %s' % str(error)) if success: num_succeeded += 1 self.trace['transferEnd'] = time.time() self.trace['clientState'] = 'DONE' file['state'] = 'A' logger.info('Successfully uploaded file %s' % basename) send_trace(self.trace, self.client.host, self.client.user_agent) if summary_file_path: summary.append(copy.deepcopy(file)) # add file to dataset if needed if dataset_did_str and not no_register: try: self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) except Exception as error: logger.warning('Failed to attach file to the dataset') logger.debug(error) if not no_register: replica_for_api = self._convert_file_for_api(file) if not self.client.update_replicas_states( rse, files=[replica_for_api]): logger.warning('Failed to update replica state') else: logger.error('Failed to upload file %s' % basename) if summary_file_path: final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = { 'scope': file['scope'], 'name': file['name'], 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result']['pfn'], 'guid': file['meta']['guid'], 'adler32': file['adler32'], 'md5': file['md5'] } with open(summary_file_path, 'wb') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1) if num_succeeded == 0: raise NoFilesUploaded() elif num_succeeded != len(files): raise NotAllFilesUploaded() return 0
def _download_item(self, item, trace, log_prefix=''): """ Downloads the given item and sends traces for success/failure. (This function is meant to be used as class internal only) :param item: dictionary that describes the item to download :param trace: dictionary representing a pattern of trace that will be send :param log_prefix: string that will be put at the beginning of every log message :returns: dictionary with all attributes from the input item and a clientState attribute """ logger = self.logger did_scope = item['scope'] did_name = item['name'] did_str = '%s:%s' % (did_scope, did_name) logger.info('%sPreparing download of %s' % (log_prefix, did_str)) trace['scope'] = did_scope trace['filename'] = did_name trace.setdefault('dataset_scope', item.get('dataset_scope', '')) trace.setdefault('dataset', item.get('dataset_name', '')) trace.setdefault('filesize', item.get('bytes')) # if file already exists, set state, send trace, and return dest_dir_path = item['dest_dir_path'] dest_file_path = os.path.join(dest_dir_path, did_name) if os.path.isfile(dest_file_path): logger.info('%sFile exists already locally: %s' % (log_prefix, did_str)) item['clientState'] = 'ALREADY_DONE' trace['transferStart'] = time.time() trace['transferEnd'] = time.time() trace['clientState'] = 'ALREADY_DONE' send_trace(trace, self.client.host, self.user_agent) return item # check if file has replicas rse_names = list(item['rses'].keys()) if not len(rse_names): logger.warning( '%sFile %s has no available replicas. Cannot be downloaded' % (log_prefix, did_str)) item['clientState'] = 'FILE_NOT_FOUND' trace['clientState'] = 'FILE_NOT_FOUND' send_trace(trace, self.client.host, self.user_agent) return item # list_replicas order is: best rse at [0] rse_names.reverse() logger.debug('%sPotential sources: %s' % (log_prefix, str(rse_names))) success = False # retry with different rses if one is not available or fails while not success and len(rse_names): rse_name = rse_names.pop() try: rse = rsemgr.get_rse_info(rse_name) except RSENotFound: logger.warning('%sCould not get info of RSE %s' % (log_prefix, rse_name)) continue if not rse['availability_read']: logger.info('%s%s is blacklisted for reading' % (log_prefix, rse_name)) continue force_scheme = item.get('force_scheme') try: protocols = rsemgr.get_protocols_ordered(rse, operation='read', scheme=force_scheme) protocols.reverse() except RSEProtocolNotSupported as error: logger.info( '%sThe protocol specfied (%s) is not supported by %s' % (log_prefix, force_scheme, rse_name)) logger.debug(error) continue logger.debug('%sPotential protocol(s) read: %s' % (log_prefix, protocols)) trace['remoteSite'] = rse_name trace['clientState'] = 'DOWNLOAD_ATTEMPT' # retry with different protocols on the given rse while not success and len(protocols): protocol = protocols.pop() cur_scheme = protocol['scheme'] trace['protocol'] = cur_scheme logger.info('%sTrying to download with %s from %s: %s ' % (log_prefix, cur_scheme, rse_name, did_str)) attempt = 0 retries = 2 # do some retries with the same rse and protocol if the download fails while not success and attempt < retries: attempt += 1 item['attemptnr'] = attempt try: start_time = time.time() rsemgr.download( rse, files=item, dest_dir=dest_dir_path, force_scheme=cur_scheme, ignore_checksum=item.get('ignore_checksum', False), transfer_timeout=item.get('transfer_timeout')) end_time = time.time() trace['transferStart'] = start_time trace['transferEnd'] = end_time trace['clientState'] = 'DONE' item['clientState'] = 'DONE' success = True except FileConsistencyMismatch as error: logger.warning(str(error)) try: pfn = item.get('pfn') if not pfn: pfns_dict = rsemgr.lfns2pfns(rse, lfns={ 'name': did_name, 'scope': did_scope }, operation='read', scheme=cur_scheme) pfn = pfns_dict[did_str] corrupted_item = copy.deepcopy(item) corrupted_item['clientState'] = 'FAIL_VALIDATE' corrupted_item['pfn'] = pfn # self.corrupted_files.append(corrupted_item) except Exception as error: logger.debug('%s%s' % (log_prefix, str(error))) trace['clientState'] = 'FAIL_VALIDATE' except Exception as error: logger.warning(str(error)) trace['clientState'] = str(type(error).__name__) if not success: logger.debug('%sFailed attempt %s/%s' % (log_prefix, attempt, retries)) send_trace(trace, self.client.host, self.user_agent) if not success: logger.error('%sFailed to download file %s' % (log_prefix, did_str)) item['clientState'] = 'FAILED' return item duration = round(end_time - start_time, 2) size = item.get('bytes') size_str = sizefmt(size, self.is_human_readable) if size and duration: rate = round((size / duration) * 1e-6, 2) logger.info( '%sFile %s successfully downloaded. %s in %s seconds = %s MBps' % (log_prefix, did_str, size_str, duration, rate)) else: logger.info('%sFile %s successfully downloaded in %s seconds' % (log_prefix, did_str, duration)) return item
def upload(self, sources_with_settings, summary_file_path=None): """ List of dictionaries of file descriptions. None means optional [{'path': 'file1', 'rse': 'rse_name1', 'did_scope': None, 'did_name': None, 'dataset_name': None, 'dataset_scope': None, 'scheme': None, 'pfn': None, 'no_register': None, 'lifetime': None }, {'path': 'file2', 'rse': 'rse_name2', 'did_scope': None, 'did_name': None, 'dataset_name': None, 'dataset_scope': None, 'scheme': None, 'pfn': None, 'no_register': None, 'lifetime': None }] raises InputValidationError raises RSEBlacklisted """ logger = self.logger self.trace['uuid'] = generate_uuid() # check given sources, resolve dirs into files, and collect meta infos files = self.collect_and_validate_file_info(sources_with_settings) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() for file in files: rse = file['rse'] if not self.rses.get(rse): rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse)) if rse_settings['availability_write'] != 1: raise RSEBlacklisted('%s is blacklisted for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError('DIDs used to address both files and datasets: %s' % str(wrong_dids)) # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() for file in files: basename = file['basename'] logger.info('Preparing upload for file %s' % basename) no_register = file.get('no_register') pfn = file.get('pfn') scheme = file.get('scheme') self.trace['scope'] = file['did_scope'] self.trace['datasetScope'] = file.get('dataset_scope', '') self.trace['dataset'] = file.get('dataset_name', '') self.trace['remoteSite'] = rse self.trace['filesize'] = file['bytes'] file_scope = file['did_scope'] file_name = file['did_name'] file_did = {'scope': file_scope, 'name': file_name} file_did_str = '%s:%s' % (file_scope, file_name) dataset_did_str = file.get('dataset_did_str') rse = file['rse'] rse_settings = self.rses[rse] # register a dataset if we need to if dataset_did_str and dataset_did_str not in registered_dataset_dids and not no_register: registered_dataset_dids.add(dataset_did_str) try: self.client.add_dataset(scope=file['dataset_scope'], name=file['dataset_name'], rules=[{'account': self.account, 'copies': 1, 'rse_expression': rse, 'grouping': 'DATASET', 'lifetime': file['lifetime']}]) logger.info('Dataset %s successfully created' % dataset_did_str) except DataIdentifierAlreadyExists: # TODO: Need to check the rules thing!! logger.info("Dataset %s already exists" % dataset_did_str) replica_for_api = self.convert_file_for_api(file) try: # if the remote checksum is different this did must not be used meta = self.client.get_metadata(file_scope, file_name) logger.info('Comparing checksums of %s and %s' % (basename, file_did_str)) if meta['adler32'] != file['adler32']: logger.error('Local checksum %s does not match remote checksum %s' % (file['adler32'], meta['adler32'])) raise DataIdentifierAlreadyExists # add file to rse if it is not registered yet replicastate = list(self.client.list_replicas([file_did], all_states=True)) if rse not in replicastate[0]['rses'] and not no_register: logger.info('Adding replica at %s in Rucio catalog' % rse) self.client.add_replicas(rse=file['rse'], files=[replica_for_api]) except DataIdentifierNotFound: if not no_register: logger.info('Adding replica at %s in Rucio catalog' % rse) self.client.add_replicas(rse=file['rse'], files=[replica_for_api]) if not dataset_did_str: # only need to add rules for files if no dataset is given logger.info('Adding replication rule at %s' % rse) self.client.add_replication_rule([file_did], copies=1, rse_expression=rse, lifetime=file['lifetime']) # if file already exists on RSE we're done if not rsemgr.exists(rse_settings, file_did): protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=scheme) protocols.reverse() success = False summary = [] while not success and len(protocols): protocol = protocols.pop() logger.info('Trying upload to %s with protocol %s' % (rse, protocol['scheme'])) lfn = {} lfn['filename'] = file['basename'] lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] lfn['adler32'] = file['adler32'] lfn['filesize'] = file['bytes'] self.trace['protocol'] = protocol['scheme'] self.trace['transferStart'] = time.time() try: state = rsemgr.upload(rse_settings=rse_settings, lfns=lfn, source_dir=file['dirname'], force_scheme=protocol['scheme'], force_pfn=pfn) success = True file['upload_result'] = state except (ServiceUnavailable, ResourceTemporaryUnavailable) as error: logger.warning('Upload attempt failed') logger.debug('Exception: %s' % str(error)) if success: self.trace['transferEnd'] = time.time() self.trace['clientState'] = 'DONE' file['state'] = 'A' logger.info('File %s successfully uploaded' % basename) send_trace(self.trace, self.client.host, self.user_agent, logger=logger) if summary_file_path: summary.append(copy.deepcopy(file)) else: logger.error('Failed to upload file %s' % basename) # TODO trace? continue # skip attach_did and update_states for this file else: logger.info('File already exists on RSE. Skipped upload') if not no_register: # add file to dataset if needed if dataset_did_str: try: logger.info('Attaching file to dataset %s' % dataset_did_str) self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) except Exception as error: logger.warning('Failed to attach file to the dataset') logger.warning(error) logger.info('Setting replica state to available') replica_for_api = self.convert_file_for_api(file) self.client.update_replicas_states(rse, files=[replica_for_api]) if summary_file_path: final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = {'scope': file['scope'], 'name': file['name'], 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result']['pfn'], 'guid': file['meta']['guid'], 'adler32': file['adler32'], 'md5': file['md5']} with open(summary_file_path, 'wb') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1)
def upload(self, items, summary_file_path=None, traces_copy_out=None): """ :param items: List of dictionaries. Each dictionary describing a file to upload. Keys: path - path of the file that will be uploaded rse - rse name (e.g. 'CERN-PROD_DATADISK') where to upload the file did_scope - Optional: custom did scope (Default: user.<account>) did_name - Optional: custom did name (Default: name of the file) dataset_scope - Optional: custom dataset scope dataset_name - Optional: custom dataset name force_scheme - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None) pfn - Optional: use a given PFN (this sets no_register to True, and no_register becomes mandatory) no_register - Optional: if True, the file will not be registered in the rucio catalogue register_after_upload - Optional: if True, the file will be registered after successful upload lifetime - Optional: the lifetime of the file after it was uploaded transfer_timeout - Optional: time after the upload will be aborted guid - Optional: guid of the file :param summary_file_path: Optional: a path where a summary in form of a json file will be stored :param traces_copy_out: reference to an external list, where the traces should be uploaded :returns: 0 on success :raises InputValidationError: if any input arguments are in a wrong format :raises RSEBlacklisted: if a given RSE is not available for writing :raises NoFilesUploaded: if no files were successfully uploaded :raises NotAllFilesUploaded: if not all files were successfully uploaded """ logger = self.logger from admix.helper import helper helper.global_dictionary['logger'].Info(' r: start') self.trace['uuid'] = generate_uuid() helper.global_dictionary['logger'].Info(' r: collect and validate') # check given sources, resolve dirs into files, and collect meta infos files = self._collect_and_validate_file_info(items) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() helper.global_dictionary['logger'].Info(' r: check if files are available') for file in files: rse = file['rse'] if not self.rses.get(rse): rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse)) if rse_settings['availability_write'] != 1: raise RSEBlacklisted('%s is blacklisted for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError('DIDs used to address both files and datasets: %s' % str(wrong_dids)) # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() num_succeeded = 0 summary = [] helper.global_dictionary['logger'].Info(' r: Starting loop') for file in files: helper.global_dictionary['logger'].Info('Start') basename = file['basename'] logger.info('Preparing upload for file %s' % basename) no_register = file.get('no_register') register_after_upload = file.get('register_after_upload') and not no_register pfn = file.get('pfn') force_scheme = file.get('force_scheme') delete_existing = False helper.global_dictionary['logger'].Info('1') trace = copy.deepcopy(self.trace) helper.global_dictionary['logger'].Info('2') # appending trace to list reference, if the reference exists if traces_copy_out is not None: helper.global_dictionary['logger'].Info('3') traces_copy_out.append(trace) helper.global_dictionary['logger'].Info('4') trace['scope'] = file['did_scope'] trace['datasetScope'] = file.get('dataset_scope', '') trace['dataset'] = file.get('dataset_name', '') trace['remoteSite'] = rse trace['filesize'] = file['bytes'] file_did = {'scope': file['did_scope'], 'name': file['did_name']} dataset_did_str = file.get('dataset_did_str') rse = file['rse'] rse_settings = self.rses[rse] rse_sign_service = rse_settings.get('sign_url', None) is_deterministic = rse_settings.get('deterministic', True) if not is_deterministic and not pfn: logger.error('PFN has to be defined for NON-DETERMINISTIC RSE.') continue if pfn and is_deterministic: logger.warning('Upload with given pfn implies that no_register is True, except non-deterministic RSEs') no_register = True helper.global_dictionary['logger'].Info('5') if not no_register and not register_after_upload: helper.global_dictionary['logger'].Info('6') self._register_file(file, registered_dataset_dids,helper) helper.global_dictionary['logger'].Info('7') # if register_after_upload, file should be overwritten if it is not registered # otherwise if file already exists on RSE we're done # if register_after_upload: # helper.global_dictionary['logger'].Info('8') # if rsemgr.exists(rse_settings, pfn if pfn else file_did): # helper.global_dictionary['logger'].Info('9') # try: # helper.global_dictionary['logger'].Info('10') # self.client.get_did(file['did_scope'], file['did_name']) # logger.info('File already registered. Skipping upload.') # trace['stateReason'] = 'File already exists' # continue # except DataIdentifierNotFound: # logger.info('File already exists on RSE. Previous left overs will be overwritten.') # delete_existing = True # elif not is_deterministic and not no_register: # if rsemgr.exists(rse_settings, pfn): # logger.info('File already exists on RSE with given pfn. Skipping upload. Existing replica has to be removed first.') # trace['stateReason'] = 'File already exists' # continue # elif rsemgr.exists(rse_settings, file_did): # logger.info('File already exists on RSE with different pfn. Skipping upload.') # trace['stateReason'] = 'File already exists' # continue # else: # if rsemgr.exists(rse_settings, pfn if pfn else file_did): # logger.info('File already exists on RSE. Skipping upload') # trace['stateReason'] = 'File already exists' # continue helper.global_dictionary['logger'].Info('11') # resolving local area networks domain = 'wan' # rse_attributes = {} # try: # rse_attributes = self.client.list_rse_attributes(rse) # helper.global_dictionary['logger'].Info('12') # except: # logger.warning('Attributes of the RSE: %s not available.' % rse) # if (self.client_location and 'lan' in rse_settings['domain'] and 'site' in rse_attributes): # if self.client_location['site'] == rse_attributes['site']: # domain = 'lan' # protocol handling and upload protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=force_scheme, domain=domain) helper.global_dictionary['logger'].Info('14') protocols.reverse() helper.global_dictionary['logger'].Info('15') success = False state_reason = '' while not success and len(protocols): helper.global_dictionary['logger'].Info('16') protocol = protocols.pop() cur_scheme = protocol['scheme'] logger.info('Trying upload with %s to %s' % (cur_scheme, rse)) lfn = {} lfn['filename'] = basename lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS: if checksum_name in file: lfn[checksum_name] = file[checksum_name] lfn['filesize'] = file['bytes'] sign_service = None if cur_scheme == 'https': sign_service = rse_sign_service trace['protocol'] = cur_scheme trace['transferStart'] = time.time() try: helper.global_dictionary['logger'].Info('17') state = rsemgr.upload(rse_settings=rse_settings, lfns=lfn, source_dir=file['dirname'], force_scheme=cur_scheme, force_pfn=pfn, transfer_timeout=file.get('transfer_timeout'), delete_existing=delete_existing, sign_service=sign_service) helper.global_dictionary['logger'].Info('18') success = state['success'] file['upload_result'] = state except (ServiceUnavailable, ResourceTemporaryUnavailable) as error: logger.warning('Upload attempt failed') logger.debug('Exception: %s' % str(error)) state_reason = str(error) helper.global_dictionary['logger'].Info('19') if success: num_succeeded += 1 trace['transferEnd'] = time.time() trace['clientState'] = 'DONE' file['state'] = 'A' logger.info('Successfully uploaded file %s' % basename) print('Successfully uploaded file %s' % basename) self._send_trace(trace) helper.global_dictionary['logger'].Info('Before if deepcopy') if summary_file_path: helper.global_dictionary['logger'].Info('Before deepcopy') summary.append(copy.deepcopy(file)) helper.global_dictionary['logger'].Info('Before if register') if not no_register: helper.global_dictionary['logger'].Info('Before if2 register') if register_after_upload: helper.global_dictionary['logger'].Info('Before register') self._register_file(file, registered_dataset_dids,helper) replica_for_api = self._convert_file_for_api(file) helper.global_dictionary['logger'].Info('Before if register2') if not self.client.update_replicas_states(rse, files=[replica_for_api]): helper.global_dictionary['logger'].Info('Before if register3') logger.warning('Failed to update replica state') # add file to dataset if needed helper.global_dictionary['logger'].Info('Before if attach') if dataset_did_str and not no_register: try: helper.global_dictionary['logger'].Info('Before attach') self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) helper.global_dictionary['logger'].Info('After attach') except Exception as error: helper.global_dictionary['logger'].Info('Failed to attach file to the dataset') logger.warning('Failed to attach file to the dataset') logger.debug(error) helper.global_dictionary['logger'].Info('Really finished') else: trace['clientState'] = 'FAILED' trace['stateReason'] = state_reason self._send_trace(trace) logger.error('Failed to upload file %s' % basename) if summary_file_path: final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = {'scope': file_scope, 'name': file_name, 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result'].get('pfn', ''), 'guid': file['meta']['guid']} for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS: if checksum_name in file: final_summary[file_did_str][checksum_name] = file[checksum_name] with open(summary_file_path, 'wb') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1) if num_succeeded == 0: raise NoFilesUploaded() elif num_succeeded != len(files): raise NotAllFilesUploaded() return 0