def _split_did_str(self, did_str): """ Splits a given DID string (e.g. 'scope1:name.file') into its scope and name part (This function is meant to be used as class internal only) :param did_str: the DID string that will be splitted :returns: the scope- and name part of the given DID :raises InputValidationError: if the given DID string is not valid """ did = did_str.split(':') if len(did) == 2: did_scope = did[0] did_name = did[1] elif len(did) == 1: did = did_str.split('.') did_scope = did[0] if did_scope == 'user' or did_scope == 'group': did_scope = '%s.%s' % (did[0], did[1]) did_name = did_str else: raise InputValidationError('%s is not a valid DID. To many colons.' % did_str) if did_name.endswith('/'): did_name = did_name[:-1] return did_scope, did_name
def parse_replicas_metalink(root): """ Transforms the metalink tree into a list of dictionaries where each dictionary describes a file with its replicas. Will be called by parse_replicas_from_file and parse_replicas_from_string. :param root: root node of the metalink tree :returns: a list with a dictionary for each file """ files = [] # metalink namespace ns = '{urn:ietf:params:xml:ns:metalink}' str_to_bool = {'true': True, 'True': True, 'false': False, 'False': False} # loop over all <file> tags of the metalink string for file_tag_obj in root.findall(ns + 'file'): # search for identity-tag identity_tag_obj = file_tag_obj.find(ns + 'identity') if not ElementTree.iselement(identity_tag_obj): raise InputValidationError( 'Failed to locate identity-tag inside %s' % ElementTree.tostring(file_tag_obj)) cur_file = { 'did': identity_tag_obj.text, 'adler32': None, 'md5': None, 'sources': [] } parent_dids = set() parent_dids_tag_obj = file_tag_obj.find(ns + 'parents') if ElementTree.iselement(parent_dids_tag_obj): for did_tag_obj in parent_dids_tag_obj.findall(ns + 'did'): parent_dids.add(did_tag_obj.text) cur_file['parent_dids'] = parent_dids size_tag_obj = file_tag_obj.find(ns + 'size') cur_file['bytes'] = int( size_tag_obj.text) if ElementTree.iselement(size_tag_obj) else None for hash_tag_obj in file_tag_obj.findall(ns + 'hash'): hash_type = hash_tag_obj.get('type') if hash_type: cur_file[hash_type] = hash_tag_obj.text for url_tag_obj in file_tag_obj.findall(ns + 'url'): key_rename_map = {'location': 'rse'} src = {} for k, v in url_tag_obj.items(): k = key_rename_map.get(k, k) src[k] = str_to_bool.get(v, v) src['pfn'] = url_tag_obj.text cur_file['sources'].append(src) files.append(cur_file) return files
def _collect_and_validate_file_info(self, items): """ Checks if there are any inconsistencies within the given input options and stores the output of _collect_file_info for every file (This function is meant to be used as class internal only) :param filepath: list of dictionaries with all input files and options :returns: a list of dictionaries containing all descriptions of the files to upload :raises InputValidationError: if an input option has a wrong format """ logger = self.logger files = [] for item in items: path = item.get('path') pfn = item.get('pfn') recursive = item.get('recursive') if not path: logger( logging.WARNING, 'Skipping source entry because the key "path" is missing') continue if not item.get('rse'): logger(logging.WARNING, 'Skipping file %s because no rse was given' % path) continue if pfn: item['force_scheme'] = pfn.split(':')[0] if os.path.isdir(path) and not recursive: dname, subdirs, fnames = next(os.walk(path)) for fname in fnames: file = self._collect_file_info(os.path.join(dname, fname), item) files.append(file) if not len(fnames) and not len(subdirs): logger(logging.WARNING, 'Skipping %s because it is empty.' % dname) elif not len(fnames): logger( logging.WARNING, 'Skipping %s because it has no files in it. Subdirectories are not supported.' % dname) elif os.path.isdir(path) and recursive: files.extend(self._recursive(item)) elif os.path.isfile(path) and not recursive: file = self._collect_file_info(path, item) files.append(file) elif os.path.isfile(path) and recursive: logger(logging.WARNING, 'Skipping %s because of --recursive flag' % path) else: logger(logging.WARNING, 'No such file or directory: %s' % path) if not len(files): raise InputValidationError('No valid input files given') return files
def collect_and_validate_file_info(self, sources_with_settings): logger = self.logger files = [] for settings in sources_with_settings: path = settings.get('path') pfn = settings.get('pfn') if not path: logger.warning( 'Skipping source entry because the key "path" is missing') continue if not settings.get('rse'): logger.warning('Skipping file %s because no rse was given' % path) continue if pfn: if settings.get('no_register'): logger.warning( 'Upload with given pfn implies that no_register is True' ) settings['no_register'] = True scheme = settings.get('scheme') pfn_scheme = pfn.split(':')[0] if scheme and scheme != pfn_scheme: logger.warning( 'PFN scheme (%s) overrides given scheme (%s)' % (pfn_scheme, scheme)) scheme = pfn_scheme settings['scheme'] = pfn_scheme if os.path.isdir(path): dname, subdirs, fnames = os.walk(path).next() for fname in fnames: file = self.collect_file_info(os.path.join(dname, fname), settings) files.append(file) if not len(fnames) and not len(subdirs): logger.warning('Skipping %s because it is empty.' % dname) elif not len(fnames): logger.warning( 'Skipping %s because it has no files in it. Subdirectories are not supported.' % dname) elif os.path.isfile(path): file = self.collect_file_info(path, settings) files.append(file) else: logger.warning('No such file or directory: %s' % path) if not len(files): raise InputValidationError('No valid input files given') return files
def _collect_and_validate_file_info(self, sources_with_settings): """ Checks if there are any inconsistencies within the given input options and stores the output of _collect_file_info for every file (This function is meant to be used as class internal only) :param filepath: list of dictionaries with all input files and options :returns: a list of dictionaries containing all descriptions of the files to upload :raises InputValidationError: if an input option has a wrong format """ logger = self.logger files = [] for settings in sources_with_settings: path = settings.get('path') pfn = settings.get('pfn') if not path: logger.warning('Skipping source entry because the key "path" is missing') continue if not settings.get('rse'): logger.warning('Skipping file %s because no rse was given' % path) continue if pfn: if settings.get('no_register'): logger.warning('Upload with given pfn implies that no_register is True') settings['no_register'] = True settings['force_scheme'] = pfn.split(':')[0] if os.path.isdir(path): dname, subdirs, fnames = next(os.walk(path)) for fname in fnames: file = self._collect_file_info(os.path.join(dname, fname), settings) files.append(file) if not len(fnames) and not len(subdirs): logger.warning('Skipping %s because it is empty.' % dname) elif not len(fnames): logger.warning('Skipping %s because it has no files in it. Subdirectories are not supported.' % dname) elif os.path.isfile(path): file = self._collect_file_info(path, settings) files.append(file) else: logger.warning('No such file or directory: %s' % path) if not len(files): raise InputValidationError('No valid input files given') return files
def _recursive(self, item): """ If the --recursive flag is set, it replicates the folder structure recursively into collections A folder only can have either other folders inside or files, but not both of them - If it has folders, the root folder will be a container - If it has files, the root folder will be a dataset - If it is empty, it does not create anything :param: item dictionary containing all descriptions of the files to upload """ files = [] datasets = [] containers = [] attach = [] scope = item.get('did_scope') if item.get( 'did_scope') is not None else self.default_file_scope rse = item.get('rse') path = item.get('path') if path[-1] == '/': path = path[0:-1] i = 0 path = os.path.abspath(path) for root, dirs, fnames in os.walk(path): if len(dirs) > 0 and len(fnames) > 0 and i == 0: self.logger( logging.ERROR, 'A container can only have either collections or files, not both' ) raise InputValidationError('Invalid input folder structure') if len(fnames) > 0: datasets.append({ 'scope': scope, 'name': root.split('/')[-1], 'rse': rse }) self.logger(logging.DEBUG, 'Appended dataset with DID %s:%s' % (scope, path)) for fname in fnames: file = self._collect_file_info(os.path.join(root, fname), item) file['dataset_scope'] = scope file['dataset_name'] = root.split('/')[-1] files.append(file) self.logger( logging.DEBUG, 'Appended file with DID %s:%s' % (scope, fname)) elif len(dirs) > 0: containers.append({ 'scope': scope, 'name': root.split('/')[-1] }) self.logger( logging.DEBUG, 'Appended container with DID %s:%s' % (scope, path)) attach.extend([{ 'scope': scope, 'name': root.split('/')[-1], 'rse': rse, 'dids': { 'scope': scope, 'name': dir_ } } for dir_ in dirs]) elif len(dirs) == 0 and len(fnames) == 0: self.logger(logging.WARNING, 'The folder %s is empty, skipping' % root) continue i += 1 # if everything went ok, replicate the folder structure in Rucio storage for dataset in datasets: try: self.client.add_dataset(scope=dataset['scope'], name=dataset['name'], rse=dataset['rse']) self.logger( logging.INFO, 'Created dataset with DID %s:%s' % (dataset['scope'], dataset['name'])) except RucioException as error: self.logger(logging.ERROR, error) self.logger( logging.ERROR, 'It was not possible to create dataset with DID %s:%s' % (dataset['scope'], dataset['name'])) for container in containers: try: self.client.add_container(scope=container['scope'], name=container['name']) self.logger( logging.INFO, 'Created container with DID %s:%s' % (container['scope'], container['name'])) except RucioException as error: self.logger(logging.ERROR, error) self.logger( logging.ERROR, 'It was not possible to create dataset with DID %s:%s' % (container['scope'], container['name'])) for att in attach: try: self.client.attach_dids(scope=att['scope'], name=att['name'], dids=[att['dids']]) self.logger( logging.INFO, 'DIDs attached to collection %s:%s' % (att['scope'], att['name'])) except RucioException as error: self.logger(logging.ERROR, error) self.logger( logging.ERROR, 'It was not possible to attach to collection with DID %s:%s' % (att['scope'], att['name'])) return files
def upload(self, items, summary_file_path=None, traces_copy_out=None): """ :param items: List of dictionaries. Each dictionary describing a file to upload. Keys: path - path of the file that will be uploaded rse - rse expression/name (e.g. 'CERN-PROD_DATADISK') where to upload the file did_scope - Optional: custom did scope (Default: user.<account>) did_name - Optional: custom did name (Default: name of the file) dataset_scope - Optional: custom dataset scope dataset_name - Optional: custom dataset name force_scheme - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None) pfn - Optional: use a given PFN (this sets no_register to True, and no_register becomes mandatory) no_register - Optional: if True, the file will not be registered in the rucio catalogue register_after_upload - Optional: if True, the file will be registered after successful upload lifetime - Optional: the lifetime of the file after it was uploaded transfer_timeout - Optional: time after the upload will be aborted guid - Optional: guid of the file recursive - Optional: if set, parses the folder structure recursively into collections :param summary_file_path: Optional: a path where a summary in form of a json file will be stored :param traces_copy_out: reference to an external list, where the traces should be uploaded :returns: 0 on success :raises InputValidationError: if any input arguments are in a wrong format :raises RSEWriteBlocked: if a given RSE is not available for writing :raises NoFilesUploaded: if no files were successfully uploaded :raises NotAllFilesUploaded: if not all files were successfully uploaded """ # helper to get rse from rse_expression: def _pick_random_rse(rse_expression): rses = [r['rse'] for r in self.client.list_rses(rse_expression) ] # can raise InvalidRSEExpression random.shuffle(rses) return rses[0] logger = self.logger self.trace['uuid'] = generate_uuid() # check given sources, resolve dirs into files, and collect meta infos files = self._collect_and_validate_file_info(items) logger( logging.DEBUG, 'Num. of files that upload client is processing: {}'.format( len(files))) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() rse_expression = None for file in files: rse_expression = file['rse'] rse = self.rse_expressions.setdefault( rse_expression, _pick_random_rse(rse_expression)) if not self.rses.get(rse): rse_settings = self.rses.setdefault( rse, rsemgr.get_rse_info(rse, vo=self.client.vo)) if rse_settings['availability_write'] != 1: raise RSEWriteBlocked( '%s is not available for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') file['rse'] = rse if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError( 'DIDs used to address both files and datasets: %s' % str(wrong_dids)) logger(logging.DEBUG, 'Input validation done.') # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() num_succeeded = 0 summary = [] for file in files: basename = file['basename'] logger(logging.INFO, 'Preparing upload for file %s' % basename) no_register = file.get('no_register') register_after_upload = file.get( 'register_after_upload') and not no_register pfn = file.get('pfn') force_scheme = file.get('force_scheme') delete_existing = False trace = copy.deepcopy(self.trace) # appending trace to list reference, if the reference exists if traces_copy_out is not None: traces_copy_out.append(trace) rse = file['rse'] trace['scope'] = file['did_scope'] trace['datasetScope'] = file.get('dataset_scope', '') trace['dataset'] = file.get('dataset_name', '') trace['remoteSite'] = rse trace['filesize'] = file['bytes'] file_did = {'scope': file['did_scope'], 'name': file['did_name']} dataset_did_str = file.get('dataset_did_str') rse_settings = self.rses[rse] rse_sign_service = rse_settings.get('sign_url', None) is_deterministic = rse_settings.get('deterministic', True) if not is_deterministic and not pfn: logger(logging.ERROR, 'PFN has to be defined for NON-DETERMINISTIC RSE.') continue if pfn and is_deterministic: logger( logging.WARNING, 'Upload with given pfn implies that no_register is True, except non-deterministic RSEs' ) no_register = True # resolving local area networks domain = 'wan' rse_attributes = {} try: rse_attributes = self.client.list_rse_attributes(rse) except: logger(logging.WARNING, 'Attributes of the RSE: %s not available.' % rse) if (self.client_location and 'lan' in rse_settings['domain'] and 'site' in rse_attributes): if self.client_location['site'] == rse_attributes['site']: domain = 'lan' logger(logging.DEBUG, '{} domain is used for the upload'.format(domain)) if not no_register and not register_after_upload: self._register_file(file, registered_dataset_dids) # if register_after_upload, file should be overwritten if it is not registered # otherwise if file already exists on RSE we're done if register_after_upload: if rsemgr.exists(rse_settings, pfn if pfn else file_did, domain=domain, auth_token=self.auth_token, logger=logger): try: self.client.get_did(file['did_scope'], file['did_name']) logger(logging.INFO, 'File already registered. Skipping upload.') trace['stateReason'] = 'File already exists' continue except DataIdentifierNotFound: logger( logging.INFO, 'File already exists on RSE. Previous left overs will be overwritten.' ) delete_existing = True elif not is_deterministic and not no_register: if rsemgr.exists(rse_settings, pfn, domain=domain, auth_token=self.auth_token, logger=logger): logger( logging.INFO, 'File already exists on RSE with given pfn. Skipping upload. Existing replica has to be removed first.' ) trace['stateReason'] = 'File already exists' continue elif rsemgr.exists(rse_settings, file_did, domain=domain, auth_token=self.auth_token, logger=logger): logger( logging.INFO, 'File already exists on RSE with different pfn. Skipping upload.' ) trace['stateReason'] = 'File already exists' continue else: if rsemgr.exists(rse_settings, pfn if pfn else file_did, domain=domain, auth_token=self.auth_token, logger=logger): logger(logging.INFO, 'File already exists on RSE. Skipping upload') trace['stateReason'] = 'File already exists' continue # protocol handling and upload protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=force_scheme, domain=domain) protocols.reverse() success = False state_reason = '' logger(logging.DEBUG, str(protocols)) while not success and len(protocols): protocol = protocols.pop() cur_scheme = protocol['scheme'] logger(logging.INFO, 'Trying upload with %s to %s' % (cur_scheme, rse)) lfn = {} lfn['filename'] = basename lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS: if checksum_name in file: lfn[checksum_name] = file[checksum_name] lfn['filesize'] = file['bytes'] sign_service = None if cur_scheme == 'https': sign_service = rse_sign_service trace['protocol'] = cur_scheme trace['transferStart'] = time.time() logger(logging.DEBUG, 'Processing upload with the domain: {}'.format(domain)) try: pfn = self._upload_item( rse_settings=rse_settings, rse_attributes=rse_attributes, lfn=lfn, source_dir=file['dirname'], domain=domain, force_scheme=cur_scheme, force_pfn=pfn, transfer_timeout=file.get('transfer_timeout'), delete_existing=delete_existing, sign_service=sign_service) logger(logging.DEBUG, 'Upload done.') success = True file['upload_result'] = { 0: True, 1: None, 'success': True, 'pfn': pfn } # needs to be removed except (ServiceUnavailable, ResourceTemporaryUnavailable, RSEOperationNotSupported, RucioException) as error: logger(logging.WARNING, 'Upload attempt failed') logger(logging.INFO, 'Exception: %s' % str(error), exc_info=True) state_reason = str(error) if success: num_succeeded += 1 trace['transferEnd'] = time.time() trace['clientState'] = 'DONE' file['state'] = 'A' logger(logging.INFO, 'Successfully uploaded file %s' % basename) self._send_trace(trace) if summary_file_path: summary.append(copy.deepcopy(file)) if not no_register: if register_after_upload: self._register_file(file, registered_dataset_dids) replica_for_api = self._convert_file_for_api(file) if not self.client.update_replicas_states( rse, files=[replica_for_api]): logger(logging.WARNING, 'Failed to update replica state') # add file to dataset if needed if dataset_did_str and not no_register: try: self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) except Exception as error: logger(logging.WARNING, 'Failed to attach file to the dataset') logger(logging.DEBUG, 'Attaching to dataset {}'.format(str(error))) else: trace['clientState'] = 'FAILED' trace['stateReason'] = state_reason self._send_trace(trace) logger(logging.ERROR, 'Failed to upload file %s' % basename) if summary_file_path: logger(logging.DEBUG, 'Summary will be available at {}'.format(summary_file_path)) final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = { 'scope': file_scope, 'name': file_name, 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result'].get('pfn', ''), 'guid': file['meta']['guid'] } for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS: if checksum_name in file: final_summary[file_did_str][checksum_name] = file[ checksum_name] with open(summary_file_path, 'w') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1) if num_succeeded == 0: raise NoFilesUploaded() elif num_succeeded != len(files): raise NotAllFilesUploaded() return 0
def _prepare_items_for_download(self, items): """ Resolves wildcarded DIDs, get DID details (e.g. type), and collects the available replicas for each DID (This function is meant to be used as class internal only) :param items: list of dictionaries containing the items to prepare :returns: list of dictionaries, one dict for each file to download :raises InputValidationError: if the given input is not valid or incomplete """ logger = self.logger logger.info('Processing %d item(s) for input' % len(items)) resolved_items = [] # resolve input: extend rse expression, resolve wildcards, get did type for item in items: did_str = item.get('did') if not did_str: raise InputValidationError('The key did is mandatory') logger.debug('Processing item %s' % did_str) new_item = copy.deepcopy(item) # extend RSE expression to exclude tape RSEs for non-admin accounts if not self.is_admin: rse = new_item.get('rse') new_item['rse'] = 'istape=False' if not rse else '(%s)&istape=False' % rse logger.debug('RSE-Expression: %s' % new_item['rse']) # resolve any wildcards in the input dids did_scope, did_name = self._split_did_str(did_str) logger.debug('Splitted DID: %s:%s' % (did_scope, did_name)) new_item['scope'] = did_scope if '*' in did_name: logger.debug('Resolving wildcarded DID %s' % did_str) for dids in self.client.list_dids(did_scope, filters={'name': did_name}, type='all', long=True): logger.debug('%s - %s:%s' % (dids['did_type'], did_scope, dids['name'])) new_item['type'] = dids['did_type'].upper() new_item['name'] = dids['name'] new_item['did'] = '%s:%s' % (did_scope, dids['name']) resolved_items.append(new_item) else: new_item['type'] = self.client.get_did(did_scope, did_name)['type'].upper() new_item['name'] = did_name resolved_items.append(new_item) # this list will have one dict for each file to download file_items = [] # get replicas for every file of the given dids logger.debug('%d DIDs after processing input' % len(resolved_items)) for item in resolved_items: did_scope = item['scope'] did_name = item['name'] did_str = item['did'] logger.debug('Processing: %s' % item) # since we are using metalink we need to explicitly # give all schemes (probably due to a bad server site implementation) force_scheme = item.get('force_scheme') if force_scheme: schemes = force_scheme if isinstance(force_scheme, list) else [force_scheme] else: schemes = ['davs', 'gsiftp', 'https', 'root', 'srm', 'file'] # get PFNs of files and datasets metalink_str = self.client.list_replicas([{'scope': did_scope, 'name': did_name}], schemes=schemes, rse_expression=item.get('rse'), client_location=self.client_location, metalink=True) files_with_pfns = self._parse_list_replica_metalink(metalink_str) nrandom = item.get('nrandom') if nrandom: logger.info('Selecting %d random replicas from dataset %s' % (nrandom, did_str)) random.shuffle(files_with_pfns) files_with_pfns = files_with_pfns[0:nrandom] for file_item in files_with_pfns: file_did_scope = file_item['scope'] file_did_name = file_item['name'] file_did_str = '%s:%s' % (file_did_scope, file_did_name) logger.debug('Queueing file: %s' % file_did_str) # put the input options from item into the file item file_item.update(item) dest_dir_name = file_did_scope if item['type'] != 'FILE': # if the did is a dataset, scope and name were updated wrongly file_item['scope'] = file_did_scope file_item['name'] = file_did_name file_item['did'] = file_did_str file_item['dataset_scope'] = did_scope file_item['dataset_name'] = did_name dest_dir_name = did_name dest_dir_path = self._prepare_dest_dir(item.get('base_dir', '.'), dest_dir_name, file_did_name, item.get('no_subdir')) file_item['dest_dir_path'] = dest_dir_path dest_file_path = os.path.join(dest_dir_path, file_did_name) file_item['dest_file_path'] = dest_file_path file_item['temp_file_path'] = dest_file_path + '.part' file_items.append(file_item) return file_items
def download_file_from_archive(self, items, trace_custom_fields={}): """ Download items with a given PFN. This function can only download files, no datasets. :param items: List of dictionaries. Each dictionary describing a file to download. Keys: did - DID string of the archive file (e.g. 'scope:file.name'). Wildcards are not allowed archive - DID string of the archive from which the file should be extracted rse - Optional: rse name (e.g. 'CERN-PROD_DATADISK'). RSE Expressions are allowed base_dir - Optional: Base directory where the downloaded files will be stored. (Default: '.') no_subdir - Optional: If true, files are written directly into base_dir and existing files are overwritten. (Default: False) :param trace_custom_fields: Custom key value pairs to send with the traces :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState clientState can be one of the following: ALREADY_DONE, DONE, FILE_NOT_FOUND, FAIL_VALIDATE, FAILED :raises InputValidationError: if one of the input items is in the wrong format :raises NoFilesDownloaded: if no files could be downloaded :raises NotAllFilesDownloaded: if not all files could be downloaded :raises SourceNotFound: if xrdcp was unable to find the PFN :raises ServiceUnavailable: if xrdcp failed :raises RucioException: if something unexpected went wrong during the download """ logger = self.logger trace = copy.deepcopy(self.trace_tpl) trace['uuid'] = generate_uuid() log_prefix = 'Extracting files: ' logger.info('Processing %d item(s) for input' % len(items)) for item in items: archive = item.get('archive') file_extract = item.get('did') rse_name = item.get('rse') if not archive or not file_extract: raise InputValidationError('File DID and archive DID are mandatory') if '*' in archive: logger.debug(archive) raise InputValidationError('Cannot use PFN download with wildcard in DID') file_extract_scope, file_extract_name = self._split_did_str(file_extract) archive_scope, archive_name = self._split_did_str(archive) # listing all available replicas of given archhive file rse_expression = 'istape=False' if not rse_name else '(%s)&istape=False' % rse_name archive_replicas = self.client.list_replicas([{'scope': archive_scope, 'name': archive_name}], schemes=['root'], rse_expression=rse_expression, unavailable=False, client_location=self.client_location) # preparing trace trace['scope'] = archive_scope trace['dataset'] = archive_name trace['filename'] = file_extract # preparing output directories dest_dir_path = self._prepare_dest_dir(item.get('base_dir', '.'), os.path.join(archive_scope, archive_name + '.extracted'), file_extract, item.get('no_subdir')) logger.debug('%sPreparing output destination %s' % (log_prefix, dest_dir_path)) # validation and customisation of list of replicas archive_replicas = list(archive_replicas) if len(archive_replicas) != 1: raise RucioException('No replicas for DID found or dataset was given.') archive_pfns = archive_replicas[0]['pfns'].keys() if len(archive_pfns) == 0: raise InputValidationError('No PFNs for replicas of archive %s' % archive) # checking whether file already exists success = False dest_file_path = os.path.join(dest_dir_path, file_extract) if os.path.isfile(dest_file_path): logger.info('%s%s File exists already locally: %s' % (log_prefix, file_extract_name, dest_dir_path)) trace['clientState'] = 'ALREADY_DONE' trace['transferStart'] = time.time() trace['transferEnd'] = time.time() send_trace(trace, self.client.host, self.client.user_agent) success = True # DOWNLOAD, iteration over different rses unitl success retry_counter = 0 while not success and len(archive_pfns): retry_counter += 1 pfn = archive_pfns.pop() trace['rse'] = archive_replicas[0]['pfns'][pfn]['rse'] try: start_time = time.time() cmd = 'xrdcp -vf %s -z %s file://%s' % (pfn, file_extract_name, dest_dir_path) logger.debug('%sExecuting: %s' % (log_prefix, cmd)) status, out, err = execute(cmd) end_time = time.time() trace['transferStart'] = start_time trace['transferEnd'] = end_time if status == 54: trace['clientState'] = 'FAILED' raise SourceNotFound(err) elif status != 0: trace['clientState'] = 'FAILED' raise RucioException(err) else: success = True item['clientState'] = 'DONE' trace['clientState'] = 'DONE' except Exception as e: trace['clientState'] = 'FAILED' raise ServiceUnavailable(e) send_trace(trace, self.client.host, self.client.user_agent) if not success: raise RucioException('Failed to download file %s after %d retries' % (file_extract_name, retry_counter)) return self._check_output(items)
def download_pfns(self, items, num_threads=2, trace_custom_fields={}): """ Download items with a given PFN. This function can only download files, no datasets. :param items: List of dictionaries. Each dictionary describing a file to download. Keys: pfn - PFN string of this file did - DID string of this file (e.g. 'scope:file.name'). Wildcards are not allowed rse - rse name (e.g. 'CERN-PROD_DATADISK'). RSE Expressions are not allowed base_dir - Optional: Base directory where the downloaded files will be stored. (Default: '.') no_subdir - Optional: If true, files are written directly into base_dir and existing files are overwritten. (Default: False) ignore_checksum - Optional: If true, the checksum validation is skipped (for pfn downloads the checksum must be given explicitly). (Default: True) transfer_timeout - Optional: Timeout time for the download protocols. (Default: None) :param num_threads: Suggestion of number of threads to use for the download. It will be lowered if it's too high. :param trace_custom_fields: Custom key value pairs to send with the traces :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState clientState can be one of the following: ALREADY_DONE, DONE, FILE_NOT_FOUND, FAIL_VALIDATE, FAILED :raises InputValidationError: if one of the input items is in the wrong format :raises NoFilesDownloaded: if no files could be downloaded :raises NotAllFilesDownloaded: if not all files could be downloaded :raises RucioException: if something unexpected went wrong during the download """ logger = self.logger trace_custom_fields['uuid'] = generate_uuid() logger.info('Processing %d item(s) for input' % len(items)) input_items = [] for item in items: did_str = item.get('did') pfn = item.get('pfn') rse = item.get('rse') if not did_str or not pfn or not rse: logger.debug(item) raise InputValidationError('The keys did, pfn, and rse are mandatory') logger.debug('Preparing PFN download of %s (%s) from %s' % (did_str, pfn, rse)) if '*' in did_str: logger.debug(did_str) raise InputValidationError('Cannot use PFN download with wildcard in DID') did_scope, did_name = self._split_did_str(did_str) dest_dir_path = self._prepare_dest_dir(item.get('base_dir', '.'), did_scope, did_name, item.get('no_subdir')) item['scope'] = did_scope item['name'] = did_name item['sources'] = [{'pfn': pfn, 'rse': rse}] dest_file_path = os.path.join(dest_dir_path, did_name) item['dest_dir_path'] = dest_dir_path item['dest_file_path'] = dest_file_path item['temp_file_path'] = dest_file_path + '.part' item.setdefault('ignore_checksum', True) input_items.append(item) num_files_in = len(input_items) output_items = self._download_multithreaded(input_items, num_threads, trace_custom_fields) num_files_out = len(output_items) if num_files_in != num_files_out: raise RucioException('%d items were in the input queue but only %d are in the output queue' % (num_files_in, num_files_out)) return self._check_output(output_items)
def upload(self, items, summary_file_path=None): """ :param items: List of dictionaries. Each dictionary describing a file to upload. Keys: path - path of the file that will be uploaded rse - rse name (e.g. 'CERN-PROD_DATADISK') where to upload the file did_scope - Optional: custom did scope (Default: user.<account>) did_name - Optional: custom did name (Default: name of the file) dataset_scope - Optional: custom dataset scope dataset_name - Optional: custom dataset name force_scheme - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None) pfn - Optional: use a given PFN (this sets no_register to True) no_register - Optional: if True, the file will not be registered in the rucio catalogue lifetime - Optional: the lifetime of the file after it was uploaded transfer_timeout - Optional: time after the upload will be aborted guid - Optional: guid of the file :param summary_file_path: Optional: a path where a summary in form of a json file will be stored :returns: 0 on success :raises InputValidationError: if any input arguments are in a wrong format :raises RSEBlacklisted: if a given RSE is not available for writing :raises NoFilesUploaded: if no files were successfully uploaded :raises NotAllFilesUploaded: if not all files were successfully uploaded """ logger = self.logger self.trace['uuid'] = generate_uuid() # check given sources, resolve dirs into files, and collect meta infos files = self._collect_and_validate_file_info(items) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() for file in files: rse = file['rse'] if not self.rses.get(rse): rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse)) if rse_settings['availability_write'] != 1: raise RSEBlacklisted( '%s is blacklisted for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError( 'DIDs used to address both files and datasets: %s' % str(wrong_dids)) # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() num_succeeded = 0 for file in files: basename = file['basename'] logger.info('Preparing upload for file %s' % basename) no_register = file.get('no_register') pfn = file.get('pfn') force_scheme = file.get('force_scheme') self.trace['scope'] = file['did_scope'] self.trace['datasetScope'] = file.get('dataset_scope', '') self.trace['dataset'] = file.get('dataset_name', '') self.trace['remoteSite'] = rse self.trace['filesize'] = file['bytes'] file_did = {'scope': file['did_scope'], 'name': file['did_name']} dataset_did_str = file.get('dataset_did_str') if not no_register: self._register_file(file, registered_dataset_dids) rse = file['rse'] rse_settings = self.rses[rse] # if file already exists on RSE we're done if rsemgr.exists(rse_settings, file_did): logger.info('File already exists on RSE. Skipping upload') continue protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=force_scheme) protocols.reverse() success = False summary = [] while not success and len(protocols): protocol = protocols.pop() cur_scheme = protocol['scheme'] logger.info('Trying upload with %s to %s' % (cur_scheme, rse)) lfn = {} lfn['filename'] = basename lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] lfn['adler32'] = file['adler32'] lfn['filesize'] = file['bytes'] self.trace['protocol'] = cur_scheme self.trace['transferStart'] = time.time() try: state = rsemgr.upload( rse_settings=rse_settings, lfns=lfn, source_dir=file['dirname'], force_scheme=cur_scheme, force_pfn=pfn, transfer_timeout=file.get('transfer_timeout')) success = True file['upload_result'] = state except (ServiceUnavailable, ResourceTemporaryUnavailable) as error: logger.warning('Upload attempt failed') logger.debug('Exception: %s' % str(error)) if success: num_succeeded += 1 self.trace['transferEnd'] = time.time() self.trace['clientState'] = 'DONE' file['state'] = 'A' logger.info('Successfully uploaded file %s' % basename) send_trace(self.trace, self.client.host, self.client.user_agent) if summary_file_path: summary.append(copy.deepcopy(file)) # add file to dataset if needed if dataset_did_str and not no_register: try: self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) except Exception as error: logger.warning('Failed to attach file to the dataset') logger.debug(error) if not no_register: replica_for_api = self._convert_file_for_api(file) if not self.client.update_replicas_states( rse, files=[replica_for_api]): logger.warning('Failed to update replica state') else: logger.error('Failed to upload file %s' % basename) if summary_file_path: final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = { 'scope': file['scope'], 'name': file['name'], 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result']['pfn'], 'guid': file['meta']['guid'], 'adler32': file['adler32'], 'md5': file['md5'] } with open(summary_file_path, 'wb') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1) if num_succeeded == 0: raise NoFilesUploaded() elif num_succeeded != len(files): raise NotAllFilesUploaded() return 0
def download_dids(self, items, num_threads=2, trace_custom_fields={}): """ Download items with given DIDs. This function can also download datasets and wildcarded DIDs. :param items: List of dictionaries. Each dictionary describing an item to download. Keys: did - DID string of this file (e.g. 'scope:file.name'). Wildcards are not allowed rse - Optional: rse name (e.g. 'CERN-PROD_DATADISK') or rse expression from where to download force_scheme - Optional: force a specific scheme to download this item. (Default: None) base_dir - Optional: base directory where the downloaded files will be stored. (Default: '.') no_subdir - Optional: If true, files are written directly into base_dir and existing files are overwritten. (Default: False) nrandom - Optional: if the DID addresses a dataset, nrandom files will be randomly choosen for download from the dataset ignore_checksum - Optional: If true, skips the checksum validation between the downloaded file and the rucio catalouge. (Default: False) transfer_timeout - Optional: Timeout time for the download protocols. (Default: None) :param num_threads: Suggestion of number of threads to use for the download. It will be lowered if it's too high. :param trace_custom_fields: Custom key value pairs to send with the traces :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState :raises InputValidationError: if one of the input items is in the wrong format :raises NoFilesDownloaded: if no files could be downloaded :raises NotAllFilesDownloaded: if not all files could be downloaded :raises RucioException: if something unexpected went wrong during the download """ logger = self.logger trace_custom_fields['uuid'] = generate_uuid() logger.info('Processing %d item(s) for input' % len(items)) resolved_items = [] for item in items: did_str = item.get('did') if not did_str: raise InputValidationError('The key did is mandatory') logger.debug('Processing item %s' % did_str) new_item = copy.deepcopy(item) # extend RSE expression to exclude tape RSEs for non-admin accounts if not self.is_admin: rse = new_item.get('rse') new_item[ 'rse'] = 'istape=False' if not rse else '(%s)&istape=False' % rse logger.debug('RSE-Expression: %s' % new_item['rse']) # resolve any wildcards in the input dids did_scope, did_name = self._split_did_str(did_str) logger.debug('Splitted DID: %s:%s' % (did_scope, did_name)) new_item['scope'] = did_scope if '*' in did_name: logger.debug('Resolving wildcarded DID %s' % did_str) for dsn in self.client.list_dids(did_scope, filters={'name': did_name}, type='all'): logger.debug('%s:%s' % (did_scope, dsn)) new_item['name'] = dsn new_item['did'] = '%s:%s' % (did_scope, dsn) resolved_items.append(new_item) else: new_item['name'] = did_name resolved_items.append(new_item) input_items = [] # get replicas for every file of the given dids logger.debug('%d DIDs after processing input' % len(resolved_items)) for item in resolved_items: did_scope = item['scope'] did_name = item['name'] did_str = item['did'] logger.debug('Processing: %s' % item) # get type of given did did_type = self.client.get_did(did_scope, did_name)['type'].upper() logger.debug('Type: %s' % did_type) # get replicas (RSEs) with PFNs for each file (especially if its a dataset) files_with_replicas = self.client.list_replicas( [{ 'scope': did_scope, 'name': did_name }], schemes=item.get('force_scheme'), rse_expression=item.get('rse'), client_location=detect_client_location()) nrandom = item.get('nrandom') if nrandom: logger.info('Selecting %d random replicas from dataset %s' % (nrandom, did_str)) files_with_replicas = list(files_with_replicas) random.shuffle(files_with_replicas) files_with_replicas = files_with_replicas[0:nrandom] for file_item in files_with_replicas: file_did_scope = file_item['scope'] file_did_name = file_item['name'] file_did_str = '%s:%s' % (file_did_scope, file_did_name) logger.debug('Queueing file: %s' % file_did_str) # put the input options from item into the file item file_item.update(item) dest_dir_name = file_did_scope if did_type == 'DATASET': # if the did is a dataset, scope and name were updated wrongly file_item['scope'] = file_did_scope file_item['name'] = file_did_name file_item['did'] = file_did_str file_item['dataset_scope'] = did_scope file_item['dataset_name'] = did_name dest_dir_name = did_name dest_dir_path = self._prepare_dest_dir( item.get('base_dir', '.'), dest_dir_name, file_did_name, item.get('no_subdir')) file_item['dest_dir_path'] = dest_dir_path input_items.append(file_item) num_files_in = len(input_items) output_items = self._download_multithreaded(input_items, num_threads, trace_custom_fields) num_files_out = len(output_items) if num_files_in != num_files_out: raise RucioException( '%d items were in the input queue but only %d are in the output queue' % (num_files_in, num_files_out)) return self._check_output(output_items)
def upload(self, sources_with_settings, summary_file_path=None): """ List of dictionaries of file descriptions. None means optional [{'path': 'file1', 'rse': 'rse_name1', 'did_scope': None, 'did_name': None, 'dataset_name': None, 'dataset_scope': None, 'scheme': None, 'pfn': None, 'no_register': None, 'lifetime': None }, {'path': 'file2', 'rse': 'rse_name2', 'did_scope': None, 'did_name': None, 'dataset_name': None, 'dataset_scope': None, 'scheme': None, 'pfn': None, 'no_register': None, 'lifetime': None }] raises InputValidationError raises RSEBlacklisted """ logger = self.logger self.trace['uuid'] = generate_uuid() # check given sources, resolve dirs into files, and collect meta infos files = self.collect_and_validate_file_info(sources_with_settings) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() for file in files: rse = file['rse'] if not self.rses.get(rse): rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse)) if rse_settings['availability_write'] != 1: raise RSEBlacklisted('%s is blacklisted for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError('DIDs used to address both files and datasets: %s' % str(wrong_dids)) # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() for file in files: basename = file['basename'] logger.info('Preparing upload for file %s' % basename) no_register = file.get('no_register') pfn = file.get('pfn') scheme = file.get('scheme') self.trace['scope'] = file['did_scope'] self.trace['datasetScope'] = file.get('dataset_scope', '') self.trace['dataset'] = file.get('dataset_name', '') self.trace['remoteSite'] = rse self.trace['filesize'] = file['bytes'] file_scope = file['did_scope'] file_name = file['did_name'] file_did = {'scope': file_scope, 'name': file_name} file_did_str = '%s:%s' % (file_scope, file_name) dataset_did_str = file.get('dataset_did_str') rse = file['rse'] rse_settings = self.rses[rse] # register a dataset if we need to if dataset_did_str and dataset_did_str not in registered_dataset_dids and not no_register: registered_dataset_dids.add(dataset_did_str) try: self.client.add_dataset(scope=file['dataset_scope'], name=file['dataset_name'], rules=[{'account': self.account, 'copies': 1, 'rse_expression': rse, 'grouping': 'DATASET', 'lifetime': file['lifetime']}]) logger.info('Dataset %s successfully created' % dataset_did_str) except DataIdentifierAlreadyExists: # TODO: Need to check the rules thing!! logger.info("Dataset %s already exists" % dataset_did_str) replica_for_api = self.convert_file_for_api(file) try: # if the remote checksum is different this did must not be used meta = self.client.get_metadata(file_scope, file_name) logger.info('Comparing checksums of %s and %s' % (basename, file_did_str)) if meta['adler32'] != file['adler32']: logger.error('Local checksum %s does not match remote checksum %s' % (file['adler32'], meta['adler32'])) raise DataIdentifierAlreadyExists # add file to rse if it is not registered yet replicastate = list(self.client.list_replicas([file_did], all_states=True)) if rse not in replicastate[0]['rses'] and not no_register: logger.info('Adding replica at %s in Rucio catalog' % rse) self.client.add_replicas(rse=file['rse'], files=[replica_for_api]) except DataIdentifierNotFound: if not no_register: logger.info('Adding replica at %s in Rucio catalog' % rse) self.client.add_replicas(rse=file['rse'], files=[replica_for_api]) if not dataset_did_str: # only need to add rules for files if no dataset is given logger.info('Adding replication rule at %s' % rse) self.client.add_replication_rule([file_did], copies=1, rse_expression=rse, lifetime=file['lifetime']) # if file already exists on RSE we're done if not rsemgr.exists(rse_settings, file_did): protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=scheme) protocols.reverse() success = False summary = [] while not success and len(protocols): protocol = protocols.pop() logger.info('Trying upload to %s with protocol %s' % (rse, protocol['scheme'])) lfn = {} lfn['filename'] = file['basename'] lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] lfn['adler32'] = file['adler32'] lfn['filesize'] = file['bytes'] self.trace['protocol'] = protocol['scheme'] self.trace['transferStart'] = time.time() try: state = rsemgr.upload(rse_settings=rse_settings, lfns=lfn, source_dir=file['dirname'], force_scheme=protocol['scheme'], force_pfn=pfn) success = True file['upload_result'] = state except (ServiceUnavailable, ResourceTemporaryUnavailable) as error: logger.warning('Upload attempt failed') logger.debug('Exception: %s' % str(error)) if success: self.trace['transferEnd'] = time.time() self.trace['clientState'] = 'DONE' file['state'] = 'A' logger.info('File %s successfully uploaded' % basename) send_trace(self.trace, self.client.host, self.user_agent, logger=logger) if summary_file_path: summary.append(copy.deepcopy(file)) else: logger.error('Failed to upload file %s' % basename) # TODO trace? continue # skip attach_did and update_states for this file else: logger.info('File already exists on RSE. Skipped upload') if not no_register: # add file to dataset if needed if dataset_did_str: try: logger.info('Attaching file to dataset %s' % dataset_did_str) self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) except Exception as error: logger.warning('Failed to attach file to the dataset') logger.warning(error) logger.info('Setting replica state to available') replica_for_api = self.convert_file_for_api(file) self.client.update_replicas_states(rse, files=[replica_for_api]) if summary_file_path: final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = {'scope': file['scope'], 'name': file['name'], 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result']['pfn'], 'guid': file['meta']['guid'], 'adler32': file['adler32'], 'md5': file['md5']} with open(summary_file_path, 'wb') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1)
def upload(self, items, summary_file_path=None, traces_copy_out=None): """ :param items: List of dictionaries. Each dictionary describing a file to upload. Keys: path - path of the file that will be uploaded rse - rse name (e.g. 'CERN-PROD_DATADISK') where to upload the file did_scope - Optional: custom did scope (Default: user.<account>) did_name - Optional: custom did name (Default: name of the file) dataset_scope - Optional: custom dataset scope dataset_name - Optional: custom dataset name force_scheme - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None) pfn - Optional: use a given PFN (this sets no_register to True, and no_register becomes mandatory) no_register - Optional: if True, the file will not be registered in the rucio catalogue register_after_upload - Optional: if True, the file will be registered after successful upload lifetime - Optional: the lifetime of the file after it was uploaded transfer_timeout - Optional: time after the upload will be aborted guid - Optional: guid of the file :param summary_file_path: Optional: a path where a summary in form of a json file will be stored :param traces_copy_out: reference to an external list, where the traces should be uploaded :returns: 0 on success :raises InputValidationError: if any input arguments are in a wrong format :raises RSEBlacklisted: if a given RSE is not available for writing :raises NoFilesUploaded: if no files were successfully uploaded :raises NotAllFilesUploaded: if not all files were successfully uploaded """ logger = self.logger from admix.helper import helper helper.global_dictionary['logger'].Info(' r: start') self.trace['uuid'] = generate_uuid() helper.global_dictionary['logger'].Info(' r: collect and validate') # check given sources, resolve dirs into files, and collect meta infos files = self._collect_and_validate_file_info(items) # check if RSE of every file is available for writing # and cache rse settings registered_dataset_dids = set() registered_file_dids = set() helper.global_dictionary['logger'].Info(' r: check if files are available') for file in files: rse = file['rse'] if not self.rses.get(rse): rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse)) if rse_settings['availability_write'] != 1: raise RSEBlacklisted('%s is blacklisted for writing. No actions have been taken' % rse) dataset_scope = file.get('dataset_scope') dataset_name = file.get('dataset_name') if dataset_scope and dataset_name: dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name)) file['dataset_did_str'] = dataset_did_str registered_dataset_dids.add(dataset_did_str) registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name'])) wrong_dids = registered_file_dids.intersection(registered_dataset_dids) if len(wrong_dids): raise InputValidationError('DIDs used to address both files and datasets: %s' % str(wrong_dids)) # clear this set again to ensure that we only try to register datasets once registered_dataset_dids = set() num_succeeded = 0 summary = [] helper.global_dictionary['logger'].Info(' r: Starting loop') for file in files: helper.global_dictionary['logger'].Info('Start') basename = file['basename'] logger.info('Preparing upload for file %s' % basename) no_register = file.get('no_register') register_after_upload = file.get('register_after_upload') and not no_register pfn = file.get('pfn') force_scheme = file.get('force_scheme') delete_existing = False helper.global_dictionary['logger'].Info('1') trace = copy.deepcopy(self.trace) helper.global_dictionary['logger'].Info('2') # appending trace to list reference, if the reference exists if traces_copy_out is not None: helper.global_dictionary['logger'].Info('3') traces_copy_out.append(trace) helper.global_dictionary['logger'].Info('4') trace['scope'] = file['did_scope'] trace['datasetScope'] = file.get('dataset_scope', '') trace['dataset'] = file.get('dataset_name', '') trace['remoteSite'] = rse trace['filesize'] = file['bytes'] file_did = {'scope': file['did_scope'], 'name': file['did_name']} dataset_did_str = file.get('dataset_did_str') rse = file['rse'] rse_settings = self.rses[rse] rse_sign_service = rse_settings.get('sign_url', None) is_deterministic = rse_settings.get('deterministic', True) if not is_deterministic and not pfn: logger.error('PFN has to be defined for NON-DETERMINISTIC RSE.') continue if pfn and is_deterministic: logger.warning('Upload with given pfn implies that no_register is True, except non-deterministic RSEs') no_register = True helper.global_dictionary['logger'].Info('5') if not no_register and not register_after_upload: helper.global_dictionary['logger'].Info('6') self._register_file(file, registered_dataset_dids,helper) helper.global_dictionary['logger'].Info('7') # if register_after_upload, file should be overwritten if it is not registered # otherwise if file already exists on RSE we're done # if register_after_upload: # helper.global_dictionary['logger'].Info('8') # if rsemgr.exists(rse_settings, pfn if pfn else file_did): # helper.global_dictionary['logger'].Info('9') # try: # helper.global_dictionary['logger'].Info('10') # self.client.get_did(file['did_scope'], file['did_name']) # logger.info('File already registered. Skipping upload.') # trace['stateReason'] = 'File already exists' # continue # except DataIdentifierNotFound: # logger.info('File already exists on RSE. Previous left overs will be overwritten.') # delete_existing = True # elif not is_deterministic and not no_register: # if rsemgr.exists(rse_settings, pfn): # logger.info('File already exists on RSE with given pfn. Skipping upload. Existing replica has to be removed first.') # trace['stateReason'] = 'File already exists' # continue # elif rsemgr.exists(rse_settings, file_did): # logger.info('File already exists on RSE with different pfn. Skipping upload.') # trace['stateReason'] = 'File already exists' # continue # else: # if rsemgr.exists(rse_settings, pfn if pfn else file_did): # logger.info('File already exists on RSE. Skipping upload') # trace['stateReason'] = 'File already exists' # continue helper.global_dictionary['logger'].Info('11') # resolving local area networks domain = 'wan' # rse_attributes = {} # try: # rse_attributes = self.client.list_rse_attributes(rse) # helper.global_dictionary['logger'].Info('12') # except: # logger.warning('Attributes of the RSE: %s not available.' % rse) # if (self.client_location and 'lan' in rse_settings['domain'] and 'site' in rse_attributes): # if self.client_location['site'] == rse_attributes['site']: # domain = 'lan' # protocol handling and upload protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=force_scheme, domain=domain) helper.global_dictionary['logger'].Info('14') protocols.reverse() helper.global_dictionary['logger'].Info('15') success = False state_reason = '' while not success and len(protocols): helper.global_dictionary['logger'].Info('16') protocol = protocols.pop() cur_scheme = protocol['scheme'] logger.info('Trying upload with %s to %s' % (cur_scheme, rse)) lfn = {} lfn['filename'] = basename lfn['scope'] = file['did_scope'] lfn['name'] = file['did_name'] for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS: if checksum_name in file: lfn[checksum_name] = file[checksum_name] lfn['filesize'] = file['bytes'] sign_service = None if cur_scheme == 'https': sign_service = rse_sign_service trace['protocol'] = cur_scheme trace['transferStart'] = time.time() try: helper.global_dictionary['logger'].Info('17') state = rsemgr.upload(rse_settings=rse_settings, lfns=lfn, source_dir=file['dirname'], force_scheme=cur_scheme, force_pfn=pfn, transfer_timeout=file.get('transfer_timeout'), delete_existing=delete_existing, sign_service=sign_service) helper.global_dictionary['logger'].Info('18') success = state['success'] file['upload_result'] = state except (ServiceUnavailable, ResourceTemporaryUnavailable) as error: logger.warning('Upload attempt failed') logger.debug('Exception: %s' % str(error)) state_reason = str(error) helper.global_dictionary['logger'].Info('19') if success: num_succeeded += 1 trace['transferEnd'] = time.time() trace['clientState'] = 'DONE' file['state'] = 'A' logger.info('Successfully uploaded file %s' % basename) print('Successfully uploaded file %s' % basename) self._send_trace(trace) helper.global_dictionary['logger'].Info('Before if deepcopy') if summary_file_path: helper.global_dictionary['logger'].Info('Before deepcopy') summary.append(copy.deepcopy(file)) helper.global_dictionary['logger'].Info('Before if register') if not no_register: helper.global_dictionary['logger'].Info('Before if2 register') if register_after_upload: helper.global_dictionary['logger'].Info('Before register') self._register_file(file, registered_dataset_dids,helper) replica_for_api = self._convert_file_for_api(file) helper.global_dictionary['logger'].Info('Before if register2') if not self.client.update_replicas_states(rse, files=[replica_for_api]): helper.global_dictionary['logger'].Info('Before if register3') logger.warning('Failed to update replica state') # add file to dataset if needed helper.global_dictionary['logger'].Info('Before if attach') if dataset_did_str and not no_register: try: helper.global_dictionary['logger'].Info('Before attach') self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did]) helper.global_dictionary['logger'].Info('After attach') except Exception as error: helper.global_dictionary['logger'].Info('Failed to attach file to the dataset') logger.warning('Failed to attach file to the dataset') logger.debug(error) helper.global_dictionary['logger'].Info('Really finished') else: trace['clientState'] = 'FAILED' trace['stateReason'] = state_reason self._send_trace(trace) logger.error('Failed to upload file %s' % basename) if summary_file_path: final_summary = {} for file in summary: file_scope = file['did_scope'] file_name = file['did_name'] file_did_str = '%s:%s' % (file_scope, file_name) final_summary[file_did_str] = {'scope': file_scope, 'name': file_name, 'bytes': file['bytes'], 'rse': file['rse'], 'pfn': file['upload_result'].get('pfn', ''), 'guid': file['meta']['guid']} for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS: if checksum_name in file: final_summary[file_did_str][checksum_name] = file[checksum_name] with open(summary_file_path, 'wb') as summary_file: json.dump(final_summary, summary_file, sort_keys=True, indent=1) if num_succeeded == 0: raise NoFilesUploaded() elif num_succeeded != len(files): raise NotAllFilesUploaded() return 0