def test_put_mgr_ok_multi(self): """(RSE/PROTOCOLS): Put multiple files to storage (Success)""" if self.rse_settings['protocols'][0]['hostname'] == 'ssh1': result = mgr.upload(self.rse_settings, [{'name': '1_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'md5': md5(str(self.tmpdir) + '/1_rse_local_put.raw'), 'filesize': os.stat('%s/1_rse_local_put.raw' % self.tmpdir)[ os.path.stat.ST_SIZE]}, {'name': '2_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'md5': md5(str(self.tmpdir) + '/2_rse_local_put.raw'), 'filesize': os.stat('%s/2_rse_local_put.raw' % self.tmpdir)[ os.path.stat.ST_SIZE]}], source_dir=self.tmpdir, vo=self.vo, impl=self.impl) else: result = mgr.upload(self.rse_settings, [{'name': '1_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/1_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/1_rse_local_put.raw' % self.tmpdir)[ os.path.stat.ST_SIZE]}, {'name': '2_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/2_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/2_rse_local_put.raw' % self.tmpdir)[ os.path.stat.ST_SIZE]}], source_dir=self.tmpdir, vo=self.vo) status = result[0] details = result[1] if not (status and details['user.%s:1_rse_local_put.raw' % self.user] and details['user.%s:2_rse_local_put.raw' % self.user]): raise Exception('Return not as expected: %s, %s' % (status, details))
def test_utils_md5(self): """(COMMON/UTILS): test calculating MD5 of a file""" ret = md5(self.temp_file_1.name) assert isinstance(ret, str), "Object returned by utils.md5 is not a string" assert match('[a-fA-F0-9]{32}', ret) is not None, "String returned by utils.md5 is not a md5 hex digest" assert ret == '31d50dd6285b9ff9f8611d0762265d04', "Hex digest returned by utils.md5 is the MD5 checksum" with pytest.raises(Exception, match='FATAL - could not get MD5 checksum of file no_file - \\[Errno 2\\] No such file or directory: \'no_file\''): md5('no_file')
def test_utils_md5(self): """(COMMON/UTILS): test calculating MD5 of a file""" ret = md5(self.temp_file_1.name) assert_is_instance(ret, str, msg="Object returned by utils.md5 is not a string") assert_is_not_none(match('[a-fA-F0-9]{32}', ret), msg="String returned by utils.md5 is not a md5 hex digest") assert_equal(ret, '31d50dd6285b9ff9f8611d0762265d04', msg="Hex digest returned by utils.md5 is the MD5 checksum") with assert_raises(Exception) as e: md5('no_file') assert_equal('FATAL - could not get MD5 checksum of file no_file - [Errno 2] No such file or directory: \'no_file\'', e.exception.message)
def _collect_file_info(self, filepath, item): """ Collects infos (e.g. size, checksums, etc.) about the file and returns them as a dictionary (This function is meant to be used as class internal only) :param filepath: path where the file is stored :param item: input options for the given file :returns: a dictionary containing all collected info and the input options """ new_item = copy.deepcopy(item) new_item['path'] = filepath new_item['dirname'] = os.path.dirname(filepath) new_item['basename'] = os.path.basename(filepath) new_item['bytes'] = os.stat(filepath).st_size new_item['adler32'] = adler32(filepath) new_item['md5'] = md5(filepath) new_item['meta'] = {'guid': self._get_file_guid(new_item)} new_item['state'] = 'C' if not new_item.get('did_scope'): new_item['did_scope'] = self.default_file_scope if not new_item.get('did_name'): new_item['did_name'] = new_item['basename'] return new_item
def test_put_mgr_ok_single(self): """(RSE/PROTOCOLS): Put a single file to storage (Success)""" if self.rse_settings['protocols'][0]['hostname'] == 'ssh1': mgr.upload(self.rse_settings, { 'name': '3_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'md5': md5('%s/3_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/3_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE] }, source_dir=self.tmpdir, vo=self.vo, impl=self.impl) else: mgr.upload(self.rse_settings, { 'name': '3_rse_local_put.raw', 'scope': 'user.%s' % self.user, 'adler32': adler32('%s/3_rse_local_put.raw' % self.tmpdir), 'filesize': os.stat('%s/3_rse_local_put.raw' % self.tmpdir)[os.path.stat.ST_SIZE] }, source_dir=self.tmpdir, vo=self.vo)
def _collect_file_info(self, filepath, settings): """ Collects infos (e.g. size, checksums, etc.) about the file and returns them as a dictionary (This function is meant to be used as class internal only) :param filepath: path where the file is stored :param settings: input options for the given file :returns: a dictionary containing all collected info and the input options """ file = copy.deepcopy(settings) file['path'] = filepath file['dirname'] = os.path.dirname(filepath) file['basename'] = os.path.basename(filepath) file['bytes'] = os.stat(filepath).st_size file['adler32'] = adler32(filepath) file['md5'] = md5(filepath) file['meta'] = {'guid': self._get_file_guid(file)} file['state'] = 'C' file.setdefault('did_scope', self.default_file_scope) file.setdefault('did_name', file['basename']) file.setdefault('lifetime', None) return file
def test_download_succeeds_md5only(self): """CLIENT(USER): Rucio download succeeds MD5 only""" # user has a file to upload filename = file_generator() file_md5 = md5(filename) filesize = stat(filename).st_size lfn = {'name': filename[5:], 'scope': self.user, 'bytes': filesize, 'md5': file_md5} # user uploads file self.replica_client.add_replicas(files=[lfn], rse=self.def_rse) rse_settings = rsemgr.get_rse_info(self.def_rse) protocol = rsemgr.create_protocol(rse_settings, 'write') protocol.connect() pfn = protocol.lfns2pfns(lfn).values()[0] protocol.put(filename[5:], pfn, filename[:5]) protocol.close() remove(filename) # download files cmd = 'rucio -v download --dir /tmp {0}:{1}'.format(self.user, filename[5:]) print(self.marker + cmd) exitcode, out, err = execute(cmd) print(out, err) # search for the files with ls cmd = 'ls /tmp/{0}'.format(self.user) # search in /tmp/ print(self.marker + cmd) exitcode, out, err = execute(cmd) print(out, err) nose.tools.assert_not_equal(re.search(filename[5:], out), None) try: for i in listdir('data13_hip'): unlink('data13_hip/%s' % i) rmdir('data13_hip') except Exception: pass
def test_utils_md5(self): """(COMMON/UTILS): test calculating MD5 of a file""" ret = md5(self.temp_file_1.name) assert_is_instance(ret, str, msg="Object returned by tools.md5 is not a string") assert_is_not_none( match('[a-fA-F0-9]{32}', ret), msg="String returned by tools.md5 is not a md5 hex digest")
def test_utils_md5(self): """(COMMON/UTILS): test calculating MD5 of a file""" ret = md5(self.temp_file_1.name) assert_is_instance(ret, str, msg="Object returned by utils.md5 is not a string") assert_is_not_none( match('[a-fA-F0-9]{32}', ret), msg="String returned by utils.md5 is not a md5 hex digest") assert_equal( ret, '31d50dd6285b9ff9f8611d0762265d04', msg="Hex digest returned by utils.md5 is the MD5 checksum")
def test_download_fails_badmd5(self): """CLIENT(USER): Rucio download fails on MD5 mismatch""" # user has a file to upload filename = file_generator() file_md5 = md5(filename) filesize = stat(filename).st_size lfn = { 'name': filename[5:], 'scope': self.user, 'bytes': filesize, 'md5': '0123456789abcdef0123456789abcdef' } # user uploads file self.replica_client.add_replicas(files=[lfn], rse=self.def_rse) rse_settings = rsemgr.get_rse_info(self.def_rse) protocol = rsemgr.create_protocol(rse_settings, 'write') protocol.connect() pfn = protocol.lfns2pfns(lfn).values()[0] protocol.put(filename[5:], pfn, filename[:5]) protocol.close() remove(filename) # download file cmd = 'rucio download --dir /tmp {0}:{1}'.format( self.user, filename[5:]) print(self.marker + cmd) exitcode, out, err = execute(cmd) print(out, err) # a failure message 'Checksum mismatch : local _____ vs storage _____' appears report = 'Checksum\ mismatch\ \:\ local\ {0}\ vs\ recorded\ 0123456789abcdef0123456789abcdef'.format( file_md5) print('searching', report, 'in', err) nose.tools.assert_not_equal(re.search(report, err), None) # The file should not exist cmd = 'ls /tmp/' # search in /tmp/ print(self.marker + cmd) exitcode, out, err = execute(cmd) print(out, err) nose.tools.assert_equal(re.search(filename[5:], out), None) try: for i in listdir('data13_hip'): unlink('data13_hip/%s' % i) rmdir('data13_hip') except Exception: pass
def collect_file_info(self, filepath, settings): file = copy.deepcopy(settings) file['path'] = filepath file['dirname'] = os.path.dirname(filepath) file['basename'] = os.path.basename(filepath) file['bytes'] = os.stat(filepath).st_size file['adler32'] = adler32(filepath) file['md5'] = md5(filepath) file['meta'] = {'guid': self.get_file_guid(file)} file['state'] = 'C' file.setdefault('did_scope', self.default_file_scope) file.setdefault('did_name', file['basename']) file.setdefault('lifetime', None) return file
def test_upload_adds_md5digest(self): """CLIENT(USER): Upload Checksums""" # user has a file to upload filename = file_generator() file_md5 = md5(filename) # user uploads file cmd = 'rucio upload --rse {0} --scope {1} {2}'.format( self.def_rse, self.user, filename) print(self.marker + cmd) exitcode, out, err = execute(cmd) print(out) print(err) # When inspecting the metadata of the new file the user finds the md5 checksum meta = self.did_client.get_metadata(scope=self.user, name=filename[5:]) nose.tools.assert_in('md5', meta) nose.tools.assert_equal(meta['md5'], file_md5) remove(filename)
def check_storage(filepath): """ Check size and checksum of a file on storage """ logging.info("Checking %s" % filepath) try: size = os.stat(filepath).st_size adler_checksum = adler32(filepath) md5_checksum = md5(filepath) # FIXME: some frames have len(adler_checksum)=7, is there a better way to # force len(adler_checksum)=8 than prepending a zero manually? if len(adler_checksum)!=8: adler_checksum="0{}".format(adler_checksum) logging.info("Got size and checksum of file: %s size=%s adler32 checksum=%s md5 checksum=%s" % (filepath, size, adler_checksum, md5_checksum)) except: logging.warning("no file found at %s" % filepath) return False return size, adler_checksum, md5_checksum
def download(rse_settings, files, dest_dir=None, force_scheme=None, ignore_checksum=False, printstatements=False, domain='wan', transfer_timeout=None): """ Copy a file from the connected storage to the local file system. Providing a list indicates the bulk mode. :param rse_settings: RSE to use :param files: a single dict or a list with dicts containing 'scope' and 'name' if LFNs are provided and additional 'pfn' if PFNs are provided. Examples: [ {'name': '2_rse_remote_get.raw', 'scope': 'user.jdoe'}, {'name':'3_rse_remote_get.raw', 'scope': 'user.jdoe', 'pfn': 'user/jdoe/5a/98/3_rse_remote_get.raw'} ] :param dest_dir: path to the directory where the downloaded files will be stored. If not given, each scope is represented by its own directory. :param force_scheme: normally the scheme is dictated by the RSE object, when specifying the PFN it must be forced to the one specified in the PFN, overruling the RSE description. :param ignore_checksum: do not verify the checksum - caution: should only be used for rucio download --pfn :param transfer_timeout: set this timeout (in seconds) for the transfers, for protocols that support it :returns: True/False for a single file or a dict object with 'scope:name' for LFNs or 'name' for PFNs as keys and True or the exception as value for each file in bulk mode :raises SourceNotFound: remote source file can not be found on storage :raises DestinationNotAccessible: local destination directory is not accessible :raises FileConsistencyMismatch: the checksum of the downloaded file does not match the provided one :raises ServiceUnavailable: for any other reason """ ret = {} gs = True # gs represents the global status which inidcates if every operation workd in bulk mode protocol = create_protocol(rse_settings, 'read', scheme=force_scheme, domain=domain) protocol.connect() files = [files] if not type(files) is list else files for f in files: pfn = f['pfn'] if 'pfn' in f else list( protocol.lfns2pfns(f).values())[0] target_dir = "./%s" % f['scope'] if dest_dir is None else dest_dir try: if not os.path.exists(target_dir): os.makedirs(target_dir) # Each scope is stored into a separate folder finalfile = '%s/%s' % (target_dir, f['name']) # Check if the file already exists, if not download and validate it if not os.path.isfile(finalfile): if 'adler32' in f: tempfile = '%s/%s.part' % (target_dir, f['name']) if os.path.isfile(tempfile): if printstatements: print( '%s already exists, probably from a failed attempt. Will remove it' % (tempfile)) os.unlink(tempfile) protocol.get(pfn, tempfile, transfer_timeout=transfer_timeout) if printstatements: print('File downloaded. Will be validated') if ignore_checksum: if printstatements: print('Skipping checksum validation') else: ruciochecksum = f['adler32'] if f['adler32'] else f[ 'md5'] localchecksum = utils.adler32( tempfile) if f['adler32'] else utils.md5(tempfile) if localchecksum == ruciochecksum: if printstatements: print('File validated') else: os.unlink(tempfile) raise exception.FileConsistencyMismatch( 'Checksum mismatch : local %s vs recorded %s' % (str(localchecksum), str(ruciochecksum))) os.rename(tempfile, finalfile) else: protocol.get(pfn, '%s/%s' % (target_dir, f['name']), transfer_timeout=transfer_timeout) ret['%s:%s' % (f['scope'], f['name'])] = True else: ret['%s:%s' % (f['scope'], f['name'])] = True except Exception as e: gs = False ret['%s:%s' % (f['scope'], f['name'])] = e protocol.close() if len(ret) == 1: for x in ret: if isinstance(ret[x], Exception): raise ret[x] else: return ret[x] return [gs, ret]
def _download_item(self, item, trace, log_prefix=''): """ Downloads the given item and sends traces for success/failure. (This function is meant to be used as class internal only) :param item: dictionary that describes the item to download :param trace: dictionary representing a pattern of trace that will be send :param log_prefix: string that will be put at the beginning of every log message :returns: dictionary with all attributes from the input item and a clientState attribute """ logger = self.logger did_scope = item['scope'] did_name = item['name'] did_str = '%s:%s' % (did_scope, did_name) logger.info('%sPreparing download of %s' % (log_prefix, did_str)) trace['scope'] = did_scope trace['filename'] = did_name trace.setdefault('datasetScope', item.get('dataset_scope', '')) trace.setdefault('dataset', item.get('dataset_name', '')) trace.setdefault('filesize', item.get('bytes')) # if file already exists, set state, send trace, and return dest_file_path = item['dest_file_path'] if os.path.isfile(dest_file_path): logger.info('%sFile exists already locally: %s' % (log_prefix, did_str)) item['clientState'] = 'ALREADY_DONE' trace['transferStart'] = time.time() trace['transferEnd'] = time.time() trace['clientState'] = 'ALREADY_DONE' send_trace(trace, self.client.host, self.client.user_agent) return item # check if file has replicas sources = item.get('sources') if not sources or not len(sources): logger.warning('%sNo available source found for file: %s' % (log_prefix, did_str)) item['clientState'] = 'FILE_NOT_FOUND' trace['clientState'] = 'FILE_NOT_FOUND' send_trace(trace, self.client.host, self.client.user_agent) return item success = False # try different PFNs until one succeeded i = 0 while not success and i < len(sources): pfn = sources[i]['pfn'] rse_name = sources[i]['rse'] i += 1 scheme = pfn.split(':')[0] try: rse = rsemgr.get_rse_info(rse_name) except RSENotFound: logger.warning('%sCould not get info of RSE %s' % (log_prefix, rse_name)) continue trace['remoteSite'] = rse_name trace['clientState'] = 'DOWNLOAD_ATTEMPT' trace['protocol'] = scheme logger.info('%sTrying to download with %s from %s: %s ' % (log_prefix, scheme, rse_name, did_str)) try: protocol = rsemgr.create_protocol(rse, operation='read', scheme=scheme) protocol.connect() except Exception as error: logger.warning('%sFailed to create protocol for PFN: %s' % (log_prefix, pfn)) logger.debug('scheme: %s, exception: %s' % (scheme, error)) continue attempt = 0 retries = 2 # do some retries with the same PFN if the download fails while not success and attempt < retries: attempt += 1 item['attemptnr'] = attempt temp_file_path = item['temp_file_path'] if os.path.isfile(temp_file_path): logger.debug('%sDeleting existing temporary file: %s' % (log_prefix, temp_file_path)) os.unlink(temp_file_path) start_time = time.time() try: protocol.get(pfn, temp_file_path, transfer_timeout=item.get('transfer_timeout')) success = True except Exception as error: logger.debug(error) trace['clientState'] = str(type(error).__name__) end_time = time.time() if success and not item.get('ignore_checksum', False): rucio_checksum = item.get('adler32') local_checksum = None if not rucio_checksum: rucio_checksum = item.get('md5') local_checksum = md5(temp_file_path) else: local_checksum = adler32(temp_file_path) if rucio_checksum != local_checksum: success = False os.unlink(temp_file_path) logger.warning('%sChecksum validation failed for file: %s' % (log_prefix, did_str)) logger.debug('Local checksum: %s, Rucio checksum: %s' % (local_checksum, rucio_checksum)) try: self.client.declare_suspicious_file_replicas([pfn], reason='Corrupted') except Exception: pass trace['clientState'] = 'FAIL_VALIDATE' if not success: logger.warning('%sDownload attempt failed. Try %s/%s' % (log_prefix, attempt, retries)) send_trace(trace, self.client.host, self.client.user_agent) protocol.close() if not success: logger.error('%sFailed to download file %s' % (log_prefix, did_str)) item['clientState'] = 'FAILED' return item os.rename(temp_file_path, dest_file_path) trace['transferStart'] = start_time trace['transferEnd'] = end_time trace['clientState'] = 'DONE' item['clientState'] = 'DONE' send_trace(trace, self.client.host, self.client.user_agent) duration = round(end_time - start_time, 2) size = item.get('bytes') size_str = sizefmt(size, self.is_human_readable) if size and duration: rate = round((size / duration) * 1e-6, 2) logger.info('%sFile %s successfully downloaded. %s in %s seconds = %s MBps' % (log_prefix, did_str, size_str, duration, rate)) else: logger.info('%sFile %s successfully downloaded in %s seconds' % (log_prefix, did_str, duration)) return item