def printResources(resources, scaled_units=True): """ E.g.: resources = [{u'total_capacity': 3774873600, u'name': u'dragproc_bandwidth:/data', u'type_id': 3, u'available_capacity': 3774873600, u'type_name': u'bandwidth', u'unit_id': 3, u'active': True, u'used_capacity': 0, u'id': 118, u'unit': u'bits/second', 'claimable_capacity': 3774873600}, ...] # this key was added (not from RADB); it can be negative! """ header = {'id': 'RId', 'name': 'Resource Name', 'active': 'Active', 'available_capacity': ' Avail. Capacity', 'claimable_capacity': ' Claimable Cap.', 'total_capacity': ' Total Capacity', 'unit': 'Unit'} print(('{id:4s} {name:24s} {active:6s} {available_capacity} {claimable_capacity} {total_capacity} {unit}'.format(**header))) print('===================================================================================================') resources.sort(key=lambda r: r['id']) # SQL could have done this better for res in resources: res['active'] = 'True' if res['active'] else 'False' # to solve bool formatting issue if scaled_units and (res['type_name'] == 'storage' or res['type_name'] == 'bandwidth'): unit_base = 1024 if res['type_name'] == 'storage' else 1000 # check type_name instead of unit as in printClaims() res['available_capacity'] = humanreadablesize(res['available_capacity'], '', unit_base) res['claimable_capacity'] = humanreadablesize(res['claimable_capacity'], '', unit_base) res['total_capacity'] = humanreadablesize(res['total_capacity'] , '', unit_base) cap_conv = '>16s' else: cap_conv = '16d' print((('{id:4d} {name:24s} {active:6s} {available_capacity:' + cap_conv + '} {claimable_capacity:' + cap_conv + '} {total_capacity:' + cap_conv + '} {unit}').format(**res))) if not resources: print('<no resources>')
def _updateCEP4CapacitiesInRADB(self): try: df_result = self.disk_usage.getDiskFreeSpace() if df_result['found']: #get the total used space, and update the resource availability in the radb radbrpc = self.disk_usage.path_resolver.radbrpc storage_resources = radbrpc.getResources( resource_types='storage', include_availability=True) cep4_storage_resource = next(x for x in storage_resources if 'CEP4' in x['name']) total_capacity = df_result.get('disk_size') used_capacity = df_result.get('disk_usage') available_capacity = df_result.get('disk_free') logger.info( 'updating capacities for resource \'%s\' (id=%s) in the RADB: total=%s, used=%s, available=%s', cep4_storage_resource['name'], cep4_storage_resource['id'], humanreadablesize(total_capacity), humanreadablesize(used_capacity), humanreadablesize(available_capacity)) radbrpc.updateResourceAvailability( cep4_storage_resource['id'], available_capacity=available_capacity, total_capacity=total_capacity) except Exception as e: logger.error('_updateCEP4CapacitiesInRADB: %s', e)
def index(): # TODO: serve html first, and let client request data via ajax usages = {} sites = db.sites() sites2 = [x for x in sites if x[1] != 'nikhef'] sites = [sites2[0], sites2[2], sites2[1]] total = 0.0 numFiles = 0L for site in sites: site_usage = float(db.totalFileSizeInSite(site[0])) usages[site[1]] = site_usage total += site_usage numFiles += db.numFilesInSite(site[0]) storagesitedata='[' + ', '.join(['''{name: "%s %s", y: %.1f}''' % (site[1], humanreadablesize(usages[site[1]]), 100.0*usages[site[1]]/total) for site in sites]) + ']' min_date, max_date = db.datetimeRangeOfFilesInTree() min_date = datetime(2012, 1, 1) month_ranges = monthRanges(min_date, max_date) # convert end-of-month timestamps to milliseconds since epoch epoch = datetime.utcfromtimestamp(0) datestamps=[('%d' % ((x[1] - epoch).total_seconds()*1000,)) for x in month_ranges] usage_per_month_series='[' deltas_per_month_series='[' for site in sites: cumulatives = [db.totalFileSizeInSite(site[0], to_date=mr[1]) for mr in month_ranges] data = ', '.join(['[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, cumulatives)]) usage_per_month_series += '''{name: '%s', data: [%s]},\n''' % (site[1], data) deltas = [0] for i in range(1, len(cumulatives)): delta = cumulatives[i] - cumulatives[i-1] deltas.append(delta) data = ', '.join(['[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, deltas)]) deltas_per_month_series += '''{name: '%s', data: [%s]},\n''' % (site[1], data) usage_per_month_series+=']' deltas_per_month_series+=']' return render_template('index.html', title='LTA storage overview', storagesitetitle='LTA Storage Site Usage', storagesitesubtitle='Total: %s #dataproducts: %s' % (humanreadablesize(total, 'B', 1000), humanreadablesize(numFiles, '', 1000)), storagesitedata=storagesitedata, usage_per_month_series=usage_per_month_series, deltas_per_month_series=deltas_per_month_series, data_gathered_timestamp=db.mostRecentVisitDate().strftime('%Y/%m/%d %H:%M:%S'))
def _log_job_notification(self, subject: str, job_dict: dict): if self._log_subject_filters and subject not in self._log_subject_filters: return try: if subject in [ 'JobProgress', 'TransferServiceStatus', 'TaskProgress', 'TaskFinished' ]: msg = "ingest %s " % subject else: msg = 'ingest job status changed to %s. ' % (subject, ) msg += 'project: %s export_id: %s type: %s server: %s' % ( job_dict.get('project'), job_dict.get('export_id'), job_dict.get('type'), job_dict.get('ingest_server')) if job_dict.get('archive_id'): msg += ' archive_id: %s' % job_dict.get('archive_id') if job_dict.get('dataproduct'): msg += ' dp: %s' % job_dict.get('dataproduct') if job_dict.get('lta_site'): msg += ' lta_site: %s' % job_dict.get('lta_site') if job_dict.get('otdb_id'): msg += ' otdb_id: %s' % job_dict.get('otdb_id') if job_dict.get('percentage_done') != None: msg += ' progress: %s%%' % (round( 10.0 * float(job_dict.get('percentage_done'))) / 10.0) if job_dict.get('total_bytes_transfered') != None: msg += ' transferred: %s' % humanreadablesize( job_dict.get('total_bytes_transfered')) if job_dict.get('average_speed') != None: msg += ' avg speed: %s' % humanreadablesize( job_dict.get('average_speed'), 'Bps') if job_dict.get('srm_url'): msg += ' srm_url: %s' % job_dict.get('srm_url') if job_dict.get('message'): msg += ' message: %s' % job_dict.get('message') logger.log( logging.WARNING if 'Failed' in subject else logging.INFO, msg) except Exception as e: logger.error(e)
def printClaims(claims, scaled_units=True): """ E.g.: claims = [{u'claim_size': 76441190400, u'endtime': datetime.datetime(2018, 6, 13, 17, 40), u'id': 67420, u'resource_id': 122, u'resource_name': u'drg01_storage:/data1', u'resource_type_id': 5, u'resource_type_name': u'storage', u'starttime': datetime.datetime(2017, 6, 13, 17, 30), u'status': u'claimed', u'status_id': 1, u'task_id': 75409, ...}, ...] """ header = {'id': 'ClId', 'resource_name': 'Resource Name', 'starttime': 'Start Time', 'endtime': 'End Time', 'claim_size': 'Claim Size', 'status': 'Status'} print(('{id:7s} {resource_name:24s} {starttime:19s} {endtime:19s} {claim_size:16s} {status:8s}'.format(**header))) print('===================================================================================================') claims.sort(key=lambda c: c['id']) # secondary sorting key; SQL could have done this better claims.sort(key=lambda c: c['starttime']) # primary sorting key (stable sort) for claim in claims: if scaled_units and (claim['resource_type_name'] == 'storage' or claim['resource_type_name'] == 'bandwidth'): unit_base = 1024 if claim['resource_type_name'] == 'storage' else 1000 # no unit name here, so check type_name claim['claim_size'] = humanreadablesize(claim['claim_size'], '', unit_base) size_conv = '>16s' else: size_conv = '16d' print((('{id:7d} {resource_name:24s} {starttime} {endtime} {claim_size:' + size_conv + '} {status:8s}').format(**claim))) if not claims: print('<no claims on specified resources and time range>')
def SendSIP(self, job_id, sip, storage_ticket): try: logger.info("LTAClient.SendSIP sending sip of %s for %s ...", humanreadablesize(len(sip)), job_id) start = time.time() #check sip before we upload it the the LTA if not validateSIPAgainstSchema(sip, log_prefix=str(job_id)): raise Exception('SIP for %s does not validate against schema' % job_id) if not checkSIPContent(sip, log_prefix=str(job_id), storage_ticket=storage_ticket): raise Exception('SIP for %s does has invalid content' % job_id) # sip ok, do upload result = self.__rpc.TransmitSIP(sip, storage_ticket) if time.time() - start > 2: logger.info("LTAClient.SendSIP for %s took %ds", job_id, time.time() - start) except xmlrpc.client.Fault as err: logger.error('LTAClient.SendSIP Received XML-RPC Fault: %s %s' % (err.faultCode, self.__hidePassword(err.faultString))) raise if result['result'] == 'ok': logger.info( 'LTAClient.SendSIP: Successfully sent SIP of %s for %s to the LTA', humanreadablesize(len(sip)), job_id) elif result.get('error'): if "could not use SIP" in result['error']: if 'is already in database with isValid 1 and Ticket state 40' in result[ 'error']: #sip is already in lta. issue warning, and continue, don't fail logger.warning(result['error']) else: msg = 'LTAClient.SendSIP: Invalid SIP according to LTA catalog for %s: %s' % ( job_id, result['error']) logger.error(msg) raise Exception(msg) else: raise Exception( 'LTAClient.SendSIP: Got Tier 1 TransmitSIP error for %s: %s' % (job_id, result['error']))
def getDiskFreeSpaceForMountpoint(mountpoint=CEP4_DATA_MOUNTPOINT): logger.info('getDiskFreeSpaceForMountpoint(\'%s\')', mountpoint) result = {'found': False, 'mountpoint': mountpoint} cmd = ['df', mountpoint] cmd = wrap_command_in_cep4_head_node_ssh_call_if_needed(cmd) logger.info(' '.join(cmd) + ' ...waiting for result...') proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = communicate_returning_strings(proc) if proc.returncode != 0: logger.error(out + err) result['message'] = out return result # example of out # Filesystem 1K-blocks Used Available Use% Mounted on # 10.134.233.65@o2ib:10.134.233.66@o2ib:/cep4-fs 3369564904320 1460036416928 1737591103048 46% /data #parse out lines = [l.strip() for l in out.split('\n')] data_line = next(l for l in lines if mountpoint in l) if data_line: parts = [p.strip() for p in data_line.split(' ')] result['found'] = True result['disk_size'] = 1024 * int(parts[1]) result['disk_usage'] = 1024 * int(parts[2]) result['disk_free'] = 1024 * int(parts[3]) result['disk_size_readable'] = humanreadablesize(result['disk_size']) result['disk_usage_readable'] = humanreadablesize(result['disk_usage']) result['disk_free_readable'] = humanreadablesize(result['disk_free']) logger.info('getDiskFreeSpaceForMountpoint(\'%s\') returning: %s', mountpoint, result) return result
def getDiskUsageForPath_du(path): logger.info('getDiskUsageForPath_du(\'%s\')', path) result = { 'found': False, 'path': path, 'disk_usage': None, 'name': path.split('/')[-1] } cmd = ['du', '-bcs', path] cmd = wrap_command_in_cep4_head_node_ssh_call_if_needed(cmd) logger.info(' '.join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = communicate_returning_strings(proc) if proc.returncode == 0: # example of out # 7025510839 /data/projects/HOLOG_WINDMILL_TESTS/L662734/uv/ # 7025510839 total #parse out lines = [l.strip() for l in out.split('\n')] total_lines = [l for l in lines if 'total' in l] if total_lines: parts = [p.strip() for p in total_lines[0].split()] if len(parts) == 2: result['found'] = True result['disk_usage'] = int(parts[0]) result['nr_of_files'] = None else: result['message'] = out + err result['found'] = False if 'No such file or directory' in err: logger.warning('No such file or directory: %s', path) result['disk_usage'] = 0 else: logger.error(out + err) result['disk_usage_readable'] = humanreadablesize(result['disk_usage']) otdb_id = getOTDBIdFromPath(path) if otdb_id: result['otdb_id'] = otdb_id logger.info('getDiskUsageForPath_du(\'%s\') returning: %s', path, result) return result
def get_sites_usages(): sites = {'sites_usages': db.sites()} for site in sites['sites_usages']: rootDirs = db.rootDirectoriesForSite(site['id']) site_usage = 0 for rootDir in rootDirs: usage = int(db.totalFileSizeInTree(rootDir['dir_id'])) site_usage += usage site['usage'] = site_usage site['usage_hr'] = humanreadablesize(site_usage) return json.jsonify(sites)
def get_sites_usages(): sites = {'sites_usages': [{'id': x[0], 'name': x[1]} for x in db.sites()]} for site in sites['sites_usages']: rootDirs = db.rootDirectoriesForSite(site['id']) site_usage = 0L for rootDir in rootDirs: usage = long(db.totalFileSizeInTree(rootDir[0])) site_usage += usage site['usage'] = site_usage site['usage_hr'] = humanreadablesize(site_usage) return json.jsonify(sites)
def _updateCache(self, du_result, send_notification=True): if not 'path' in du_result: return path = du_result['path'] otdb_id = du_result.get('otdb_id') with self._cacheLock: path_cache = self._cache['path_du_results'] otdb_id2path_cache = self._cache['otdb_id2path'] if otdb_id is None: # try to look up the otdb in the path cache if path in path_cache: otdb_id = du_result.get('otdb_id') # if still None, try to get the id from the path if otdb_id is None: otdb_id = getOTDBIdFromPath(path) if not path in path_cache or path_cache[path][ 'disk_usage'] != du_result['disk_usage']: # update the cache entry, even when no du result found, # cause that will save disk queries next time. logger.info('updating cache entry: %s', du_result) path_cache[path] = du_result if otdb_id != None: otdb_id2path_cache[otdb_id] = path if not du_result['found']: # even when the du for the path is not found, # keep a copy in the cache for fast lookup by clients # Make sure the size is 0 du_result['disk_usage'] = 0 du_result['disk_usage_readable'] = humanreadablesize(0) path_cache[path]['cache_timestamp'] = datetime.datetime.utcnow() path_cache[path]['needs_update'] = False self._writeCacheToDisk() if send_notification: self._sendDiskUsageChangedNotification(path, du_result['disk_usage'], otdb_id)
def _handleDiskUsageChange(self, disk_usage, otdb_id): if otdb_id != None: task = self._radbrpc.getTask(otdb_id=otdb_id) if task: du_readable = humanreadablesize(disk_usage) logger.info( 'disk_usage change: otdb_id %s radb_id %s disk_usage %s %s', otdb_id, task['id'], disk_usage, du_readable) task['disk_usage'] = disk_usage task['disk_usage_readable'] = du_readable task_change = { 'changeType': CHANGE_UPDATE_TYPE, 'objectType': 'task', 'value': task } self._handleChange(task_change)
def _sendDiskUsageChangedNotification(self, path, disk_usage, otdb_id=None): try: msg = EventMessage(subject='%s.DiskUsageChanged' % DEFAULT_DM_NOTIFICATION_PREFIX, content={ 'path': path, 'disk_usage': disk_usage, 'disk_usage_readable': humanreadablesize(disk_usage), 'otdb_id': otdb_id }) logger.info('Sending notification with subject %s to %s: %s', msg.subject, self.event_bus.exchange, msg.content) self.event_bus.send(msg) except Exception as e: logger.error(str(e))
def run(self): with self.event_bus: while True: try: self.__clearFinishedJobs() with self.__lock: starting_threads = [ job_thread_dict['thread'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' not in job_thread_dict ] pipelines = [ job_thread_dict['pipeline'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' in job_thread_dict ] initializing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_INITIALIZING ] transferring_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_TRANSFERRING ] finalizing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_FINALIZING ] finished_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_FINISHED ] log_interval = 5 if self.__running_jobs else 60 if datetime.utcnow( ) - self.__running_jobs_log_timestamp > timedelta( seconds=log_interval): status_log_line = "status: running %s jobs: #starting=%d, #transferring=%d, #finalizing=%d, #finished=%d, bandwith used on network interface(s) %s %s (%s), load=%.1f" % ( len(self.__running_jobs), len(initializing_pipelines) + len(starting_threads), len(transferring_pipelines), len(finalizing_pipelines), len(finished_pipelines), NET_IF_TO_MONITOR, humanreadablesize(self.__prev_used_bandwidth, 'bps'), humanreadablesize(self.__prev_used_bandwidth / 8, 'Bps'), os.getloadavg()[0]) logger.info(status_log_line) self.__running_jobs_log_timestamp = datetime.utcnow() msg = EventMessage(subject="%s.%s" % (INGEST_NOTIFICATION_PREFIX, 'TransferServiceStatus'), content={ 'ingest_server': socket.gethostname(), 'message': status_log_line }) msg.ttl = 3600 # remove message from queue's when not picked up within 1 hours self.event_bus.send(msg) time.sleep(5) except KeyboardInterrupt: break except Exception as e: logger.error(e)
def __str__(self): return self.filename + " " + humanreadablesize(self.size) + " " + str(self.created_at)
def __str__(self): return "LocationResult: path=%s # subdirs=%d # files=%d totalFileSizeOfDir=%s" % (self.location.path(), self.nrOfSubDirs(), self.nrOfFiles(), humanreadablesize(self.totalFileSizeOfDir()))
def index(): # TODO: serve html first, and let client request data via ajax usages = {} colors = { 'sara': { 'used': '#90ed7d', 'free': '#c5f6bc' }, 'juelich': { 'used': '#494950', 'free': '#a1a1aa' }, 'poznan': { 'used': '#7cb5ec', 'free': '#bcdaf5' } } sites = db.sites() sitesDict = {s['name']: s for s in sites} sites = [ sitesDict[sitename] for sitename in ['poznan', 'juelich', 'sara'] if sitename in sitesDict ] total_lta_size = 0.0 total_lta_num_files = 0 for site in sites: totals = db.totalFileSizeAndNumFilesInSite(site['id']) total_lta_size += totals['tree_total_file_size'] total_lta_num_files += totals['tree_num_files'] usages[site['name']] = totals['tree_total_file_size'] if total_lta_size > 0: storagesitedata = '[' + ', '.join([ '''{name: "%s %s", color:'%s', y: %.2f}''' % (site['name'], humanreadablesize( usages[site['name']]), colors[site['name']]['used'], 100.0 * usages[site['name']] / total_lta_size) for site in sites ]) + ']' else: storagesitedata = '[]' min_date, max_date = db.datetimeRangeOfFilesInTree() logger.info("datetimeRangeOfFilesInTree: min_date=%s, max_date=%s", min_date, max_date) if min_date is None: min_date = datetime(2012, 1, 1) if max_date is None: max_date = datetime.utcnow() min_date = max(datetime(2012, 1, 1), min_date) logger.info("month_ranges: min_date=%s, max_date=%s", min_date, max_date) month_ranges = monthRanges(min_date, max_date, 1) # convert end-of-month timestamps to milliseconds since epoch epoch = datetime.utcfromtimestamp(0) datestamps = [('%d' % ((x[1] - epoch).total_seconds() * 1000, )) for x in month_ranges] usage_per_month_series = '[' deltas_per_month_series = '[' for site in sites: deltas_per_month = [ db.totalFileSizeInSite(site['id'], from_date=mr[0], to_date=mr[1]) for mr in month_ranges ] data = ', '.join([ '[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, deltas_per_month) ]) deltas_per_month_series += '''{name: '%s', color:'%s', data: [%s]},\n''' % ( site['name'], colors[site['name']]['used'], data) cumulatives = [deltas_per_month[0]] for delta in deltas_per_month[1:]: cumulative = cumulatives[-1] + delta cumulatives.append(cumulative) data = ', '.join([ '[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, cumulatives) ]) usage_per_month_series += '''{name: '%s', color:'%s', data: [%s]},\n''' % ( site['name'], colors[site['name']]['used'], data) usage_per_month_series += ']' deltas_per_month_series += ']' quota_dir_stats = db.siteQuotaRootDirStats() quota_dir_stats = sorted(quota_dir_stats, reverse=True, key=lambda x: x['tree_total_file_size']) site_usages_per_site = {} latest_usages_per_site = {} for site_usage in db.siteQuotaUsages(): site_name = site_usage['site_name'] if site_name not in site_usages_per_site: site_usages_per_site[site_name] = [] site_usages_per_site[site_name].append(site_usage) if site_name not in latest_usages_per_site: latest_usages_per_site[site_name] = site_usage if site_usage['valid_until_date'] > latest_usages_per_site[site_name][ 'valid_until_date']: latest_usages_per_site[site_name] = site_usage quota_series = '[' storagesite_free_space = '[' site_tape_usages_table = '<table>\n' site_tape_usages_table += '<tr><th style="text-align: left;">site</th><th style="text-align: left;">directory</th><th>total #files</th><th>total file size</th><th>quota</th><th>free</th><th>expiration</th></tr>\n' total_lta_free_space = sum(u['space_left'] for u in list(latest_usages_per_site.values()) if u['space_left'] > 0) total_lta_quota = sum(u['quota'] for u in list(latest_usages_per_site.values())) for site_name in ['sara', 'juelich', 'poznan']: if site_name in latest_usages_per_site: latest_usage = latest_usages_per_site[site_name] site_tape_usages_table += '<tr style="font-weight: bold;"><td style="text-align: left;">%s</td><td style="text-align: left;">%s</td><td>%s</td><td>%s</td><td>%s</td><td %s>%s</td><td>%s</td></tr>\n' % ( latest_usage['site_name'], '', latest_usage['num_files'], humanreadablesize(latest_usage['total_file_size']), humanreadablesize(latest_usage['quota']), 'style="color: red;"' if latest_usage['space_left'] < 0 else '', humanreadablesize(latest_usage['space_left']), latest_usage['valid_until_date']) for qds in quota_dir_stats: if qds['site_name'] == site_name: site_tape_usages_table += '<tr><td style="text-align: left;">%s</td><td style="text-align: left;">%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % ( '', qds['dir_name'], qds['tree_num_files'], humanreadablesize( qds['tree_total_file_size']), '', '', '') storagesite_free_space += '''{name: "%s %s", color:'%s', y: %.2f}, ''' % ( site_name, humanreadablesize( latest_usage['space_left']), colors[site_name]['free'], max(0, 100.0 * latest_usage['space_left']) / total_lta_free_space) site_tape_usages_table += '</table>\n' for site_name in ['poznan', 'juelich', 'sara']: if site_name in site_usages_per_site: site_usages_for_site = site_usages_per_site[site_name] site_usages_for_site = sorted(site_usages_for_site, key=lambda x: x['valid_until_date']) data = ','.join('[%d, %s]' % ((su['valid_until_date'] - epoch).total_seconds() * 1000, su['space_left']) for su in site_usages_for_site) quota_series += '''{ name:'%s_free', stack:'%s', color:'%s', data:[%s] },''' % ( site_name, site_name, colors[site_name]['free'], data) data = ','.join('[%d, %s]' % ((su['valid_until_date'] - epoch).total_seconds() * 1000, su['total_file_size']) for su in site_usages_for_site) quota_series += '''{ name:'%s_used', stack:'%s', color:'%s', data:[%s] },''' % ( site_name, site_name, colors[site_name]['used'], data) quota_series += ']' storagesite_free_space += ']' return render_template( 'index.html', title='LTA storage overview', storagesitetitle='LTA Storage Site Usage', storagesitesubtitle='Total: %s #dataproducts: %s' % (humanreadablesize(total_lta_size, 'B', 1000), humanreadablesize(total_lta_num_files, '', 1000)), storagesite_free_space_title='LTA Storage Site Free Space', storagesite_free_space_subtitle= 'Total free space: %s Current total quota: %s' % (humanreadablesize(total_lta_free_space, 'B', 1000), humanreadablesize(total_lta_quota, 'B', 1000)), storagesitedata=storagesitedata, storagesite_free_space=storagesite_free_space, usage_per_month_series=usage_per_month_series, deltas_per_month_series=deltas_per_month_series, quota_series=quota_series, site_tape_usages=site_tape_usages_table, data_gathered_timestamp=db.mostRecentVisitDate().strftime( '%Y/%m/%d %H:%M:%S'))
def run(self): with self.event_bus: try: logger.info("starting ingestpipeline for %s" % self.JobId) start = time.time() self.__sendNotification('JobStarted') self.GetStorageTicket() self.CheckForValidSIP() self.TransferFile() self.SendChecksumsToLTA() self.GetSIP() self.SendSIPToLTA() self.SendStatusToLTA(IngestSuccessful) avgSpeed = 0 elapsed = time.time() - start try: avgSpeed = float(self.FileSize) / elapsed logger.info("Ingest Pipeline finished for %s in %d sec with average speed of %s for %s including all overhead", self.JobId, elapsed, humanreadablesize(avgSpeed, 'Bps'), humanreadablesize(float(self.FileSize), 'B')) except Exception: logger.info("Ingest Pipeline finished for %s in %d sec", self.JobId, elapsed) self.__sendNotification('JobFinished', average_speed=avgSpeed, total_bytes_transfered=int(self.FileSize)) except PipelineError as pe: logger.log(logging.WARNING if pe.type == PipelineAlreadyInLTAError else logging.ERROR, 'Encountered PipelineError for %s : %s', self.JobId, str(pe)) if pe.type == PipelineNoSourceError: self.__sendNotification('JobTransferFailed', 'data not transfered because it was not on disk') elif pe.type == PipelineAlreadyInLTAError: self.__sendNotification('JobFinished', 'data was already in the LTA', average_speed=0, total_bytes_transfered=0) else: self.RollBack() # by default the error_message for the notification is the exception error_message = str(pe) # for known messsages in the exception, make a nice readable error_message if 'MoM returned login screen instead of SIP' in error_message: error_message = 'MoM returned login screen instead of SIP' self.__sendNotification('JobTransferFailed', error_message) try: if pe.type != PipelineAlreadyInLTAError: self.SendStatusToLTA(IngestFailed) except Exception as e: logger.error('SendStatusToLTA failed for %s: %s', self.JobId, e) except Exception as e: logger.error('Encountered unexpected error for %s: %s', self.JobId, e) # by default the error_message for the notification is the exception error_message = str(e) # for known messsages in the exception, make a nice readable error_message if 'ltacp' in error_message and ('file listing failed' in error_message or 'du failed' in error_message): error_message = 'dataproduct %s not found at location %s:%s' % (self.DataProduct, self.HostLocation, self.Location) elif 'does not validate against schema' in error_message: error_message = 'invalid SIP does not validate against schema' try: self.RollBack() except Exception as rbe: logger.error('RollBack failed for %s: %s', self.JobId, rbe) try: self.SendStatusToLTA(IngestFailed) except Exception as sse: logger.error('SendStatusToLTA failed for %s: %s', self.JobId, sse) try: self.__sendNotification('JobTransferFailed', error_message) except Exception as sne: logger.error('sendNotification failed for %s: %s', self.JobId, sne) finally: self.status = IngestPipeline.STATUS_FINISHED
def transfer(self, force=False, dereference=False): starttime = datetime.utcnow() # for cleanup self.started_procs = {} self.fifos = [] try: if not self.source_exists(): if self.can_logon_to_source_host(): if self.source_mounted(): raise LtacpException( "ltacp %s: source path %s:%s does not exist" % (self.logId, self.src_host, self.src_path)) else: raise LtacpException( "ltacp %s: the disk of source path %s:%s does not seem to be mounted" % (self.logId, self.src_host, self.src_path)) else: raise LtacpException( "ltacp %s: cannot login to %s@%s" % (self.logId, self.src_user, self.src_host)) # determine if input is file input_is_file = self.is_soure_single_file() if not input_is_file: # make sure the file extension is .tar or .tar.gz missing_suffix = "" if self.gzip: if not (self.dst_surl.endswith(".tar.gz") or self.dst_surl.endswith(".tgz")): if self.dst_surl.endswith(".tar"): missing_suffix = ".gz" else: missing_suffix = ".tar.gz" else: if not self.dst_surl.endswith(".tar"): missing_suffix = ".tar" if missing_suffix: self.dst_surl += missing_suffix logger.info( "ltacp %s: appending missing suffix %s to surl: %s", self.logId, missing_suffix, self.dst_surl) dst_turl = convert_surl_to_turl(self.dst_surl) logger.info( 'ltacp %s: initiating transfer of %s:%s to surl=%s turl=%s' % (self.logId, self.src_host, self.src_path, self.dst_surl, dst_turl)) # get input datasize du_items = self.src_path if input_is_file or isinstance( self.src_path, str) else ' '.join(self.src_path) cmd_remote_du = self.ssh_cmd + [ 'du -b %s --max-depth=0 %s' % ("--dereference" if dereference else "", du_items) ] logger.info('ltacp %s: remote getting datasize. executing: %s' % (self.logId, ' '.join(cmd_remote_du))) p_remote_du = Popen(cmd_remote_du, stdout=PIPE, stderr=PIPE) self.started_procs[p_remote_du] = cmd_remote_du # block until du is finished output_remote_du = communicate_returning_strings(p_remote_du) del self.started_procs[p_remote_du] if p_remote_du.returncode != 0: raise LtacpException( 'ltacp %s: remote du failed: \nstdout: %s\nstderr: %s' % (self.logId, output_remote_du[0], output_remote_du[1])) # compute various parameters for progress logging if input_is_file: input_datasize = int(output_remote_du[0].split()[0]) else: input_datasize = sum([ int(line.strip().split()[0]) for line in output_remote_du[0].split('\n') if line.strip() ]) logger.info('ltacp %s: input datasize: %d bytes, %s' % (self.logId, input_datasize, humanreadablesize(input_datasize))) estimated_tar_size = 512 * ( input_datasize // 512 ) + 3 * 512 #512byte header, 2*512byte ending, 512byte modulo data logger.info('ltacp %s: estimated_tar_size: %d bytes, %s' % (self.logId, estimated_tar_size, humanreadablesize(estimated_tar_size))) #--- # Server part #--- # we'll randomize ports # to minimize initial collision, randomize based on path and time random.seed(hash(self.logId) ^ hash(time.time())) p_data_in, port_data = self._ncListen('data') self.local_data_fifo = '/tmp/ltacp_datapipe_%s_%s' % ( self.src_host, self.logId) logger.info('ltacp %s: creating data fifo: %s' % (self.logId, self.local_data_fifo)) if os.path.exists(self.local_data_fifo): os.remove(self.local_data_fifo) os.mkfifo(self.local_data_fifo) if not os.path.exists(self.local_data_fifo): raise LtacpException("ltacp %s: Could not create fifo: %s" % (self.logId, self.local_data_fifo)) # transfer incomming data stream via md5a32bc to compute md5, adler32 and byte_count # data is written to fifo, which is then later fed into globus-url-copy # on stdout we can monitor progress # set progress message step 0f 0.5% of estimated_tar_size cmd_md5a32bc = [ 'md5a32bc', '-p', str(min(1000000, estimated_tar_size // 200)), self.local_data_fifo ] logger.info( 'ltacp %s: processing data stream for md5, adler32 and byte_count. executing: %s' % ( self.logId, ' '.join(cmd_md5a32bc), )) p_md5a32bc = Popen(cmd_md5a32bc, stdin=p_data_in.stdout, stdout=PIPE, stderr=PIPE, universal_newlines=True) self.started_procs[p_md5a32bc] = cmd_md5a32bc # start copy fifo stream to globus-url-copy guc_options = [ '-cd', #create remote directories if missing '-p 4', #number of parallel ftp connections '-bs 131072', #buffer size '-b', # binary '-nodcau', # turn off data channel authentication for ftp transfers ] cmd_data_out = [ '/bin/bash', '-c', 'globus-url-copy %s file://%s %s' % (' '.join(guc_options), self.local_data_fifo, dst_turl) ] logger.info( 'ltacp %s: copying data stream into globus-url-copy. executing: %s' % (self.logId, ' '.join(cmd_data_out))) p_data_out = Popen(cmd_data_out, stdout=PIPE, stderr=PIPE) self.started_procs[p_data_out] = cmd_data_out # Check if receiver side is set up correctly # and all processes are still waiting for input from client finished_procs = dict( (p, cl) for (p, cl) in list(self.started_procs.items()) if p.poll() is not None) if len(finished_procs): msg = '' for p, cl in list(finished_procs.items()): o, e = communicate_returning_strings(p) msg += " process pid:%d exited prematurely with exit code %d. cmdline: %s\nstdout: %s\nstderr: %s\n" % ( p.pid, p.returncode, cl, o, e) raise LtacpException( "ltacp %s: %d local process(es) exited prematurely\n%s" % (self.logId, len(finished_procs), msg)) #--- # Client part #--- # start remote copy on src host: # 1) create fifo # 2) send tar stream of data/dir + tee to fifo for 3) # 3) simultaneously to 2), calculate checksum of fifo stream # 4) break fifo self.remote_data_fifo = '/tmp/ltacp_md5_pipe_%s' % (self.logId, ) #make sure there is no old remote fifo self._removeRemoteFifo() cmd_remote_mkfifo = self.ssh_cmd + [ 'mkfifo %s' % (self.remote_data_fifo, ) ] logger.info('ltacp %s: remote creating fifo. executing: %s' % (self.logId, ' '.join(cmd_remote_mkfifo))) p_remote_mkfifo = Popen(cmd_remote_mkfifo, stdout=PIPE, stderr=PIPE) self.started_procs[p_remote_mkfifo] = cmd_remote_mkfifo # block until fifo is created output_remote_mkfifo = communicate_returning_strings( p_remote_mkfifo) del self.started_procs[p_remote_mkfifo] if p_remote_mkfifo.returncode != 0: raise LtacpException( 'ltacp %s: remote fifo creation failed: \nstdout: %s\nstderr: %s' % (self.logId, output_remote_mkfifo[0], output_remote_mkfifo[1])) with open(os.devnull, 'r') as devnull: # start sending remote data, tee to fifo if input_is_file: cmd_remote_data = self.ssh_cmd + [ 'cat %s %s| tee %s | %s %s %s' % (self.src_path, '| gzip --stdout --fast ' if self.gzip else '', self.remote_data_fifo, self.remoteNetCatCmd, self.localIPAddress, port_data) ] else: if isinstance(self.src_path, str): #src_path is dir src_path_parent, src_path_child = os.path.split( self.src_path) else: #src_path is list if paths dirs = set([os.path.dirname(p) for p in self.src_path]) if len(dirs) > 1: raise LtacpException( 'ltacp %s: cannot combine multiple files from different directories in one tarbal' % self.logId) files = set( [os.path.basename(p) for p in self.src_path]) src_path_parent = list(dirs)[0] src_path_child = ' '.join(files) cmd_remote_data = self.ssh_cmd + [ 'cd %s && tar c %s -O %s %s| tee %s | %s %s %s' % (src_path_parent, src_path_child, '--dereference' if dereference else '', '| gzip --stdout --fast ' if self.gzip else '', self.remote_data_fifo, self.remoteNetCatCmd, self.localIPAddress, port_data) ] logger.info( 'ltacp %s: remote starting transfer. executing: %s' % (self.logId, ' '.join(cmd_remote_data))) p_remote_data = Popen(cmd_remote_data, stdin=devnull, stdout=PIPE, stderr=PIPE) self.started_procs[p_remote_data] = cmd_remote_data # start computation of checksum on remote fifo stream cmd_remote_checksum = self.ssh_cmd + [ 'md5sum %s' % (self.remote_data_fifo, ) ] logger.info( 'ltacp %s: remote starting computation of md5 checksum. executing: %s' % (self.logId, ' '.join(cmd_remote_checksum))) p_remote_checksum = Popen(cmd_remote_checksum, stdin=devnull, stdout=PIPE, stderr=PIPE) self.started_procs[p_remote_checksum] = cmd_remote_checksum # waiting for output, comparing checksums, etc. logger.info( 'ltacp %s: transfering... waiting for progress...' % self.logId) transfer_start_time = datetime.utcnow() prev_progress_time = datetime.utcnow() prev_bytes_transfered = 0 with PipeReader(p_md5a32bc.stdout, self.logId) as pipe_reader: # wait and poll for progress while all processes are runnning while len([ p for p in list(self.started_procs.keys()) if p.poll() is not None ]) == 0: try: current_progress_time = datetime.utcnow() elapsed_secs_since_prev = totalSeconds( current_progress_time - prev_progress_time) if elapsed_secs_since_prev > 900: raise LtacpException( 'ltacp %s: transfer stalled for 15min.' % (self.logId)) # read and process md5a32bc stdout lines to create progress messages lines = pipe_reader.readlines(1) nextline = lines[-1].strip() if lines else '' if len(nextline) > 0: try: logger.debug('ltacp %s: transfering... %s', self.logId, nextline) total_bytes_transfered = int( nextline.split()[0].strip()) percentage_done = ( 100.0 * float(total_bytes_transfered) ) / float(estimated_tar_size) elapsed_secs_since_start = totalSeconds( current_progress_time - transfer_start_time) if percentage_done > 0 and elapsed_secs_since_start > 0 and elapsed_secs_since_prev > 0: avg_speed = total_bytes_transfered / elapsed_secs_since_start current_bytes_transfered = total_bytes_transfered - prev_bytes_transfered current_speed = current_bytes_transfered / elapsed_secs_since_prev if elapsed_secs_since_prev > 120 or current_bytes_transfered > 0.333 * estimated_tar_size: prev_progress_time = current_progress_time prev_bytes_transfered = total_bytes_transfered percentage_to_go = 100.0 - percentage_done time_to_go = elapsed_secs_since_start * percentage_to_go / percentage_done try: if self.progress_callback: self.progress_callback( percentage_done= percentage_done, current_speed= current_speed, total_bytes_transfered= total_bytes_transfered) except Exception as e: logger.error(e) logger.info( 'ltacp %s: transfered %s %.1f%% in %s at avgSpeed=%s (%s) curSpeed=%s (%s) to_go=%s to %s' % (self.logId, humanreadablesize( total_bytes_transfered), percentage_done, timedelta(seconds=int( round( elapsed_secs_since_start ))), humanreadablesize( avg_speed, 'Bps'), humanreadablesize( avg_speed * 8, 'bps'), humanreadablesize( current_speed, 'Bps'), humanreadablesize( current_speed * 8, 'bps'), timedelta(seconds=int( round(time_to_go))), dst_turl)) except Exception as e: msg = 'ltacp %s: error while parsing md5a32bc loglines: error=%s. line=%s' % ( self.logId, e, nextline) logger.error(msg) self.cleanup() raise LtacpException(msg) time.sleep(0.05) except KeyboardInterrupt: self.cleanup() except LtacpException as e: logger.error('ltacp %s: %s' % (self.logId, str(e))) self.cleanup() raise except Exception as e: logger.error('ltacp %s: %s' % (self.logId, str(e))) def waitForSubprocess(proc, timeout=timedelta(seconds=60), proc_log_name='', loglevel=logging.DEBUG): logger.log( loglevel, 'ltacp %s: waiting at most %s for %s to finish...', self.logId, timeout, proc_log_name) start_wait = datetime.now() while datetime.now() - start_wait < timeout: if proc.poll() is not None: break time.sleep(1) if proc.poll() is None: raise LtacpException( 'ltacp %s: %s did not finish within %s.' % (self.logId, proc_log_name, timeout)) waitForSubprocess(p_data_out, timedelta(seconds=self.globus_timeout), 'globus-url-copy', logging.INFO) output_data_out = communicate_returning_strings(p_data_out) if p_data_out.returncode != 0: if 'file exist' in output_data_out[1].lower(): raise LtacpDestinationExistsException( 'ltacp %s: data transfer via globus-url-copy to LTA failed, file already exists. turl=%s.' % (self.logId, dst_turl)) raise LtacpException( 'ltacp %s: transfer via globus-url-copy to LTA failed. turl=%s error=%s' % (self.logId, dst_turl, output_data_out[0].strip() + output_data_out[1].strip())) logger.info( 'ltacp %s: data transfer via globus-url-copy to LTA complete.' % self.logId) waitForSubprocess(p_remote_data, timedelta(seconds=60), 'remote data transfer') output_remote_data = communicate_returning_strings( p_remote_data) if p_remote_data.returncode != 0: raise LtacpException( 'ltacp %s: Error in remote data transfer: %s' % (self.logId, output_remote_data[1])) logger.debug('ltacp %s: remote data transfer finished...' % self.logId) waitForSubprocess(p_remote_checksum, timedelta(seconds=60), 'remote md5 checksum computation') output_remote_checksum = communicate_returning_strings( p_remote_checksum) if p_remote_checksum.returncode != 0: raise LtacpException( 'ltacp %s: Error in remote md5 checksum computation: %s' % (self.logId, output_remote_checksum[1])) logger.debug( 'ltacp %s: remote md5 checksum computation finished.' % self.logId) waitForSubprocess( p_md5a32bc, timedelta(seconds=60), 'local computation of md5 adler32 and byte_count') output_md5a32bc_local = communicate_returning_strings( p_md5a32bc) if p_md5a32bc.returncode != 0: raise LtacpException( 'ltacp %s: Error while computing md5 adler32 and byte_count: %s' % (self.logId, output_md5a32bc_local[1])) logger.debug( 'ltacp %s: computed local md5 adler32 and byte_count.' % self.logId) # process remote md5 checksums try: md5_checksum_remote = output_remote_checksum[0].split( ' ')[0] except Exception as e: logger.error( 'ltacp %s: error while parsing remote md5: %s\n%s' % (self.logId, output_remote_checksum[0], output_remote_checksum[1])) raise # process local md5 adler32 and byte_count try: items = output_md5a32bc_local[1].splitlines()[-1].split( ' ') md5_checksum_local = items[0].strip() a32_checksum_local = items[1].strip().zfill(8) byte_count = int(items[2].strip()) except Exception as e: logger.error( 'ltacp %s: error while parsing md5 adler32 and byte_count outputs: %s' % (self.logId, output_md5a32bc_local[0])) raise # check transfered number of bytes if (byte_count == 0 and input_datasize > 0): raise LtacpException( 'ltacp %s: did not transfer any bytes of the expected %s. Something is wrong in the datastream setup.' % (self.logId, humanreadablesize(estimated_tar_size))) logger.info( 'ltacp %s: byte count of datastream is %d %s' % (self.logId, byte_count, humanreadablesize(byte_count))) # compare local and remote md5 if (md5_checksum_remote != md5_checksum_local): raise LtacpException( 'md5 checksum reported by client (%s) does not match local checksum of incoming data stream (%s)' % (self.logId, md5_checksum_remote, md5_checksum_local)) logger.info( 'ltacp %s: remote and local md5 checksums are equal: %s' % ( self.logId, md5_checksum_local, )) logger.info('ltacp %s: fetching adler32 checksum from LTA...' % self.logId) srm_ok, srm_file_size, srm_a32_checksum = get_srm_size_and_a32_checksum( self.dst_surl, 'ltacp %s:' % self.logId) if not srm_ok: raise LtacpException( 'ltacp %s: Could not get srm adler32 checksum for: %s' % (self.logId, self.dst_surl)) if srm_a32_checksum != a32_checksum_local: raise LtacpException( 'ltacp %s: adler32 checksum reported by srm (%s) does not match original data checksum (%s)' % (self.logId, srm_a32_checksum, a32_checksum_local)) logger.info('ltacp %s: adler32 checksums are equal: %s' % ( self.logId, a32_checksum_local, )) if int(srm_file_size) != int(byte_count): raise LtacpException( 'ltacp %s: file size reported by srm (%s) does not match datastream byte count (%s)' % (self.logId, srm_file_size, byte_count)) logger.info( 'ltacp %s: srm file size and datastream byte count are equal: %s bytes (%s)' % (self.logId, srm_file_size, humanreadablesize(srm_file_size))) logger.info('ltacp %s: transfer to LTA completed successfully.' % (self.logId)) except LtacpDestinationExistsException as e: logger.log(logging.WARN if force else logging.ERROR, str(e)) if force: self.cleanup() srmrm(self.dst_surl, 'ltacp %s ' % self.logId) return self.transfer(force=False) else: # re-raise the exception to the caller raise except Exception as e: # Something went wrong logger.error('ltacp %s: Error in transfer: %s' % (self.logId, str(e))) # re-raise the exception to the caller raise finally: # cleanup self.cleanup() total_time = round( 10 * totalSeconds(datetime.utcnow() - starttime)) / 10 logger.info( 'ltacp %s: successfully completed transfer of %s:%s to %s in %ssec for %s at avg speed of %s or %s', self.logId, self.src_host, self.src_path, self.dst_surl, total_time, humanreadablesize(int(srm_file_size)), humanreadablesize(int(srm_file_size) / total_time, 'Bps'), humanreadablesize(8 * int(srm_file_size) / total_time, 'bps')) return (md5_checksum_local, a32_checksum_local, str(byte_count))
def getDiskUsageForPath_rbh_du(path): logger.info('getDiskUsageForPath_rbh_du(\'%s\')', path) result = { 'found': False, 'path': path, 'disk_usage': None, 'name': path.split('/')[-1] } cmd = ['rbh-du', '-bd', path] cmd = wrap_command_in_cep4_head_node_ssh_call_if_needed(cmd) logger.info(' '.join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = communicate_returning_strings(proc) if proc.returncode == 0: # example of out # Using config file '/etc/robinhood.d/tmpfs/tmp_fs_mgr_basic.conf'. # /data/projects/2016LOFAROBS/L522380 # dir count:3906, size:16048128, spc_used:16052224 # file count:17568, size:42274164368, spc_used:42327519232 #parse out lines = [l.strip() for l in out.split('\n')] file_lines = [l for l in lines if 'file count' in l] if file_lines: parts = [p.strip() for p in file_lines[0].split(',')] partsDict = { p.split(':')[0].strip(): p.split(':')[1].strip() for p in parts } result['found'] = True if 'size' in partsDict: result['disk_usage'] = int(partsDict['size']) if 'file count' in partsDict: result['nr_of_files'] = int(partsDict['file count']) else: dir_lines = [l for l in lines if 'dir count' in l] if dir_lines: result['found'] = True result['disk_usage'] = 0 result['nr_of_files'] = None else: logger.error(out + err) result['message'] = out result['disk_usage_readable'] = humanreadablesize(result['disk_usage']) otdb_id = getOTDBIdFromPath(path) if otdb_id: result['otdb_id'] = otdb_id logger.info('getDiskUsageForPath_rbh_du(\'%s\') returning: %s', path, result) return result
def uploadDataAndGetSIP(self, archive_id, storage_ticket, filename, uri, filesize, md5_checksum, adler32_checksum, validate=True): try: # mom is very reluctant in providing sips # often it returns a login page, even when you're logged in # so, upon error, retry a couple of times with a pause, else just return for mom_retry in range(self.MAX_MOM_RETRIES): start = time.time() logger.info( "MoMClient.uploadDataAndGetSIP with archiveId %s - StorageTicket %s - FileName %s - Uri %s", archive_id, storage_ticket, filename, uri) self.login() xmlcontent = """<?xml version="1.0" encoding="UTF-8"?> <lofar:DataProduct archiveId="%s" xmlns:lofar="http://www.astron.nl/MoM2-Lofar"> <locations> <location> <uri>lta://%s/%s/%s</uri> </location> </locations> <storageTicket>%s</storageTicket> <fileSize>%s</fileSize> <checksums> <checksum> <algorithm>MD5</algorithm> <value>%s</value> </checksum> <checksum> <algorithm>Adler32</algorithm> <value>%s</value> </checksum> </checksums> </lofar:DataProduct>""" % ( archive_id, storage_ticket, filename, uri, storage_ticket, filesize, md5_checksum, adler32_checksum) # sanitize, make compact xmlcontent = xmlcontent.replace('\n', ' ') while ' ' in xmlcontent: xmlcontent = xmlcontent.replace(' ', ' ') params = { "command": "get-sip-with-input", "xmlcontent": xmlcontent } response = self.session.get(self.__momURLgetSIP, params=params) result = response.text result = result.replace( '<stationType>Europe</stationType>', '<stationType>International</stationType>') if 'DOCTYPE HTML PUBLIC' in result: logger.error( 'MoM returned login screen instead of SIP for %s %s using url %s and params %s, full url=%s', archive_id, filename, self.__momURLgetSIP, params, response.url) # logout, even though we think we should be logged in properly # it's mom who thinks we should login again, even though we have a proper session. # next retry, we'll login automatically again self.logout() if mom_retry == (self.MAX_MOM_RETRIES - 1): # for some reason mom cannot handle the uploadDataAndGetSIP # let's give it a final try with just GetSip # we'll miss some data in the SIP, which we can add ourselves, so the LTA catalogue is up-to-date # but MoM will miss these parameters. tough luck. logger.warning( "MoMClient.uploadDataAndGetSIP with archiveId %s - StorageTicket %s - FileName %s - Uri %s failed %s times. Trying normal GetSip without uploading data to MoM.", archive_id, storage_ticket, filename, uri, mom_retry) result = self.getSIP(archive_id, validate=False) # add ingest info to sip result = addIngestInfoToSIP(result, storage_ticket, filesize, md5_checksum, adler32_checksum) else: wait_secs = (mom_retry + 1) * (mom_retry + 1) * 10 logger.info( 'Retrying to uploadDataAndGetSIP for archiveId %s in %s seconds', archive_id, wait_secs) time.sleep(wait_secs) continue # jump back to for mom_retry in range(self.MAX_MOM_RETRIES) if validate: if not validateSIPAgainstSchema(result, log_prefix=str(filename)): logger.error('Invalid SIP:\n%s', result) raise Exception( 'SIP for %s does not validate against schema' % filename) # MoM sometimes provides SIPs which validate against the schema # but which has incorrect content, e.g. an incorrect archive_id. # check it! if not checkSIPContent(result, log_prefix=str(filename), archive_id=archive_id, filename=filename, filesize=filesize, storage_ticket=storage_ticket, md5_checksum=md5_checksum, adler32_checksum=adler32_checksum): raise Exception('SIP for %s does has invalid content' % filename) if time.time() - start > 2: logger.debug( "MoMClient.uploadDataAndGetSIP for %s took %ds", filename, time.time() - start) logger.info( "MoMClient.uploadDataAndGetSIP for %s retreived SIP of %s: %s...", filename, humanreadablesize(len(result)), result[:512].replace('\n', '')) return result except Exception as e: self.logout() raise Exception("getting SIP from MoM failed: " + str(e)) return ''
def getDiskUsageForPath(self, path, force_update=False): logger.info("cache.getDiskUsageForPath('%s', force_update=%s)", path, force_update) needs_cache_update = False if not force_update: with self._cacheLock: needs_cache_update |= path not in self._cache[ 'path_du_results'] if needs_cache_update or force_update: logger.info("cache update needed for %s", path) # check if some other thread is already doing a du call for this path... with self._du_threading_events_lock: path_threading_event = self._du_threading_events.get(path) need_to_do_du_call = path_threading_event is None if need_to_do_du_call: # no other thread is currently du'ing/updating this path # so create a threading Event and store it in the dict, # so other threads can wait for this event. logger.info("updating the cache for %s current_thread=%s", path, current_thread().name) path_threading_event = Event() self._du_threading_events[path] = path_threading_event if need_to_do_du_call: # no other thread is currently du'ing/updating this path # so we need to do it here. result = du_getDiskUsageForPath(path) self._updateCache(result) # signal threads waiting for this same path du call # and do bookkeeping with self._du_threading_events_lock: logger.info( "signaling other threads that the cache was updated for %s current_thread=%s", path, current_thread().name) path_threading_event.set() del self._du_threading_events[path] else: logger.info( "waiting for du call on other thread that will update the cache for %s current_thread=%s", path, current_thread().name) path_threading_event.wait() logger.info( "another thread just updated the cache for %s current_thread=%s", path, current_thread().name) with self._cacheLock: if path in self._cache['path_du_results']: result = self._cache['path_du_results'][path] else: result = { 'found': False, 'path': path, 'message': 'unknown error' } if not self.disk_usage.path_resolver.pathExists(path): result['message'] = 'No such path: %s' % path result['disk_usage_readable'] = humanreadablesize( result.get('disk_usage', 0)) logger.info('cache.getDiskUsageForPath(\'%s\') result: %s', path, result) return result
def GetSIP(self): try: if self.Type == "MoM": with self.momClient: self.SIP = self.momClient.uploadDataAndGetSIP(self.ArchiveId, self.ticket, self.FileName, self.PrimaryUri, self.FileSize, self.MD5Checksum, self.Adler32Checksum, validate=True) elif 'SIPLocation' in self.job: # job file might know where the sip is when it is not a MoM job try: sip_host = self.job['SIPLocation'].split(':')[0] sip_path = self.job['SIPLocation'].split(':')[1] cmd = ['ssh', '-tt', '-n', '-x', '-q', '%s@%s' % (self.user, sip_host), 'cat %s' % sip_path] logger.info("GetSIP for %s at SIPLocation %s - cmd %s" % (self.JobId, self.job['SIPLocation'], ' ' .join(cmd))) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = communicate_returning_strings(p) if p.returncode != 0: raise PipelineError('GetSIP error getting EoR SIP for %s: %s' % (self.JobId, out + err)) self.SIP = out self.SIP = addIngestInfoToSIP(self.SIP, self.ticket, self.FileSize, self.MD5Checksum, self.Adler32Checksum) self.SIP = self.SIP.replace('<stationType>Europe</stationType>','<stationType>International</stationType>') #make sure the source in the SIP is the same as the type of the storageticket self.SIP = re.compile('<source>eor</source>', re.IGNORECASE).sub('<source>%s</source>' % (self.Type,), self.SIP) if not validateSIPAgainstSchema(self.SIP): logger.error('Invalid SIP:\n%s', self.SIP) raise Exception('SIP for %s does not validate against schema' % self.JobId) except: logger.exception('Getting SIP from SIPLocation %s failed', self.job['SIPLocation']) raise logger.info('SIP received for %s from SIPLocation %s with size %d (%s): \n%s' % (self.JobId, self.job['SIPLocation'], len(self.SIP), humanreadablesize(len(self.SIP)), self.SIP[0:1024])) else: self.SIP = makeSIP(self.Project, self.ObsId, self.ArchiveId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type) self.FileType = FILE_TYPE_UNSPECIFIED except Exception as e: if self.minimal_SIP: logger.info('making minimal SIP for %s', self.JobId) self.SIP = makeSIP(self.Project, self.ObsId, self.ArchiveId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type) logger.info('minimal SIP for %s: \n%s', self.JobId, self.SIP) self.FileType = FILE_TYPE_UNSPECIFIED else: raise
def main(argv): dbpath = argv[0] if argv else 'ltastorageoverview.sqlite' print 'Report for ' + dbpath db = store.LTAStorageDb(dbpath) sites = db.sites() numFilesTotal = sum([db.numFilesInSite(s[0]) for s in sites]) totalFileSize = sum([db.totalFileSizeInSite(s[0]) for s in sites]) print '\n*** TOTALS *** #files=%s total_size=%s' % (humanreadablesize(numFilesTotal, ''), humanreadablesize(totalFileSize)) for site in sites: numFilesInSite = db.numFilesInSite(site[0]) totalFileSizeInSite = db.totalFileSizeInSite(site[0]) print '\n--- %s --- #files=%s total_size=%s' % (site[1], humanreadablesize(numFilesInSite, ''), humanreadablesize(totalFileSizeInSite)) root_dirs = db.rootDirectoriesForSite(site[0]) for root_dir in root_dirs: numFilesInTree = db.numFilesInTree(root_dir[0]) totalFileSizeInTree = db.totalFileSizeInTree(root_dir[0]) print " %s #files=%d total_size=%s" % (root_dir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree)) subdirs = db.subDirectories(root_dir[0], 1, False) subdirs = sorted(subdirs, key=lambda x: x[1]) for subdir in subdirs: numFilesInTree = db.numFilesInTree(subdir[0]) totalFileSizeInTree = db.totalFileSizeInTree(subdir[0]) print " %s #files=%d total_size=%s" % (subdir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree)) if subdir[1].endswith('projects/'): projectsSubDirs = db.subDirectories(subdir[0], 1, False) projectsSubDirs = sorted(projectsSubDirs, key=lambda x: x[1]) for projectsSubDir in projectsSubDirs: numFilesInTree = db.numFilesInTree(projectsSubDir[0]) totalFileSizeInTree = db.totalFileSizeInTree(projectsSubDir[0]) print " %s #files=%d total_size=%s" % (projectsSubDir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree)) utcnow = datetime.utcnow() monthbegin = datetime(utcnow.year, utcnow.month, 1) monthend = datetime(utcnow.year, utcnow.month+1, 1) - timedelta(milliseconds=1) print '\n\n*** CHANGES THIS MONTH %s ***' % monthbegin.strftime('%Y/%m') for site in sites: root_dirs = db.rootDirectoriesForSite(site[0]) numChangedFilesInSite = db.numFilesInSite(site[0], monthbegin, monthend) if numChangedFilesInSite == 0: print '\n--- %s --- None' % (site[1],) continue totalChangedFileSizeInSite = db.totalFileSizeInSite(site[0], monthbegin, monthend) print '\n--- %s --- #files=%d total_size=%s' % (site[1], numChangedFilesInSite, humanreadablesize(totalChangedFileSizeInSite)) for root_dir in root_dirs: changedFiles = db.filesInTree(root_dir[0], monthbegin, monthend) if len(changedFiles) > 0: numFilesInTree = db.numFilesInTree(root_dir[0], monthbegin, monthend) totalFileSizeInTree = db.totalFileSizeInTree(root_dir[0], monthbegin, monthend) print " %s #files=%d total_size=%s" % (root_dir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree)) # filter unique dirs containing changed files dirsWithChangedFiles = set([(x[0], x[1]) for x in changedFiles]) # sort by name dirsWithChangedFiles = sorted(dirsWithChangedFiles, key=lambda x: x[1]) for dir in dirsWithChangedFiles: numFilesInTree = db.numFilesInTree(dir[0], monthbegin, monthend) totalFileSizeInTree = db.totalFileSizeInTree(dir[0], monthbegin, monthend) print " %s #files=%d total_size=%s" % (dir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree)) print '\n\n*** CHANGES PER MONTH ***' min_date, max_date = db.datetimeRangeOfFilesInTree() month_ranges = monthRanges(min_date, max_date) for site in sites: print '\n--- %s ---' % site[1] for month_range in month_ranges: numFilesInSite = db.numFilesInSite(site[0], month_range[0], month_range[1]) totalFileSizeInSite = db.totalFileSizeInSite(site[0], month_range[0], month_range[1]) print " %s %s %s #files=%d total_size=%s" % (site[1], month_range[0], month_range[1], numFilesInSite, humanreadablesize(totalFileSizeInSite))
def enoughResourcesAvailable(self): try: now = datetime.utcnow() bytes_sent = _getBytesSent() if bytes_sent >= self.__prev_bytes_sent: # bytes_sent might wrap around zero # compute current speed in Gbps speed = 8 * (bytes_sent - self.__prev_bytes_sent ) / totalSeconds(now - self.__prev_bytes_sent_timestamp) # running average for used_bandwidth used_bandwidth = 0.5 * speed + 0.5 * self.__prev_used_bandwidth logger.debug("resources: current used_bandwidth = %s", humanreadablesize(used_bandwidth, 'bps')) # store for next iteration self.__prev_bytes_sent = bytes_sent self.__prev_bytes_sent_timestamp = now self.__prev_used_bandwidth = used_bandwidth # only start new jobs if we have some bandwith available # note that this is a 'soft' limit. # we cannot control the actual bandwith used by the running transfers # we can only not start new jobs if we already exceed the MAX_USED_BANDWITH_TO_START_NEW_JOBS if used_bandwidth > MAX_USED_BANDWITH_TO_START_NEW_JOBS: logger.warning( 'resources: not enough bandwith available to start new jobs, using %s, max %s' % (humanreadablesize(used_bandwidth, 'bps'), humanreadablesize( MAX_USED_BANDWITH_TO_START_NEW_JOBS, 'bps'))) return False else: # wrapped around 0, just store for next iteration, do not compute anything self.__prev_bytes_sent = bytes_sent self.__prev_bytes_sent_timestamp = now # only start new jobs if we have some cpu time available idle_cpu_percentage = psutil.cpu_times_percent().idle logger.debug("resources: current idle_cpu_percentage = %s%%", idle_cpu_percentage) if idle_cpu_percentage < 5: logger.warning( 'resources: not enough cpu power available to start new jobs, cpu_idle %s%%' % idle_cpu_percentage) return False # only start new jobs if system load is not too high short_load_avg = os.getloadavg()[0] cpu_count = psutil.cpu_count() allowed_load = 1.5 * cpu_count logger.debug( "resources: current short term load = %s #cpu's = %s allowed_load = %s", short_load_avg, cpu_count, allowed_load) if short_load_avg > allowed_load: logger.warning( 'resources: system load too high (%s > %s), cannot start new jobs' % (short_load_avg, allowed_load)) return False # only allow 1 job at the time if swapping swap_memory_percentage = psutil.swap_memory().percent logger.debug("resources: current swap_memory_percentage = %s%%", swap_memory_percentage) if swap_memory_percentage > 5 and len(self.__running_jobs) > 0: logger.warning( 'resources: system swapping. not enough memory available to start new jobs' ) return False # only start new jobs if number of processes is not too high try: current_user = getpass.getuser() current_user_procs = [ p for p in psutil.process_iter() if p.username() == current_user ] current_num_user_procs = len(current_user_procs) allowed_num_user_procs = 64 * cpu_count logger.debug( "resources: current num_user_procs = %s allowed_num_user_procs = %s", current_num_user_procs, allowed_num_user_procs) if current_num_user_procs > allowed_num_user_procs: logger.warning( 'resources: number of processes by %s too high (%s > %s), cannot start new jobs' % (current_user, current_num_user_procs, allowed_num_user_procs)) return False except Exception as e: logger.exception(e) pass # limit total number of parallel transferring jobs to self.max_nr_of_parallel_jobs with self.__lock: starting_threads = [ job_thread_dict['thread'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' not in job_thread_dict ] pipelines = [ job_thread_dict['pipeline'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' in job_thread_dict ] initializing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_INITIALIZING ] transferring_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_TRANSFERRING ] finalizing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_FINALIZING ] num_busy_transfers = len(starting_threads) + len( initializing_pipelines) + len(transferring_pipelines) num_finalizing_transfers = len(finalizing_pipelines) logger.debug( "resources: current num_busy_transfers = %s num_finalizing_transfers = %s max_nr_of_parallel_jobs = %s", num_busy_transfers, num_finalizing_transfers, self.max_nr_of_parallel_jobs) if num_busy_transfers >= self.max_nr_of_parallel_jobs: logger.warning( 'resources: already running %d parallel jobs (#starting=%d, #transferring=%d) limiting the total number of transferring jobs to %d' % (len(self.__running_jobs), len(initializing_pipelines) + len(starting_threads), len(transferring_pipelines), self.max_nr_of_parallel_jobs)) return False if num_finalizing_transfers >= 2 * self.max_nr_of_parallel_jobs: logger.warning( 'resources: already waiting for %d jobs to finish (updating status/SIP to MoM and LTA). not starting new jobs until some jobs finished...' % (len(finalizing_pipelines), )) return False except Exception as e: logger.exception( "error while checking for available resources: %s", e) num_running_jobs = len(self.__running_jobs) if num_running_jobs <= 4: logger.info("running %d jobs, assuming we can run 1 more: ", num_running_jobs) return True else: logger.warning( "already running %d jobs, assuming for safety we cannot run more jobs...", num_running_jobs) return False return True
def _removeTaskData(self, otdb_id, delete_is=True, delete_cs=True, delete_uv=True, delete_im=True, delete_img=True, delete_pulp=True, delete_scratch=True, force=False): logger.info("Remove task data for otdb_id %s, force=%s" % (otdb_id, force)) if not isinstance(otdb_id, int): message = "Provided otdb_id is not an int" logger.error(message) return {'deleted': False, 'message': message} self._sendNotification(subject='TaskDeleting', content={'otdb_id': otdb_id}) if _isTaskDataPinned(otdb_id): message = "Task otdb_id=%s is pinned. Not deleting data." % ( otdb_id) logger.error(message) self._sendNotification(subject='TaskDeleted', content={ 'deleted': False, 'otdb_id': otdb_id, 'message': message }) return {'deleted': False, 'message': message} radbrpc = self.path_resolver.radbrpc task = radbrpc.getTask(otdb_id=otdb_id) if task: suc_tasks = radbrpc.getTasks(task_ids=task['successor_ids']) unfinished_scu_tasks = [ t for t in suc_tasks if not (t['status'] == 'finished' or t['status'] == 'obsolete') ] if unfinished_scu_tasks: message = "Task otdb_id=%s has unfinished successor tasks (otdb_ids: %s). Not deleting data." % ( task['otdb_id'], [t['otdb_id'] for t in unfinished_scu_tasks]) logger.error(message) self._sendNotification(subject='TaskDeleted', content={ 'deleted': False, 'otdb_id': otdb_id, 'message': message }) return {'deleted': False, 'message': message} momrpc = self.path_resolver.momrpc dataproducts = momrpc.getDataProducts(task['mom_id']).get( task['mom_id']) ingestable_dataproducts = [ dp for dp in dataproducts if dp['status'] not in [None, 'has_data', 'no_data', 'populated'] ] ingested_dataproducts = [ dp for dp in ingestable_dataproducts if dp['status'] == 'ingested' ] if not force: if len(ingestable_dataproducts) > 0 and len( ingested_dataproducts) < len(ingestable_dataproducts): uningested_dataproducts = [ dp for dp in ingestable_dataproducts if dp['status'] != 'ingested' ] message = "Task otdb_id=%s has un-ingested dataproducts. Not deleting data." % ( task['otdb_id'], ) logger.error(message) self._sendNotification(subject='TaskDeleted', content={ 'deleted': False, 'otdb_id': otdb_id, 'message': message }) return {'deleted': False, 'message': message} path_result = self.path_resolver.getPathForOTDBId(otdb_id) if path_result['found']: rm_results = [] if delete_is and delete_cs and delete_uv and delete_im and delete_img and delete_pulp: rm_results.append(self._removePath(path_result['path'])) else: if delete_is and self.path_resolver.pathExists( os.path.join(path_result['path'], 'is')): rm_results.append( self._removePath( os.path.join(path_result['path'], 'is'))) if delete_cs and self.path_resolver.pathExists( os.path.join(path_result['path'], 'cs')): rm_results.append( self._removePath( os.path.join(path_result['path'], 'cs'))) if delete_uv and self.path_resolver.pathExists( os.path.join(path_result['path'], 'uv')): rm_results.append( self._removePath( os.path.join(path_result['path'], 'uv'))) if delete_im and self.path_resolver.pathExists( os.path.join(path_result['path'], 'im')): rm_results.append( self._removePath( os.path.join(path_result['path'], 'im'))) if delete_img and self.path_resolver.pathExists( os.path.join(path_result['path'], 'img')): rm_results.append( self._removePath( os.path.join(path_result['path'], 'img'))) if delete_pulp and self.path_resolver.pathExists( os.path.join(path_result['path'], 'pulp')): rm_results.append( self._removePath( os.path.join(path_result['path'], 'pulp'))) if delete_scratch and 'scratch_paths' in path_result: for scratch_path in path_result['scratch_paths']: rm_results.append(self._removePath(scratch_path)) rm_result = { 'deleted': all(x['deleted'] for x in rm_results), 'paths': [x.get('path') for x in rm_results], 'message': '', 'size': sum([x.get('size', 0) or 0 for x in rm_results]) } combined_message = '\n'.join( x.get('message', '') for x in rm_results) if rm_result[ 'deleted'] and not 'does not exist' in combined_message: task_type = task.get('type', 'task') if task else 'task' rm_result[ 'message'] = 'Deleted %s of data from disk for %s with otdb_id %s\n' % ( humanreadablesize( rm_result['size']), task_type, otdb_id) rm_result['message'] += combined_message self._sendNotification(subject='TaskDeleted', content={ 'deleted': rm_result['deleted'], 'otdb_id': otdb_id, 'paths': rm_result['paths'], 'message': rm_result['message'], 'size': rm_result['size'], 'size_readable': humanreadablesize(rm_result['size']) }) self._endStorageResourceClaim(otdb_id) return rm_result return {'deleted': False, 'message': path_result['message']}
def TransferFile(self): try: logger.info('Starting file transfer for %s ' % self.JobId) start = time.time() self.status = IngestPipeline.STATUS_TRANSFERRING self.__sendNotification('JobProgress', message='transfer starting', percentage_done=0.0, total_bytes_transfered=0) local_ip = hostnameToIp(self.hostname) if 'cep4' in self.HostLocation.lower() or 'cpu' in self.HostLocation.lower(): self.HostLocation = 'localhost' if 'locus' in self.HostLocation.lower() and not '.' in self.HostLocation.lower(): self.HostLocation += '.cep2.lofar' def progress_callback(percentage_done, current_speed, total_bytes_transfered): self.__sendNotification('JobProgress', percentage_done=min(100.0, round(10.0*percentage_done)/10.0), current_speed=current_speed, total_bytes_transfered=total_bytes_transfered) if (os.path.splitext(self.Location)[-1] == '.h5' and os.path.splitext(os.path.basename(self.Location))[0].endswith('_bf')): logger.info('dataproduct is a beamformed h5 file. adding raw file to the transfer') self.Location = [self.Location, self.Location.replace('.h5', '.raw')] if self.DataProduct not in self.Location and 'Source' in self.job: # old hack, is needed to support dynspec / pulsar archiving scripts self.Location = os.path.join(self.Location, self.job['Source']) cp = LtaCp(self.HostLocation, self.Location, self.PrimaryUri, self.user, local_ip=local_ip, globus_timeout=self.globus_timeout, progress_callback=progress_callback) transfer_result = cp.transfer(force=True) self.status = IngestPipeline.STATUS_FINALIZING if not transfer_result: msg = 'error while transferring %s with ltacp' % (self.JobId) logger.error(msg) raise Exception(msg) self.MD5Checksum = transfer_result[0] self.Adler32Checksum = transfer_result[1] self.FileSize = transfer_result[2] if self.MD5Checksum and self.Adler32Checksum and self.FileSize: logger.debug('valid checksums found for %s with filesize %sB (%s). md5: %s adler32: %s', self.JobId, self.FileSize, humanreadablesize(int(self.FileSize), 'B'), self.MD5Checksum, self.Adler32Checksum) else: msg = 'no valid checksums found for %s with filesize %sB (%s). md5: %s adler32: %s' % (self.JobId, self.FileSize, humanreadablesize(int(self.FileSize), 'B'), self.MD5Checksum, self.Adler32Checksum) logger.error(msg) raise Exception(msg) try: self.__sendNotification('JobProgress', message='transfer finished', percentage_done=100.0, total_bytes_transfered=int(self.FileSize)) except ValueError: pass elapsed = time.time() - start try: if int(self.FileSize) > 0: avgSpeed = float(self.FileSize) / elapsed logger.info("Finished file transfer for %s in %d sec with an average speed of %s for %s including ltacp overhead" % (self.JobId, elapsed, humanreadablesize(avgSpeed, 'Bps'), humanreadablesize(float(self.FileSize), 'B'))) except Exception: logger.info('Finished file transfer of %s in %s' % (self.JobId, elapsed)) except Exception as exp: if isinstance(exp, LtacpException): if '550 File not found' in exp.value: logger.error('Destination directory does not exist. Creating %s in LTA for %s' % (self.PrimaryUri, self.JobId)) if create_missing_directories(self.PrimaryUri) == 0: logger.info('Created path %s in LTA for %s' % (self.PrimaryUri, self.JobId)) elif 'source path' in exp.value and 'does not exist' in exp.value: raise PipelineError(exp.value, PipelineNoSourceError) raise Exception('transfer failed for %s: %s' % (self.JobId, str(exp)))