Exemplo n.º 1
0
def printResources(resources, scaled_units=True):
    """ E.g.: resources = [{u'total_capacity': 3774873600, u'name': u'dragproc_bandwidth:/data', u'type_id': 3,
                            u'available_capacity': 3774873600, u'type_name': u'bandwidth', u'unit_id': 3,
                            u'active': True, u'used_capacity': 0, u'id': 118, u'unit': u'bits/second',
                            'claimable_capacity': 3774873600}, ...]  # this key was added (not from RADB); it can be negative!
    """
    header = {'id': 'RId', 'name': 'Resource Name', 'active': 'Active',
              'available_capacity': ' Avail. Capacity', 'claimable_capacity': '  Claimable Cap.',
              'total_capacity': '  Total Capacity', 'unit': 'Unit'}
    print(('{id:4s} {name:24s} {active:6s} {available_capacity} {claimable_capacity} {total_capacity} {unit}'.format(**header)))
    print('===================================================================================================')
    resources.sort(key=lambda r: r['id'])  # SQL could have done this better
    for res in resources:
        res['active'] = 'True' if res['active'] else 'False'  # to solve bool formatting issue

        if scaled_units and (res['type_name'] == 'storage' or res['type_name'] == 'bandwidth'):
            unit_base = 1024 if res['type_name'] == 'storage' else 1000  # check type_name instead of unit as in printClaims()
            res['available_capacity'] = humanreadablesize(res['available_capacity'], '', unit_base)
            res['claimable_capacity'] = humanreadablesize(res['claimable_capacity'], '', unit_base)
            res['total_capacity']     = humanreadablesize(res['total_capacity']    , '', unit_base)
            cap_conv = '>16s'
        else:
            cap_conv = '16d'

        print((('{id:4d} {name:24s} {active:6s} {available_capacity:' + cap_conv +
               '} {claimable_capacity:' + cap_conv + '} {total_capacity:' + cap_conv + '} {unit}').format(**res)))
    if not resources:
        print('<no resources>')
Exemplo n.º 2
0
    def _updateCEP4CapacitiesInRADB(self):
        try:
            df_result = self.disk_usage.getDiskFreeSpace()
            if df_result['found']:
                #get the total used space, and update the resource availability in the radb
                radbrpc = self.disk_usage.path_resolver.radbrpc
                storage_resources = radbrpc.getResources(
                    resource_types='storage', include_availability=True)
                cep4_storage_resource = next(x for x in storage_resources
                                             if 'CEP4' in x['name'])

                total_capacity = df_result.get('disk_size')
                used_capacity = df_result.get('disk_usage')
                available_capacity = df_result.get('disk_free')

                logger.info(
                    'updating capacities for resource \'%s\' (id=%s) in the RADB: total=%s, used=%s, available=%s',
                    cep4_storage_resource['name'], cep4_storage_resource['id'],
                    humanreadablesize(total_capacity),
                    humanreadablesize(used_capacity),
                    humanreadablesize(available_capacity))

                radbrpc.updateResourceAvailability(
                    cep4_storage_resource['id'],
                    available_capacity=available_capacity,
                    total_capacity=total_capacity)
        except Exception as e:
            logger.error('_updateCEP4CapacitiesInRADB: %s', e)
Exemplo n.º 3
0
def index():
    # TODO: serve html first, and let client request data via ajax
    usages = {}

    sites = db.sites()
    sites2 = [x for x in sites if x[1] != 'nikhef']
    sites = [sites2[0], sites2[2], sites2[1]]

    total = 0.0
    numFiles = 0L
    for site in sites:
        site_usage = float(db.totalFileSizeInSite(site[0]))
        usages[site[1]] = site_usage
        total += site_usage
        numFiles += db.numFilesInSite(site[0])

    storagesitedata='[' + ', '.join(['''{name: "%s %s", y: %.1f}''' % (site[1], humanreadablesize(usages[site[1]]), 100.0*usages[site[1]]/total) for site in sites]) + ']'

    min_date, max_date = db.datetimeRangeOfFilesInTree()
    min_date = datetime(2012, 1, 1)
    month_ranges = monthRanges(min_date, max_date)

    # convert end-of-month timestamps to milliseconds since epoch
    epoch = datetime.utcfromtimestamp(0)
    datestamps=[('%d' % ((x[1] - epoch).total_seconds()*1000,)) for x in month_ranges]

    usage_per_month_series='['
    deltas_per_month_series='['
    for site in sites:
        cumulatives = [db.totalFileSizeInSite(site[0], to_date=mr[1]) for mr in month_ranges]

        data = ', '.join(['[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, cumulatives)])
        usage_per_month_series += '''{name: '%s', data: [%s]},\n''' % (site[1], data)

        deltas = [0]
        for i in range(1, len(cumulatives)):
            delta = cumulatives[i] - cumulatives[i-1]
            deltas.append(delta)

        data = ', '.join(['[%s, %s]' % (x[0], str(x[1])) for x in zip(datestamps, deltas)])
        deltas_per_month_series += '''{name: '%s', data: [%s]},\n''' % (site[1], data)


    usage_per_month_series+=']'
    deltas_per_month_series+=']'

    return render_template('index.html',
                           title='LTA storage overview',
                           storagesitetitle='LTA Storage Site Usage',
                           storagesitesubtitle='Total: %s #dataproducts: %s' % (humanreadablesize(total, 'B', 1000), humanreadablesize(numFiles, '', 1000)),
                           storagesitedata=storagesitedata,
                           usage_per_month_series=usage_per_month_series,
                           deltas_per_month_series=deltas_per_month_series,
                           data_gathered_timestamp=db.mostRecentVisitDate().strftime('%Y/%m/%d %H:%M:%S'))
Exemplo n.º 4
0
    def _log_job_notification(self, subject: str, job_dict: dict):
        if self._log_subject_filters and subject not in self._log_subject_filters:
            return

        try:
            if subject in [
                    'JobProgress', 'TransferServiceStatus', 'TaskProgress',
                    'TaskFinished'
            ]:
                msg = "ingest %s " % subject
            else:
                msg = 'ingest job status changed to %s. ' % (subject, )

            msg += 'project: %s export_id: %s type: %s server: %s' % (
                job_dict.get('project'), job_dict.get('export_id'),
                job_dict.get('type'), job_dict.get('ingest_server'))
            if job_dict.get('archive_id'):
                msg += ' archive_id: %s' % job_dict.get('archive_id')

            if job_dict.get('dataproduct'):
                msg += ' dp: %s' % job_dict.get('dataproduct')

            if job_dict.get('lta_site'):
                msg += ' lta_site: %s' % job_dict.get('lta_site')

            if job_dict.get('otdb_id'):
                msg += ' otdb_id: %s' % job_dict.get('otdb_id')

            if job_dict.get('percentage_done') != None:
                msg += ' progress: %s%%' % (round(
                    10.0 * float(job_dict.get('percentage_done'))) / 10.0)

            if job_dict.get('total_bytes_transfered') != None:
                msg += ' transferred: %s' % humanreadablesize(
                    job_dict.get('total_bytes_transfered'))

            if job_dict.get('average_speed') != None:
                msg += ' avg speed: %s' % humanreadablesize(
                    job_dict.get('average_speed'), 'Bps')

            if job_dict.get('srm_url'):
                msg += ' srm_url: %s' % job_dict.get('srm_url')

            if job_dict.get('message'):
                msg += ' message: %s' % job_dict.get('message')

            logger.log(
                logging.WARNING if 'Failed' in subject else logging.INFO, msg)
        except Exception as e:
            logger.error(e)
Exemplo n.º 5
0
def printClaims(claims, scaled_units=True):
    """ E.g.: claims = [{u'claim_size': 76441190400, u'endtime': datetime.datetime(2018, 6, 13, 17, 40),
                         u'id': 67420, u'resource_id': 122, u'resource_name': u'drg01_storage:/data1',
                         u'resource_type_id': 5, u'resource_type_name': u'storage',
                         u'starttime': datetime.datetime(2017, 6, 13, 17, 30),
                         u'status': u'claimed', u'status_id': 1, u'task_id': 75409, ...}, ...]
    """
    header = {'id': 'ClId', 'resource_name': 'Resource Name', 'starttime': 'Start Time', 'endtime': 'End Time',
              'claim_size': 'Claim Size', 'status': 'Status'}
    print(('{id:7s} {resource_name:24s} {starttime:19s} {endtime:19s} {claim_size:16s} {status:8s}'.format(**header)))
    print('===================================================================================================')
    claims.sort(key=lambda c: c['id'])         # secondary sorting key; SQL could have done this better
    claims.sort(key=lambda c: c['starttime'])  # primary sorting key (stable sort)
    for claim in claims:
        if scaled_units and (claim['resource_type_name'] == 'storage' or claim['resource_type_name'] == 'bandwidth'):
            unit_base = 1024 if claim['resource_type_name'] == 'storage' else 1000  # no unit name here, so check type_name
            claim['claim_size'] = humanreadablesize(claim['claim_size'], '', unit_base)
            size_conv = '>16s'
        else:
            size_conv = '16d'

        print((('{id:7d} {resource_name:24s} {starttime} {endtime} {claim_size:' + size_conv +
               '} {status:8s}').format(**claim)))
    if not claims:
        print('<no claims on specified resources and time range>')
Exemplo n.º 6
0
    def SendSIP(self, job_id, sip, storage_ticket):
        try:
            logger.info("LTAClient.SendSIP sending sip of %s for %s ...",
                        humanreadablesize(len(sip)), job_id)
            start = time.time()
            #check sip before we upload it the the LTA
            if not validateSIPAgainstSchema(sip, log_prefix=str(job_id)):
                raise Exception('SIP for %s does not validate against schema' %
                                job_id)

            if not checkSIPContent(sip,
                                   log_prefix=str(job_id),
                                   storage_ticket=storage_ticket):
                raise Exception('SIP for %s does has invalid content' % job_id)

            # sip ok, do upload
            result = self.__rpc.TransmitSIP(sip, storage_ticket)
            if time.time() - start > 2:
                logger.info("LTAClient.SendSIP for %s took %ds", job_id,
                            time.time() - start)
        except xmlrpc.client.Fault as err:
            logger.error('LTAClient.SendSIP Received XML-RPC Fault: %s %s' %
                         (err.faultCode, self.__hidePassword(err.faultString)))
            raise
        if result['result'] == 'ok':
            logger.info(
                'LTAClient.SendSIP: Successfully sent SIP of %s for %s to the LTA',
                humanreadablesize(len(sip)), job_id)
        elif result.get('error'):
            if "could not use SIP" in result['error']:
                if 'is already in database with isValid 1 and Ticket state 40' in result[
                        'error']:
                    #sip is already in lta. issue warning, and continue, don't fail
                    logger.warning(result['error'])
                else:
                    msg = 'LTAClient.SendSIP: Invalid SIP according to LTA catalog for %s: %s' % (
                        job_id, result['error'])
                    logger.error(msg)
                    raise Exception(msg)
            else:
                raise Exception(
                    'LTAClient.SendSIP: Got Tier 1 TransmitSIP error for %s: %s'
                    % (job_id, result['error']))
Exemplo n.º 7
0
def getDiskFreeSpaceForMountpoint(mountpoint=CEP4_DATA_MOUNTPOINT):
    logger.info('getDiskFreeSpaceForMountpoint(\'%s\')', mountpoint)

    result = {'found': False, 'mountpoint': mountpoint}

    cmd = ['df', mountpoint]
    cmd = wrap_command_in_cep4_head_node_ssh_call_if_needed(cmd)
    logger.info(' '.join(cmd) + ' ...waiting for result...')

    proc = subprocess.Popen(cmd,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    out, err = communicate_returning_strings(proc)

    if proc.returncode != 0:
        logger.error(out + err)
        result['message'] = out
        return result

    # example of out
    # Filesystem                                         1K-blocks          Used     Available Use% Mounted on
    # 10.134.233.65@o2ib:10.134.233.66@o2ib:/cep4-fs 3369564904320 1460036416928 1737591103048  46% /data

    #parse out
    lines = [l.strip() for l in out.split('\n')]
    data_line = next(l for l in lines if mountpoint in l)
    if data_line:
        parts = [p.strip() for p in data_line.split(' ')]

        result['found'] = True
        result['disk_size'] = 1024 * int(parts[1])
        result['disk_usage'] = 1024 * int(parts[2])
        result['disk_free'] = 1024 * int(parts[3])

        result['disk_size_readable'] = humanreadablesize(result['disk_size'])
        result['disk_usage_readable'] = humanreadablesize(result['disk_usage'])
        result['disk_free_readable'] = humanreadablesize(result['disk_free'])

    logger.info('getDiskFreeSpaceForMountpoint(\'%s\') returning: %s',
                mountpoint, result)
    return result
Exemplo n.º 8
0
def getDiskUsageForPath_du(path):
    logger.info('getDiskUsageForPath_du(\'%s\')', path)

    result = {
        'found': False,
        'path': path,
        'disk_usage': None,
        'name': path.split('/')[-1]
    }

    cmd = ['du', '-bcs', path]
    cmd = wrap_command_in_cep4_head_node_ssh_call_if_needed(cmd)
    logger.info(' '.join(cmd))

    proc = subprocess.Popen(cmd,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    out, err = communicate_returning_strings(proc)

    if proc.returncode == 0:
        # example of out
        # 7025510839      /data/projects/HOLOG_WINDMILL_TESTS/L662734/uv/
        # 7025510839      total

        #parse out
        lines = [l.strip() for l in out.split('\n')]
        total_lines = [l for l in lines if 'total' in l]
        if total_lines:
            parts = [p.strip() for p in total_lines[0].split()]
            if len(parts) == 2:
                result['found'] = True
                result['disk_usage'] = int(parts[0])
                result['nr_of_files'] = None
    else:
        result['message'] = out + err
        result['found'] = False

        if 'No such file or directory' in err:
            logger.warning('No such file or directory: %s', path)
            result['disk_usage'] = 0
        else:
            logger.error(out + err)

    result['disk_usage_readable'] = humanreadablesize(result['disk_usage'])

    otdb_id = getOTDBIdFromPath(path)
    if otdb_id:
        result['otdb_id'] = otdb_id

    logger.info('getDiskUsageForPath_du(\'%s\') returning: %s', path, result)
    return result
Exemplo n.º 9
0
def get_sites_usages():
    sites = {'sites_usages': db.sites()}

    for site in sites['sites_usages']:
        rootDirs = db.rootDirectoriesForSite(site['id'])

        site_usage = 0
        for rootDir in rootDirs:
            usage = int(db.totalFileSizeInTree(rootDir['dir_id']))
            site_usage += usage
        site['usage'] = site_usage
        site['usage_hr'] = humanreadablesize(site_usage)

    return json.jsonify(sites)
Exemplo n.º 10
0
def get_sites_usages():
    sites = {'sites_usages': [{'id': x[0],
                               'name': x[1]} for x in db.sites()]}

    for site in sites['sites_usages']:
        rootDirs = db.rootDirectoriesForSite(site['id'])

        site_usage = 0L
        for rootDir in rootDirs:
            usage = long(db.totalFileSizeInTree(rootDir[0]))
            site_usage += usage
        site['usage'] = site_usage
        site['usage_hr'] = humanreadablesize(site_usage)

    return json.jsonify(sites)
Exemplo n.º 11
0
    def _updateCache(self, du_result, send_notification=True):
        if not 'path' in du_result:
            return

        path = du_result['path']
        otdb_id = du_result.get('otdb_id')

        with self._cacheLock:
            path_cache = self._cache['path_du_results']
            otdb_id2path_cache = self._cache['otdb_id2path']

            if otdb_id is None:
                # try to look up the otdb in the path cache
                if path in path_cache:
                    otdb_id = du_result.get('otdb_id')

                # if still None, try to get the id from the path
                if otdb_id is None:
                    otdb_id = getOTDBIdFromPath(path)

            if not path in path_cache or path_cache[path][
                    'disk_usage'] != du_result['disk_usage']:
                # update the cache entry, even when no du result found,
                # cause that will save disk queries next time.
                logger.info('updating cache entry: %s', du_result)
                path_cache[path] = du_result

            if otdb_id != None:
                otdb_id2path_cache[otdb_id] = path

            if not du_result['found']:
                # even when the du for the path is not found,
                # keep a copy in the cache for fast lookup by clients
                # Make sure the size is 0
                du_result['disk_usage'] = 0
                du_result['disk_usage_readable'] = humanreadablesize(0)

            path_cache[path]['cache_timestamp'] = datetime.datetime.utcnow()
            path_cache[path]['needs_update'] = False

        self._writeCacheToDisk()

        if send_notification:
            self._sendDiskUsageChangedNotification(path,
                                                   du_result['disk_usage'],
                                                   otdb_id)
Exemplo n.º 12
0
    def _handleDiskUsageChange(self, disk_usage, otdb_id):
        if otdb_id != None:
            task = self._radbrpc.getTask(otdb_id=otdb_id)
            if task:
                du_readable = humanreadablesize(disk_usage)
                logger.info(
                    'disk_usage change: otdb_id %s radb_id %s disk_usage %s %s',
                    otdb_id, task['id'], disk_usage, du_readable)
                task['disk_usage'] = disk_usage
                task['disk_usage_readable'] = du_readable

                task_change = {
                    'changeType': CHANGE_UPDATE_TYPE,
                    'objectType': 'task',
                    'value': task
                }
                self._handleChange(task_change)
Exemplo n.º 13
0
 def _sendDiskUsageChangedNotification(self,
                                       path,
                                       disk_usage,
                                       otdb_id=None):
     try:
         msg = EventMessage(subject='%s.DiskUsageChanged' %
                            DEFAULT_DM_NOTIFICATION_PREFIX,
                            content={
                                'path':
                                path,
                                'disk_usage':
                                disk_usage,
                                'disk_usage_readable':
                                humanreadablesize(disk_usage),
                                'otdb_id':
                                otdb_id
                            })
         logger.info('Sending notification with subject %s to %s: %s',
                     msg.subject, self.event_bus.exchange, msg.content)
         self.event_bus.send(msg)
     except Exception as e:
         logger.error(str(e))
Exemplo n.º 14
0
    def run(self):
        with self.event_bus:
            while True:
                try:
                    self.__clearFinishedJobs()

                    with self.__lock:
                        starting_threads = [
                            job_thread_dict['thread'] for job_thread_dict in
                            list(self.__running_jobs.values())
                            if 'pipeline' not in job_thread_dict
                        ]
                        pipelines = [
                            job_thread_dict['pipeline'] for job_thread_dict in
                            list(self.__running_jobs.values())
                            if 'pipeline' in job_thread_dict
                        ]
                        initializing_pipelines = [
                            pipeline for pipeline in pipelines
                            if pipeline.status ==
                            IngestPipeline.STATUS_INITIALIZING
                        ]
                        transferring_pipelines = [
                            pipeline for pipeline in pipelines
                            if pipeline.status ==
                            IngestPipeline.STATUS_TRANSFERRING
                        ]
                        finalizing_pipelines = [
                            pipeline for pipeline in pipelines if
                            pipeline.status == IngestPipeline.STATUS_FINALIZING
                        ]
                        finished_pipelines = [
                            pipeline for pipeline in pipelines if
                            pipeline.status == IngestPipeline.STATUS_FINISHED
                        ]
                        log_interval = 5 if self.__running_jobs else 60

                    if datetime.utcnow(
                    ) - self.__running_jobs_log_timestamp > timedelta(
                            seconds=log_interval):
                        status_log_line = "status: running %s jobs: #starting=%d, #transferring=%d, #finalizing=%d, #finished=%d, bandwith used on network interface(s) %s %s (%s), load=%.1f" % (
                            len(self.__running_jobs),
                            len(initializing_pipelines) +
                            len(starting_threads), len(transferring_pipelines),
                            len(finalizing_pipelines), len(finished_pipelines),
                            NET_IF_TO_MONITOR,
                            humanreadablesize(self.__prev_used_bandwidth,
                                              'bps'),
                            humanreadablesize(self.__prev_used_bandwidth / 8,
                                              'Bps'), os.getloadavg()[0])

                        logger.info(status_log_line)
                        self.__running_jobs_log_timestamp = datetime.utcnow()

                        msg = EventMessage(subject="%s.%s" %
                                           (INGEST_NOTIFICATION_PREFIX,
                                            'TransferServiceStatus'),
                                           content={
                                               'ingest_server':
                                               socket.gethostname(),
                                               'message':
                                               status_log_line
                                           })
                        msg.ttl = 3600  # remove message from queue's when not picked up within 1 hours
                        self.event_bus.send(msg)

                    time.sleep(5)
                except KeyboardInterrupt:
                    break
                except Exception as e:
                    logger.error(e)
Exemplo n.º 15
0
 def __str__(self):
     return self.filename + " " + humanreadablesize(self.size) + " " + str(self.created_at)
Exemplo n.º 16
0
 def __str__(self):
     return "LocationResult: path=%s # subdirs=%d # files=%d totalFileSizeOfDir=%s" % (self.location.path(), self.nrOfSubDirs(), self.nrOfFiles(), humanreadablesize(self.totalFileSizeOfDir()))
Exemplo n.º 17
0
def index():
    # TODO: serve html first, and let client request data via ajax
    usages = {}

    colors = {
        'sara': {
            'used': '#90ed7d',
            'free': '#c5f6bc'
        },
        'juelich': {
            'used': '#494950',
            'free': '#a1a1aa'
        },
        'poznan': {
            'used': '#7cb5ec',
            'free': '#bcdaf5'
        }
    }

    sites = db.sites()
    sitesDict = {s['name']: s for s in sites}
    sites = [
        sitesDict[sitename] for sitename in ['poznan', 'juelich', 'sara']
        if sitename in sitesDict
    ]

    total_lta_size = 0.0
    total_lta_num_files = 0
    for site in sites:
        totals = db.totalFileSizeAndNumFilesInSite(site['id'])
        total_lta_size += totals['tree_total_file_size']
        total_lta_num_files += totals['tree_num_files']
        usages[site['name']] = totals['tree_total_file_size']

    if total_lta_size > 0:
        storagesitedata = '[' + ', '.join([
            '''{name: "%s %s", color:'%s', y: %.2f}''' %
            (site['name'], humanreadablesize(
                usages[site['name']]), colors[site['name']]['used'],
             100.0 * usages[site['name']] / total_lta_size) for site in sites
        ]) + ']'
    else:
        storagesitedata = '[]'

    min_date, max_date = db.datetimeRangeOfFilesInTree()
    logger.info("datetimeRangeOfFilesInTree: min_date=%s, max_date=%s",
                min_date, max_date)
    if min_date is None:
        min_date = datetime(2012, 1, 1)
    if max_date is None:
        max_date = datetime.utcnow()
    min_date = max(datetime(2012, 1, 1), min_date)
    logger.info("month_ranges: min_date=%s, max_date=%s", min_date, max_date)
    month_ranges = monthRanges(min_date, max_date, 1)

    # convert end-of-month timestamps to milliseconds since epoch
    epoch = datetime.utcfromtimestamp(0)
    datestamps = [('%d' % ((x[1] - epoch).total_seconds() * 1000, ))
                  for x in month_ranges]

    usage_per_month_series = '['
    deltas_per_month_series = '['
    for site in sites:
        deltas_per_month = [
            db.totalFileSizeInSite(site['id'], from_date=mr[0], to_date=mr[1])
            for mr in month_ranges
        ]
        data = ', '.join([
            '[%s, %s]' % (x[0], str(x[1]))
            for x in zip(datestamps, deltas_per_month)
        ])
        deltas_per_month_series += '''{name: '%s', color:'%s', data: [%s]},\n''' % (
            site['name'], colors[site['name']]['used'], data)

        cumulatives = [deltas_per_month[0]]
        for delta in deltas_per_month[1:]:
            cumulative = cumulatives[-1] + delta
            cumulatives.append(cumulative)

        data = ', '.join([
            '[%s, %s]' % (x[0], str(x[1]))
            for x in zip(datestamps, cumulatives)
        ])
        usage_per_month_series += '''{name: '%s', color:'%s', data: [%s]},\n''' % (
            site['name'], colors[site['name']]['used'], data)

    usage_per_month_series += ']'
    deltas_per_month_series += ']'

    quota_dir_stats = db.siteQuotaRootDirStats()
    quota_dir_stats = sorted(quota_dir_stats,
                             reverse=True,
                             key=lambda x: x['tree_total_file_size'])

    site_usages_per_site = {}
    latest_usages_per_site = {}
    for site_usage in db.siteQuotaUsages():
        site_name = site_usage['site_name']
        if site_name not in site_usages_per_site:
            site_usages_per_site[site_name] = []
        site_usages_per_site[site_name].append(site_usage)
        if site_name not in latest_usages_per_site:
            latest_usages_per_site[site_name] = site_usage
        if site_usage['valid_until_date'] > latest_usages_per_site[site_name][
                'valid_until_date']:
            latest_usages_per_site[site_name] = site_usage

    quota_series = '['
    storagesite_free_space = '['
    site_tape_usages_table = '<table>\n'
    site_tape_usages_table += '<tr><th style="text-align: left;">site</th><th style="text-align: left;">directory</th><th>total #files</th><th>total file size</th><th>quota</th><th>free</th><th>expiration</th></tr>\n'
    total_lta_free_space = sum(u['space_left']
                               for u in list(latest_usages_per_site.values())
                               if u['space_left'] > 0)
    total_lta_quota = sum(u['quota']
                          for u in list(latest_usages_per_site.values()))

    for site_name in ['sara', 'juelich', 'poznan']:
        if site_name in latest_usages_per_site:
            latest_usage = latest_usages_per_site[site_name]
            site_tape_usages_table += '<tr style="font-weight: bold;"><td style="text-align: left;">%s</td><td style="text-align: left;">%s</td><td>%s</td><td>%s</td><td>%s</td><td %s>%s</td><td>%s</td></tr>\n' % (
                latest_usage['site_name'], '', latest_usage['num_files'],
                humanreadablesize(latest_usage['total_file_size']),
                humanreadablesize(latest_usage['quota']), 'style="color: red;"'
                if latest_usage['space_left'] < 0 else '',
                humanreadablesize(latest_usage['space_left']),
                latest_usage['valid_until_date'])

            for qds in quota_dir_stats:
                if qds['site_name'] == site_name:
                    site_tape_usages_table += '<tr><td style="text-align: left;">%s</td><td style="text-align: left;">%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % (
                        '', qds['dir_name'], qds['tree_num_files'],
                        humanreadablesize(
                            qds['tree_total_file_size']), '', '', '')

            storagesite_free_space += '''{name: "%s %s", color:'%s', y: %.2f}, ''' % (
                site_name, humanreadablesize(
                    latest_usage['space_left']), colors[site_name]['free'],
                max(0, 100.0 * latest_usage['space_left']) /
                total_lta_free_space)

    site_tape_usages_table += '</table>\n'

    for site_name in ['poznan', 'juelich', 'sara']:
        if site_name in site_usages_per_site:
            site_usages_for_site = site_usages_per_site[site_name]
            site_usages_for_site = sorted(site_usages_for_site,
                                          key=lambda x: x['valid_until_date'])
            data = ','.join('[%d, %s]' %
                            ((su['valid_until_date'] - epoch).total_seconds() *
                             1000, su['space_left'])
                            for su in site_usages_for_site)
            quota_series += '''{ name:'%s_free', stack:'%s', color:'%s', data:[%s] },''' % (
                site_name, site_name, colors[site_name]['free'], data)
            data = ','.join('[%d, %s]' %
                            ((su['valid_until_date'] - epoch).total_seconds() *
                             1000, su['total_file_size'])
                            for su in site_usages_for_site)
            quota_series += '''{ name:'%s_used', stack:'%s', color:'%s', data:[%s] },''' % (
                site_name, site_name, colors[site_name]['used'], data)

    quota_series += ']'
    storagesite_free_space += ']'

    return render_template(
        'index.html',
        title='LTA storage overview',
        storagesitetitle='LTA Storage Site Usage',
        storagesitesubtitle='Total: %s #dataproducts: %s' %
        (humanreadablesize(total_lta_size, 'B', 1000),
         humanreadablesize(total_lta_num_files, '', 1000)),
        storagesite_free_space_title='LTA Storage Site Free Space',
        storagesite_free_space_subtitle=
        'Total free space: %s Current total quota: %s' %
        (humanreadablesize(total_lta_free_space, 'B', 1000),
         humanreadablesize(total_lta_quota, 'B', 1000)),
        storagesitedata=storagesitedata,
        storagesite_free_space=storagesite_free_space,
        usage_per_month_series=usage_per_month_series,
        deltas_per_month_series=deltas_per_month_series,
        quota_series=quota_series,
        site_tape_usages=site_tape_usages_table,
        data_gathered_timestamp=db.mostRecentVisitDate().strftime(
            '%Y/%m/%d %H:%M:%S'))
Exemplo n.º 18
0
    def run(self):
        with self.event_bus:
            try:
                logger.info("starting ingestpipeline for %s" % self.JobId)
                start = time.time()
                self.__sendNotification('JobStarted')

                self.GetStorageTicket()
                self.CheckForValidSIP()
                self.TransferFile()
                self.SendChecksumsToLTA()
                self.GetSIP()
                self.SendSIPToLTA()
                self.SendStatusToLTA(IngestSuccessful)

                avgSpeed = 0
                elapsed = time.time() - start
                try:
                    avgSpeed = float(self.FileSize) / elapsed
                    logger.info("Ingest Pipeline finished for %s in %d sec with average speed of %s for %s including all overhead",
                                self.JobId, elapsed, humanreadablesize(avgSpeed, 'Bps'), humanreadablesize(float(self.FileSize), 'B'))
                except Exception:
                    logger.info("Ingest Pipeline finished for %s in %d sec", self.JobId, elapsed)

                self.__sendNotification('JobFinished',
                                        average_speed=avgSpeed,
                                        total_bytes_transfered=int(self.FileSize))

            except PipelineError as pe:
                logger.log(logging.WARNING if pe.type == PipelineAlreadyInLTAError else logging.ERROR,
                           'Encountered PipelineError for %s : %s', self.JobId, str(pe))
                if pe.type == PipelineNoSourceError:
                    self.__sendNotification('JobTransferFailed', 'data not transfered because it was not on disk')
                elif pe.type == PipelineAlreadyInLTAError:
                    self.__sendNotification('JobFinished', 'data was already in the LTA',
                                            average_speed=0,
                                            total_bytes_transfered=0)
                else:
                    self.RollBack()

                    # by default the error_message for the notification is the exception
                    error_message = str(pe)
                    # for known messsages in the exception, make a nice readable error_message
                    if 'MoM returned login screen instead of SIP' in error_message:
                        error_message = 'MoM returned login screen instead of SIP'

                    self.__sendNotification('JobTransferFailed', error_message)

                try:
                    if pe.type != PipelineAlreadyInLTAError:
                        self.SendStatusToLTA(IngestFailed)
                except Exception as e:
                    logger.error('SendStatusToLTA failed for %s: %s', self.JobId, e)

            except Exception as e:
                logger.error('Encountered unexpected error for %s: %s', self.JobId, e)

                # by default the error_message for the notification is the exception
                error_message = str(e)
                # for known messsages in the exception, make a nice readable error_message
                if 'ltacp' in error_message and ('file listing failed' in error_message or 'du failed' in error_message):
                    error_message = 'dataproduct %s not found at location %s:%s' % (self.DataProduct, self.HostLocation, self.Location)
                elif 'does not validate against schema' in error_message:
                    error_message = 'invalid SIP does not validate against schema'

                try:
                    self.RollBack()
                except Exception as rbe:
                    logger.error('RollBack failed for %s: %s', self.JobId, rbe)
                try:
                    self.SendStatusToLTA(IngestFailed)
                except Exception as sse:
                    logger.error('SendStatusToLTA failed for %s: %s', self.JobId, sse)
                try:
                    self.__sendNotification('JobTransferFailed', error_message)
                except Exception as sne:
                    logger.error('sendNotification failed for %s: %s', self.JobId, sne)
            finally:
                self.status = IngestPipeline.STATUS_FINISHED
Exemplo n.º 19
0
    def transfer(self, force=False, dereference=False):
        starttime = datetime.utcnow()

        # for cleanup
        self.started_procs = {}
        self.fifos = []

        try:
            if not self.source_exists():
                if self.can_logon_to_source_host():
                    if self.source_mounted():
                        raise LtacpException(
                            "ltacp %s: source path %s:%s does not exist" %
                            (self.logId, self.src_host, self.src_path))
                    else:
                        raise LtacpException(
                            "ltacp %s: the disk of source path %s:%s does not seem to be mounted"
                            % (self.logId, self.src_host, self.src_path))
                else:
                    raise LtacpException(
                        "ltacp %s: cannot login to %s@%s" %
                        (self.logId, self.src_user, self.src_host))

            # determine if input is file
            input_is_file = self.is_soure_single_file()

            if not input_is_file:
                # make sure the file extension is .tar or .tar.gz
                missing_suffix = ""
                if self.gzip:
                    if not (self.dst_surl.endswith(".tar.gz")
                            or self.dst_surl.endswith(".tgz")):
                        if self.dst_surl.endswith(".tar"):
                            missing_suffix = ".gz"
                        else:
                            missing_suffix = ".tar.gz"
                else:
                    if not self.dst_surl.endswith(".tar"):
                        missing_suffix = ".tar"

                if missing_suffix:
                    self.dst_surl += missing_suffix
                    logger.info(
                        "ltacp %s: appending missing suffix %s to surl: %s",
                        self.logId, missing_suffix, self.dst_surl)

            dst_turl = convert_surl_to_turl(self.dst_surl)
            logger.info(
                'ltacp %s: initiating transfer of %s:%s to surl=%s turl=%s' %
                (self.logId, self.src_host, self.src_path, self.dst_surl,
                 dst_turl))

            # get input datasize
            du_items = self.src_path if input_is_file or isinstance(
                self.src_path, str) else ' '.join(self.src_path)
            cmd_remote_du = self.ssh_cmd + [
                'du -b %s --max-depth=0 %s' %
                ("--dereference" if dereference else "", du_items)
            ]
            logger.info('ltacp %s: remote getting datasize. executing: %s' %
                        (self.logId, ' '.join(cmd_remote_du)))
            p_remote_du = Popen(cmd_remote_du, stdout=PIPE, stderr=PIPE)
            self.started_procs[p_remote_du] = cmd_remote_du

            # block until du is finished
            output_remote_du = communicate_returning_strings(p_remote_du)
            del self.started_procs[p_remote_du]
            if p_remote_du.returncode != 0:
                raise LtacpException(
                    'ltacp %s: remote du failed: \nstdout: %s\nstderr: %s' %
                    (self.logId, output_remote_du[0], output_remote_du[1]))

            # compute various parameters for progress logging
            if input_is_file:
                input_datasize = int(output_remote_du[0].split()[0])
            else:
                input_datasize = sum([
                    int(line.strip().split()[0])
                    for line in output_remote_du[0].split('\n')
                    if line.strip()
                ])

            logger.info('ltacp %s: input datasize: %d bytes, %s' %
                        (self.logId, input_datasize,
                         humanreadablesize(input_datasize)))
            estimated_tar_size = 512 * (
                input_datasize // 512
            ) + 3 * 512  #512byte header, 2*512byte ending, 512byte modulo data
            logger.info('ltacp %s: estimated_tar_size: %d bytes, %s' %
                        (self.logId, estimated_tar_size,
                         humanreadablesize(estimated_tar_size)))

            #---
            # Server part
            #---

            # we'll randomize ports
            # to minimize initial collision, randomize based on path and time
            random.seed(hash(self.logId) ^ hash(time.time()))

            p_data_in, port_data = self._ncListen('data')

            self.local_data_fifo = '/tmp/ltacp_datapipe_%s_%s' % (
                self.src_host, self.logId)

            logger.info('ltacp %s: creating data fifo: %s' %
                        (self.logId, self.local_data_fifo))
            if os.path.exists(self.local_data_fifo):
                os.remove(self.local_data_fifo)
            os.mkfifo(self.local_data_fifo)
            if not os.path.exists(self.local_data_fifo):
                raise LtacpException("ltacp %s: Could not create fifo: %s" %
                                     (self.logId, self.local_data_fifo))

            # transfer incomming data stream via md5a32bc to compute md5, adler32 and byte_count
            # data is written to fifo, which is then later fed into globus-url-copy
            # on stdout we can monitor progress
            # set progress message step 0f 0.5% of estimated_tar_size
            cmd_md5a32bc = [
                'md5a32bc', '-p',
                str(min(1000000, estimated_tar_size // 200)),
                self.local_data_fifo
            ]
            logger.info(
                'ltacp %s: processing data stream for md5, adler32 and byte_count. executing: %s'
                % (
                    self.logId,
                    ' '.join(cmd_md5a32bc),
                ))
            p_md5a32bc = Popen(cmd_md5a32bc,
                               stdin=p_data_in.stdout,
                               stdout=PIPE,
                               stderr=PIPE,
                               universal_newlines=True)
            self.started_procs[p_md5a32bc] = cmd_md5a32bc

            # start copy fifo stream to globus-url-copy
            guc_options = [
                '-cd',  #create remote directories if missing
                '-p 4',  #number of parallel ftp connections
                '-bs 131072',  #buffer size
                '-b',  # binary
                '-nodcau',  # turn off data channel authentication for ftp transfers
            ]
            cmd_data_out = [
                '/bin/bash', '-c',
                'globus-url-copy %s file://%s %s' %
                (' '.join(guc_options), self.local_data_fifo, dst_turl)
            ]
            logger.info(
                'ltacp %s: copying data stream into globus-url-copy. executing: %s'
                % (self.logId, ' '.join(cmd_data_out)))
            p_data_out = Popen(cmd_data_out, stdout=PIPE, stderr=PIPE)
            self.started_procs[p_data_out] = cmd_data_out

            # Check if receiver side is set up correctly
            # and all processes are still waiting for input from client
            finished_procs = dict(
                (p, cl) for (p, cl) in list(self.started_procs.items())
                if p.poll() is not None)

            if len(finished_procs):
                msg = ''
                for p, cl in list(finished_procs.items()):
                    o, e = communicate_returning_strings(p)
                    msg += "  process pid:%d exited prematurely with exit code %d. cmdline: %s\nstdout: %s\nstderr: %s\n" % (
                        p.pid, p.returncode, cl, o, e)
                raise LtacpException(
                    "ltacp %s: %d local process(es) exited prematurely\n%s" %
                    (self.logId, len(finished_procs), msg))

            #---
            # Client part
            #---

            # start remote copy on src host:
            # 1) create fifo
            # 2) send tar stream of data/dir + tee to fifo for 3)
            # 3) simultaneously to 2), calculate checksum of fifo stream
            # 4) break fifo

            self.remote_data_fifo = '/tmp/ltacp_md5_pipe_%s' % (self.logId, )
            #make sure there is no old remote fifo
            self._removeRemoteFifo()
            cmd_remote_mkfifo = self.ssh_cmd + [
                'mkfifo %s' % (self.remote_data_fifo, )
            ]
            logger.info('ltacp %s: remote creating fifo. executing: %s' %
                        (self.logId, ' '.join(cmd_remote_mkfifo)))
            p_remote_mkfifo = Popen(cmd_remote_mkfifo,
                                    stdout=PIPE,
                                    stderr=PIPE)
            self.started_procs[p_remote_mkfifo] = cmd_remote_mkfifo

            # block until fifo is created
            output_remote_mkfifo = communicate_returning_strings(
                p_remote_mkfifo)
            del self.started_procs[p_remote_mkfifo]
            if p_remote_mkfifo.returncode != 0:
                raise LtacpException(
                    'ltacp %s: remote fifo creation failed: \nstdout: %s\nstderr: %s'
                    % (self.logId, output_remote_mkfifo[0],
                       output_remote_mkfifo[1]))

            with open(os.devnull, 'r') as devnull:
                # start sending remote data, tee to fifo
                if input_is_file:
                    cmd_remote_data = self.ssh_cmd + [
                        'cat %s %s| tee %s | %s %s %s' %
                        (self.src_path, '| gzip --stdout --fast '
                         if self.gzip else '', self.remote_data_fifo,
                         self.remoteNetCatCmd, self.localIPAddress, port_data)
                    ]
                else:
                    if isinstance(self.src_path, str):
                        #src_path is dir
                        src_path_parent, src_path_child = os.path.split(
                            self.src_path)
                    else:
                        #src_path is list if paths
                        dirs = set([os.path.dirname(p) for p in self.src_path])

                        if len(dirs) > 1:
                            raise LtacpException(
                                'ltacp %s: cannot combine multiple files from different directories in one tarbal'
                                % self.logId)

                        files = set(
                            [os.path.basename(p) for p in self.src_path])
                        src_path_parent = list(dirs)[0]
                        src_path_child = ' '.join(files)

                    cmd_remote_data = self.ssh_cmd + [
                        'cd %s && tar c %s -O %s %s| tee %s | %s %s %s' %
                        (src_path_parent, src_path_child, '--dereference'
                         if dereference else '', '| gzip --stdout --fast '
                         if self.gzip else '', self.remote_data_fifo,
                         self.remoteNetCatCmd, self.localIPAddress, port_data)
                    ]
                logger.info(
                    'ltacp %s: remote starting transfer. executing: %s' %
                    (self.logId, ' '.join(cmd_remote_data)))
                p_remote_data = Popen(cmd_remote_data,
                                      stdin=devnull,
                                      stdout=PIPE,
                                      stderr=PIPE)
                self.started_procs[p_remote_data] = cmd_remote_data

                # start computation of checksum on remote fifo stream
                cmd_remote_checksum = self.ssh_cmd + [
                    'md5sum %s' % (self.remote_data_fifo, )
                ]
                logger.info(
                    'ltacp %s: remote starting computation of md5 checksum. executing: %s'
                    % (self.logId, ' '.join(cmd_remote_checksum)))
                p_remote_checksum = Popen(cmd_remote_checksum,
                                          stdin=devnull,
                                          stdout=PIPE,
                                          stderr=PIPE)
                self.started_procs[p_remote_checksum] = cmd_remote_checksum

                # waiting for output, comparing checksums, etc.
                logger.info(
                    'ltacp %s: transfering... waiting for progress...' %
                    self.logId)
                transfer_start_time = datetime.utcnow()
                prev_progress_time = datetime.utcnow()
                prev_bytes_transfered = 0

                with PipeReader(p_md5a32bc.stdout, self.logId) as pipe_reader:
                    # wait and poll for progress while all processes are runnning
                    while len([
                            p for p in list(self.started_procs.keys())
                            if p.poll() is not None
                    ]) == 0:
                        try:
                            current_progress_time = datetime.utcnow()
                            elapsed_secs_since_prev = totalSeconds(
                                current_progress_time - prev_progress_time)

                            if elapsed_secs_since_prev > 900:
                                raise LtacpException(
                                    'ltacp %s: transfer stalled for 15min.' %
                                    (self.logId))

                            # read and process md5a32bc stdout lines to create progress messages
                            lines = pipe_reader.readlines(1)
                            nextline = lines[-1].strip() if lines else ''

                            if len(nextline) > 0:
                                try:
                                    logger.debug('ltacp %s: transfering... %s',
                                                 self.logId, nextline)
                                    total_bytes_transfered = int(
                                        nextline.split()[0].strip())
                                    percentage_done = (
                                        100.0 * float(total_bytes_transfered)
                                    ) / float(estimated_tar_size)
                                    elapsed_secs_since_start = totalSeconds(
                                        current_progress_time -
                                        transfer_start_time)
                                    if percentage_done > 0 and elapsed_secs_since_start > 0 and elapsed_secs_since_prev > 0:
                                        avg_speed = total_bytes_transfered / elapsed_secs_since_start
                                        current_bytes_transfered = total_bytes_transfered - prev_bytes_transfered
                                        current_speed = current_bytes_transfered / elapsed_secs_since_prev
                                        if elapsed_secs_since_prev > 120 or current_bytes_transfered > 0.333 * estimated_tar_size:
                                            prev_progress_time = current_progress_time
                                            prev_bytes_transfered = total_bytes_transfered
                                            percentage_to_go = 100.0 - percentage_done
                                            time_to_go = elapsed_secs_since_start * percentage_to_go / percentage_done

                                            try:
                                                if self.progress_callback:
                                                    self.progress_callback(
                                                        percentage_done=
                                                        percentage_done,
                                                        current_speed=
                                                        current_speed,
                                                        total_bytes_transfered=
                                                        total_bytes_transfered)
                                            except Exception as e:
                                                logger.error(e)

                                            logger.info(
                                                'ltacp %s: transfered %s %.1f%% in %s at avgSpeed=%s (%s) curSpeed=%s (%s) to_go=%s to %s'
                                                %
                                                (self.logId,
                                                 humanreadablesize(
                                                     total_bytes_transfered),
                                                 percentage_done,
                                                 timedelta(seconds=int(
                                                     round(
                                                         elapsed_secs_since_start
                                                     ))),
                                                 humanreadablesize(
                                                     avg_speed, 'Bps'),
                                                 humanreadablesize(
                                                     avg_speed * 8, 'bps'),
                                                 humanreadablesize(
                                                     current_speed, 'Bps'),
                                                 humanreadablesize(
                                                     current_speed * 8, 'bps'),
                                                 timedelta(seconds=int(
                                                     round(time_to_go))),
                                                 dst_turl))
                                except Exception as e:
                                    msg = 'ltacp %s: error while parsing md5a32bc loglines: error=%s. line=%s' % (
                                        self.logId, e, nextline)
                                    logger.error(msg)
                                    self.cleanup()
                                    raise LtacpException(msg)
                            time.sleep(0.05)
                        except KeyboardInterrupt:
                            self.cleanup()
                        except LtacpException as e:
                            logger.error('ltacp %s: %s' % (self.logId, str(e)))
                            self.cleanup()
                            raise
                        except Exception as e:
                            logger.error('ltacp %s: %s' % (self.logId, str(e)))

                def waitForSubprocess(proc,
                                      timeout=timedelta(seconds=60),
                                      proc_log_name='',
                                      loglevel=logging.DEBUG):
                    logger.log(
                        loglevel,
                        'ltacp %s: waiting at most %s for %s to finish...',
                        self.logId, timeout, proc_log_name)
                    start_wait = datetime.now()
                    while datetime.now() - start_wait < timeout:
                        if proc.poll() is not None:
                            break
                        time.sleep(1)

                    if proc.poll() is None:
                        raise LtacpException(
                            'ltacp %s: %s did not finish within %s.' %
                            (self.logId, proc_log_name, timeout))

                waitForSubprocess(p_data_out,
                                  timedelta(seconds=self.globus_timeout),
                                  'globus-url-copy', logging.INFO)
                output_data_out = communicate_returning_strings(p_data_out)
                if p_data_out.returncode != 0:
                    if 'file exist' in output_data_out[1].lower():
                        raise LtacpDestinationExistsException(
                            'ltacp %s: data transfer via globus-url-copy to LTA failed, file already exists. turl=%s.'
                            % (self.logId, dst_turl))
                    raise LtacpException(
                        'ltacp %s: transfer via globus-url-copy to LTA failed. turl=%s error=%s'
                        % (self.logId, dst_turl, output_data_out[0].strip() +
                           output_data_out[1].strip()))
                logger.info(
                    'ltacp %s: data transfer via globus-url-copy to LTA complete.'
                    % self.logId)

                waitForSubprocess(p_remote_data, timedelta(seconds=60),
                                  'remote data transfer')
                output_remote_data = communicate_returning_strings(
                    p_remote_data)
                if p_remote_data.returncode != 0:
                    raise LtacpException(
                        'ltacp %s: Error in remote data transfer: %s' %
                        (self.logId, output_remote_data[1]))
                logger.debug('ltacp %s: remote data transfer finished...' %
                             self.logId)

                waitForSubprocess(p_remote_checksum, timedelta(seconds=60),
                                  'remote md5 checksum computation')
                output_remote_checksum = communicate_returning_strings(
                    p_remote_checksum)
                if p_remote_checksum.returncode != 0:
                    raise LtacpException(
                        'ltacp %s: Error in remote md5 checksum computation: %s'
                        % (self.logId, output_remote_checksum[1]))
                logger.debug(
                    'ltacp %s: remote md5 checksum computation finished.' %
                    self.logId)

                waitForSubprocess(
                    p_md5a32bc, timedelta(seconds=60),
                    'local computation of md5 adler32 and byte_count')
                output_md5a32bc_local = communicate_returning_strings(
                    p_md5a32bc)
                if p_md5a32bc.returncode != 0:
                    raise LtacpException(
                        'ltacp %s: Error while computing md5 adler32 and byte_count: %s'
                        % (self.logId, output_md5a32bc_local[1]))
                logger.debug(
                    'ltacp %s: computed local md5 adler32 and byte_count.' %
                    self.logId)

                # process remote md5 checksums
                try:
                    md5_checksum_remote = output_remote_checksum[0].split(
                        ' ')[0]
                except Exception as e:
                    logger.error(
                        'ltacp %s: error while parsing remote md5: %s\n%s' %
                        (self.logId, output_remote_checksum[0],
                         output_remote_checksum[1]))
                    raise

                # process local md5 adler32 and byte_count
                try:
                    items = output_md5a32bc_local[1].splitlines()[-1].split(
                        ' ')
                    md5_checksum_local = items[0].strip()
                    a32_checksum_local = items[1].strip().zfill(8)
                    byte_count = int(items[2].strip())
                except Exception as e:
                    logger.error(
                        'ltacp %s: error while parsing md5 adler32 and byte_count outputs: %s'
                        % (self.logId, output_md5a32bc_local[0]))
                    raise

                # check transfered number of bytes
                if (byte_count == 0 and input_datasize > 0):
                    raise LtacpException(
                        'ltacp %s: did not transfer any bytes of the expected %s. Something is wrong in the datastream setup.'
                        % (self.logId, humanreadablesize(estimated_tar_size)))

                logger.info(
                    'ltacp %s: byte count of datastream is %d %s' %
                    (self.logId, byte_count, humanreadablesize(byte_count)))

                # compare local and remote md5
                if (md5_checksum_remote != md5_checksum_local):
                    raise LtacpException(
                        'md5 checksum reported by client (%s) does not match local checksum of incoming data stream (%s)'
                        %
                        (self.logId, md5_checksum_remote, md5_checksum_local))
                logger.info(
                    'ltacp %s: remote and local md5 checksums are equal: %s' %
                    (
                        self.logId,
                        md5_checksum_local,
                    ))

            logger.info('ltacp %s: fetching adler32 checksum from LTA...' %
                        self.logId)
            srm_ok, srm_file_size, srm_a32_checksum = get_srm_size_and_a32_checksum(
                self.dst_surl, 'ltacp %s:' % self.logId)

            if not srm_ok:
                raise LtacpException(
                    'ltacp %s: Could not get srm adler32 checksum for: %s' %
                    (self.logId, self.dst_surl))

            if srm_a32_checksum != a32_checksum_local:
                raise LtacpException(
                    'ltacp %s: adler32 checksum reported by srm (%s) does not match original data checksum (%s)'
                    % (self.logId, srm_a32_checksum, a32_checksum_local))

            logger.info('ltacp %s: adler32 checksums are equal: %s' % (
                self.logId,
                a32_checksum_local,
            ))

            if int(srm_file_size) != int(byte_count):
                raise LtacpException(
                    'ltacp %s: file size reported by srm (%s) does not match datastream byte count (%s)'
                    % (self.logId, srm_file_size, byte_count))

            logger.info(
                'ltacp %s: srm file size and datastream byte count are equal: %s bytes (%s)'
                %
                (self.logId, srm_file_size, humanreadablesize(srm_file_size)))
            logger.info('ltacp %s: transfer to LTA completed successfully.' %
                        (self.logId))

        except LtacpDestinationExistsException as e:
            logger.log(logging.WARN if force else logging.ERROR, str(e))
            if force:
                self.cleanup()
                srmrm(self.dst_surl, 'ltacp %s ' % self.logId)
                return self.transfer(force=False)
            else:
                # re-raise the exception to the caller
                raise
        except Exception as e:
            # Something went wrong
            logger.error('ltacp %s: Error in transfer: %s' %
                         (self.logId, str(e)))
            # re-raise the exception to the caller
            raise
        finally:
            # cleanup
            self.cleanup()

        total_time = round(
            10 * totalSeconds(datetime.utcnow() - starttime)) / 10
        logger.info(
            'ltacp %s: successfully completed transfer of %s:%s to %s in %ssec for %s at avg speed of %s or %s',
            self.logId, self.src_host, self.src_path, self.dst_surl,
            total_time, humanreadablesize(int(srm_file_size)),
            humanreadablesize(int(srm_file_size) / total_time, 'Bps'),
            humanreadablesize(8 * int(srm_file_size) / total_time, 'bps'))

        return (md5_checksum_local, a32_checksum_local, str(byte_count))
Exemplo n.º 20
0
def getDiskUsageForPath_rbh_du(path):
    logger.info('getDiskUsageForPath_rbh_du(\'%s\')', path)

    result = {
        'found': False,
        'path': path,
        'disk_usage': None,
        'name': path.split('/')[-1]
    }

    cmd = ['rbh-du', '-bd', path]
    cmd = wrap_command_in_cep4_head_node_ssh_call_if_needed(cmd)
    logger.info(' '.join(cmd))

    proc = subprocess.Popen(cmd,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    out, err = communicate_returning_strings(proc)

    if proc.returncode == 0:
        # example of out
        # Using config file '/etc/robinhood.d/tmpfs/tmp_fs_mgr_basic.conf'.
        # /data/projects/2016LOFAROBS/L522380
        # dir count:3906, size:16048128, spc_used:16052224
        # file count:17568, size:42274164368, spc_used:42327519232

        #parse out
        lines = [l.strip() for l in out.split('\n')]
        file_lines = [l for l in lines if 'file count' in l]
        if file_lines:
            parts = [p.strip() for p in file_lines[0].split(',')]
            partsDict = {
                p.split(':')[0].strip(): p.split(':')[1].strip()
                for p in parts
            }

            result['found'] = True

            if 'size' in partsDict:
                result['disk_usage'] = int(partsDict['size'])

            if 'file count' in partsDict:
                result['nr_of_files'] = int(partsDict['file count'])
        else:
            dir_lines = [l for l in lines if 'dir count' in l]
            if dir_lines:
                result['found'] = True
                result['disk_usage'] = 0
                result['nr_of_files'] = None
    else:
        logger.error(out + err)
        result['message'] = out

    result['disk_usage_readable'] = humanreadablesize(result['disk_usage'])

    otdb_id = getOTDBIdFromPath(path)
    if otdb_id:
        result['otdb_id'] = otdb_id

    logger.info('getDiskUsageForPath_rbh_du(\'%s\') returning: %s', path,
                result)
    return result
Exemplo n.º 21
0
    def uploadDataAndGetSIP(self,
                            archive_id,
                            storage_ticket,
                            filename,
                            uri,
                            filesize,
                            md5_checksum,
                            adler32_checksum,
                            validate=True):
        try:
            # mom is very reluctant in providing sips
            # often it returns a login page, even when you're logged in
            # so, upon error, retry a couple of times with a pause, else just return
            for mom_retry in range(self.MAX_MOM_RETRIES):
                start = time.time()
                logger.info(
                    "MoMClient.uploadDataAndGetSIP with archiveId %s - StorageTicket %s - FileName %s - Uri %s",
                    archive_id, storage_ticket, filename, uri)

                self.login()

                xmlcontent = """<?xml version="1.0" encoding="UTF-8"?>
                <lofar:DataProduct archiveId="%s" xmlns:lofar="http://www.astron.nl/MoM2-Lofar">
                    <locations>
                        <location>
                            <uri>lta://%s/%s/%s</uri>
                        </location>
                    </locations>
                    <storageTicket>%s</storageTicket>
                    <fileSize>%s</fileSize>
                    <checksums>
                        <checksum>
                            <algorithm>MD5</algorithm>
                            <value>%s</value>
                        </checksum>
                        <checksum>
                            <algorithm>Adler32</algorithm>
                            <value>%s</value>
                        </checksum>
                    </checksums>
                </lofar:DataProduct>""" % (
                    archive_id, storage_ticket, filename, uri, storage_ticket,
                    filesize, md5_checksum, adler32_checksum)

                # sanitize, make compact
                xmlcontent = xmlcontent.replace('\n', ' ')
                while '  ' in xmlcontent:
                    xmlcontent = xmlcontent.replace('  ', ' ')

                params = {
                    "command": "get-sip-with-input",
                    "xmlcontent": xmlcontent
                }
                response = self.session.get(self.__momURLgetSIP, params=params)
                result = response.text
                result = result.replace(
                    '<stationType>Europe</stationType>',
                    '<stationType>International</stationType>')

                if 'DOCTYPE HTML PUBLIC' in result:
                    logger.error(
                        'MoM returned login screen instead of SIP for %s %s using url %s and params %s, full url=%s',
                        archive_id, filename, self.__momURLgetSIP, params,
                        response.url)

                    # logout, even though we think we should be logged in properly
                    # it's mom who thinks we should login again, even though we have a proper session.
                    # next retry, we'll login automatically again
                    self.logout()

                    if mom_retry == (self.MAX_MOM_RETRIES - 1):
                        # for some reason mom cannot handle the uploadDataAndGetSIP
                        # let's give it a final try with just GetSip
                        # we'll miss some data in the SIP, which we can add ourselves, so the LTA catalogue is up-to-date
                        # but MoM will miss these parameters. tough luck.
                        logger.warning(
                            "MoMClient.uploadDataAndGetSIP with archiveId %s - StorageTicket %s - FileName %s - Uri %s failed %s times. Trying normal GetSip without uploading data to MoM.",
                            archive_id, storage_ticket, filename, uri,
                            mom_retry)
                        result = self.getSIP(archive_id, validate=False)
                        # add ingest info to sip
                        result = addIngestInfoToSIP(result, storage_ticket,
                                                    filesize, md5_checksum,
                                                    adler32_checksum)
                    else:
                        wait_secs = (mom_retry + 1) * (mom_retry + 1) * 10
                        logger.info(
                            'Retrying to uploadDataAndGetSIP for archiveId %s in %s seconds',
                            archive_id, wait_secs)
                        time.sleep(wait_secs)
                        continue  # jump back to for mom_retry in range(self.MAX_MOM_RETRIES)

                if validate:
                    if not validateSIPAgainstSchema(result,
                                                    log_prefix=str(filename)):
                        logger.error('Invalid SIP:\n%s', result)
                        raise Exception(
                            'SIP for %s does not validate against schema' %
                            filename)

                    # MoM sometimes provides SIPs which validate against the schema
                    # but which has incorrect content, e.g. an incorrect archive_id.
                    # check it!
                    if not checkSIPContent(result,
                                           log_prefix=str(filename),
                                           archive_id=archive_id,
                                           filename=filename,
                                           filesize=filesize,
                                           storage_ticket=storage_ticket,
                                           md5_checksum=md5_checksum,
                                           adler32_checksum=adler32_checksum):
                        raise Exception('SIP for %s does has invalid content' %
                                        filename)

                if time.time() - start > 2:
                    logger.debug(
                        "MoMClient.uploadDataAndGetSIP for %s took %ds",
                        filename,
                        time.time() - start)

                logger.info(
                    "MoMClient.uploadDataAndGetSIP for %s retreived SIP of %s: %s...",
                    filename, humanreadablesize(len(result)),
                    result[:512].replace('\n', ''))
                return result
        except Exception as e:
            self.logout()
            raise Exception("getting SIP from MoM failed: " + str(e))
        return ''
Exemplo n.º 22
0
    def getDiskUsageForPath(self, path, force_update=False):
        logger.info("cache.getDiskUsageForPath('%s', force_update=%s)", path,
                    force_update)
        needs_cache_update = False
        if not force_update:
            with self._cacheLock:
                needs_cache_update |= path not in self._cache[
                    'path_du_results']

        if needs_cache_update or force_update:
            logger.info("cache update needed for %s", path)

            # check if some other thread is already doing a du call for this path...
            with self._du_threading_events_lock:
                path_threading_event = self._du_threading_events.get(path)
                need_to_do_du_call = path_threading_event is None

                if need_to_do_du_call:
                    # no other thread is currently du'ing/updating this path
                    # so create a threading Event and store it in the dict,
                    # so other threads can wait for this event.
                    logger.info("updating the cache for %s current_thread=%s",
                                path,
                                current_thread().name)
                    path_threading_event = Event()
                    self._du_threading_events[path] = path_threading_event

            if need_to_do_du_call:
                # no other thread is currently du'ing/updating this path
                # so we need to do it here.
                result = du_getDiskUsageForPath(path)
                self._updateCache(result)

                # signal threads waiting for this same path du call
                # and do bookkeeping
                with self._du_threading_events_lock:
                    logger.info(
                        "signaling other threads that the cache was updated for %s current_thread=%s",
                        path,
                        current_thread().name)
                    path_threading_event.set()
                    del self._du_threading_events[path]
            else:
                logger.info(
                    "waiting for du call on other thread that will update the cache for %s current_thread=%s",
                    path,
                    current_thread().name)
                path_threading_event.wait()
                logger.info(
                    "another thread just updated the cache for %s current_thread=%s",
                    path,
                    current_thread().name)

        with self._cacheLock:
            if path in self._cache['path_du_results']:
                result = self._cache['path_du_results'][path]
            else:
                result = {
                    'found': False,
                    'path': path,
                    'message': 'unknown error'
                }
                if not self.disk_usage.path_resolver.pathExists(path):
                    result['message'] = 'No such path: %s' % path

        result['disk_usage_readable'] = humanreadablesize(
            result.get('disk_usage', 0))
        logger.info('cache.getDiskUsageForPath(\'%s\') result: %s', path,
                    result)
        return result
Exemplo n.º 23
0
    def GetSIP(self):
        try:
            if self.Type == "MoM":
                with self.momClient:
                    self.SIP = self.momClient.uploadDataAndGetSIP(self.ArchiveId,
                                                                self.ticket,
                                                                self.FileName,
                                                                self.PrimaryUri,
                                                                self.FileSize,
                                                                self.MD5Checksum,
                                                                self.Adler32Checksum,
                                                                validate=True)
            elif 'SIPLocation' in self.job: # job file might know where the sip is when it is not a MoM job
                try:
                    sip_host = self.job['SIPLocation'].split(':')[0]
                    sip_path = self.job['SIPLocation'].split(':')[1]

                    cmd = ['ssh', '-tt', '-n', '-x', '-q', '%s@%s' % (self.user, sip_host), 'cat %s' % sip_path]
                    logger.info("GetSIP for %s at SIPLocation %s - cmd %s" % (self.JobId, self.job['SIPLocation'], ' ' .join(cmd)))
                    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    out, err = communicate_returning_strings(p)
                    if p.returncode != 0:
                        raise PipelineError('GetSIP error getting EoR SIP for %s: %s' % (self.JobId, out + err))

                    self.SIP = out

                    self.SIP = addIngestInfoToSIP(self.SIP, self.ticket, self.FileSize, self.MD5Checksum, self.Adler32Checksum)

                    self.SIP = self.SIP.replace('<stationType>Europe</stationType>','<stationType>International</stationType>')

                    #make sure the source in the SIP is the same as the type of the storageticket
                    self.SIP = re.compile('<source>eor</source>', re.IGNORECASE).sub('<source>%s</source>' % (self.Type,), self.SIP)

                    if not validateSIPAgainstSchema(self.SIP):
                        logger.error('Invalid SIP:\n%s', self.SIP)
                        raise Exception('SIP for %s does not validate against schema' % self.JobId)

                except:
                    logger.exception('Getting SIP from SIPLocation %s failed', self.job['SIPLocation'])
                    raise

                logger.info('SIP received for %s from SIPLocation %s with size %d (%s): \n%s' % (self.JobId,
                                                                                                self.job['SIPLocation'],
                                                                                                len(self.SIP),
                                                                                                humanreadablesize(len(self.SIP)),
                                                                                                self.SIP[0:1024]))
            else:
                self.SIP = makeSIP(self.Project, self.ObsId, self.ArchiveId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type)
                self.FileType = FILE_TYPE_UNSPECIFIED
        except Exception as e:
            if self.minimal_SIP:
                logger.info('making minimal SIP for %s', self.JobId)
                self.SIP = makeSIP(self.Project, self.ObsId, self.ArchiveId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type)
                logger.info('minimal SIP for %s: \n%s', self.JobId, self.SIP)
                self.FileType = FILE_TYPE_UNSPECIFIED
            else:
                raise
Exemplo n.º 24
0
def main(argv):
    dbpath = argv[0] if argv else 'ltastorageoverview.sqlite'
    print 'Report for ' + dbpath

    db = store.LTAStorageDb(dbpath)

    sites = db.sites()

    numFilesTotal = sum([db.numFilesInSite(s[0]) for s in sites])
    totalFileSize = sum([db.totalFileSizeInSite(s[0]) for s in sites])

    print '\n*** TOTALS *** #files=%s total_size=%s' % (humanreadablesize(numFilesTotal, ''),
                                                        humanreadablesize(totalFileSize))

    for site in sites:
        numFilesInSite = db.numFilesInSite(site[0])
        totalFileSizeInSite = db.totalFileSizeInSite(site[0])

        print '\n--- %s ---  #files=%s total_size=%s' % (site[1],
                                                         humanreadablesize(numFilesInSite, ''),
                                                         humanreadablesize(totalFileSizeInSite))

        root_dirs = db.rootDirectoriesForSite(site[0])

        for root_dir in root_dirs:
            numFilesInTree = db.numFilesInTree(root_dir[0])
            totalFileSizeInTree = db.totalFileSizeInTree(root_dir[0])

            print "  %s #files=%d total_size=%s" % (root_dir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree))

            subdirs = db.subDirectories(root_dir[0], 1, False)
            subdirs = sorted(subdirs, key=lambda x: x[1])

            for subdir in subdirs:
                numFilesInTree = db.numFilesInTree(subdir[0])
                totalFileSizeInTree = db.totalFileSizeInTree(subdir[0])

                print "    %s #files=%d total_size=%s" % (subdir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree))

                if subdir[1].endswith('projects/'):
                    projectsSubDirs = db.subDirectories(subdir[0], 1, False)
                    projectsSubDirs = sorted(projectsSubDirs, key=lambda x: x[1])

                    for projectsSubDir in projectsSubDirs:
                        numFilesInTree = db.numFilesInTree(projectsSubDir[0])
                        totalFileSizeInTree = db.totalFileSizeInTree(projectsSubDir[0])

                        print "      %s #files=%d total_size=%s" % (projectsSubDir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree))


    utcnow = datetime.utcnow()
    monthbegin = datetime(utcnow.year, utcnow.month, 1)
    monthend =  datetime(utcnow.year, utcnow.month+1, 1) - timedelta(milliseconds=1)
    print '\n\n*** CHANGES THIS MONTH %s ***' % monthbegin.strftime('%Y/%m')

    for site in sites:
        root_dirs = db.rootDirectoriesForSite(site[0])

        numChangedFilesInSite = db.numFilesInSite(site[0],
                                                  monthbegin,
                                                  monthend)

        if numChangedFilesInSite == 0:
            print '\n--- %s --- None' % (site[1],)
            continue

        totalChangedFileSizeInSite = db.totalFileSizeInSite(site[0],
                                                            monthbegin,
                                                            monthend)

        print '\n--- %s --- #files=%d total_size=%s' % (site[1],
                                                        numChangedFilesInSite,
                                                        humanreadablesize(totalChangedFileSizeInSite))

        for root_dir in root_dirs:
            changedFiles = db.filesInTree(root_dir[0], monthbegin, monthend)

            if len(changedFiles) > 0:
                numFilesInTree = db.numFilesInTree(root_dir[0],
                                                   monthbegin,
                                                   monthend)
                totalFileSizeInTree = db.totalFileSizeInTree(root_dir[0],
                                                             monthbegin,
                                                             monthend)

                print "  %s #files=%d total_size=%s" % (root_dir[1],
                                                        numFilesInTree,
                                                        humanreadablesize(totalFileSizeInTree))

                # filter unique dirs containing changed files
                dirsWithChangedFiles = set([(x[0], x[1]) for x in changedFiles])

                # sort by name
                dirsWithChangedFiles = sorted(dirsWithChangedFiles, key=lambda x: x[1])

                for dir in dirsWithChangedFiles:
                    numFilesInTree = db.numFilesInTree(dir[0], monthbegin, monthend)
                    totalFileSizeInTree = db.totalFileSizeInTree(dir[0], monthbegin, monthend)

                    print "    %s #files=%d total_size=%s" % (dir[1], numFilesInTree, humanreadablesize(totalFileSizeInTree))

    print '\n\n*** CHANGES PER MONTH ***'

    min_date, max_date = db.datetimeRangeOfFilesInTree()
    month_ranges = monthRanges(min_date, max_date)

    for site in sites:
        print '\n--- %s ---' % site[1]

        for month_range in month_ranges:
            numFilesInSite = db.numFilesInSite(site[0], month_range[0], month_range[1])
            totalFileSizeInSite = db.totalFileSizeInSite(site[0], month_range[0], month_range[1])

            print "  %s %s %s #files=%d total_size=%s" % (site[1], month_range[0], month_range[1], numFilesInSite, humanreadablesize(totalFileSizeInSite))
Exemplo n.º 25
0
    def enoughResourcesAvailable(self):
        try:
            now = datetime.utcnow()
            bytes_sent = _getBytesSent()

            if bytes_sent >= self.__prev_bytes_sent:  # bytes_sent might wrap around zero
                # compute current speed in Gbps
                speed = 8 * (bytes_sent - self.__prev_bytes_sent
                             ) / totalSeconds(now -
                                              self.__prev_bytes_sent_timestamp)

                # running average for used_bandwidth
                used_bandwidth = 0.5 * speed + 0.5 * self.__prev_used_bandwidth

                logger.debug("resources: current used_bandwidth = %s",
                             humanreadablesize(used_bandwidth, 'bps'))

                # store for next iteration
                self.__prev_bytes_sent = bytes_sent
                self.__prev_bytes_sent_timestamp = now
                self.__prev_used_bandwidth = used_bandwidth

                # only start new jobs if we have some bandwith available
                # note that this is a 'soft' limit.
                # we cannot control the actual bandwith used by the running transfers
                # we can only not start new jobs if we already exceed the MAX_USED_BANDWITH_TO_START_NEW_JOBS
                if used_bandwidth > MAX_USED_BANDWITH_TO_START_NEW_JOBS:
                    logger.warning(
                        'resources: not enough bandwith available to start new jobs, using %s, max %s'
                        % (humanreadablesize(used_bandwidth, 'bps'),
                           humanreadablesize(
                               MAX_USED_BANDWITH_TO_START_NEW_JOBS, 'bps')))
                    return False
            else:
                # wrapped around 0, just store for next iteration, do not compute anything
                self.__prev_bytes_sent = bytes_sent
                self.__prev_bytes_sent_timestamp = now

            # only start new jobs if we have some cpu time available
            idle_cpu_percentage = psutil.cpu_times_percent().idle
            logger.debug("resources: current idle_cpu_percentage = %s%%",
                         idle_cpu_percentage)
            if idle_cpu_percentage < 5:
                logger.warning(
                    'resources: not enough cpu power available to start new jobs, cpu_idle %s%%'
                    % idle_cpu_percentage)
                return False

            # only start new jobs if system load is not too high
            short_load_avg = os.getloadavg()[0]
            cpu_count = psutil.cpu_count()
            allowed_load = 1.5 * cpu_count
            logger.debug(
                "resources: current short term load = %s #cpu's = %s allowed_load = %s",
                short_load_avg, cpu_count, allowed_load)
            if short_load_avg > allowed_load:
                logger.warning(
                    'resources: system load too high (%s > %s), cannot start new jobs'
                    % (short_load_avg, allowed_load))
                return False

            # only allow 1 job at the time if swapping
            swap_memory_percentage = psutil.swap_memory().percent
            logger.debug("resources: current swap_memory_percentage = %s%%",
                         swap_memory_percentage)
            if swap_memory_percentage > 5 and len(self.__running_jobs) > 0:
                logger.warning(
                    'resources: system swapping. not enough memory available to start new jobs'
                )
                return False

            # only start new jobs if number of processes is not too high
            try:
                current_user = getpass.getuser()
                current_user_procs = [
                    p for p in psutil.process_iter()
                    if p.username() == current_user
                ]
                current_num_user_procs = len(current_user_procs)
                allowed_num_user_procs = 64 * cpu_count

                logger.debug(
                    "resources: current num_user_procs = %s allowed_num_user_procs = %s",
                    current_num_user_procs, allowed_num_user_procs)

                if current_num_user_procs > allowed_num_user_procs:
                    logger.warning(
                        'resources: number of processes by %s too high (%s > %s), cannot start new jobs'
                        % (current_user, current_num_user_procs,
                           allowed_num_user_procs))
                    return False
            except Exception as e:
                logger.exception(e)
                pass

            # limit total number of parallel transferring jobs to self.max_nr_of_parallel_jobs
            with self.__lock:
                starting_threads = [
                    job_thread_dict['thread']
                    for job_thread_dict in list(self.__running_jobs.values())
                    if 'pipeline' not in job_thread_dict
                ]
                pipelines = [
                    job_thread_dict['pipeline']
                    for job_thread_dict in list(self.__running_jobs.values())
                    if 'pipeline' in job_thread_dict
                ]
                initializing_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_INITIALIZING
                ]
                transferring_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_TRANSFERRING
                ]
                finalizing_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_FINALIZING
                ]

                num_busy_transfers = len(starting_threads) + len(
                    initializing_pipelines) + len(transferring_pipelines)
                num_finalizing_transfers = len(finalizing_pipelines)

                logger.debug(
                    "resources: current num_busy_transfers = %s num_finalizing_transfers = %s max_nr_of_parallel_jobs = %s",
                    num_busy_transfers, num_finalizing_transfers,
                    self.max_nr_of_parallel_jobs)

                if num_busy_transfers >= self.max_nr_of_parallel_jobs:
                    logger.warning(
                        'resources: already running %d parallel jobs (#starting=%d, #transferring=%d) limiting the total number of transferring jobs to %d'
                        % (len(self.__running_jobs),
                           len(initializing_pipelines) + len(starting_threads),
                           len(transferring_pipelines),
                           self.max_nr_of_parallel_jobs))
                    return False

                if num_finalizing_transfers >= 2 * self.max_nr_of_parallel_jobs:
                    logger.warning(
                        'resources: already waiting for %d jobs to finish (updating status/SIP to MoM and LTA). not starting new jobs until some jobs finished...'
                        % (len(finalizing_pipelines), ))
                    return False

        except Exception as e:
            logger.exception(
                "error while checking for available resources: %s", e)

            num_running_jobs = len(self.__running_jobs)
            if num_running_jobs <= 4:
                logger.info("running %d jobs, assuming we can run 1 more: ",
                            num_running_jobs)
                return True
            else:
                logger.warning(
                    "already running %d jobs, assuming for safety we cannot run more jobs...",
                    num_running_jobs)
                return False

        return True
Exemplo n.º 26
0
    def _removeTaskData(self,
                        otdb_id,
                        delete_is=True,
                        delete_cs=True,
                        delete_uv=True,
                        delete_im=True,
                        delete_img=True,
                        delete_pulp=True,
                        delete_scratch=True,
                        force=False):
        logger.info("Remove task data for otdb_id %s, force=%s" %
                    (otdb_id, force))

        if not isinstance(otdb_id, int):
            message = "Provided otdb_id is not an int"
            logger.error(message)
            return {'deleted': False, 'message': message}

        self._sendNotification(subject='TaskDeleting',
                               content={'otdb_id': otdb_id})

        if _isTaskDataPinned(otdb_id):
            message = "Task otdb_id=%s is pinned. Not deleting data." % (
                otdb_id)
            logger.error(message)
            self._sendNotification(subject='TaskDeleted',
                                   content={
                                       'deleted': False,
                                       'otdb_id': otdb_id,
                                       'message': message
                                   })
            return {'deleted': False, 'message': message}

        radbrpc = self.path_resolver.radbrpc
        task = radbrpc.getTask(otdb_id=otdb_id)
        if task:
            suc_tasks = radbrpc.getTasks(task_ids=task['successor_ids'])
            unfinished_scu_tasks = [
                t for t in suc_tasks
                if not (t['status'] == 'finished' or t['status'] == 'obsolete')
            ]
            if unfinished_scu_tasks:
                message = "Task otdb_id=%s has unfinished successor tasks (otdb_ids: %s). Not deleting data." % (
                    task['otdb_id'],
                    [t['otdb_id'] for t in unfinished_scu_tasks])
                logger.error(message)
                self._sendNotification(subject='TaskDeleted',
                                       content={
                                           'deleted': False,
                                           'otdb_id': otdb_id,
                                           'message': message
                                       })
                return {'deleted': False, 'message': message}

            momrpc = self.path_resolver.momrpc
            dataproducts = momrpc.getDataProducts(task['mom_id']).get(
                task['mom_id'])
            ingestable_dataproducts = [
                dp for dp in dataproducts if dp['status'] not in
                [None, 'has_data', 'no_data', 'populated']
            ]
            ingested_dataproducts = [
                dp for dp in ingestable_dataproducts
                if dp['status'] == 'ingested'
            ]

            if not force:
                if len(ingestable_dataproducts) > 0 and len(
                        ingested_dataproducts) < len(ingestable_dataproducts):
                    uningested_dataproducts = [
                        dp for dp in ingestable_dataproducts
                        if dp['status'] != 'ingested'
                    ]
                    message = "Task otdb_id=%s has un-ingested dataproducts. Not deleting data." % (
                        task['otdb_id'], )
                    logger.error(message)
                    self._sendNotification(subject='TaskDeleted',
                                           content={
                                               'deleted': False,
                                               'otdb_id': otdb_id,
                                               'message': message
                                           })
                    return {'deleted': False, 'message': message}

        path_result = self.path_resolver.getPathForOTDBId(otdb_id)
        if path_result['found']:
            rm_results = []
            if delete_is and delete_cs and delete_uv and delete_im and delete_img and delete_pulp:
                rm_results.append(self._removePath(path_result['path']))
            else:
                if delete_is and self.path_resolver.pathExists(
                        os.path.join(path_result['path'], 'is')):
                    rm_results.append(
                        self._removePath(
                            os.path.join(path_result['path'], 'is')))
                if delete_cs and self.path_resolver.pathExists(
                        os.path.join(path_result['path'], 'cs')):
                    rm_results.append(
                        self._removePath(
                            os.path.join(path_result['path'], 'cs')))
                if delete_uv and self.path_resolver.pathExists(
                        os.path.join(path_result['path'], 'uv')):
                    rm_results.append(
                        self._removePath(
                            os.path.join(path_result['path'], 'uv')))
                if delete_im and self.path_resolver.pathExists(
                        os.path.join(path_result['path'], 'im')):
                    rm_results.append(
                        self._removePath(
                            os.path.join(path_result['path'], 'im')))
                if delete_img and self.path_resolver.pathExists(
                        os.path.join(path_result['path'], 'img')):
                    rm_results.append(
                        self._removePath(
                            os.path.join(path_result['path'], 'img')))
                if delete_pulp and self.path_resolver.pathExists(
                        os.path.join(path_result['path'], 'pulp')):
                    rm_results.append(
                        self._removePath(
                            os.path.join(path_result['path'], 'pulp')))

            if delete_scratch and 'scratch_paths' in path_result:
                for scratch_path in path_result['scratch_paths']:
                    rm_results.append(self._removePath(scratch_path))

            rm_result = {
                'deleted': all(x['deleted'] for x in rm_results),
                'paths': [x.get('path') for x in rm_results],
                'message': '',
                'size': sum([x.get('size', 0) or 0 for x in rm_results])
            }

            combined_message = '\n'.join(
                x.get('message', '') for x in rm_results)

            if rm_result[
                    'deleted'] and not 'does not exist' in combined_message:
                task_type = task.get('type', 'task') if task else 'task'
                rm_result[
                    'message'] = 'Deleted %s of data from disk for %s with otdb_id %s\n' % (
                        humanreadablesize(
                            rm_result['size']), task_type, otdb_id)

            rm_result['message'] += combined_message

            self._sendNotification(subject='TaskDeleted',
                                   content={
                                       'deleted':
                                       rm_result['deleted'],
                                       'otdb_id':
                                       otdb_id,
                                       'paths':
                                       rm_result['paths'],
                                       'message':
                                       rm_result['message'],
                                       'size':
                                       rm_result['size'],
                                       'size_readable':
                                       humanreadablesize(rm_result['size'])
                                   })

            self._endStorageResourceClaim(otdb_id)

            return rm_result

        return {'deleted': False, 'message': path_result['message']}
Exemplo n.º 27
0
    def TransferFile(self):
        try:
            logger.info('Starting file transfer for %s ' % self.JobId)
            start = time.time()
            self.status = IngestPipeline.STATUS_TRANSFERRING

            self.__sendNotification('JobProgress',
                                    message='transfer starting',
                                    percentage_done=0.0,
                                    total_bytes_transfered=0)

            local_ip = hostnameToIp(self.hostname)

            if 'cep4' in self.HostLocation.lower() or 'cpu' in self.HostLocation.lower():
                self.HostLocation = 'localhost'

            if 'locus' in self.HostLocation.lower() and not '.' in self.HostLocation.lower():
                self.HostLocation += '.cep2.lofar'

            def progress_callback(percentage_done, current_speed, total_bytes_transfered):
                self.__sendNotification('JobProgress',
                                        percentage_done=min(100.0, round(10.0*percentage_done)/10.0),
                                        current_speed=current_speed,
                                        total_bytes_transfered=total_bytes_transfered)

            if (os.path.splitext(self.Location)[-1] == '.h5' and
                os.path.splitext(os.path.basename(self.Location))[0].endswith('_bf')):
                logger.info('dataproduct is a beamformed h5 file. adding raw file to the transfer')
                self.Location = [self.Location, self.Location.replace('.h5', '.raw')]

            if self.DataProduct not in self.Location and 'Source' in self.job:
                # old hack, is needed to support dynspec / pulsar archiving scripts
                self.Location = os.path.join(self.Location, self.job['Source'])

            cp = LtaCp(self.HostLocation,
                       self.Location,
                       self.PrimaryUri,
                       self.user,
                       local_ip=local_ip,
                       globus_timeout=self.globus_timeout,
                       progress_callback=progress_callback)

            transfer_result = cp.transfer(force=True)

            self.status = IngestPipeline.STATUS_FINALIZING

            if not transfer_result:
                msg = 'error while transferring %s with ltacp' % (self.JobId)
                logger.error(msg)
                raise Exception(msg)

            self.MD5Checksum = transfer_result[0]
            self.Adler32Checksum = transfer_result[1]
            self.FileSize = transfer_result[2]

            if self.MD5Checksum and self.Adler32Checksum and self.FileSize:
                logger.debug('valid checksums found for %s with filesize %sB (%s). md5: %s adler32: %s', self.JobId,
                                                                                                         self.FileSize,
                                                                                                         humanreadablesize(int(self.FileSize), 'B'),
                                                                                                         self.MD5Checksum,
                                                                                                         self.Adler32Checksum)
            else:
                msg = 'no valid checksums found for %s with filesize %sB (%s). md5: %s adler32: %s' % (self.JobId,
                                                                                                       self.FileSize,
                                                                                                       humanreadablesize(int(self.FileSize), 'B'),
                                                                                                       self.MD5Checksum,
                                                                                                       self.Adler32Checksum)
                logger.error(msg)
                raise Exception(msg)

            try:
                self.__sendNotification('JobProgress',
                                        message='transfer finished',
                                        percentage_done=100.0,
                                        total_bytes_transfered=int(self.FileSize))
            except ValueError:
                pass
            elapsed = time.time() - start

            try:
                if int(self.FileSize) > 0:
                    avgSpeed = float(self.FileSize) / elapsed
                logger.info("Finished file transfer for %s in %d sec with an average speed of %s for %s including ltacp overhead" % (self.JobId, elapsed, humanreadablesize(avgSpeed, 'Bps'), humanreadablesize(float(self.FileSize), 'B')))
            except Exception:
                logger.info('Finished file transfer of %s in %s' % (self.JobId, elapsed))

        except Exception as exp:
            if isinstance(exp, LtacpException):
                if '550 File not found' in exp.value:
                    logger.error('Destination directory does not exist. Creating %s in LTA for %s' % (self.PrimaryUri, self.JobId))

                    if create_missing_directories(self.PrimaryUri) == 0:
                        logger.info('Created path %s in LTA for %s' % (self.PrimaryUri, self.JobId))
                elif 'source path' in exp.value and 'does not exist' in exp.value:
                    raise PipelineError(exp.value, PipelineNoSourceError)

            raise Exception('transfer failed for %s: %s' % (self.JobId, str(exp)))