예제 #1
0
    def _recommend_ram(self):
        """Recommend an economical RAM constraint for this job.

        Nodes that are advertised as "8 gibibytes" actually have what
        we might call "8 nearlygibs" of memory available for jobs.
        Here, we calculate a whole number of nearlygibs that would
        have sufficed to run the job, then recommend requesting a node
        with that number of nearlygibs (expressed as mebibytes).

        Requesting a node with "nearly 8 gibibytes" is our best hope
        of getting a node that actually has nearly 8 gibibytes
        available.  If the node manager is smart enough to account for
        the discrepancy itself when choosing/creating a node, we'll
        get an 8 GiB node with nearly 8 GiB available.  Otherwise, the
        advertised size of the next-size-smaller node (say, 6 GiB)
        will be too low to satisfy our request, so we will effectively
        get rounded up to 8 GiB.

        For example, if we need 7500 MiB, we can ask for 7500 MiB, and
        we will generally get a node that is advertised as "8 GiB" and
        has at least 7500 MiB available.  However, asking for 8192 MiB
        would either result in an unnecessarily expensive 12 GiB node
        (if node manager knows about the discrepancy), or an 8 GiB
        node which has less than 8192 MiB available and is therefore
        considered by crunch-dispatch to be too small to meet our
        constraint.

        When node manager learns how to predict the available memory
        for each node type such that crunch-dispatch always agrees
        that a node is big enough to run the job it was brought up
        for, all this will be unnecessary.  We'll just ask for exactly
        the memory we want -- even if that happens to be 8192 MiB.
        """

        constraint_key = self._map_runtime_constraint('ram')
        used_bytes = self.stats_max['mem']['rss']
        if used_bytes == float('-Inf'):
            logger.warning('%s: no memory usage data', self.label)
            return
        used_mib = math.ceil(float(used_bytes) / MB)
        asked_mib = self.existing_constraints.get(constraint_key)

        nearlygibs = lambda mebibytes: mebibytes / AVAILABLE_RAM_RATIO / 1024
        if used_mib > 0 and (
                asked_mib is None or
            (math.ceil(nearlygibs(used_mib)) < nearlygibs(asked_mib))):
            yield ('#!! {} max RSS was {} MiB -- '
                   'try reducing runtime_constraints to "{}":{}').format(
                       self.label, int(used_mib), constraint_key,
                       int(
                           math.ceil(nearlygibs(used_mib)) *
                           AVAILABLE_RAM_RATIO * 1024 * (MB) /
                           self._runtime_constraint_mem_unit()))
예제 #2
0
    def _recommend_ram(self):
        """Recommend an economical RAM constraint for this job.

        Nodes that are advertised as "8 gibibytes" actually have what
        we might call "8 nearlygibs" of memory available for jobs.
        Here, we calculate a whole number of nearlygibs that would
        have sufficed to run the job, then recommend requesting a node
        with that number of nearlygibs (expressed as mebibytes).

        Requesting a node with "nearly 8 gibibytes" is our best hope
        of getting a node that actually has nearly 8 gibibytes
        available.  If the node manager is smart enough to account for
        the discrepancy itself when choosing/creating a node, we'll
        get an 8 GiB node with nearly 8 GiB available.  Otherwise, the
        advertised size of the next-size-smaller node (say, 6 GiB)
        will be too low to satisfy our request, so we will effectively
        get rounded up to 8 GiB.

        For example, if we need 7500 MiB, we can ask for 7500 MiB, and
        we will generally get a node that is advertised as "8 GiB" and
        has at least 7500 MiB available.  However, asking for 8192 MiB
        would either result in an unnecessarily expensive 12 GiB node
        (if node manager knows about the discrepancy), or an 8 GiB
        node which has less than 8192 MiB available and is therefore
        considered by crunch-dispatch to be too small to meet our
        constraint.

        When node manager learns how to predict the available memory
        for each node type such that crunch-dispatch always agrees
        that a node is big enough to run the job it was brought up
        for, all this will be unnecessary.  We'll just ask for exactly
        the memory we want -- even if that happens to be 8192 MiB.
        """

        constraint_key = self._map_runtime_constraint('ram')
        used_bytes = self.stats_max['mem']['rss']
        if used_bytes == float('-Inf'):
            logger.warning('%s: no memory usage data', self.label)
            return
        used_mib = math.ceil(float(used_bytes) / 1048576)
        asked_mib = self.existing_constraints.get(constraint_key)

        nearlygibs = lambda mebibytes: mebibytes/AVAILABLE_RAM_RATIO/1024
        if asked_mib is None or (
                math.ceil(nearlygibs(used_mib)) < nearlygibs(asked_mib)):
            yield (
                '#!! {} max RSS was {} MiB -- '
                'try runtime_constraints "{}":{}'
            ).format(
                self.label,
                int(used_mib),
                constraint_key,
                int(math.ceil(nearlygibs(used_mib))*AVAILABLE_RAM_RATIO*1024*(2**20)/self._runtime_constraint_mem_unit()))
예제 #3
0
    def _recommend_cpu(self):
        """Recommend asking for 4 cores if max CPU usage was 333%"""

        cpu_max_rate = self.stats_max['cpu']['user+sys__rate']
        if cpu_max_rate == float('-Inf'):
            logger.warning('%s: no CPU usage data', self.label)
            return
        used_cores = max(1, int(math.ceil(cpu_max_rate)))
        asked_cores = self.existing_constraints.get('min_cores_per_node')
        if asked_cores is None or used_cores < asked_cores:
            yield ('#!! {} max CPU usage was {}% -- '
                   'try runtime_constraints "min_cores_per_node":{}').format(
                       self.label, int(math.ceil(cpu_max_rate * 100)),
                       int(used_cores))
예제 #4
0
    def __init__(self, root, skip_child_jobs=False, **kwargs):
        arv = arvados.api('v1', model=OrderedJsonModel())

        label = kwargs.pop('label', None) or root.get('name') or root['uuid']
        root['name'] = label

        children = collections.OrderedDict()
        todo = collections.deque((root, ))
        while len(todo) > 0:
            current = todo.popleft()
            label = current['name']
            sort_key = current['created_at']
            if current['uuid'].find('-xvhdp-') > 0:
                current = arv.containers().get(
                    uuid=current['container_uuid']).execute()

            summer = ContainerSummarizer(current, label=label, **kwargs)
            summer.sort_key = sort_key
            children[current['uuid']] = summer

            page_filters = []
            while True:
                child_crs = arv.container_requests().index(
                    order=['uuid asc'],
                    filters=page_filters +
                    [['requesting_container_uuid', '=', current['uuid']]],
                ).execute()
                if not child_crs['items']:
                    break
                elif skip_child_jobs:
                    logger.warning(
                        '%s: omitting stats from %d child containers'
                        ' because --skip-child-jobs flag is on', label,
                        child_crs['items_available'])
                    break
                page_filters = [['uuid', '>', child_crs['items'][-1]['uuid']]]
                for cr in child_crs['items']:
                    if cr['container_uuid']:
                        logger.debug('%s: container req %s', current['uuid'],
                                     cr['uuid'])
                        cr['name'] = cr.get('name') or cr['uuid']
                        todo.append(cr)
        sorted_children = collections.OrderedDict()
        for uuid in sorted(list(children.keys()),
                           key=lambda uuid: children[uuid].sort_key):
            sorted_children[uuid] = children[uuid]
        super(ContainerTreeSummarizer, self).__init__(children=sorted_children,
                                                      label=root['name'],
                                                      **kwargs)
예제 #5
0
 def __init__(self, pipeline_instance_uuid, **kwargs):
     arv = arvados.api('v1', model=OrderedJsonModel())
     instance = arv.pipeline_instances().get(
         uuid=pipeline_instance_uuid).execute()
     self.summarizers = collections.OrderedDict()
     for cname, component in instance['components'].iteritems():
         if 'job' not in component:
             logger.warning("%s: skipping component with no job assigned",
                            cname)
         else:
             logger.info("%s: job %s", cname, component['job']['uuid'])
             summarizer = JobSummarizer(component['job'], **kwargs)
             summarizer.label = '{} {}'.format(cname,
                                               component['job']['uuid'])
             self.summarizers[cname] = summarizer
     self.label = pipeline_instance_uuid
예제 #6
0
 def __init__(self, process, label=None, **kwargs):
     rdr = None
     self.process = process
     if label is None:
         label = self.process.get('name', self.process['uuid'])
     if self.process.get('log'):
         try:
             rdr = crunchstat_summary.reader.CollectionReader(self.process['log'])
         except arvados.errors.NotFoundError as e:
             logger.warning("Trying event logs after failing to read "
                            "log collection %s: %s", self.process['log'], e)
     if rdr is None:
         rdr = crunchstat_summary.reader.LiveLogReader(self.process['uuid'])
         label = label + ' (partial)'
     super(ProcessSummarizer, self).__init__(rdr, label=label, **kwargs)
     self.existing_constraints = self.process.get('runtime_constraints', {})
예제 #7
0
    def __init__(self, root, skip_child_jobs=False, **kwargs):
        arv = arvados.api('v1', model=OrderedJsonModel())

        label = kwargs.pop('label', None) or root.get('name') or root['uuid']
        root['name'] = label

        children = collections.OrderedDict()
        todo = collections.deque((root, ))
        while len(todo) > 0:
            current = todo.popleft()
            label = current['name']
            sort_key = current['created_at']
            if current['uuid'].find('-xvhdp-') > 0:
                current = arv.containers().get(uuid=current['container_uuid']).execute()

            summer = ContainerSummarizer(current, label=label, **kwargs)
            summer.sort_key = sort_key
            children[current['uuid']] = summer

            page_filters = []
            while True:
                child_crs = arv.container_requests().index(
                    order=['uuid asc'],
                    filters=page_filters+[
                        ['requesting_container_uuid', '=', current['uuid']]],
                ).execute()
                if not child_crs['items']:
                    break
                elif skip_child_jobs:
                    logger.warning('%s: omitting stats from %d child containers'
                                   ' because --skip-child-jobs flag is on',
                                   label, child_crs['items_available'])
                    break
                page_filters = [['uuid', '>', child_crs['items'][-1]['uuid']]]
                for cr in child_crs['items']:
                    if cr['container_uuid']:
                        logger.debug('%s: container req %s', current['uuid'], cr['uuid'])
                        cr['name'] = cr.get('name') or cr['uuid']
                        todo.append(cr)
        sorted_children = collections.OrderedDict()
        for uuid in sorted(children.keys(), key=lambda uuid: children[uuid].sort_key):
            sorted_children[uuid] = children[uuid]
        super(ContainerTreeSummarizer, self).__init__(
            children=sorted_children,
            label=root['name'],
            **kwargs)
예제 #8
0
 def __init__(self, instance, **kwargs):
     children = collections.OrderedDict()
     for cname, component in instance['components'].iteritems():
         if 'job' not in component:
             logger.warning(
                 "%s: skipping component with no job assigned", cname)
         else:
             logger.info(
                 "%s: job %s", cname, component['job']['uuid'])
             summarizer = JobTreeSummarizer(component['job'], label=cname, **kwargs)
             summarizer.label = '{} {}'.format(
                 cname, component['job']['uuid'])
             children[cname] = summarizer
     super(PipelineSummarizer, self).__init__(
         children=children,
         label=instance['uuid'],
         **kwargs)
예제 #9
0
    def _recommend_cpu(self):
        """Recommend asking for 4 cores if max CPU usage was 333%"""

        cpu_max_rate = self.stats_max['cpu']['user+sys__rate']
        if cpu_max_rate == float('-Inf'):
            logger.warning('%s: no CPU usage data', self.label)
            return
        used_cores = max(1, int(math.ceil(cpu_max_rate)))
        asked_cores = self.existing_constraints.get('min_cores_per_node')
        if asked_cores is None or used_cores < asked_cores:
            yield (
                '#!! {} max CPU usage was {}% -- '
                'try runtime_constraints "min_cores_per_node":{}'
            ).format(
                self.label,
                int(math.ceil(cpu_max_rate*100)),
                int(used_cores))
예제 #10
0
 def __init__(self, instance, **kwargs):
     children = collections.OrderedDict()
     for cname, component in instance['components'].items():
         if 'job' not in component:
             logger.warning("%s: skipping component with no job assigned",
                            cname)
         else:
             logger.info("%s: job %s", cname, component['job']['uuid'])
             summarizer = JobTreeSummarizer(component['job'],
                                            label=cname,
                                            **kwargs)
             summarizer.label = '{} {}'.format(cname,
                                               component['job']['uuid'])
             children[cname] = summarizer
     super(PipelineSummarizer, self).__init__(children=children,
                                              label=instance['uuid'],
                                              **kwargs)
예제 #11
0
 def __init__(self, pipeline_instance_uuid, **kwargs):
     arv = arvados.api('v1', model=OrderedJsonModel())
     instance = arv.pipeline_instances().get(
         uuid=pipeline_instance_uuid).execute()
     self.summarizers = collections.OrderedDict()
     for cname, component in instance['components'].iteritems():
         if 'job' not in component:
             logger.warning(
                 "%s: skipping component with no job assigned", cname)
         else:
             logger.info(
                 "%s: job %s", cname, component['job']['uuid'])
             summarizer = JobSummarizer(component['job'], **kwargs)
             summarizer.label = '{} {}'.format(
                 cname, component['job']['uuid'])
             self.summarizers[cname] = summarizer
     self.label = pipeline_instance_uuid
예제 #12
0
    def _recommend_cpu(self):
        """Recommend asking for 4 cores if max CPU usage was 333%"""

        constraint_key = self._map_runtime_constraint('vcpus')
        cpu_max_rate = self.stats_max['cpu']['user+sys__rate']
        if cpu_max_rate == float('-Inf') or cpu_max_rate == 0.0:
            logger.warning('%s: no CPU usage data', self.label)
            return
        # TODO Don't necessarily want to recommend on isolated max peak
        # take average CPU usage into account as well or % time at max
        used_cores = max(1, int(math.ceil(cpu_max_rate)))
        asked_cores = self.existing_constraints.get(constraint_key)
        if asked_cores is None:
            asked_cores = 1
        # TODO: This should be more nuanced in cases where max >> avg
        if used_cores < asked_cores:
            yield ('#!! {} max CPU usage was {}% -- '
                   'try reducing runtime_constraints to "{}":{}').format(
                       self.label, math.ceil(cpu_max_rate * 100),
                       constraint_key, int(used_cores))
예제 #13
0
 def __init__(self, job, **kwargs):
     arv = arvados.api('v1')
     if isinstance(job, basestring):
         self.job = arv.jobs().get(uuid=job).execute()
     else:
         self.job = job
     rdr = None
     if self.job.get('log'):
         try:
             rdr = crunchstat_summary.reader.CollectionReader(self.job['log'])
         except arvados.errors.NotFoundError as e:
             logger.warning("Trying event logs after failing to read "
                            "log collection %s: %s", self.job['log'], e)
         else:
             label = self.job['uuid']
     if rdr is None:
         rdr = crunchstat_summary.reader.LiveLogReader(self.job['uuid'])
         label = self.job['uuid'] + ' (partial)'
     super(JobSummarizer, self).__init__(rdr, **kwargs)
     self.label = label
     self.existing_constraints = self.job.get('runtime_constraints', {})
예제 #14
0
 def __init__(self, process, label=None, **kwargs):
     rdr = None
     self.process = process
     if label is None:
         label = self.process.get('name', self.process['uuid'])
     # Pre-Arvados v1.4 everything is in 'log'
     # For 1.4+ containers have no logs and container_requests have them in 'log_uuid', not 'log'
     log_collection = self.process.get('log', self.process.get('log_uuid'))
     if log_collection and self.process.get(
             'state') != 'Uncommitted':  # arvados.util.CR_UNCOMMITTED:
         try:
             rdr = crunchstat_summary.reader.CollectionReader(
                 log_collection)
         except arvados.errors.NotFoundError as e:
             logger.warning(
                 "Trying event logs after failing to read "
                 "log collection %s: %s", self.process['log'], e)
     if rdr is None:
         uuid = self.process.get('container_uuid', self.process.get('uuid'))
         rdr = crunchstat_summary.reader.LiveLogReader(uuid)
         label = label + ' (partial)'
     super(ProcessSummarizer, self).__init__(rdr, label=label, **kwargs)
     self.existing_constraints = self.process.get('runtime_constraints', {})
예제 #15
0
    def _run(self, logdata):
        self.detected_crunch1 = False
        for line in logdata:
            if not self.detected_crunch1 and '-8i9sb-' in line:
                self.detected_crunch1 = True

            if self.detected_crunch1:
                m = re.search(
                    r'^\S+ \S+ \d+ (?P<seq>\d+) job_task (?P<task_uuid>\S+)$',
                    line)
                if m:
                    seq = int(m.group('seq'))
                    uuid = m.group('task_uuid')
                    self.seq_to_uuid[seq] = uuid
                    logger.debug('%s: seq %d is task %s', self.label, seq,
                                 uuid)
                    continue

                m = re.search(
                    r'^\S+ \S+ \d+ (?P<seq>\d+) (success in|failure \(#., permanent\) after) (?P<elapsed>\d+) seconds',
                    line)
                if m:
                    task_id = self.seq_to_uuid[int(m.group('seq'))]
                    elapsed = int(m.group('elapsed'))
                    self.task_stats[task_id]['time'] = {'elapsed': elapsed}
                    if elapsed > self.stats_max['time']['elapsed']:
                        self.stats_max['time']['elapsed'] = elapsed
                    continue

                m = re.search(
                    r'^\S+ \S+ \d+ (?P<seq>\d+) stderr Queued job (?P<uuid>\S+)$',
                    line)
                if m:
                    uuid = m.group('uuid')
                    if self._skip_child_jobs:
                        logger.warning(
                            '%s: omitting stats from child job %s'
                            ' because --skip-child-jobs flag is on',
                            self.label, uuid)
                        continue
                    logger.debug('%s: follow %s', self.label, uuid)
                    child_summarizer = ProcessSummarizer(uuid)
                    child_summarizer.stats_max = self.stats_max
                    child_summarizer.task_stats = self.task_stats
                    child_summarizer.tasks = self.tasks
                    child_summarizer.starttime = self.starttime
                    child_summarizer.run()
                    logger.debug('%s: done %s', self.label, uuid)
                    continue

                # 2017-12-02_17:15:08 e51c5-8i9sb-mfp68stkxnqdd6m 63676 0 stderr crunchstat: keepcalls 0 put 2576 get -- interval 10.0000 seconds 0 put 2576 get
                m = re.search(
                    r'^(?P<timestamp>[^\s.]+)(\.\d+)? (?P<job_uuid>\S+) \d+ (?P<seq>\d+) stderr (?P<crunchstat>crunchstat: )(?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n$',
                    line)
                if not m:
                    continue
            else:
                # crunch2
                # 2017-12-01T16:56:24.723509200Z crunchstat: keepcalls 0 put 3 get -- interval 10.0000 seconds 0 put 3 get
                m = re.search(
                    r'^(?P<timestamp>\S+) (?P<crunchstat>crunchstat: )?(?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n$',
                    line)
                if not m:
                    continue

            if self.label is None:
                try:
                    self.label = m.group('job_uuid')
                except IndexError:
                    self.label = 'container'
            if m.group('category').endswith(':'):
                # "stderr crunchstat: notice: ..."
                continue
            elif m.group('category') in ('error', 'caught'):
                continue
            elif m.group('category') in ('read', 'open', 'cgroup', 'CID',
                                         'Running'):
                # "stderr crunchstat: read /proc/1234/net/dev: ..."
                # (old logs are less careful with unprefixed error messages)
                continue

            if self.detected_crunch1:
                task_id = self.seq_to_uuid[int(m.group('seq'))]
            else:
                task_id = 'container'
            task = self.tasks[task_id]

            # Use the first and last crunchstat timestamps as
            # approximations of starttime and finishtime.
            timestamp = m.group('timestamp')
            if timestamp[10:11] == '_':
                timestamp = datetime.datetime.strptime(timestamp,
                                                       '%Y-%m-%d_%H:%M:%S')
            elif timestamp[10:11] == 'T':
                timestamp = datetime.datetime.strptime(timestamp[:19],
                                                       '%Y-%m-%dT%H:%M:%S')
            else:
                raise ValueError(
                    "Cannot parse timestamp {!r}".format(timestamp))

            if task.starttime is None:
                logger.debug('%s: task %s starttime %s', self.label, task_id,
                             timestamp)
            if task.starttime is None or timestamp < task.starttime:
                task.starttime = timestamp
            if task.finishtime is None or timestamp > task.finishtime:
                task.finishtime = timestamp

            if self.starttime is None or timestamp < task.starttime:
                self.starttime = timestamp
            if self.finishtime is None or timestamp < task.finishtime:
                self.finishtime = timestamp

            if (
                    not self.detected_crunch1
            ) and task.starttime is not None and task.finishtime is not None:
                elapsed = (task.finishtime - task.starttime).seconds
                self.task_stats[task_id]['time'] = {'elapsed': elapsed}
                if elapsed > self.stats_max['time']['elapsed']:
                    self.stats_max['time']['elapsed'] = elapsed

            this_interval_s = None
            for group in ['current', 'interval']:
                if not m.group(group):
                    continue
                category = m.group('category')
                words = m.group(group).split(' ')
                stats = {}
                try:
                    for val, stat in zip(words[::2], words[1::2]):
                        if '.' in val:
                            stats[stat] = float(val)
                        else:
                            stats[stat] = int(val)
                except ValueError as e:
                    # If the line doesn't start with 'crunchstat:' we
                    # might have mistaken an error message for a
                    # structured crunchstat line.
                    if m.group("crunchstat") is None or m.group(
                            "category") == "crunchstat":
                        logger.warning("%s: log contains message\n  %s",
                                       self.label, line)
                    else:
                        logger.warning(
                            '%s: Error parsing value %r (stat %r, category %r): %r',
                            self.label, val, stat, category, e)
                        logger.warning('%s', line)
                    continue
                if 'user' in stats or 'sys' in stats:
                    stats['user+sys'] = stats.get('user', 0) + stats.get(
                        'sys', 0)
                if 'tx' in stats or 'rx' in stats:
                    stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0)
                for stat, val in stats.items():
                    if group == 'interval':
                        if stat == 'seconds':
                            this_interval_s = val
                            continue
                        elif not (this_interval_s > 0):
                            logger.error(
                                "BUG? interval stat given with duration {!r}".
                                format(this_interval_s))
                            continue
                        else:
                            stat = stat + '__rate'
                            val = val / this_interval_s
                            if stat in ['user+sys__rate', 'tx+rx__rate']:
                                task.series[category, stat].append(
                                    (timestamp - self.starttime, val))
                    else:
                        if stat in ['rss']:
                            task.series[category, stat].append(
                                (timestamp - self.starttime, val))
                        self.task_stats[task_id][category][stat] = val
                    if val > self.stats_max[category][stat]:
                        self.stats_max[category][stat] = val
        logger.debug('%s: done parsing', self.label)

        self.job_tot = collections.defaultdict(
            functools.partial(collections.defaultdict, int))
        for task_id, task_stat in self.task_stats.items():
            for category, stat_last in task_stat.items():
                for stat, val in stat_last.items():
                    if stat in ['cpus', 'cache', 'swap', 'rss']:
                        # meaningless stats like 16 cpu cores x 5 tasks = 80
                        continue
                    self.job_tot[category][stat] += val
        logger.debug('%s: done totals', self.label)
예제 #16
0
    def _run(self, logdata):
        self.detected_crunch1 = False
        for line in logdata:
            if not self.detected_crunch1 and '-8i9sb-' in line:
                self.detected_crunch1 = True

            if self.detected_crunch1:
                m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) job_task (?P<task_uuid>\S+)$', line)
                if m:
                    seq = int(m.group('seq'))
                    uuid = m.group('task_uuid')
                    self.seq_to_uuid[seq] = uuid
                    logger.debug('%s: seq %d is task %s', self.label, seq, uuid)
                    continue

                m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) (success in|failure \(#., permanent\) after) (?P<elapsed>\d+) seconds', line)
                if m:
                    task_id = self.seq_to_uuid[int(m.group('seq'))]
                    elapsed = int(m.group('elapsed'))
                    self.task_stats[task_id]['time'] = {'elapsed': elapsed}
                    if elapsed > self.stats_max['time']['elapsed']:
                        self.stats_max['time']['elapsed'] = elapsed
                    continue

                m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) stderr Queued job (?P<uuid>\S+)$', line)
                if m:
                    uuid = m.group('uuid')
                    if self._skip_child_jobs:
                        logger.warning('%s: omitting stats from child job %s'
                                       ' because --skip-child-jobs flag is on',
                                       self.label, uuid)
                        continue
                    logger.debug('%s: follow %s', self.label, uuid)
                    child_summarizer = ProcessSummarizer(uuid)
                    child_summarizer.stats_max = self.stats_max
                    child_summarizer.task_stats = self.task_stats
                    child_summarizer.tasks = self.tasks
                    child_summarizer.starttime = self.starttime
                    child_summarizer.run()
                    logger.debug('%s: done %s', self.label, uuid)
                    continue

                # 2017-12-02_17:15:08 e51c5-8i9sb-mfp68stkxnqdd6m 63676 0 stderr crunchstat: keepcalls 0 put 2576 get -- interval 10.0000 seconds 0 put 2576 get
                m = re.search(r'^(?P<timestamp>[^\s.]+)(\.\d+)? (?P<job_uuid>\S+) \d+ (?P<seq>\d+) stderr (?P<crunchstat>crunchstat: )(?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n$', line)
                if not m:
                    continue
            else:
                # crunch2
                # 2017-12-01T16:56:24.723509200Z crunchstat: keepcalls 0 put 3 get -- interval 10.0000 seconds 0 put 3 get
                m = re.search(r'^(?P<timestamp>\S+) (?P<crunchstat>crunchstat: )?(?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n$', line)
                if not m:
                    continue

            if self.label is None:
                try:
                    self.label = m.group('job_uuid')
                except IndexError:
                    self.label = 'container'
            if m.group('category').endswith(':'):
                # "stderr crunchstat: notice: ..."
                continue
            elif m.group('category') in ('error', 'caught'):
                continue
            elif m.group('category') in ('read', 'open', 'cgroup', 'CID', 'Running'):
                # "stderr crunchstat: read /proc/1234/net/dev: ..."
                # (old logs are less careful with unprefixed error messages)
                continue

            if self.detected_crunch1:
                task_id = self.seq_to_uuid[int(m.group('seq'))]
            else:
                task_id = 'container'
            task = self.tasks[task_id]

            # Use the first and last crunchstat timestamps as
            # approximations of starttime and finishtime.
            timestamp = m.group('timestamp')
            if timestamp[10:11] == '_':
                timestamp = datetime.datetime.strptime(
                    timestamp, '%Y-%m-%d_%H:%M:%S')
            elif timestamp[10:11] == 'T':
                timestamp = datetime.datetime.strptime(
                    timestamp[:19], '%Y-%m-%dT%H:%M:%S')
            else:
                raise ValueError("Cannot parse timestamp {!r}".format(
                    timestamp))

            if task.starttime is None:
                logger.debug('%s: task %s starttime %s',
                             self.label, task_id, timestamp)
            if task.starttime is None or timestamp < task.starttime:
                task.starttime = timestamp
            if task.finishtime is None or timestamp > task.finishtime:
                task.finishtime = timestamp

            if self.starttime is None or timestamp < task.starttime:
                self.starttime = timestamp
            if self.finishtime is None or timestamp < task.finishtime:
                self.finishtime = timestamp

            if (not self.detected_crunch1) and task.starttime is not None and task.finishtime is not None:
                elapsed = (task.finishtime - task.starttime).seconds
                self.task_stats[task_id]['time'] = {'elapsed': elapsed}
                if elapsed > self.stats_max['time']['elapsed']:
                    self.stats_max['time']['elapsed'] = elapsed

            this_interval_s = None
            for group in ['current', 'interval']:
                if not m.group(group):
                    continue
                category = m.group('category')
                words = m.group(group).split(' ')
                stats = {}
                try:
                    for val, stat in zip(words[::2], words[1::2]):
                        if '.' in val:
                            stats[stat] = float(val)
                        else:
                            stats[stat] = int(val)
                except ValueError as e:
                    # If the line doesn't start with 'crunchstat:' we
                    # might have mistaken an error message for a
                    # structured crunchstat line.
                    if m.group("crunchstat") is None or m.group("category") == "crunchstat":
                        logger.warning("%s: log contains message\n  %s", self.label, line)
                    else:
                        logger.warning(
                            '%s: Error parsing value %r (stat %r, category %r): %r',
                            self.label, val, stat, category, e)
                        logger.warning('%s', line)
                    continue
                if 'user' in stats or 'sys' in stats:
                    stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0)
                if 'tx' in stats or 'rx' in stats:
                    stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0)
                for stat, val in stats.iteritems():
                    if group == 'interval':
                        if stat == 'seconds':
                            this_interval_s = val
                            continue
                        elif not (this_interval_s > 0):
                            logger.error(
                                "BUG? interval stat given with duration {!r}".
                                format(this_interval_s))
                            continue
                        else:
                            stat = stat + '__rate'
                            val = val / this_interval_s
                            if stat in ['user+sys__rate', 'tx+rx__rate']:
                                task.series[category, stat].append(
                                    (timestamp - self.starttime, val))
                    else:
                        if stat in ['rss']:
                            task.series[category, stat].append(
                                (timestamp - self.starttime, val))
                        self.task_stats[task_id][category][stat] = val
                    if val > self.stats_max[category][stat]:
                        self.stats_max[category][stat] = val
        logger.debug('%s: done parsing', self.label)

        self.job_tot = collections.defaultdict(
            functools.partial(collections.defaultdict, int))
        for task_id, task_stat in self.task_stats.iteritems():
            for category, stat_last in task_stat.iteritems():
                for stat, val in stat_last.iteritems():
                    if stat in ['cpus', 'cache', 'swap', 'rss']:
                        # meaningless stats like 16 cpu cores x 5 tasks = 80
                        continue
                    self.job_tot[category][stat] += val
        logger.debug('%s: done totals', self.label)
예제 #17
0
    def run(self):
        logger.debug("%s: parsing logdata %s", self.label, self._logdata)
        for line in self._logdata:
            m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) job_task (?P<task_uuid>\S+)$', line)
            if m:
                seq = int(m.group('seq'))
                uuid = m.group('task_uuid')
                self.seq_to_uuid[seq] = uuid
                logger.debug('%s: seq %d is task %s', self.label, seq, uuid)
                continue

            m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) (success in|failure \(#., permanent\) after) (?P<elapsed>\d+) seconds', line)
            if m:
                task_id = self.seq_to_uuid[int(m.group('seq'))]
                elapsed = int(m.group('elapsed'))
                self.task_stats[task_id]['time'] = {'elapsed': elapsed}
                if elapsed > self.stats_max['time']['elapsed']:
                    self.stats_max['time']['elapsed'] = elapsed
                continue

            m = re.search(r'^\S+ \S+ \d+ (?P<seq>\d+) stderr Queued job (?P<uuid>\S+)$', line)
            if m:
                uuid = m.group('uuid')
                if self._skip_child_jobs:
                    logger.warning('%s: omitting stats from child job %s'
                                   ' because --skip-child-jobs flag is on',
                                   self.label, uuid)
                    continue
                logger.debug('%s: follow %s', self.label, uuid)
                child_summarizer = JobSummarizer(uuid)
                child_summarizer.stats_max = self.stats_max
                child_summarizer.task_stats = self.task_stats
                child_summarizer.tasks = self.tasks
                child_summarizer.starttime = self.starttime
                child_summarizer.run()
                logger.debug('%s: done %s', self.label, uuid)
                continue

            m = re.search(r'^(?P<timestamp>[^\s.]+)(\.\d+)? (?P<job_uuid>\S+) \d+ (?P<seq>\d+) stderr crunchstat: (?P<category>\S+) (?P<current>.*?)( -- interval (?P<interval>.*))?\n', line)
            if not m:
                continue

            if self.label is None:
                self.label = m.group('job_uuid')
                logger.debug('%s: using job uuid as label', self.label)
            if m.group('category').endswith(':'):
                # "stderr crunchstat: notice: ..."
                continue
            elif m.group('category') in ('error', 'caught'):
                continue
            elif m.group('category') == 'read':
                # "stderr crunchstat: read /proc/1234/net/dev: ..."
                # (crunchstat formatting fixed, but old logs still say this)
                continue
            task_id = self.seq_to_uuid[int(m.group('seq'))]
            task = self.tasks[task_id]

            # Use the first and last crunchstat timestamps as
            # approximations of starttime and finishtime.
            timestamp = datetime.datetime.strptime(
                m.group('timestamp'), '%Y-%m-%d_%H:%M:%S')
            if not task.starttime:
                task.starttime = timestamp
                logger.debug('%s: task %s starttime %s',
                             self.label, task_id, timestamp)
            task.finishtime = timestamp

            if not self.starttime:
                self.starttime = timestamp
            self.finishtime = timestamp

            this_interval_s = None
            for group in ['current', 'interval']:
                if not m.group(group):
                    continue
                category = m.group('category')
                words = m.group(group).split(' ')
                stats = {}
                for val, stat in zip(words[::2], words[1::2]):
                    try:
                        if '.' in val:
                            stats[stat] = float(val)
                        else:
                            stats[stat] = int(val)
                    except ValueError as e:
                        raise ValueError(
                            'Error parsing {} stat in "{}": {!r}'.format(
                                stat, line, e))
                if 'user' in stats or 'sys' in stats:
                    stats['user+sys'] = stats.get('user', 0) + stats.get('sys', 0)
                if 'tx' in stats or 'rx' in stats:
                    stats['tx+rx'] = stats.get('tx', 0) + stats.get('rx', 0)
                for stat, val in stats.iteritems():
                    if group == 'interval':
                        if stat == 'seconds':
                            this_interval_s = val
                            continue
                        elif not (this_interval_s > 0):
                            logger.error(
                                "BUG? interval stat given with duration {!r}".
                                format(this_interval_s))
                            continue
                        else:
                            stat = stat + '__rate'
                            val = val / this_interval_s
                            if stat in ['user+sys__rate', 'tx+rx__rate']:
                                task.series[category, stat].append(
                                    (timestamp - self.starttime, val))
                    else:
                        if stat in ['rss']:
                            task.series[category, stat].append(
                                (timestamp - self.starttime, val))
                        self.task_stats[task_id][category][stat] = val
                    if val > self.stats_max[category][stat]:
                        self.stats_max[category][stat] = val
        logger.debug('%s: done parsing', self.label)

        self.job_tot = collections.defaultdict(
            functools.partial(collections.defaultdict, int))
        for task_id, task_stat in self.task_stats.iteritems():
            for category, stat_last in task_stat.iteritems():
                for stat, val in stat_last.iteritems():
                    if stat in ['cpus', 'cache', 'swap', 'rss']:
                        # meaningless stats like 16 cpu cores x 5 tasks = 80
                        continue
                    self.job_tot[category][stat] += val
        logger.debug('%s: done totals', self.label)