示例#1
0
    def update(self, master_info=None, state_json=None):
        """
        Get new node list from master.
        If master_info is set (during registration and reregistration),
        a new master url will be set.
        """
        if master_info is not None:
            self.master_info = master_info

        # We have no way to update; abort
        if state_json is None and self.master_info is None:
            return

        # For testing, allow caller to give state_json.
        if state_json is None and self.master_info is not None:
            state_endpoint = "http://" + self.master_info.hostname + ":" + \
                str(self.master_info.port) + "/state.json"

            state_json = json_helper.from_url(state_endpoint)

        # Get node list.
        new_targets = []
        for slave in state_json['slaves']:
            new_targets.append(slave['pid'].split('@')[1])

        print "New targets: %s" % new_targets

        # Make copy of current targets, to identify deactivated slaves
        # TODO(nnielsen): Find lighter weight way of doing this.
        inactive_slaves = copy.deepcopy(self.targets)

        for new_target in new_targets:
            if new_target not in self.targets:
                slave = Agent(new_target)

                # TODO(nnielsen): Persist map id -> host to zookeeper.

                self.monitor[slave.task_id] = slave
                self.targets[new_target] = slave

            if new_target in inactive_slaves:
                print "Don't remove %s" % new_target
                del inactive_slaves[new_target]

        if len(inactive_slaves) > 0:
            print "%d slaves to be unmonitored" % len(inactive_slaves)
            for inactive_slave, slave in inactive_slaves.iteritems():
                print "inactive_slave: %s" % inactive_slave
                # TODO(nnielsen): Remove from monitor queue as well.
                self.unmonitor[slave.task_id] = inactive_slave

                if slave.task_id in self.monitor:
                    # Don't try to schedule for monitoring, if we decided slave is gone.
                    del self.monitor[slave.task_id]

                    # And no longer a target.
                    if slave.hostname in self.targets:
                        del self.targets[slave.hostname]

        self.stats()
示例#2
0
    def update(self, master_info=None, state_json=None):
        """
        Get new node list from master.
        If master_info is set (during registration and reregistration),
        a new master url will be set.
        """
        if master_info is not None:
            self.master_info = master_info

        # We have no way to update; abort
        if state_json is None and self.master_info is None:
            return

        # For testing, allow caller to give state_json.
        if state_json is None and self.master_info is not None:
            state_endpoint = "http://" + self.master_info.hostname + ":" + \
                str(self.master_info.port) + "/state.json"

            state_json = json_helper.from_url(state_endpoint)

        # Get node list.
        new_targets = []
        for slave in state_json['slaves']:
            new_targets.append(slave['pid'].split('@')[1])

        print "New targets: %s" % new_targets

        # Make copy of current targets, to identify deactivated slaves
        # TODO(nnielsen): Find lighter weight way of doing this.
        inactive_slaves = copy.deepcopy(self.targets)

        for new_target in new_targets:
            if new_target not in self.targets:
                slave = Agent(new_target)

                # TODO(nnielsen): Persist map id -> host to zookeeper.

                self.monitor[slave.task_id] = slave
                self.targets[new_target] = slave

            if new_target in inactive_slaves:
                print "Don't remove %s" % new_target
                del inactive_slaves[new_target]

        if len(inactive_slaves) > 0:
            print "%d slaves to be unmonitored" % len(inactive_slaves)
            for inactive_slave, slave in inactive_slaves.iteritems():
                print "inactive_slave: %s" % inactive_slave
                # TODO(nnielsen): Remove from monitor queue as well.
                self.unmonitor[slave.task_id] = inactive_slave

                if slave.task_id in self.monitor:
                    # Don't try to schedule for monitoring, if we decided slave is gone.
                    del self.monitor[slave.task_id]

                    # And no longer a target.
                    if slave.hostname in self.targets:
                        del self.targets[slave.hostname]

        self.stats()
示例#3
0
    def update(self, master_info=None):
        """
        Get new node list from master.
        If master_info is set (during registration and reregistration), a new master url will be set.
        """
        if master_info is not None:
            self.master_info = master_info

        state_endpoint = "http://" + self.master_info.hostname + ":" + str(self.master_info.port) + "/state.json"

        state_json = json_helper.from_url(state_endpoint)

        # Get node list
        new_targets = []
        for slave in state_json['slaves']:
            new_targets.append(slave['pid'].split('@')[1])

        # Make copy of current targets, to identify deactivated slaves
        inactive_slaves = self.targets

        for new_target in new_targets:
            if new_target not in self.targets:
                slave = Slave(new_target)

                # TODO(nnielsen): Persist map id -> host to zookeeper.

                self.monitor[slave.id] = slave
                self.targets[slave.hostname] = slave
                del inactive_slaves[slave.hostname]

        if len(inactive_slaves) > 0:
            print "%d slaves to be unmonitored" % len(inactive_slaves)
            for inactive_slave in inactive_slaves:
                # TODO(nnielsen): Remove from monitor queue as well.
                self.unmonitor[inactive_slave.id] = inactive_slave
示例#4
0
def resolve_slave_id(slave_location):
    """
    Helper to look up slave id from slave endpoint.

    :param slave_location: Address of slave (for example, localhost:5051).
    :return: ID of slave.
    """
    state_endpoint = 'http://%s/state.json' % slave_location
    slave_state = json_helper.from_url(state_endpoint)
    slave_id = slave_state['id']
    print "Resolved slave id: %s" % slave_id

    return slave_id
示例#5
0
        def run_task():
            print "Running task %s" % task.task_id.value
            update = mesos_pb2.TaskStatus()
            update.task_id.value = task.task_id.value
            update.state = mesos_pb2.TASK_RUNNING
            driver.sendStatusUpdate(update)

            def fail(message):
                update = mesos_pb2.TaskStatus()
                update.task_id.value = task.task_id.value
                update.state = mesos_pb2.TASK_FAILED
                update.message = message
                driver.sendStatusUpdate(update)

            print "Validating task.data..."

            # Validate task.data
            try:
                if task.data is None:
                    return fail('Data field not set for task; cannot monitor slave')
             
                task_json = json.loads(task.data)

                if 'slave_location' not in task_json:
                    return fail('slave_location not found in task json')
            except:
                return fail('Data field could not be parsed for task; cannot monitor slave')

            print "Task.data validated"

            print task_json

            slave_location = task_json['slave_location']
            monitor_endpoint = 'http://%s/monitor/statistics.json' % slave_location
            state_endpoint = 'http://%s/state.json' % slave_location
            metrics_endpoint = 'http://%s/metrics/snapshot' % slave_location

            slave_state = json_helper.from_url(state_endpoint)
            slave_id = slave_state['id']

            print "Resolved slave id: %s" % slave_id

            # One second sample rate.
            sample_rate = 1

            samples = {}
            sample_count = 0

            print "Start sample loop..."

            # Sample loop.
            while True:
                # Poor mans GC: We loose one sample per framework every 10.000 iterations.
                sample_count += 1
                if sample_count > 10000 == 0:
                    print "Cleaning samples..."
                    sample_count = 0
                    samples = {}

                stellar_samples = []

                print "Collecting sample for %s" % monitor_endpoint

                metrics_snapshot = json_helper.from_url(metrics_endpoint)
                cpus_total = metrics_snapshot['slave/cpus_total']
                cpus_used = metrics_snapshot['slave/cpus_used']
                cpus_allocation_slack = cpus_total - cpus_used

                mem_total = metrics_snapshot['slave/mem_total']
                mem_used = metrics_snapshot['slave/mem_used']
                mem_allocation_slack = mem_total - mem_used

                # TODO(nnielsen): If slave is unreachable after a certain number of retries, send TASK_FAILED and abort.
                # Collect the latest resource usage statistics.
                # TODO(nnielsen): Make sample rate configurable.
                # TODO(nnielsen): Batch samples.
                for sample in json_helper.from_url(monitor_endpoint):
                    print 'Collecting sample at \'%s\'' % monitor_endpoint
                    if 'statistics' in sample and 'timestamp' not in sample['statistics']:
                        sample['statistics']['timestamp'] = time.time()

                    # Validate sample
                    if not metrics.validate_statistics_sample(sample):
                        print "Warning: partial sample %s" % sample
                        continue

                    framework_id = sample['framework_id']
                    executor_id = sample['executor_id']

                    if framework_id not in samples:
                        samples[framework_id] = {}

                    if executor_id not in samples[framework_id]:
                        samples[framework_id][executor_id] = None

                    if samples[framework_id][executor_id] is not None:
                        # We need two samples to compute the cpu usage.
                        prev = samples[framework_id][executor_id]

                        interval = sample['statistics']['timestamp'] - prev['statistics']['timestamp']

                        user_time = sample['statistics']['cpus_user_time_secs'] - prev['statistics']['cpus_user_time_secs']
                        system_time = sample['statistics']['cpus_system_time_secs'] - prev['statistics']['cpus_system_time_secs']
                        cpu_usage = float(user_time + system_time) / float(interval)

                        # Compute slack CPU.
                        cpu_slack = sample['statistics']['cpus_limit'] - cpu_usage

                        # Compute slack memory.
                        mem_usage = sample['statistics']['mem_rss_bytes']
                        mem_slack = sample['statistics']['mem_limit_bytes'] - mem_usage

                        # TODO(nnielsen): Hang off task id's for this executor.
                        stellar_samples.append({
                            'slave_id': slave_id,
                            'framework_id': framework_id,
                            'executor_id': executor_id,
                            'cpu_usage_slack': cpu_slack,
                            'cpu_usage': cpu_usage,
                            'cpu_allocation_slack': cpus_allocation_slack,
                            'mem_usage_slack': mem_slack,
                            'mem_usage': mem_usage,
                            'mem_allocation_slack': mem_allocation_slack,
                            'timestamp': sample['statistics']['timestamp']
                        })

                    samples[framework_id][executor_id] = sample

                # Send samples if collected.
                if stellar_samples is not '':
                    json_out = json.dumps(stellar_samples)
                    update = mesos_pb2.TaskStatus()
                    update.task_id.value = task.task_id.value
                    update.state = mesos_pb2.TASK_RUNNING
                    update.data = json_out
                    driver.sendStatusUpdate(update)

                time.sleep(sample_rate)
示例#6
0
def run_task(executor_driver, task):
    """
    Entry for collector thread.

    :return: False on failure
    """
    print "Running task %s" % task.task_id.value
    running(executor_driver, task)

    slave_location = validate_task_info(executor_driver, task)['slave_location']

    slave_id = resolve_slave_id(slave_location)

    monitor_endpoint = 'http://%s/monitor/statistics.json' % slave_location

    samples = {}
    sample_count = 0

    print "Start sample loop..."

    # Sample loop.
    while True:
        # Poor mans GC: We loose one sample per framework every 10.000 iterations.
        sample_count += 1
        if sample_count > 10000 == 0:
            print "Cleaning samples..."
            sample_count = 0
            samples = {}

        stellar_samples = []

        # Compute slave global allocation slacks.
        metrics_snapshot = json_helper.from_url('http://%s/metrics/snapshot' % slave_location)

        cpus_allocation_slack = metrics_snapshot['slave/cpus_total'] - \
            metrics_snapshot['slave/cpus_used']

        mem_allocation_slack = metrics_snapshot['slave/mem_total'] - \
            metrics_snapshot['slave/mem_used']

        # TODO(nnielsen): If slave is unreachable after a certain number of retries,
        #                 send TASK_FAILED and abort.
        # Collect the latest resource usage statistics.
        # TODO(nnielsen): Make sample rate configurable.
        # TODO(nnielsen): Batch samples.
        # TODO(nnielsen): We can adjust sample rate based on time of previous request.
        for sample in json_helper.from_url(monitor_endpoint):
            print 'Collecting sample at \'%s\'' % monitor_endpoint
            if 'statistics' in sample and 'timestamp' not in sample['statistics']:
                sample['statistics']['timestamp'] = time.time()

            # Validate sample
            if not validate_statistics_sample(sample):
                print "Warning: partial sample %s" % sample
                continue

            framework_id = sample['framework_id']
            executor_id = sample['executor_id']

            # Initialize 2-level deep map of framework -> executor -> sample.
            if framework_id not in samples:
                samples[framework_id] = {}

            if executor_id not in samples[framework_id]:
                samples[framework_id][executor_id] = None

            if samples[framework_id][executor_id] is not None:
                # We need two samples to compute the cpu usage.
                stellar_sample = process_sample(samples[framework_id][executor_id], sample)

                # Add global metrics.
                stellar_sample['slave_id'] = slave_id
                stellar_sample['cpu_allocation_slack'] = cpus_allocation_slack
                stellar_sample['mem_allocation_slack'] = mem_allocation_slack

                stellar_samples.append(stellar_sample)

            # Save current sample for next sample processing.
            samples[framework_id][executor_id] = sample

        # Send samples if collected.
        if len(stellar_samples) > 0:
            running(executor_driver, task, json.dumps(stellar_samples))

        # One second sample rate.
        time.sleep(1)