Пример #1
0
def test_extreme():
    digest = TDigest()
    digest.add(10, 3)
    digest.add(20, 1)
    digest.add(40, 5)
    values = [5., 10., 15., 20., 30., 35., 40., 45., 50.]
    for q in [1.5 / 9, 3.5 / 9, 6.5 / 9]:
        assert abs(quantile(values, q) - digest.quantile(q)) < 0.01
Пример #2
0
def test_extreme():
    digest = TDigest()
    digest.add(10, 3)
    digest.add(20, 1)
    digest.add(40, 5)
    values = [5., 10., 15., 20., 30., 35., 40., 45., 50.]
    for q in [1.5 / 9, 3.5 / 9, 6.5 / 9]:
        assert abs(quantile(values, q) - digest.quantile(q)) < 0.01
Пример #3
0
def test_sorted():
    digest = TDigest()
    for i in range(10000):
        digest.add(random.random(), 1 + random.randint(0, 10))

    prev = None
    for c in digest.centroids:
        assert prev is None or prev.mean <= c.mean
        prev = c
Пример #4
0
def test_sorted():
    digest = TDigest()
    for i in range(10000):
        digest.add(random.random(), 1 + random.randint(0, 10))

    prev = None
    for c in digest.centroids:
        assert prev is None or prev.mean <= c.mean
        prev = c
Пример #5
0
def test_monocity():
    digest = TDigest()
    for i in range(10000):
        digest.add(random.random())

    for i in range(int(1e4) - 1):
        q1 = i * 1e-4
        q2 = (i + 1) * 1e-4
        assert digest.quantile(q1) <= digest.quantile(q2)
        assert digest.cdf(q1) <= digest.cdf(q2)
Пример #6
0
def test_monocity():
    digest = TDigest()
    for i in range(10000):
        digest.add(random.random())

    for i in range(int(1e4) - 1):
        q1 = i * 1e-4
        q2 = (i + 1) * 1e-4
        assert digest.quantile(q1) <= digest.quantile(q2)
        assert digest.cdf(q1) <= digest.cdf(q2)
Пример #7
0
def test_more_than_2billion_values():
    digest = TDigest()
    for i in range(1000):
        digest.add(random.random())

    for i in range(10):
        digest.add(random.random(), 1 << 28)

    assert len(digest) == 1000 + 10 * (1 << 28)
    prev = None
    for q in sorted([0, 0.1, 0.5, 0.9, 1, random.random()]):
        v = digest.quantile(q)
        assert prev is None or v >= prev
        prev = v
Пример #8
0
def test_more_than_2billion_values():
    digest = TDigest()
    for i in range(1000):
        digest.add(random.random())

    for i in range(10):
        digest.add(random.random(), 1 << 28)

    assert len(digest) == 1000 + 10 * (1 << 28)
    prev = None
    for q in sorted([0, 0.1, 0.5, 0.9, 1, random.random()]):
        v = digest.quantile(q)
        assert prev is None or v >= prev
        prev = v
Пример #9
0
def test_singleton_in_a_crowd():
    compression = 100
    digest = TDigest(compression=compression)
    for i in range(10000):
        digest.add(10)

    digest.add(20)
    digest.compress()

    assert digest.quantile(0) == 10.0
    assert digest.quantile(0.5) == 10.0
    assert digest.quantile(0.8) == 10.0
    assert digest.quantile(0.9) == 10.0
    assert digest.quantile(0.99) == 10.0
    assert digest.quantile(1) == 20.0
Пример #10
0
def test_fill():
    def q_to_k(q):
        return asin(2 * min(1, q) - 1) / pi + 0.5

    delta = 300
    digest = TDigest(delta)
    for i in range(100000):
        digest.add(random.gauss(0, 1))

    q0 = 0.
    for c in digest.centroids:
        q1 = q0 + float(c.count) / len(digest)
        dk = delta * (q_to_k(q1) - q_to_k(q0))
        assert dk <= 1
        q0 = q1
Пример #11
0
def test_fill():
    def q_to_k(q):
        return asin(2 * min(1, q) - 1) / pi + 0.5

    delta = 300
    digest = TDigest(delta)
    for i in range(100000):
        digest.add(random.gauss(0, 1))

    q0 = 0.
    for c in digest.centroids:
        q1 = q0 + float(c.count) / len(digest)
        dk = delta * (q_to_k(q1) - q_to_k(q0))
        assert dk <= 1
        q0 = q1
Пример #12
0
def test_singleton_in_a_crowd():
    compression = 100
    digest = TDigest(compression=compression)
    for i in range(10000):
        digest.add(10)

    digest.add(20)
    digest.compress()

    assert digest.quantile(0) == 10.0
    assert digest.quantile(0.5) == 10.0
    assert digest.quantile(0.8) == 10.0
    assert digest.quantile(0.9) == 10.0
    assert digest.quantile(0.99) == 10.0
    assert digest.quantile(1) == 20.0
Пример #13
0
def test_serialization():
    digest = TDigest()
    for i in range(100):
        digest.add(random.random())

    digest2 = pickle.loads(pickle.dumps(digest))

    assert len(digest) == len(digest2)
    assert len(digest.centroids) == len(digest2.centroids)
    for c1, c2 in zip(digest.centroids, digest2.centroids):
        assert c1.mean == c2.mean
        assert c1.count == c2.count

    for q in range(10000):
        assert digest.quantile(q / 10000.) == digest2.quantile(q / 10000.)
        assert digest.cdf(q / 10000.) == digest2.cdf(q / 10000.)
Пример #14
0
def test_serialization():
    digest = TDigest()
    for i in range(100):
        digest.add(random.random())

    digest2 = pickle.loads(pickle.dumps(digest))

    assert len(digest) == len(digest2)
    assert len(digest.centroids) == len(digest2.centroids)
    for c1, c2 in zip(digest.centroids, digest2.centroids):
        assert c1.mean == c2.mean
        assert c1.count == c2.count

    for q in range(10000):
        assert digest.quantile(q / 10000.) == digest2.quantile(q / 10000.)
        assert digest.cdf(q / 10000.) == digest2.cdf(q / 10000.)
Пример #15
0
def test_repeated_values():
    digest = TDigest()
    data = [rint(random.uniform(0, 1) * 10) / 10. for _ in range(10000)]

    for d in data:
        digest.add(d)

    assert len(digest.centroids) < 10 * 1000.
    for i in range(10):
        z = i / 10.
        for delta in [0.01, 0.02, 0.03, 0.07, 0.08, 0.09]:
            q = z + delta
            cdf = digest.cdf(q)
            assert abs(z + 0.05 - cdf) < 0.02

            estimate = digest.quantile(q)
            assert abs(rint(q * 10) / 10. - estimate) < 0.001
Пример #16
0
def test_repeated_values():
    digest = TDigest()
    data = [rint(random.uniform(0, 1) * 10) / 10. for _ in range(10000)]

    for d in data:
        digest.add(d)

    assert len(digest.centroids) < 10 * 1000.
    for i in range(10):
        z = i / 10.
        for delta in [0.01, 0.02, 0.03, 0.07, 0.08, 0.09]:
            q = z + delta
            cdf = digest.cdf(q)
            assert abs(z + 0.05 - cdf) < 0.02

            estimate = digest.quantile(q)
            assert abs(rint(q * 10) / 10. - estimate) < 0.001
Пример #17
0
def test_few_values():
    digest = TDigest()
    length = random.randint(1, 10)
    values = []
    for i in range(length):
        if i == 0 or random.random() < 0.5:
            value = random.random() * 100
        else:
            value = values[-1]

        digest.add(value)
        values.append(value)

    values = sorted(values)
    assert len(digest.centroids) == len(values)
    for q in [0, 1e-10, random.random(), 0.5, 1 - 1e-10, 1]:
        q1 = quantile(values, q)
        q2 = digest.quantile(q)
        assert abs(q1 - q2) < 0.03
Пример #18
0
def test_few_values():
    digest = TDigest()
    length = random.randint(1, 10)
    values = []
    for i in range(length):
        if i == 0 or random.random() < 0.5:
            value = random.random() * 100
        else:
            value = values[-1]

        digest.add(value)
        values.append(value)

    values = sorted(values)
    assert len(digest.centroids) == len(values)
    for q in [0, 1e-10, random.random(), 0.5, 1 - 1e-10, 1]:
        q1 = quantile(values, q)
        q2 = digest.quantile(q)
        assert abs(q1 - q2) < 0.03
Пример #19
0
def test_merge():
    for parts in [2, 5, 10, 20, 50, 100]:
        data = []
        digest = TDigest()
        subs = [TDigest() for _ in range(parts)]
        cnt = [0] * parts

        for i in range(10000):
            x = random.random()
            data.append(x)
            digest.add(x)
            subs[i % parts].add(x)
            cnt[i % parts] += 1

        digest.compress()
        data = sorted(data)

        k = 0
        for i, d in enumerate(subs):
            assert cnt[i] == len(d)
            k2 = sum(c.count for c in d.centroids)
            assert cnt[i] == k2
            k += k2

        assert k == len(data)

        digest2 = reduce(lambda x, y: x + y, subs)

        for q in [0.001, 0.01, 0.1, 0.2, 0.3, 0.5]:
            z = quantile(data, q)
            e2 = digest2.quantile(q) - z

            assert abs(e2) / q < 0.3
            assert abs(e2) < 0.015

        for q in [0.001, 0.01, 0.1, 0.2, 0.3, 0.5]:
            z = cdf(data, q)
            e2 = digest2.cdf(q) - z

            assert abs(e2) / q < 0.3
            assert abs(e2) < 0.015
Пример #20
0
def test_merge():
    for parts in [2, 5, 10, 20, 50, 100]:
        data = []
        digest = TDigest()
        subs = [TDigest() for _ in range(parts)]
        cnt = [0] * parts

        for i in range(10000):
            x = random.random()
            data.append(x)
            digest.add(x)
            subs[i % parts].add(x)
            cnt[i % parts] += 1

        digest.compress()
        data = sorted(data)

        k = 0
        for i, d in enumerate(subs):
            assert cnt[i] == len(d)
            k2 = sum(c.count for c in d.centroids)
            assert cnt[i] == k2
            k += k2

        assert k == len(data)

        digest2 = reduce(lambda x, y: x + y, subs)

        for q in [0.001, 0.01, 0.1, 0.2, 0.3, 0.5]:
            z = quantile(data, q)
            e2 = digest2.quantile(q) - z

            assert abs(e2) / q < 0.3
            assert abs(e2) < 0.015

        for q in [0.001, 0.01, 0.1, 0.2, 0.3, 0.5]:
            z = cdf(data, q)
            e2 = digest2.cdf(q) - z

            assert abs(e2) / q < 0.3
            assert abs(e2) < 0.015
Пример #21
0
def test_three_point_example():
    digest = TDigest()
    x0 = 0.18615591526031494
    x1 = 0.4241943657398224
    x2 = 0.8813006281852722
    digest.add(x0)
    digest.add(x1)
    digest.add(x2)

    p10 = digest.quantile(0.1)
    p50 = digest.quantile(0.5)
    p90 = digest.quantile(0.9)
    p95 = digest.quantile(0.95)
    p99 = digest.quantile(0.99)

    assert p10 <= p50
    assert p50 <= p90
    assert p90 <= p95
    assert p95 <= p99

    assert x0 == p10
    assert x2 == p90
Пример #22
0
def test_three_point_example():
    digest = TDigest()
    x0 = 0.18615591526031494
    x1 = 0.4241943657398224
    x2 = 0.8813006281852722
    digest.add(x0)
    digest.add(x1)
    digest.add(x2)

    p10 = digest.quantile(0.1)
    p50 = digest.quantile(0.5)
    p90 = digest.quantile(0.9)
    p95 = digest.quantile(0.95)
    p99 = digest.quantile(0.99)

    assert p10 <= p50
    assert p50 <= p90
    assert p90 <= p95
    assert p95 <= p99

    assert x0 == p10
    assert x2 == p90
Пример #23
0
def test_nan():
    digest = TDigest()
    iters = random.randint(0, 10)
    for i in range(iters):
        digest.add(random.random(), 1 + random.randint(0, 10))

    try:
        if random.random() < 0.5:
            digest.add(float('nan'))
        else:
            digest.add(float('nan'), 1)

        assert False
    except ValueError:
        pass
Пример #24
0
def test_nan():
    digest = TDigest()
    iters = random.randint(0, 10)
    for i in range(iters):
        digest.add(random.random(), 1 + random.randint(0, 10))

    try:
        if random.random() < 0.5:
            digest.add(float('nan'))
        else:
            digest.add(float('nan'), 1)

        assert False
    except ValueError:
        pass
Пример #25
0
def test_single_value():
    digest = TDigest()
    value = random.random() * 1000
    digest.add(value)
    for q in [0, random.random(), 1]:
        assert abs(value - digest.quantile(q)) < 1e-3
Пример #26
0
def test_small_count_quantile():
    digest = TDigest(200)
    for d in [15.0, 20.0, 32.0, 60.0]:
        digest.add(d)

    assert abs(digest.quantile(0.4) - 21.2) < 1e-10
Пример #27
0
def test_small_count_quantile():
    digest = TDigest(200)
    for d in [15.0, 20.0, 32.0, 60.0]:
        digest.add(d)

    assert abs(digest.quantile(0.4) - 21.2) < 1e-10
Пример #28
0
def test_single_value():
    digest = TDigest()
    value = random.random() * 1000
    digest.add(value)
    for q in [0, random.random(), 1]:
        assert abs(value - digest.quantile(q)) < 1e-3
Пример #29
0
class TaskSet(object):
    """ A TaskSet runs a set of tasks of a Stage with retry.

        - Task_id seen by TaskSet not include task.num_try
        - Each task try four times before abort.
        - Enlarge task.mem if fail for OOM.
        - Retry for lagging tasks.
    """
    def __init__(self,
                 sched,
                 tasks,
                 cpus=1,
                 mem=100,
                 gpus=0,
                 task_host_manager=None):
        self.start = time.time()
        self.sched = sched
        self.tasks = tasks
        self.id = tasks[0].taskset_id
        self.ttids = set()

        for t in self.tasks:
            t.status = None
            t.num_try = 0
            t.time_used = 0
            t.cpus = cpus
            t.mem = mem
            t.gpus = gpus

        self.launched = [False] * len(tasks)
        self.finished = [False] * len(tasks)
        self.numFailures = [0] * len(tasks)
        self.running_hosts = [[] for _ in range(len(tasks))]
        self.tidToIndex = {}
        self.numTasks = len(tasks)
        self.tasksLaunched = 0
        self.tasksFinished = 0
        self.total_time_used = 0

        self.lastPreferredLaunchTime = time.time()

        self.pendingTasksForHost = {}
        self.pendingTasksWithNoPrefs = []
        self.allPendingTasks = []

        self.reasons = set()
        self.failed = False
        self.causeOfFailure = ''
        self.last_check = 0

        for i in range(len(tasks)):
            self._addPendingTask(i)
        self.host_cache = {}
        self.task_host_manager = task_host_manager if task_host_manager is not None \
            else TaskHostManager()
        self.id_retry_host = {}
        self.task_local_set = set()
        self.mem_digest = TDigest()
        self.mem90 = 0  # TODO: move to stage

    @property
    def taskEverageTime(self):
        if not self.tasksFinished:
            return 10
        return max(self.total_time_used / self.tasksFinished, 5)

    def _addPendingTask(self, i):
        loc = self.tasks[i].preferredLocations()
        if not loc:
            self.pendingTasksWithNoPrefs.append(i)
        else:
            for host in loc:
                self.pendingTasksForHost.setdefault(host, []).append(i)
        self.allPendingTasks.append(i)

    def _getPendingTasksForHostWithCache(self, host):
        tasks = self.host_cache.get(host)
        if tasks:
            return tasks
        else:
            tasks = self._getPendingTasksForHost(host)
            self.host_cache[host] = tasks
            return tasks

    def _getPendingTasksForHost(self, host):
        try:
            h, hs, ips = socket.gethostbyname_ex(host)
        except Exception:
            h, hs, ips = host, [], []
        tasks = sum(
            (self.pendingTasksForHost.get(h, []) for h in [h] + hs + ips), [])
        st = {}
        for t in tasks:
            st[t] = st.get(t, 0) + 1
        ts = sorted(list(st.items()), key=itemgetter(1), reverse=True)
        return [t for t, _ in ts]

    def _findTaskFromList(self, l, host, cpus, mem, gpus):
        for i in l:
            if self.launched[i] or self.finished[i]:
                continue
            if host in self.running_hosts[i]:
                continue
            t = self.tasks[i]
            if self.task_host_manager.task_failed_on_host(t.id, host):
                continue
            if t.cpus <= cpus + 1e-4 and t.mem <= mem and t.gpus <= gpus:
                return i

    def taskOffer(self, host_offers, cpus, mems, gpus):
        prefer_list = []
        for host in host_offers:
            i, o = host_offers[host]
            local_task = self._findTaskFromList(
                self._getPendingTasksForHostWithCache(host), host, cpus[i],
                mems[i], gpus[i])
            if local_task is not None:
                result_tuple = self._try_update_task_offer(
                    local_task, i, o, cpus, mems, gpus)
                if result_tuple is None:
                    continue
                prefer_list.append(result_tuple)
        if prefer_list:
            return prefer_list
        for idx in range(len(self.tasks)):
            if not self.launched[idx] and not self.finished[idx]:
                i, o = self.task_host_manager.offer_choice(
                    self.tasks[idx].id, host_offers, self.running_hosts[idx])
                if i is None:
                    continue
                result_tuple = self._try_update_task_offer(
                    idx, i, o, cpus, mems, gpus)
                if result_tuple:
                    return [result_tuple]
        return []

    def _try_update_task_offer(self, task_idx, i, o, cpus, mem, gpus):
        t = self.tasks[task_idx]
        if t.cpus <= cpus[i] + 1e-4 and t.mem <= mem[i] and t.gpus <= gpus[i]:
            t.status = 'TASK_STAGING'
            t.start = time.time()
            t.host = o.hostname
            t.num_try += 1
            self.id_retry_host[(t.id, t.num_try)] = o.hostname
            logger.debug('Starting task %s on slave %s', t.try_id, o.hostname)
            self.tidToIndex[t.id] = task_idx
            self.launched[task_idx] = True
            self.tasksLaunched += 1
            self.running_hosts[task_idx].append(o.hostname)
            host_set = set(self.tasks[task_idx].preferredLocations())
            if o.hostname in host_set:
                self.task_local_set.add(t.id)
            return i, o, t
        return None

    def statusUpdate(self,
                     task_id,
                     num_try,
                     status,
                     reason=None,
                     result=None,
                     update=None,
                     stats=None):
        logger.debug('taskset status update %s, status %s, reason %s', task_id,
                     status, reason)
        if task_id not in self.tidToIndex:
            logger.error('invalid task_id: %s, status %s, reason %s', task_id,
                         status, reason)
            return
        i = self.tidToIndex[task_id]
        if self.finished[i]:
            if status == 'TASK_FINISHED':
                logger.debug('Task %s is already finished, ignore it', task_id)
            return

        task = self.tasks[i]
        task.status = status
        # when checking, task been masked as not launched
        if not self.launched[i]:
            self.launched[i] = True
            self.tasksLaunched += 1

        if status == 'TASK_FINISHED':
            self._task_finished(task_id, num_try, result, update, stats)
        elif status in ('TASK_LOST', 'TASK_FAILED', 'TASK_KILLED'):
            self._task_lost(task_id, num_try, status, reason)

        task.start = time.time()
        if stats:
            self.mem_digest.add(stats.bytes_max_rss / (1024.**2))

    def progress(self, ending=''):
        n = self.numTasks
        ratio = self.tasksFinished * 1. / n
        bar = make_progress_bar(ratio)
        if self.tasksFinished:
            elasped = time.time() - self.start
            avg = self.total_time_used / self.tasksFinished
            eta = (n - self.tasksFinished) * elasped / self.tasksFinished
            m, s = divmod(int(eta), 60)
            h, m = divmod(m, 60)

            tmpl = 'taskset:%4s {{GREEN}}%s{{RESET}}%5.1f%% (% {width}s/% {width}s) ETA:% 2d:%02d:%02d AVG:%.1fs\x1b[K%s'
            fmt = tmpl.format(width=int(math.log10(self.numTasks)) + 1)

            msg = fmt % (self.id, bar, ratio * 100, self.tasksFinished, n, h,
                         m, s, avg, ending)
            msg = msg.ljust(80)
            logger.info(msg)
        else:

            tmpl = 'taskset:%4s {{GREEN}}%s{{RESET}}%5.1f%% (% {width}s/% {width}s) ETA:--:--:-- AVG:N/A\x1b[K%s'
            fmt = tmpl.format(width=int(math.log10(self.numTasks)) + 1)

            msg = fmt % (self.id, bar, ratio * 100, self.tasksFinished, n,
                         ending)
            msg = msg.ljust(80)
            logger.info(msg)

    def _task_finished(self, task_id, num_try, result, update, stats):
        i = self.tidToIndex[task_id]
        self.finished[i] = True
        self.tasksFinished += 1
        task = self.tasks[i]
        hostname = self.id_retry_host[(task.id, num_try)] \
            if (task.id, num_try) in self.id_retry_host else task.host
        task.time_used += time.time() - task.start
        self.total_time_used += task.time_used
        if getattr(self.sched, 'color', False):
            title = 'taskset %s: task %s finished in %.1fs (%d/%d)     ' % (
                self.id, task_id, task.time_used, self.tasksFinished,
                self.numTasks)
            msg = '\x1b]2;%s\x07\x1b[1A' % title
            logger.info(msg)

        from dpark.schedule import Success
        self.sched.taskEnded(task, Success(), result, update, stats)
        self.running_hosts[i] = []
        self.task_host_manager.task_succeed(task.id, hostname, Success())

        for t in range(task.num_try):
            if t + 1 != num_try:
                self.sched.killTask(task.id, t + 1)

        if self.tasksFinished == self.numTasks:
            ts = [t.time_used for t in self.tasks]
            num_try = [t.num_try for t in self.tasks]
            elasped = time.time() - self.start
            logger.info(
                'taskset %s finished in %.1fs: min=%.1fs, '
                'avg=%.1fs, max=%.1fs, maxtry=%d, speedup=%.1f, local=%.1f%%',
                self.id, elasped, min(ts),
                sum(ts) / len(ts), max(ts), max(num_try),
                self.total_time_used / elasped,
                len(self.task_local_set) * 100. / len(self.tasks))
            self.sched.tasksetFinished(self)

    def _task_lost(self, task_id, num_try, status, reason):
        index = self.tidToIndex[task_id]

        from dpark.schedule import FetchFailed
        if isinstance(reason, FetchFailed) and self.numFailures[index] >= 1:
            logger.warning('Cancel task %s after fetch fail twice from %s',
                           task_id, reason.serverUri)
            self.sched.taskEnded(self.tasks[index], reason, None, None)
            # cancel tasks
            if not self.finished[index]:
                self.finished[index] = True
                self.tasksFinished += 1
            for i in range(len(self.finished)):
                if not self.launched[i]:
                    self.launched[i] = True
                    self.tasksLaunched += 1
                    self.finished[i] = True
                    self.tasksFinished += 1
            if self.tasksFinished == self.numTasks:
                self.sched.tasksetFinished(self)  # cancel taskset
            return

        task = self.tasks[index]
        hostname = self.id_retry_host[(task.id, num_try)] \
            if (task.id, num_try) in self.id_retry_host else task.host

        if status == 'TASK_KILLED' or str(reason).startswith(
                'Memory limit exceeded:'):
            task.mem = min(task.mem * 2, MAX_TASK_MEMORY)
            logger.info("task %s oom, enlarge memory limit to %d, origin %d",
                        task.id, task.mem, task.rdd.mem)

            mem90 = self.mem_digest.quantile(0.9)
            if not math.isnan(mem90):
                mem90 = int(mem90)
                if mem90 > self.mem90:
                    logger.info(
                        "enlarge memory limit of remaining task from >%d to >%d (mem90)",
                        self.mem90, mem90)
                    self.mem90 = mem90
                    for i, t in enumerate(self.tasks):
                        if not self.launched[i]:
                            t.mem = max(mem90, t.mem)

        elif status == 'TASK_FAILED':
            _logger = logger.error if self.numFailures[index] == MAX_TASK_FAILURES \
                else logger.warning
            if reason not in self.reasons:
                _logger('task %s failed @ %s: %s : %s', task.id, hostname,
                        task, reason)
                self.reasons.add(reason)
            else:
                _logger('task %s failed @ %s: %s', task.id, hostname, task)

        elif status == 'TASK_LOST':
            logger.warning('Lost Task %s try %s at %s, reason %s', task_id,
                           num_try, task.host, reason)

        self.numFailures[index] += 1
        if self.numFailures[index] > MAX_TASK_FAILURES:
            logger.error('Task %s failed more than %d times; aborting taskset',
                         self.tasks[index].id, MAX_TASK_FAILURES)
            self._abort('Task %s failed more than %d times' %
                        (self.tasks[index].id, MAX_TASK_FAILURES))
        self.task_host_manager.task_failed(task.id, hostname, reason)
        self.launched[index] = False
        if self.tasksLaunched == self.numTasks:
            self.sched.requestMoreResources()
        self.running_hosts[index] = []
        self.tasksLaunched -= 1

    def check_task_timeout(self):
        now = time.time()
        if self.last_check + 5 > now:
            return False
        self.last_check = now

        n = self.launched.count(True)
        if n != self.tasksLaunched:
            logger.warning('bug: tasksLaunched(%d) != %d', self.tasksLaunched,
                           n)
            self.tasksLaunched = n

        for i in range(self.numTasks):
            task = self.tasks[i]
            if (self.launched[i] and task.status == 'TASK_STAGING'
                    and task.start + WAIT_FOR_RUNNING < now):
                logger.info('task %s timeout %.1f (at %s), re-assign it',
                            task.id, now - task.start, task.host)
                self.launched[i] = False
                self.tasksLaunched -= 1

        if self.tasksFinished > self.numTasks * 2.0 / 3:
            scale = 1.0 * self.numTasks / self.tasksFinished
            avg = max(self.taskEverageTime, 10)
            tasks = sorted((task.start, i, task)
                           for i, task in enumerate(self.tasks)
                           if self.launched[i] and not self.finished[i])
            for _t, idx, task in tasks:
                time_used = now - task.start
                if time_used > avg * (2**task.num_try) * scale:
                    # re-submit timeout task
                    if task.num_try <= MAX_TASK_FAILURES:
                        logger.info(
                            're-submit task %s for timeout %.1f, '
                            'try %d', task.id, time_used, task.num_try)
                        task.time_used += time_used
                        task.start = now
                        self.launched[idx] = False
                        self.tasksLaunched -= 1
                    else:
                        logger.error('task %s timeout, aborting taskset %s',
                                     task, self.id)
                        self._abort('task %s timeout' % task)
                else:
                    break
        return self.tasksLaunched < n

    def _abort(self, message):
        logger.error('abort the taskset: %s', message)
        tasks = ' '.join(
            str(i) for i in range(len(self.finished)) if not self.finished[i])
        logger.error('not finished tasks: %s', tasks)
        self.failed = True
        self.causeOfFailure = message
        self.sched.tasksetFinished(self)
        self.sched.abort()
Пример #30
0
class TaskSet(object):
    """ A TaskSet runs a set of tasks of a Stage with retry.

        - Task_id seen by TaskSet not include task.num_try
        - Each task try four times before abort.
        - Enlarge task.mem if fail for OOM.
        - Retry for lagging tasks.
    """

    def __init__(self, sched, tasks, cpus=1, mem=100, gpus=0,
                 task_host_manager=None):
        self.start_time = time.time()
        self.sched = sched
        self.tasks = tasks
        self.id = tasks[0].taskset_id
        self.ttids = set()

        for t in self.tasks:
            t.status = None
            t.num_try = 0
            t.time_used = 0
            t.cpus = cpus
            t.mem = mem
            t.gpus = gpus

        self.launched = [False] * len(tasks)
        self.finished = [False] * len(tasks)
        self.numFailures = [0] * len(tasks)
        self.running_hosts = [[] for _ in range(len(tasks))]
        self.tidToIndex = {}
        self.counter = TaskCounter(len(tasks))

        self.total_time_used = 0
        self.max_task_time = 0

        self.lastPreferredLaunchTime = time.time()

        self.pendingTasksForHost = {}
        self.pendingTasksWithNoPrefs = []
        self.allPendingTasks = []

        self.reasons = set()
        self.failed = False
        self.causeOfFailure = ''
        self.last_check = 0

        for i in range(len(tasks)):
            self._addPendingTask(i)
        self.host_cache = {}
        self.task_host_manager = task_host_manager if task_host_manager is not None \
            else TaskHostManager()
        self.id_retry_host = {}
        self.task_local_set = set()
        self.mem_digest = TDigest()
        self.max_stage_time = 0
        self.mem90 = 0  # TODO: move to stage

    @property
    def taskEverageTime(self):
        if not self.counter.finished:
            return 10
        return max(self.total_time_used / self.counter.finished, 5)

    def _addPendingTask(self, i):
        loc = self.tasks[i].preferredLocations()
        if not loc:
            self.pendingTasksWithNoPrefs.append(i)
        else:
            for host in loc:
                self.pendingTasksForHost.setdefault(host, []).append(i)
        self.allPendingTasks.append(i)

    def _getPendingTasksForHostWithCache(self, host):
        tasks = self.host_cache.get(host)
        if tasks:
            return tasks
        else:
            tasks = self._getPendingTasksForHost(host)
            self.host_cache[host] = tasks
            return tasks

    def _getPendingTasksForHost(self, host):
        try:
            h, hs, ips = socket.gethostbyname_ex(host)
        except Exception:
            h, hs, ips = host, [], []
        tasks = sum((self.pendingTasksForHost.get(h, [])
                     for h in [h] + hs + ips), [])
        st = {}
        for t in tasks:
            st[t] = st.get(t, 0) + 1
        ts = sorted(list(st.items()), key=itemgetter(1), reverse=True)
        return [t for t, _ in ts]

    def _findTaskFromList(self, l, host, cpus, mem, gpus):
        for i in l:
            if self.launched[i] or self.finished[i]:
                continue
            if host in self.running_hosts[i]:
                continue
            t = self.tasks[i]
            if self.task_host_manager.task_failed_on_host(t.id, host):
                continue
            if t.cpus <= cpus + 1e-4 and t.mem <= mem and t.gpus <= gpus:
                return i

    def taskOffer(self, host_offers, cpus, mems, gpus):
        prefer_list = []
        for host in host_offers:
            i, o = host_offers[host]
            local_task = self._findTaskFromList(
                self._getPendingTasksForHostWithCache(host), host,
                cpus[i], mems[i], gpus[i])
            if local_task is not None:
                result_tuple = self._try_update_task_offer(local_task, i, o, cpus, mems, gpus)
                if result_tuple is None:
                    continue
                prefer_list.append(result_tuple)
        if prefer_list:
            return prefer_list
        for idx in range(len(self.tasks)):
            if not self.launched[idx] and not self.finished[idx]:
                i, o = self.task_host_manager.offer_choice(self.tasks[idx].id, host_offers,
                                                           self.running_hosts[idx])
                if i is None:
                    continue
                result_tuple = self._try_update_task_offer(idx, i, o, cpus, mems, gpus)
                if result_tuple:
                    return [result_tuple]
        return []

    def _try_update_task_offer(self, task_idx, i, o, cpus, mem, gpus):
        t = self.tasks[task_idx]
        if t.cpus <= cpus[i] + 1e-4 and t.mem <= mem[i] and t.gpus <= gpus[i]:
            t.status = TaskState.staging
            t.host = o.hostname
            t.try_next()
            self.id_retry_host[(t.id, t.num_try)] = o.hostname
            logger.debug('Starting task %s on slave %s',
                         t.try_id, o.hostname)
            self.tidToIndex[t.id] = task_idx
            self.launched[task_idx] = True
            self.counter.launched += 1
            self.running_hosts[task_idx].append(o.hostname)
            host_set = set(self.tasks[task_idx].preferredLocations())
            if o.hostname in host_set:
                self.task_local_set.add(t.id)
            return i, o, t
        return None

    def statusUpdate(self, task_id, num_try, status, reason=None, message=None,
                     result=None, update=None, stats=None):
        logger.debug('taskset status update %s, status %s, reason %s', task_id, status, reason)
        if task_id not in self.tidToIndex:
            logger.error('invalid task_id: %s, status %s, reason %s', task_id, status, reason)
            return
        i = self.tidToIndex[task_id]
        task = self.tasks[i]
        task.update_status(status, num_try)

        if self.finished[i]:
            if status == TaskState.finished:
                logger.debug('Task %s is already finished, ignore it', task_id)
            return

        # when checking, task been masked as not launched
        if not self.launched[i]:
            self.launched[i] = True
            self.counter.launched += 1

        if status == TaskState.running:
            task.start_time = time.time()
            self.max_stage_time = max(self.max_stage_time, task.start_time - task.stage_time)
        elif status == TaskState.finished:
            if stats:
                self.mem_digest.add(stats.bytes_max_rss / (1024. ** 2))
            if task.tries[num_try].reason in (TaskReason.run_timeout, TaskReason.stage_timeout):
                logger.warning("task timeout works: try %s finshed. History: %s",
                               num_try, ". ".join(map(str, task.tries.values())))

            self._task_finished(task_id, num_try, result, update, stats)
        else:  # failed, killed, lost, error
            self._task_lost(task_id, num_try, status, reason, message, exception=result)

    def progress(self, ending=''):
        n = self.counter.n
        ratio = self.counter.finished * 1. / n
        bar = make_progress_bar(ratio)
        if self.counter.finished:
            elasped = time.time() - self.start_time
            avg = self.total_time_used / self.counter.finished
            eta = (n - self.counter.finished) * elasped / self.counter.finished
            m, s = divmod(int(eta), 60)
            h, m = divmod(m, 60)

            tmpl = 'taskset:%4s {{GREEN}}%s{{RESET}}%5.1f%% (% {width}s/% {width}s) ETA:% 2d:%02d:%02d AVG:%.1fs\x1b[K%s'
            fmt = tmpl.format(width=int(math.log10(self.counter.n)) + 1)

            msg = fmt % (
                self.id, bar, ratio * 100, self.counter.finished, n, h, m, s,
                avg, ending
            )
            msg = msg.ljust(80)
            logger.info(msg)
        else:

            tmpl = 'taskset:%4s {{GREEN}}%s{{RESET}}%5.1f%% (% {width}s/% {width}s) ETA:--:--:-- AVG:N/A\x1b[K%s'
            fmt = tmpl.format(width=int(math.log10(self.counter.n)) + 1)

            msg = fmt % (self.id, bar, ratio * 100, self.counter.finished, n, ending)
            msg = msg.ljust(80)
            logger.info(msg)

    def _task_finished(self, task_id, num_try, result, update, stats):
        i = self.tidToIndex[task_id]
        self.finished[i] = True
        self.counter.finished += 1
        task = self.tasks[i]
        hostname = self.id_retry_host[(task.id, num_try)] \
            if (task.id, num_try) in self.id_retry_host else task.host
        task.time_used += time.time() - task.start_time
        self.total_time_used += task.time_used
        self.max_task_time = max(self.max_task_time, task.time_used)
        if getattr(self.sched, 'color', False):
            title = 'taskset %s: task %s finished in %.1fs (%d/%d)     ' % (
                self.id, task_id, task.time_used, self.counter.finished, self.counter.n)
            msg = '\x1b]2;%s\x07\x1b[1A' % title
            logger.info(msg)

        self.sched.taskEnded(task, TaskEndReason.success, result, update, stats)
        self.running_hosts[i] = []
        self.task_host_manager.task_succeed(task.id, hostname,
                                            TaskEndReason.success)

        for t in range(task.num_try):
            if t + 1 != num_try:
                self.sched.killTask(task.id, t + 1)

        if self.counter.finished == self.counter.n:
            ts = [t.time_used for t in self.tasks]
            num_try = [t.num_try for t in self.tasks]
            elasped = time.time() - self.start_time
            logger.info('taskset %s finished in %.1fs: min=%.1fs, '
                        'avg=%.1fs, max=%.1fs, maxtry=%d, speedup=%.1f, local=%.1f%%',
                        self.id, elasped, min(ts), sum(ts) / len(ts), max(ts),
                        max(num_try), self.total_time_used / elasped,
                        len(self.task_local_set) * 100. / len(self.tasks)
                        )
            self.sched.tasksetFinished(self)

    def _task_lost(self, task_id, num_try, status, reason, message, exception=None):
        index = self.tidToIndex[task_id]
        task = self.tasks[index]

        if reason == TaskEndReason.fetch_failed and self.numFailures[index] >= 1:
            self.counter.fail_fetch += 1
            logger.warning('Cancel task %s after fetch fail twice from %s',
                           task_id, exception.serverUri)
            self.sched.taskEnded(self.tasks[index], reason, exception, None)
            # cancel tasks
            if not self.finished[index]:
                self.finished[index] = True
                self.counter.finished += 1
            for i in range(len(self.finished)):
                if not self.launched[i]:
                    self.launched[i] = True
                    self.counter.launched += 1
                    self.finished[i] = True
                    self.counter.finished += 1
            if self.counter.finished == self.counter.n:
                self.sched.tasksetFinished(self)  # cancel taskset
            return

        hostname = self.id_retry_host[(task.id, num_try)] \
            if (task.id, num_try) in self.id_retry_host else task.host

        abort = (self.numFailures[index] >= MAX_TASK_FAILURES)

        if TaskEndReason.maybe_oom(reason):
            self.counter.fail_oom += 1
            task.mem = min(task.mem * 2, MAX_TASK_MEMORY)
            logger.info("task %s oom, enlarge memory limit to %d, origin %d", task.id, task.mem, task.rdd.mem)

            mem90 = self.mem_digest.quantile(0.9)
            if not math.isnan(mem90):
                mem90 = int(mem90)
                if mem90 > self.mem90:
                    logger.info("enlarge memory limit of remaining task from >%d to >%d (mem90)", self.mem90, mem90)
                    self.mem90 = mem90
                    for i, t in enumerate(self.tasks):
                        if not self.launched[i]:
                            t.mem = max(mem90, t.mem)

        else:
            _logger = logger.error if abort else logger.warning

            err_msg_simple = '{} id: {}, host: {}, reason: {}'.format(status, task.id, hostname, reason)
            err_msg = "{} message: {}".format(err_msg_simple, message)

            if status == TaskState.failed:
                if reason is not None:  # for tests with master=local
                    if reason.startswith("FATAL_EXCEPTION"):
                        self._abort('Job abort without retry for {}.'.format(reason))
                if reason not in self.reasons:
                    self.reasons.add(reason)
                elif not abort:
                    err_msg = err_msg_simple
            _logger(err_msg)

        self.counter.fail_all += 1
        self.numFailures[index] += 1

        if abort:
            self._abort('Task %s failed more than %d times' % (self.tasks[index].id, MAX_TASK_FAILURES))

        task.reason_next = "fail"

        self.task_host_manager.task_failed(task.id, hostname, reason)
        self.launched[index] = False
        if self.counter.launched == self.counter.n:
            self.sched.requestMoreResources()
        self.running_hosts[index] = []
        self.counter.launched -= 1

    def check_task_timeout(self):
        """In lock, so be fast!"""

        now = time.time()
        if self.last_check + 5 > now:
            return False
        self.last_check = now

        n = self.launched.count(True)
        if n != self.counter.launched:
            logger.warning(
                'bug: counter.launched(%d) != %d',
                self.counter.launched,
                n)
            self.counter.launched = n

        # staged but not run for too long
        # mesos may be busy.
        num_resubmit = 0
        for i in range(self.counter.n):
            task = self.tasks[i]
            if (self.launched[i] and task.status == TaskState.staging
                    and task.stage_time + self.max_stage_time + WAIT_FOR_RUNNING < now):
                logger.warning('task %s staging timeout %.1f (at %s), re-assign it',
                               task.id, now - task.stage_time, task.host)
                self.counter.fail_staging_timeout += 1
                task.reason_next = TaskReason.stage_timeout
                self.launched[i] = False
                self.counter.launched -= 1
                num_resubmit += 1
                if num_resubmit > 3:
                    break

        # running for too long
        num_resubmit = 0
        if self.counter.finished > self.counter.n * 0.8:
            scale = 1.0 * self.counter.n / self.counter.finished
            tasks = sorted((task.start_time, i, task)
                           for i, task in enumerate(self.tasks)
                           if self.launched[i] and not self.finished[i] and task.status == TaskState.running)
            for _t, idx, task in tasks:
                time_used = now - task.start_time
                if time_used > self.max_task_time * (4 ** task.num_try) * scale:  # num_try starts from 1
                    # re-submit timeout task
                    self.counter.fail_run_timeout += 1
                    if task.num_try <= MAX_TASK_FAILURES:
                        logger.info('re-submit task %s for run timeout %.1f, max finished = %d, try %d',
                                    task.id, time_used, int(self.max_task_time), task.num_try)
                        task.time_used += time_used
                        task.stage_time = 0
                        task.start_time = 0
                        self.launched[idx] = False
                        self.counter.launched -= 1
                        task.reason_next = TaskReason.run_timeout
                    else:
                        logger.error('task %s timeout, aborting taskset %s',
                                     task, self.id)
                        self._abort('task %s timeout' % task)
                else:
                    break
                num_resubmit += 1
                if num_resubmit > 3:
                    break
        return self.counter.launched < n

    def _abort(self, message):
        logger.error('abort the taskset: %s', message)
        tasks = ' '.join(str(i) for i in range(len(self.finished))
                         if not self.finished[i])
        logger.error('not finished tasks: %s', tasks)
        self.failed = True
        self.causeOfFailure = message
        self.sched.tasksetFinished(self)
        self.sched.abort()