Exemplo n.º 1
0
def test_sorted():
    digest = TDigest()
    for i in range(10000):
        digest.add(random.random(), 1 + random.randint(0, 10))

    prev = None
    for c in digest.centroids:
        assert prev is None or prev.mean <= c.mean
        prev = c
Exemplo n.º 2
0
Arquivo: job.py Projeto: uxlsl/dpark
    def __init__(self,
                 sched,
                 tasks,
                 cpus=1,
                 mem=100,
                 gpus=0,
                 task_host_manager=None):
        Job.__init__(self)
        self.sched = sched
        self.tasks = tasks

        for t in self.tasks:
            t.status = None
            t.tried = 0
            t.used = 0
            t.cpus = cpus
            t.mem = mem
            t.gpus = gpus

        self.launched = [False] * len(tasks)
        self.finished = [False] * len(tasks)
        self.numFailures = [0] * len(tasks)
        self.running_hosts = [[] for i in range(len(tasks))]
        self.tidToIndex = {}
        self.numTasks = len(tasks)
        self.tasksLaunched = 0
        self.tasksFinished = 0
        self.total_used = 0

        self.lastPreferredLaunchTime = time.time()

        self.pendingTasksForHost = {}
        self.pendingTasksWithNoPrefs = []
        self.allPendingTasks = []

        self.reasons = set()
        self.failed = False
        self.causeOfFailure = ''
        self.last_check = 0

        for i in range(len(tasks)):
            self.addPendingTask(i)
        self.host_cache = {}
        self.task_host_manager = task_host_manager if task_host_manager is not None\
            else TaskHostManager()
        self.id_retry_host = {}
        self.task_local_set = set()
        self.mem_digest = TDigest()
        self.mem90 = 0
Exemplo n.º 3
0
def test_fill():
    def q_to_k(q):
        return asin(2 * min(1, q) - 1) / pi + 0.5

    delta = 300
    digest = TDigest(delta)
    for i in range(100000):
        digest.add(random.gauss(0, 1))

    q0 = 0.
    for c in digest.centroids:
        q1 = q0 + float(c.count) / len(digest)
        dk = delta * (q_to_k(q1) - q_to_k(q0))
        assert dk <= 1
        q0 = q1
Exemplo n.º 4
0
def test_monocity():
    digest = TDigest()
    for i in range(10000):
        digest.add(random.random())

    for i in range(int(1e4) - 1):
        q1 = i * 1e-4
        q2 = (i + 1) * 1e-4
        assert digest.quantile(q1) <= digest.quantile(q2)
        assert digest.cdf(q1) <= digest.cdf(q2)
Exemplo n.º 5
0
def test_extreme():
    digest = TDigest()
    digest.add(10, 3)
    digest.add(20, 1)
    digest.add(40, 5)
    values = [5., 10., 15., 20., 30., 35., 40., 45., 50.]
    for q in [1.5 / 9, 3.5 / 9, 6.5 / 9]:
        assert abs(quantile(values, q) - digest.quantile(q)) < 0.01
Exemplo n.º 6
0
def test_few_values():
    digest = TDigest()
    length = random.randint(1, 10)
    values = []
    for i in range(length):
        if i == 0 or random.random() < 0.5:
            value = random.random() * 100
        else:
            value = values[-1]

        digest.add(value)
        values.append(value)

    values = sorted(values)
    assert len(digest.centroids) == len(values)
    for q in [0, 1e-10, random.random(), 0.5, 1 - 1e-10, 1]:
        q1 = quantile(values, q)
        q2 = digest.quantile(q)
        assert abs(q1 - q2) < 0.03
Exemplo n.º 7
0
def test_narrow_normal():
    digest = TDigest()
    data = []
    for i in range(10000):
        if random.random() < 0.5:
            data.append(random.gauss(0, 1e-5))
        else:
            data.append(random.uniform(-1, 1))

    _test_distribution(digest, data, [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999],
                       100)
Exemplo n.º 8
0
def test_nan():
    digest = TDigest()
    iters = random.randint(0, 10)
    for i in range(iters):
        digest.add(random.random(), 1 + random.randint(0, 10))

    try:
        if random.random() < 0.5:
            digest.add(float('nan'))
        else:
            digest.add(float('nan'), 1)

        assert False
    except ValueError:
        pass
Exemplo n.º 9
0
def test_more_than_2billion_values():
    digest = TDigest()
    for i in range(1000):
        digest.add(random.random())

    for i in range(10):
        digest.add(random.random(), 1 << 28)

    assert len(digest) == 1000 + 10 * (1 << 28)
    prev = None
    for q in sorted([0, 0.1, 0.5, 0.9, 1, random.random()]):
        v = digest.quantile(q)
        assert prev is None or v >= prev
        prev = v
Exemplo n.º 10
0
def test_three_point_example():
    digest = TDigest()
    x0 = 0.18615591526031494
    x1 = 0.4241943657398224
    x2 = 0.8813006281852722
    digest.add(x0)
    digest.add(x1)
    digest.add(x2)

    p10 = digest.quantile(0.1)
    p50 = digest.quantile(0.5)
    p90 = digest.quantile(0.9)
    p95 = digest.quantile(0.95)
    p99 = digest.quantile(0.99)

    assert p10 <= p50
    assert p50 <= p90
    assert p90 <= p95
    assert p95 <= p99

    assert x0 == p10
    assert x2 == p90
Exemplo n.º 11
0
def test_serialization():
    digest = TDigest()
    for i in range(100):
        digest.add(random.random())

    digest2 = pickle.loads(pickle.dumps(digest))

    assert len(digest) == len(digest2)
    assert len(digest.centroids) == len(digest2.centroids)
    for c1, c2 in zip(digest.centroids, digest2.centroids):
        assert c1.mean == c2.mean
        assert c1.count == c2.count

    for q in range(10000):
        assert digest.quantile(q / 10000.) == digest2.quantile(q / 10000.)
        assert digest.cdf(q / 10000.) == digest2.cdf(q / 10000.)
Exemplo n.º 12
0
def test_repeated_values():
    digest = TDigest()
    data = [rint(random.uniform(0, 1) * 10) / 10. for _ in range(10000)]

    for d in data:
        digest.add(d)

    assert len(digest.centroids) < 10 * 1000.
    for i in range(10):
        z = i / 10.
        for delta in [0.01, 0.02, 0.03, 0.07, 0.08, 0.09]:
            q = z + delta
            cdf = digest.cdf(q)
            assert abs(z + 0.05 - cdf) < 0.02

            estimate = digest.quantile(q)
            assert abs(rint(q * 10) / 10. - estimate) < 0.001
Exemplo n.º 13
0
def test_merge():
    for parts in [2, 5, 10, 20, 50, 100]:
        data = []
        digest = TDigest()
        subs = [TDigest() for _ in range(parts)]
        cnt = [0] * parts

        for i in range(10000):
            x = random.random()
            data.append(x)
            digest.add(x)
            subs[i % parts].add(x)
            cnt[i % parts] += 1

        digest.compress()
        data = sorted(data)

        k = 0
        for i, d in enumerate(subs):
            assert cnt[i] == len(d)
            k2 = sum(c.count for c in d.centroids)
            assert cnt[i] == k2
            k += k2

        assert k == len(data)

        digest2 = reduce(lambda x, y: x + y, subs)

        for q in [0.001, 0.01, 0.1, 0.2, 0.3, 0.5]:
            z = quantile(data, q)
            e2 = digest2.quantile(q) - z

            assert abs(e2) / q < 0.3
            assert abs(e2) < 0.015

        for q in [0.001, 0.01, 0.1, 0.2, 0.3, 0.5]:
            z = cdf(data, q)
            e2 = digest2.cdf(q) - z

            assert abs(e2) / q < 0.3
            assert abs(e2) < 0.015
Exemplo n.º 14
0
def test_empty():
    digest = TDigest()
    q = random.random()
    assert isnan(digest.quantile(q))
Exemplo n.º 15
0
Arquivo: job.py Projeto: uxlsl/dpark
class SimpleJob(Job):
    def __init__(self,
                 sched,
                 tasks,
                 cpus=1,
                 mem=100,
                 gpus=0,
                 task_host_manager=None):
        Job.__init__(self)
        self.sched = sched
        self.tasks = tasks

        for t in self.tasks:
            t.status = None
            t.tried = 0
            t.used = 0
            t.cpus = cpus
            t.mem = mem
            t.gpus = gpus

        self.launched = [False] * len(tasks)
        self.finished = [False] * len(tasks)
        self.numFailures = [0] * len(tasks)
        self.running_hosts = [[] for i in range(len(tasks))]
        self.tidToIndex = {}
        self.numTasks = len(tasks)
        self.tasksLaunched = 0
        self.tasksFinished = 0
        self.total_used = 0

        self.lastPreferredLaunchTime = time.time()

        self.pendingTasksForHost = {}
        self.pendingTasksWithNoPrefs = []
        self.allPendingTasks = []

        self.reasons = set()
        self.failed = False
        self.causeOfFailure = ''
        self.last_check = 0

        for i in range(len(tasks)):
            self.addPendingTask(i)
        self.host_cache = {}
        self.task_host_manager = task_host_manager if task_host_manager is not None\
            else TaskHostManager()
        self.id_retry_host = {}
        self.task_local_set = set()
        self.mem_digest = TDigest()
        self.mem90 = 0

    @property
    def taskEverageTime(self):
        if not self.tasksFinished:
            return 10
        return max(self.total_used / self.tasksFinished, 5)

    def addPendingTask(self, i):
        loc = self.tasks[i].preferredLocations()
        if not loc:
            self.pendingTasksWithNoPrefs.append(i)
        else:
            for host in loc:
                self.pendingTasksForHost.setdefault(host, []).append(i)
        self.allPendingTasks.append(i)

    def getPendingTasksForHost(self, host):
        try:
            return self.host_cache[host]
        except KeyError:
            v = self._getPendingTasksForHost(host)
            self.host_cache[host] = v
            return v

    def _getPendingTasksForHost(self, host):
        try:
            h, hs, ips = socket.gethostbyname_ex(host)
        except Exception:
            h, hs, ips = host, [], []
        tasks = sum(
            (self.pendingTasksForHost.get(h, []) for h in [h] + hs + ips), [])
        st = {}
        for t in tasks:
            st[t] = st.get(t, 0) + 1
        ts = sorted(list(st.items()), key=itemgetter(1), reverse=True)
        return [t for t, _ in ts]

    def findTaskFromList(self, l, host, cpus, mem, gpus):
        for i in l:
            if self.launched[i] or self.finished[i]:
                continue
            if host in self.running_hosts[i]:
                continue
            t = self.tasks[i]
            if self.task_host_manager.task_failed_on_host(t.id, host):
                continue
            if t.cpus <= cpus + 1e-4 and t.mem <= mem and t.gpus <= gpus:
                return i

    def taskOffer(self, host_offers, cpus, mems, gpus):
        prefer_list = []
        for host in host_offers:
            i, o = host_offers[host]
            local_task = self.findTaskFromList(
                self.getPendingTasksForHost(host), host, cpus[i], mems[i],
                gpus[i])
            if local_task is not None:
                result_tuple = self._try_update_task_offer(
                    local_task, i, o, cpus, mems, gpus)
                if result_tuple is None:
                    continue
                prefer_list.append(result_tuple)
        if prefer_list:
            return prefer_list
        for idx in range(len(self.tasks)):
            if not self.launched[idx] and not self.finished[idx]:
                i, o = self.task_host_manager.offer_choice(
                    self.tasks[idx].id, host_offers, self.running_hosts[idx])
                if i is None:
                    continue
                result_tuple = self._try_update_task_offer(
                    idx, i, o, cpus, mems, gpus)
                if result_tuple:
                    return [result_tuple]
        return []

    def _try_update_task_offer(self, task_idx, i, o, cpus, mem, gpus):
        t = self.tasks[task_idx]
        if t.cpus <= cpus[i] + 1e-4 and t.mem <= mem[i] and t.gpus <= gpus[i]:
            t.status = 'TASK_STAGING'
            t.start = time.time()
            t.host = o.hostname
            t.tried += 1
            self.id_retry_host[(t.id, t.tried)] = o.hostname
            logger.debug('Starting task %d:%d as TID %s on slave %s', self.id,
                         task_idx, t, o.hostname)
            self.tidToIndex[t.id] = task_idx
            self.launched[task_idx] = True
            self.tasksLaunched += 1
            self.running_hosts[task_idx].append(o.hostname)
            host_set = set(self.tasks[task_idx].preferredLocations())
            if o.hostname in host_set:
                self.task_local_set.add(t.id)
            return i, o, t
        return None

    def statusUpdate(self,
                     tid,
                     tried,
                     status,
                     reason=None,
                     result=None,
                     update=None,
                     stats=None):
        logger.debug('job status update %s %s %s', tid, status, reason)
        if tid not in self.tidToIndex:
            logger.error('invalid tid: %s', tid)
            return
        i = self.tidToIndex[tid]
        if self.finished[i]:
            if status == 'TASK_FINISHED':
                logger.debug('Task %d is already finished, ignore it', tid)
            return

        task = self.tasks[i]
        task.status = status
        # when checking, task been masked as not launched
        if not self.launched[i]:
            self.launched[i] = True
            self.tasksLaunched += 1

        if status == 'TASK_FINISHED':
            self.taskFinished(tid, tried, result, update, stats)
        elif status in ('TASK_LOST', 'TASK_FAILED', 'TASK_KILLED'):
            self.taskLost(tid, tried, status, reason)

        task.start = time.time()
        if stats:
            self.mem_digest.add(stats.bytes_max_rss / (1024.**2))

    def progress(self, ending=''):
        n = self.numTasks
        ratio = self.tasksFinished * 1. / n
        bar = make_progress_bar(ratio)
        if self.tasksFinished:
            elasped = time.time() - self.start
            avg = self.total_used / self.tasksFinished
            eta = (n - self.tasksFinished) * elasped / self.tasksFinished
            m, s = divmod(int(eta), 60)
            h, m = divmod(m, 60)

            fmt = 'Job:%4s {{GREEN}}%s{{RESET}}%5.1f%% (% {width}s/% {width}s)' \
                ' ETA:% 2d:%02d:%02d AVG:%.1fs\x1b[K%s'.format(
                    width=int(math.log10(self.numTasks)) + 1,
                )

            msg = fmt % (self.id, bar, ratio * 100, self.tasksFinished, n, h,
                         m, s, avg, ending)
            msg = msg.ljust(80)
            logger.info(msg)
        else:
            fmt = 'Job:%4s {{GREEN}}%s{{RESET}}%5.1f%% (% {width}s/% {width}s)' \
                ' ETA:--:--:-- AVG:N/A\x1b[K%s'.format(
                    width=int(math.log10(self.numTasks)) + 1,
                )

            msg = fmt % (self.id, bar, ratio * 100, self.tasksFinished, n,
                         ending)
            msg = msg.ljust(80)
            logger.info(msg)

    def taskFinished(self, tid, tried, result, update, stats):
        i = self.tidToIndex[tid]
        self.finished[i] = True
        self.tasksFinished += 1
        task = self.tasks[i]
        hostname = self.id_retry_host[(task.id, tried)] \
            if (task.id, tried) in self.id_retry_host else task.host
        task.used += time.time() - task.start
        self.total_used += task.used
        if getattr(self.sched, 'color', False):
            title = 'Job %d: task %s finished in %.1fs (%d/%d)     ' % (
                self.id, tid, task.used, self.tasksFinished, self.numTasks)
            msg = '\x1b]2;%s\x07\x1b[1A' % title
            logger.info(msg)

        from dpark.schedule import Success
        self.sched.taskEnded(task, Success(), result, update, stats)
        self.running_hosts[i] = []
        self.task_host_manager.task_succeed(task.id, hostname, Success())

        for t in range(task.tried):
            if t + 1 != tried:
                self.sched.killTask(self.id, task.id, t + 1)

        if self.tasksFinished == self.numTasks:
            ts = [t.used for t in self.tasks]
            tried = [t.tried for t in self.tasks]
            elasped = time.time() - self.start
            logger.info(
                'Job %d finished in %.1fs: min=%.1fs, '
                'avg=%.1fs, max=%.1fs, maxtry=%d, speedup=%.1f, local=%.1f%%',
                self.id, elasped, min(ts),
                sum(ts) / len(ts), max(ts), max(tried),
                self.total_used / elasped,
                len(self.task_local_set) * 100. / len(self.tasks))
            self.sched.jobFinished(self)

    def taskLost(self, tid, tried, status, reason):
        index = self.tidToIndex[tid]

        from dpark.schedule import FetchFailed
        if isinstance(reason, FetchFailed) and self.numFailures[index] >= 1:
            logger.warning('Cancel task %s after fetch fail twice from %s',
                           tid, reason.serverUri)
            self.sched.taskEnded(self.tasks[index], reason, None, None)
            # cancel tasks
            if not self.finished[index]:
                self.finished[index] = True
                self.tasksFinished += 1
            for i in range(len(self.finished)):
                if not self.launched[i]:
                    self.launched[i] = True
                    self.tasksLaunched += 1
                    self.finished[i] = True
                    self.tasksFinished += 1
            if self.tasksFinished == self.numTasks:
                self.sched.jobFinished(self)  # cancel job
            return

        task = self.tasks[index]
        hostname = self.id_retry_host[(task.id, tried)] \
            if (task.id, tried) in self.id_retry_host else task.host

        if status == 'TASK_KILLED' or str(reason).startswith(
                'Memory limit exceeded:'):
            task.mem = min(task.mem * 2, MAX_TASK_MEMORY)
            logger.info("task %s oom, enlarge memory limit to %d, origin %d",
                        task.id, task.mem, task.rdd.mem)

            mem90 = self.mem_digest.quantile(0.9)
            if not math.isnan(mem90):
                mem90 = int(mem90)
                if mem90 > self.mem90:
                    logger.info(
                        "enlarge memory limit of remaining task from >%d to >%d (mem90)",
                        self.mem90, mem90)
                    self.mem90 = mem90
                    for i, t in enumerate(self.tasks):
                        if not self.launched[i]:
                            t.mem = max(mem90, t.mem)

        elif status == 'TASK_FAILED':
            _logger = logger.error if self.numFailures[index] == MAX_TASK_FAILURES\
                else logger.warning
            if reason not in self.reasons:
                _logger('task %s failed @ %s: %s : %s', task.id, hostname,
                        task, reason)
                self.reasons.add(reason)
            else:
                _logger('task %s failed @ %s: %s', task.id, hostname, task)

        elif status == 'TASK_LOST':
            logger.warning('Lost Task %d (task %d:%d:%s) %s at %s', index,
                           self.id, tid, tried, reason, task.host)

        self.numFailures[index] += 1
        if self.numFailures[index] > MAX_TASK_FAILURES:
            logger.error('Task %d failed more than %d times; aborting job',
                         self.tasks[index].id, MAX_TASK_FAILURES)
            self.abort('Task %d failed more than %d times' %
                       (self.tasks[index].id, MAX_TASK_FAILURES))
        self.task_host_manager.task_failed(task.id, hostname, reason)
        self.launched[index] = False
        if self.tasksLaunched == self.numTasks:
            self.sched.requestMoreResources()
        self.running_hosts[index] = []
        self.tasksLaunched -= 1

    def check_task_timeout(self):
        now = time.time()
        if self.last_check + 5 > now:
            return False
        self.last_check = now

        n = self.launched.count(True)
        if n != self.tasksLaunched:
            logger.warning('bug: tasksLaunched(%d) != %d', self.tasksLaunched,
                           n)
            self.tasksLaunched = n

        for i in range(self.numTasks):
            task = self.tasks[i]
            if (self.launched[i] and task.status == 'TASK_STAGING'
                    and task.start + WAIT_FOR_RUNNING < now):
                logger.info('task %d timeout %.1f (at %s), re-assign it',
                            task.id, now - task.start, task.host)
                self.launched[i] = False
                self.tasksLaunched -= 1

        if self.tasksFinished > self.numTasks * 2.0 / 3:
            scale = 1.0 * self.numTasks / self.tasksFinished
            avg = max(self.taskEverageTime, 10)
            tasks = sorted((task.start, i, task)
                           for i, task in enumerate(self.tasks)
                           if self.launched[i] and not self.finished[i])
            for _t, idx, task in tasks:
                used = now - task.start
                if used > avg * (2**task.tried) * scale:
                    # re-submit timeout task
                    if task.tried <= MAX_TASK_FAILURES:
                        logger.info(
                            're-submit task %s for timeout %.1f, '
                            'try %d', task.id, used, task.tried)
                        task.used += used
                        task.start = now
                        self.launched[idx] = False
                        self.tasksLaunched -= 1
                    else:
                        logger.error('task %s timeout, aborting job %s', task,
                                     self.id)
                        self.abort('task %s timeout' % task)
                else:
                    break
        return self.tasksLaunched < n

    def abort(self, message):
        logger.error('abort the job: %s', message)
        tasks = ' '.join(
            str(i) for i in range(len(self.finished)) if not self.finished[i])
        logger.error('not finished tasks: %s', tasks)
        self.failed = True
        self.causeOfFailure = message
        self.sched.jobFinished(self)
        self.sched.abort()
Exemplo n.º 16
0
def test_uniform():
    digest = TDigest()
    _test_distribution(digest, [random.uniform(0, 1) for _ in range(10000)],
                       [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999], 100)
Exemplo n.º 17
0
def test_empty_digest():
    digest = TDigest()
    assert len(digest.centroids) == 0
Exemplo n.º 18
0
def test_single_value():
    digest = TDigest()
    value = random.random() * 1000
    digest.add(value)
    for q in [0, random.random(), 1]:
        assert abs(value - digest.quantile(q)) < 1e-3
Exemplo n.º 19
0
def test_small_count_quantile():
    digest = TDigest(200)
    for d in [15.0, 20.0, 32.0, 60.0]:
        digest.add(d)

    assert abs(digest.quantile(0.4) - 21.2) < 1e-10
Exemplo n.º 20
0
def test_gamma():
    digest = TDigest()
    _test_distribution(digest,
                       [random.gammavariate(0.1, 0.1) for _ in range(10000)],
                       [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999], 100)
Exemplo n.º 21
0
def test_sequential_points():
    digest = TDigest()
    data = [i * pi * 1e-5 for i in range(10000)]

    _test_distribution(digest, data, [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999],
                       100)
Exemplo n.º 22
0
def test_singleton_in_a_crowd():
    compression = 100
    digest = TDigest(compression=compression)
    for i in range(10000):
        digest.add(10)

    digest.add(20)
    digest.compress()

    assert digest.quantile(0) == 10.0
    assert digest.quantile(0.5) == 10.0
    assert digest.quantile(0.8) == 10.0
    assert digest.quantile(0.9) == 10.0
    assert digest.quantile(0.99) == 10.0
    assert digest.quantile(1) == 20.0