예제 #1
0
파일: ring.py 프로젝트: NewpTone/swift
    def get_more_nodes(self, part):
        """
        Generator to get extra nodes for a partition for hinted handoff.

        :param part: partition to get handoff nodes for
        :returns: generator of node dicts

        See :func:`get_nodes` for a description of the node dicts.
        """
        if time() > self._rtime:
            self._reload()
        used_tiers = set()
        for part2dev_id in self._replica2part2dev_id:
            for tier in tiers_for_dev(self.devs[part2dev_id[part]]):
                used_tiers.add(tier)

        for level in self.tiers_by_length:
            tiers = list(level)
            while tiers:
                tier = tiers.pop(part % len(tiers))
                if tier in used_tiers:
                    continue
                for i in xrange(len(self.tier2devs[tier])):
                    dev = self.tier2devs[tier][(part + i) %
                                               len(self.tier2devs[tier])]
                    if not dev.get('weight'):
                        continue
                    yield dev
                    used_tiers.update(tiers_for_dev(dev))
                    break
예제 #2
0
파일: test_utils.py 프로젝트: bigdig/swift
 def test_tiers_for_dev(self):
     self.assertEqual(
         tiers_for_dev(self.test_dev),
         ((1,),
          (1, 1),
          (1, 1, '192.168.1.1'),
          (1, 1, '192.168.1.1', 0)))
예제 #3
0
    def _rebuild_tier_data(self):
        self.tier2devs = defaultdict(list)
        for dev in self._devs:
            if not dev:
                continue
            for tier in tiers_for_dev(dev):
                self.tier2devs[tier].append(dev)

        tiers_by_length = defaultdict(list)
        for tier in self.tier2devs:
            tiers_by_length[len(tier)].append(tier)
        self.tiers_by_length = sorted(tiers_by_length.values(), key=lambda x: len(x[0]))
        for tiers in self.tiers_by_length:
            tiers.sort()
예제 #4
0
 def test_normalized_device_tier_names(self):
     rb = ring.RingBuilder(8, 3, 0)
     rb.add_dev({
         'region': 1,
         'zone': 1,
         'ip': '127.0.0.1',
         'port': 6011,
         'device': 'd1',
         'weight': 0.0,
     })
     dev = rb.devs[0]
     expected = 'r1z1-127.0.0.1/d1'
     self.assertEqual(expected, get_tier_name(tiers_for_dev(dev)[-1], rb))
     self.assertEqual(expected, pretty_dev(dev))
예제 #5
0
파일: builder.py 프로젝트: CiscoAS/swift
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different regions are considered the
        farthest-apart things, followed by zones, then different ip/port pairs
        within a zone; the least-far-apart things are different devices with
        the same ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)

        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            for tier in tiers_for_dev(dev):
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2sort_key[tier].append(dev['sort_key'])
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=lambda t: tier2sort_key[t][-1])
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(
                    lambda t: tier2sort_key[t][-1], child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (regions, zones, ip/ports, and
            # devices) the replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            unique_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in tiers_for_dev(dev):
                        other_replicas[tier] += 1
                        unique_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas, pick the
                    # tier with the hungriest drive and then continue
                    # searching in that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator will balance things as
                    # evenly as possible at each level of the device
                    # layout. If your layout is extremely unbalanced,
                    # this may produce poor results.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.
                    candidate_tiers = tier2children[tier]
                    candidates_with_replicas = \
                        unique_tiers_by_tier_len[len(tier) + 1]
                    if len(candidate_tiers) > len(candidates_with_replicas):
                        # There exists at least one tier with 0 other replicas,
                        # so work backward among the candidates, accepting the
                        # first which isn't in other_replicas.
                        #
                        # This optimization is to avoid calling the min()
                        # below, which is expensive if you've got thousands of
                        # drives.
                        for t in reversed(candidate_tiers):
                            if other_replicas[t] == 0:
                                tier = t
                                break
                    else:
                        min_count = min(other_replicas[t]
                                        for t in candidate_tiers)
                        tier = (t for t in reversed(candidate_tiers)
                                if other_replicas[t] == min_count).next()
                    depth += 1
                dev = tier2devs[tier][-1]
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1
                    unique_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2sort_key[tier].insert(new_index, new_sort_key)

                    # Now jiggle tier2children values to keep them sorted
                    new_last_sort_key = tier2sort_key[tier][-1]
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier],
                        old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier],
                        new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(
                        new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']
예제 #6
0
파일: builder.py 프로젝트: CiscoAS/swift
    def _gather_reassign_parts(self):
        """
        Returns a list of (partition, replicas) pairs to be reassigned by
        gathering from removed devices, insufficiently-far-apart replicas, and
        overweight drives.
        """
        # inline memoization of tiers_for_dev() results (profiling reveals it
        # as a hot-spot).
        tfd = {}

        # First we gather partitions from removed devices. Since removed
        # devices usually indicate device failures, we have no choice but to
        # reassign these partitions. However, we mark them as moved so later
        # choices will skip other replicas of the same partition if possible.
        removed_dev_parts = defaultdict(list)
        if self._remove_devs:
            dev_ids = [d['id'] for d in self._remove_devs if d['parts']]
            if dev_ids:
                for part, replica in self._each_part_replica():
                    dev_id = self._replica2part2dev[replica][part]
                    if dev_id in dev_ids:
                        self._last_part_moves[part] = 0
                        removed_dev_parts[part].append(replica)

        # Now we gather partitions that are "at risk" because they aren't
        # currently sufficient spread out across the cluster.
        spread_out_parts = defaultdict(list)
        max_allowed_replicas = self._build_max_replicas_by_tier()
        for part in xrange(self.parts):
            # Only move one replica at a time if possible.
            if part in removed_dev_parts:
                continue

            # First, add up the count of replicas at each tier for each
            # partition.
            # replicas_at_tier was a "lambda: 0" defaultdict, but profiling
            # revealed the lambda invocation as a significant cost.
            replicas_at_tier = {}
            for dev in self._devs_for_part(part):
                if dev['id'] not in tfd:
                    tfd[dev['id']] = tiers_for_dev(dev)
                for tier in tfd[dev['id']]:
                    if tier not in replicas_at_tier:
                        replicas_at_tier[tier] = 1
                    else:
                        replicas_at_tier[tier] += 1

            # Now, look for partitions not yet spread out enough and not
            # recently moved.
            for replica in self._replicas_for_part(part):
                dev = self.devs[self._replica2part2dev[replica][part]]
                removed_replica = False
                if dev['id'] not in tfd:
                    tfd[dev['id']] = tiers_for_dev(dev)
                for tier in tfd[dev['id']]:
                    rep_at_tier = 0
                    if tier in replicas_at_tier:
                        rep_at_tier = replicas_at_tier[tier]
                    if (rep_at_tier > max_allowed_replicas[tier] and
                            self._last_part_moves[part] >=
                            self.min_part_hours):
                        self._last_part_moves[part] = 0
                        spread_out_parts[part].append(replica)
                        dev['parts_wanted'] += 1
                        dev['parts'] -= 1
                        removed_replica = True
                        break
                if removed_replica:
                    if dev['id'] not in tfd:
                        tfd[dev['id']] = tiers_for_dev(dev)
                    for tier in tfd[dev['id']]:
                        replicas_at_tier[tier] -= 1

        # Last, we gather partitions from devices that are "overweight" because
        # they have more partitions than their parts_wanted.
        reassign_parts = defaultdict(list)

        # We randomly pick a new starting point in the "circular" ring of
        # partitions to try to get a better rebalance when called multiple
        # times.

        start = self._last_part_gather_start / 4
        start += random.randint(0, self.parts / 2)  # GRAH PEP8!!!

        self._last_part_gather_start = start
        for replica, part2dev in enumerate(self._replica2part2dev):
            # If we've got a partial replica, start may be out of
            # range. Scale it down so that we get a similar movement
            # pattern (but scaled down) on sequential runs.
            this_start = int(float(start) * len(part2dev) / self.parts)

            for part in itertools.chain(xrange(this_start, len(part2dev)),
                                        xrange(0, this_start)):
                if self._last_part_moves[part] < self.min_part_hours:
                    continue
                if part in removed_dev_parts or part in spread_out_parts:
                    continue
                dev = self.devs[part2dev[part]]
                if dev['parts_wanted'] < 0:
                    self._last_part_moves[part] = 0
                    dev['parts_wanted'] += 1
                    dev['parts'] -= 1
                    reassign_parts[part].append(replica)

        reassign_parts.update(spread_out_parts)
        reassign_parts.update(removed_dev_parts)

        reassign_parts_list = list(reassign_parts.iteritems())
        # We shuffle the partitions to reassign so we get a more even
        # distribution later. There has been discussion of trying to distribute
        # partitions more "regularly" because that would actually reduce risk
        # but 1) it is really difficult to do this with uneven clusters and 2)
        # it would concentrate load during failure recovery scenarios
        # (increasing risk). The "right" answer has yet to be debated to
        # conclusion, but working code wins for now.
        random.shuffle(reassign_parts_list)
        return reassign_parts_list
예제 #7
0
파일: builder.py 프로젝트: sriramnrn/swift
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different regions are considered the
        farthest-apart things, followed by zones, then different ip/port pairs
        within a zone; the least-far-apart things are different devices with
        the same ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev["sort_key"] = self._sort_key_for(dev)

        available_devs = sorted((d for d in self._iter_devs() if d["weight"]), key=lambda x: x["sort_key"])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(tuple)
        tier2dev_sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            dev["tiers"] = tiers_for_dev(dev)
            for tier in dev["tiers"]:
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2dev_sort_key[tier].append(dev["sort_key"])
                tier2sort_key[tier] = dev["sort_key"]
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=tier2sort_key.__getitem__)
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(tier2sort_key.__getitem__, child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (regions, zones, ip/ports, and
            # devices) the replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            unique_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in dev["tiers"]:
                        other_replicas[tier] += 1
                        unique_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas, pick the
                    # tier with the hungriest drive and then continue
                    # searching in that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator will balance things as
                    # evenly as possible at each level of the device
                    # layout. If your layout is extremely unbalanced,
                    # this may produce poor results.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.

                    # We sort the tiers here so that, when we look for a tier
                    # with the lowest number of replicas, the first one we
                    # find is the one with the hungriest drive (i.e. drive
                    # with the largest sort_key value). This lets us
                    # short-circuit the search while still ensuring we get the
                    # right tier.
                    candidates_with_replicas = unique_tiers_by_tier_len[len(tier) + 1]
                    # Find a tier with the minimal replica count and the
                    # hungriest drive among all the tiers with the minimal
                    # replica count.
                    if len(tier2children[tier]) > len(candidates_with_replicas):
                        # There exists at least one tier with 0 other replicas
                        tier = max(
                            (t for t in tier2children[tier] if other_replicas[t] == 0), key=tier2sort_key.__getitem__
                        )
                    else:
                        tier = max(tier2children[tier], key=lambda t: (-other_replicas[t], tier2sort_key[t]))
                    depth += 1
                dev = tier2devs[tier][-1]
                dev["parts_wanted"] -= 1
                dev["parts"] += 1
                old_sort_key = dev["sort_key"]
                new_sort_key = dev["sort_key"] = self._sort_key_for(dev)
                for tier in dev["tiers"]:
                    other_replicas[tier] += 1
                    unique_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2dev_sort_key[tier], old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2dev_sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2dev_sort_key[tier], new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2dev_sort_key[tier].insert(new_index, new_sort_key)

                    new_last_sort_key = tier2dev_sort_key[tier][-1]
                    tier2sort_key[tier] = new_last_sort_key

                    # Now jiggle tier2children values to keep them sorted
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(tier2children_sort_key[parent_tier], old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(tier2children_sort_key[parent_tier], new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev["id"]

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev["sort_key"]
            dev.pop("tiers", None)  # May be absent for devices w/o weight
예제 #8
0
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different zones are considered the
        farthest-apart things, followed by different ip/port pairs within a
        zone; the least-far-apart things are different devices with the same
        ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)

        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            for tier in tiers_for_dev(dev):
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2sort_key[tier].append(dev['sort_key'])
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=lambda t: tier2sort_key[t][-1])
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(
                    lambda t: tier2sort_key[t][-1], child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (zones, ip_ports, and devices) the
            # replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            unique_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in tiers_for_dev(dev):
                        other_replicas[tier] += 1
                        unique_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas, pick the
                    # tier with the hungriest drive and then continue
                    # searching in that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator will balance things as
                    # evenly as possible at each level of the device
                    # layout. If your layout is extremely unbalanced,
                    # this may produce poor results.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.
                    candidate_tiers = tier2children[tier]
                    candidates_with_replicas = \
                        unique_tiers_by_tier_len[len(tier) + 1]
                    if len(candidate_tiers) > len(candidates_with_replicas):
                        # There exists at least one tier with 0 other replicas,
                        # so work backward among the candidates, accepting the
                        # first which isn't in other_replicas.
                        #
                        # This optimization is to avoid calling the min()
                        # below, which is expensive if you've got thousands of
                        # drives.
                        for t in reversed(candidate_tiers):
                            if other_replicas[t] == 0:
                                tier = t
                                break
                    else:
                        min_count = min(other_replicas[t]
                                        for t in candidate_tiers)
                        tier = (t for t in reversed(candidate_tiers)
                                if other_replicas[t] == min_count).next()
                    depth += 1
                dev = tier2devs[tier][-1]
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1
                    unique_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2sort_key[tier].insert(new_index, new_sort_key)

                    # Now jiggle tier2children values to keep them sorted
                    new_last_sort_key = tier2sort_key[tier][-1]
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier], old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier], new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(
                        new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']
예제 #9
0
    def _gather_reassign_parts(self):
        """
        Returns a list of (partition, replicas) pairs to be reassigned by
        gathering from removed devices, insufficiently-far-apart replicas, and
        overweight drives.
        """
        # inline memoization of tiers_for_dev() results (profiling reveals it
        # as a hot-spot).
        tfd = {}

        # First we gather partitions from removed devices. Since removed
        # devices usually indicate device failures, we have no choice but to
        # reassign these partitions. However, we mark them as moved so later
        # choices will skip other replicas of the same partition if possible.
        removed_dev_parts = defaultdict(list)
        if self._remove_devs:
            dev_ids = [d['id'] for d in self._remove_devs if d['parts']]
            if dev_ids:
                for part, replica in self._each_part_replica():
                    dev_id = self._replica2part2dev[replica][part]
                    if dev_id in dev_ids:
                        self._last_part_moves[part] = 0
                        removed_dev_parts[part].append(replica)

        # Now we gather partitions that are "at risk" because they aren't
        # currently sufficient spread out across the cluster.
        spread_out_parts = defaultdict(list)
        max_allowed_replicas = self._build_max_replicas_by_tier()
        for part in xrange(self.parts):
            # Only move one replica at a time if possible.
            if part in removed_dev_parts:
                continue

            # First, add up the count of replicas at each tier for each
            # partition.
            # replicas_at_tier was a "lambda: 0" defaultdict, but profiling
            # revealed the lambda invocation as a significant cost.
            replicas_at_tier = {}
            for dev in self._devs_for_part(part):
                if dev['id'] not in tfd:
                    tfd[dev['id']] = tiers_for_dev(dev)
                for tier in tfd[dev['id']]:
                    if tier not in replicas_at_tier:
                        replicas_at_tier[tier] = 1
                    else:
                        replicas_at_tier[tier] += 1

            # Now, look for partitions not yet spread out enough and not
            # recently moved.
            for replica in self._replicas_for_part(part):
                dev = self.devs[self._replica2part2dev[replica][part]]
                removed_replica = False
                if dev['id'] not in tfd:
                    tfd[dev['id']] = tiers_for_dev(dev)
                for tier in tfd[dev['id']]:
                    rep_at_tier = 0
                    if tier in replicas_at_tier:
                        rep_at_tier = replicas_at_tier[tier]
                    if (rep_at_tier > max_allowed_replicas[tier]
                            and self._last_part_moves[part] >=
                            self.min_part_hours):
                        self._last_part_moves[part] = 0
                        spread_out_parts[part].append(replica)
                        dev['parts_wanted'] += 1
                        dev['parts'] -= 1
                        removed_replica = True
                        break
                if removed_replica:
                    if dev['id'] not in tfd:
                        tfd[dev['id']] = tiers_for_dev(dev)
                    for tier in tfd[dev['id']]:
                        replicas_at_tier[tier] -= 1

        # Last, we gather partitions from devices that are "overweight" because
        # they have more partitions than their parts_wanted.
        reassign_parts = defaultdict(list)

        # We randomly pick a new starting point in the "circular" ring of
        # partitions to try to get a better rebalance when called multiple
        # times.

        start = self._last_part_gather_start / 4
        start += random.randint(0, self.parts / 2)  # GRAH PEP8!!!

        self._last_part_gather_start = start
        for replica, part2dev in enumerate(self._replica2part2dev):
            # If we've got a partial replica, start may be out of
            # range. Scale it down so that we get a similar movement
            # pattern (but scaled down) on sequential runs.
            this_start = int(float(start) * len(part2dev) / self.parts)

            for part in itertools.chain(xrange(this_start, len(part2dev)),
                                        xrange(0, this_start)):
                if self._last_part_moves[part] < self.min_part_hours:
                    continue
                if part in removed_dev_parts or part in spread_out_parts:
                    continue
                dev = self.devs[part2dev[part]]
                if dev['parts_wanted'] < 0:
                    self._last_part_moves[part] = 0
                    dev['parts_wanted'] += 1
                    dev['parts'] -= 1
                    reassign_parts[part].append(replica)

        reassign_parts.update(spread_out_parts)
        reassign_parts.update(removed_dev_parts)

        reassign_parts_list = list(reassign_parts.iteritems())
        # We shuffle the partitions to reassign so we get a more even
        # distribution later. There has been discussion of trying to distribute
        # partitions more "regularly" because that would actually reduce risk
        # but 1) it is really difficult to do this with uneven clusters and 2)
        # it would concentrate load during failure recovery scenarios
        # (increasing risk). The "right" answer has yet to be debated to
        # conclusion, but working code wins for now.
        random.shuffle(reassign_parts_list)
        return reassign_parts_list
예제 #10
0
파일: builder.py 프로젝트: AsherBond/swift
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different regions are considered the
        farthest-apart things, followed by zones, then different ip/port pairs
        within a zone; the least-far-apart things are different devices with
        the same ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        parts_available_in_tier = defaultdict(int)
        for dev in self._iter_devs():
            dev["sort_key"] = self._sort_key_for(dev)
            tiers = tiers_for_dev(dev)
            dev["tiers"] = tiers
            for tier in tiers:
                # Note: this represents how many partitions may be assigned to
                # a given tier (region/zone/server/disk). It does not take
                # into account how many partitions a given tier wants to shed.
                #
                # If we did not do this, we could have a zone where, at some
                # point during assignment, number-of-parts-to-gain equals
                # number-of-parts-to-shed. At that point, no further placement
                # into that zone would occur since its parts_available_in_tier
                # would be 0. This would happen any time a zone had any device
                # with partitions to shed, which is any time a device is being
                # removed, which is a pretty frequent operation.
                parts_available_in_tier[tier] += max(dev["parts_wanted"], 0)

        available_devs = sorted((d for d in self._iter_devs() if d["weight"]), key=lambda x: x["sort_key"])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(tuple)
        tier2dev_sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            for tier in dev["tiers"]:
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2dev_sort_key[tier].append(dev["sort_key"])
                tier2sort_key[tier] = dev["sort_key"]
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=tier2sort_key.__getitem__)
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(tier2sort_key.__getitem__, child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (regions, zones, ip/ports, and
            # devices) the replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            occupied_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in dev["tiers"]:
                        other_replicas[tier] += 1
                        occupied_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                # Find a new home for this replica
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas and that have
                    # room to accept more partitions, pick the tier with
                    # the hungriest drive and then continue searching in
                    # that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator prioritizes even device
                    # filling over dispersion, so if your layout is
                    # extremely unbalanced, you may not get the replica
                    # dispersion that you expect, and your durability
                    # may be lessened.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.

                    # We sort the tiers here so that, when we look for a tier
                    # with the lowest number of replicas, the first one we
                    # find is the one with the hungriest drive (i.e. drive
                    # with the largest sort_key value). This lets us
                    # short-circuit the search while still ensuring we get the
                    # right tier.
                    candidates_with_replicas = occupied_tiers_by_tier_len[len(tier) + 1]

                    # Among the tiers with room for more partitions,
                    # find one with the smallest possible number of
                    # replicas already in it, breaking ties by which one
                    # has the hungriest drive.
                    candidates_with_room = [t for t in tier2children[tier] if parts_available_in_tier[t] > 0]

                    if len(candidates_with_room) > len(candidates_with_replicas):
                        # There exists at least one tier with room for
                        # another partition and 0 other replicas already
                        # in it, so we can use a faster search. The else
                        # branch's search would work here, but it's
                        # significantly slower.
                        tier = max(
                            (t for t in candidates_with_room if other_replicas[t] == 0), key=tier2sort_key.__getitem__
                        )
                    else:
                        tier = max(candidates_with_room, key=lambda t: (-other_replicas[t], tier2sort_key[t]))
                    depth += 1
                dev = tier2devs[tier][-1]
                dev["parts_wanted"] -= 1
                dev["parts"] += 1
                old_sort_key = dev["sort_key"]
                new_sort_key = dev["sort_key"] = self._sort_key_for(dev)
                for tier in dev["tiers"]:
                    parts_available_in_tier[tier] -= 1
                    other_replicas[tier] += 1
                    occupied_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2dev_sort_key[tier], old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2dev_sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2dev_sort_key[tier], new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2dev_sort_key[tier].insert(new_index, new_sort_key)

                    new_last_sort_key = tier2dev_sort_key[tier][-1]
                    tier2sort_key[tier] = new_last_sort_key

                    # Now jiggle tier2children values to keep them sorted
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(tier2children_sort_key[parent_tier], old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(tier2children_sort_key[parent_tier], new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev["id"]

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev["sort_key"]
            del dev["tiers"]
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different zones are considered the
        farthest-apart things, followed by different ip/port pairs within a
        zone; the least-far-apart things are different devices with the same
        ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)
        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2children = build_tier_tree(available_devs)

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(list)
        tiers_by_depth = defaultdict(set)
        for dev in available_devs:
            for tier in tiers_for_dev(dev):
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2sort_key[tier].append(dev['sort_key'])
                tiers_by_depth[len(tier)].add(tier)

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (zones, ip_ports, and devices) the
            # replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(lambda: 0)
            for replica in xrange(self.replicas):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in tiers_for_dev(dev):
                        other_replicas[tier] += 1

            def find_home_for_replica(tier=(), depth=1):
                # Order the tiers by how many replicas of this
                # partition they already have. Then, of the ones
                # with the smallest number of replicas, pick the
                # tier with the hungriest drive and then continue
                # searching in that subtree.
                #
                # There are other strategies we could use here,
                # such as hungriest-tier (i.e. biggest
                # sum-of-parts-wanted) or picking one at random.
                # However, hungriest-drive is what was used here
                # before, and it worked pretty well in practice.
                #
                # Note that this allocator will balance things as
                # evenly as possible at each level of the device
                # layout. If your layout is extremely unbalanced,
                # this may produce poor results.
                candidate_tiers = tier2children[tier]
                min_count = min(other_replicas[t] for t in candidate_tiers)
                candidate_tiers = [t for t in candidate_tiers
                                   if other_replicas[t] == min_count]
                candidate_tiers.sort(
                    key=lambda t: tier2sort_key[t][-1])

                if depth == max(tiers_by_depth.keys()):
                    return tier2devs[candidate_tiers[-1]][-1]

                return find_home_for_replica(tier=candidate_tiers[-1],
                                             depth=depth + 1)

            for replica in replace_replicas:
                dev = find_home_for_replica()
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1

                    index = bisect.bisect_left(tier2sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2sort_key[tier].insert(new_index, new_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']
    def _gather_reassign_parts(self):
        """
        Returns a list of (partition, replicas) pairs to be reassigned by
        gathering from removed devices, insufficiently-far-apart replicas, and
        overweight drives.
        """
        # First we gather partitions from removed devices. Since removed
        # devices usually indicate device failures, we have no choice but to
        # reassign these partitions. However, we mark them as moved so later
        # choices will skip other replicas of the same partition if possible.
        removed_dev_parts = defaultdict(list)
        if self._remove_devs:
            dev_ids = [d['id'] for d in self._remove_devs if d['parts']]
            if dev_ids:
                for replica in xrange(self.replicas):
                    part2dev = self._replica2part2dev[replica]
                    for part in xrange(self.parts):
                        if part2dev[part] in dev_ids:
                            self._last_part_moves[part] = 0
                            removed_dev_parts[part].append(replica)

        # Now we gather partitions that are "at risk" because they aren't
        # currently sufficient spread out across the cluster.
        spread_out_parts = defaultdict(list)
        max_allowed_replicas = self._build_max_replicas_by_tier()
        for part in xrange(self.parts):
            # Only move one replica at a time if possible.
            if part in removed_dev_parts:
                continue

            # First, add up the count of replicas at each tier for each
            # partition.
            replicas_at_tier = defaultdict(lambda: 0)
            for replica in xrange(self.replicas):
                dev = self.devs[self._replica2part2dev[replica][part]]
                for tier in tiers_for_dev(dev):
                    replicas_at_tier[tier] += 1

            # Now, look for partitions not yet spread out enough and not
            # recently moved.
            for replica in xrange(self.replicas):
                dev = self.devs[self._replica2part2dev[replica][part]]
                removed_replica = False
                for tier in tiers_for_dev(dev):
                    if (replicas_at_tier[tier] > max_allowed_replicas[tier] and
                           self._last_part_moves[part] >= self.min_part_hours):
                        self._last_part_moves[part] = 0
                        spread_out_parts[part].append(replica)
                        dev['parts_wanted'] += 1
                        dev['parts'] -= 1
                        removed_replica = True
                        break
                if removed_replica:
                    for tier in tiers_for_dev(dev):
                        replicas_at_tier[tier] -= 1

        # Last, we gather partitions from devices that are "overweight" because
        # they have more partitions than their parts_wanted.
        reassign_parts = defaultdict(list)

        # We randomly pick a new starting point in the "circular" ring of
        # partitions to try to get a better rebalance when called multiple
        # times.
        start = self._last_part_gather_start / 4 + randint(0, self.parts / 2)
        self._last_part_gather_start = start
        for replica in xrange(self.replicas):
            part2dev = self._replica2part2dev[replica]
            for part in itertools.chain(xrange(start, self.parts),
                                        xrange(0, start)):
                if self._last_part_moves[part] < self.min_part_hours:
                    continue
                if part in removed_dev_parts or part in spread_out_parts:
                    continue
                dev = self.devs[part2dev[part]]
                if dev['parts_wanted'] < 0:
                    self._last_part_moves[part] = 0
                    dev['parts_wanted'] += 1
                    dev['parts'] -= 1
                    reassign_parts[part].append(replica)

        reassign_parts.update(spread_out_parts)
        reassign_parts.update(removed_dev_parts)

        reassign_parts_list = list(reassign_parts.iteritems())
        # We shuffle the partitions to reassign so we get a more even
        # distribution later. There has been discussion of trying to distribute
        # partitions more "regularly" because that would actually reduce risk
        # but 1) it is really difficult to do this with uneven clusters and 2)
        # it would concentrate load during failure recovery scenarios
        # (increasing risk). The "right" answer has yet to be debated to
        # conclusion, but working code wins for now.
        shuffle(reassign_parts_list)
        return reassign_parts_list
예제 #13
0
 def test_tiers_for_dev(self):
     self.assertEqual(tiers_for_dev(self.test_dev),
                      ((1, ), (1, 1), (1, 1, '192.168.1.1'),
                       (1, 1, '192.168.1.1', 0)))
예제 #14
0
 def test_tiers_for_dev(self):
     self.assertEqual(
         tiers_for_dev(self.test_dev), ((1,), (1, 1), (1, 1, "192.168.1.1:6000"), (1, 1, "192.168.1.1:6000", 0))
     )
예제 #15
0
파일: builder.py 프로젝트: sudowork/swift
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different zones are considered the
        farthest-apart things, followed by different ip/port pairs within a
        zone; the least-far-apart things are different devices with the same
        ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)
        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2children = build_tier_tree(available_devs)

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(list)
        tiers_by_depth = defaultdict(set)
        for dev in available_devs:
            for tier in tiers_for_dev(dev):
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2sort_key[tier].append(dev['sort_key'])
                tiers_by_depth[len(tier)].add(tier)

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (zones, ip_ports, and devices) the
            # replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(lambda: 0)
            for replica in xrange(self.replicas):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in tiers_for_dev(dev):
                        other_replicas[tier] += 1

            def find_home_for_replica(tier=(), depth=1):
                # Order the tiers by how many replicas of this
                # partition they already have. Then, of the ones
                # with the smallest number of replicas, pick the
                # tier with the hungriest drive and then continue
                # searching in that subtree.
                #
                # There are other strategies we could use here,
                # such as hungriest-tier (i.e. biggest
                # sum-of-parts-wanted) or picking one at random.
                # However, hungriest-drive is what was used here
                # before, and it worked pretty well in practice.
                #
                # Note that this allocator will balance things as
                # evenly as possible at each level of the device
                # layout. If your layout is extremely unbalanced,
                # this may produce poor results.
                candidate_tiers = tier2children[tier]
                min_count = min(other_replicas[t] for t in candidate_tiers)
                candidate_tiers = [
                    t for t in candidate_tiers
                    if other_replicas[t] == min_count
                ]
                candidate_tiers.sort(
                    key=lambda t: tier2devs[t][-1]['parts_wanted'])

                if depth == max(tiers_by_depth.keys()):
                    return tier2devs[candidate_tiers[-1]][-1]

                return find_home_for_replica(tier=candidate_tiers[-1],
                                             depth=depth + 1)

            for replica in replace_replicas:
                dev = find_home_for_replica()
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1

                    index = bisect.bisect_left(tier2sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2sort_key[tier].insert(new_index, new_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']
예제 #16
0
파일: builder.py 프로젝트: sudowork/swift
    def _gather_reassign_parts(self):
        """
        Returns a list of (partition, replicas) pairs to be reassigned by
        gathering from removed devices, insufficiently-far-apart replicas, and
        overweight drives.
        """
        # First we gather partitions from removed devices. Since removed
        # devices usually indicate device failures, we have no choice but to
        # reassign these partitions. However, we mark them as moved so later
        # choices will skip other replicas of the same partition if possible.
        removed_dev_parts = defaultdict(list)
        if self._remove_devs:
            dev_ids = [d['id'] for d in self._remove_devs if d['parts']]
            if dev_ids:
                for replica in xrange(self.replicas):
                    part2dev = self._replica2part2dev[replica]
                    for part in xrange(self.parts):
                        if part2dev[part] in dev_ids:
                            self._last_part_moves[part] = 0
                            removed_dev_parts[part].append(replica)

        # Now we gather partitions that are "at risk" because they aren't
        # currently sufficient spread out across the cluster.
        spread_out_parts = defaultdict(list)
        max_allowed_replicas = self._build_max_replicas_by_tier()
        for part in xrange(self.parts):
            # Only move one replica at a time if possible.
            if part in removed_dev_parts:
                continue

            # First, add up the count of replicas at each tier for each
            # partition.
            replicas_at_tier = defaultdict(lambda: 0)
            for replica in xrange(self.replicas):
                dev = self.devs[self._replica2part2dev[replica][part]]
                for tier in tiers_for_dev(dev):
                    replicas_at_tier[tier] += 1

            # Now, look for partitions not yet spread out enough and not
            # recently moved.
            for replica in xrange(self.replicas):
                dev = self.devs[self._replica2part2dev[replica][part]]
                removed_replica = False
                for tier in tiers_for_dev(dev):
                    if (replicas_at_tier[tier] > max_allowed_replicas[tier]
                            and self._last_part_moves[part] >=
                            self.min_part_hours):
                        self._last_part_moves[part] = 0
                        spread_out_parts[part].append(replica)
                        dev['parts_wanted'] += 1
                        dev['parts'] -= 1
                        removed_replica = True
                        break
                if removed_replica:
                    for tier in tiers_for_dev(dev):
                        replicas_at_tier[tier] -= 1

        # Last, we gather partitions from devices that are "overweight" because
        # they have more partitions than their parts_wanted.
        reassign_parts = defaultdict(list)

        # We randomly pick a new starting point in the "circular" ring of
        # partitions to try to get a better rebalance when called multiple
        # times.
        start = self._last_part_gather_start / 4 + randint(0, self.parts / 2)
        self._last_part_gather_start = start
        for replica in xrange(self.replicas):
            part2dev = self._replica2part2dev[replica]
            for part in itertools.chain(xrange(start, self.parts),
                                        xrange(0, start)):
                if self._last_part_moves[part] < self.min_part_hours:
                    continue
                if part in removed_dev_parts or part in spread_out_parts:
                    continue
                dev = self.devs[part2dev[part]]
                if dev['parts_wanted'] < 0:
                    self._last_part_moves[part] = 0
                    dev['parts_wanted'] += 1
                    dev['parts'] -= 1
                    reassign_parts[part].append(replica)

        reassign_parts.update(spread_out_parts)
        reassign_parts.update(removed_dev_parts)

        reassign_parts_list = list(reassign_parts.iteritems())
        # We shuffle the partitions to reassign so we get a more even
        # distribution later. There has been discussion of trying to distribute
        # partitions more "regularly" because that would actually reduce risk
        # but 1) it is really difficult to do this with uneven clusters and 2)
        # it would concentrate load during failure recovery scenarios
        # (increasing risk). The "right" answer has yet to be debated to
        # conclusion, but working code wins for now.
        shuffle(reassign_parts_list)
        return reassign_parts_list