예제 #1
0
    def distributeSlices(self, root, increase_slices):
        disks = []

        self.getAllDisks(root, disks)
        self.total_slices += increase_slices
        groups = self.divideDisksIntoGroups(disks)

        full_disk_count = 0
        for i in xrange(self.total_slices - increase_slices,
                        self.total_slices):
            group = choice(groups)
            self.slice_locations.append(group)
            for disk in group:
                if len(disk.getChildren()) > self.conf.max_chunks_per_disk:
                    full_disk_count += self.n
                    groups.remove(group)
                    error_logger.error(
                        "A Partition is completely full, full disk count is " +
                        str(full_disk_count))
                    break
                disk.addChild(i)

            self._my_assert(len(self.slice_locations[i]) == self.n)

        self._my_assert(len(self.slice_locations) == self.total_slices)
예제 #2
0
    def distributeSliceToDisk(self, slice_index, disks, available_racks,
                              separate_racks):
        retry_count = 0
        same_rack_count = 0
        same_disk_count = 0
        full_disk_count = 0
        while True:
            retry_count += 1
            # choose disk from the right rack
            if len(available_racks) == 0:
                raise Exception("No racks left")
            prev_racks_index = randint(0, len(available_racks) - 1)
            rack_disks = available_racks[prev_racks_index]

            disk_index_in_rack = randint(0, len(rack_disks) - 1)
            disk = rack_disks[disk_index_in_rack]
            if disk.getMetadata().slice_count >= self.conf.chunks_per_disk:
                full_disk_count += 1
                rack_disks.remove(disk)

                if len(rack_disks) == 0:
                    error_logger.error(
                        "One rack is completely full" +
                        str(disk.getParent().getParent().getID()))
                    available_racks.remove(rack_disks)
                    disks.remove(rack_disks)
                if retry_count > 100:
                    error_logger.error("Unable to distribute slice " +
                                       str(slice_index) +
                                       "; picked full disk " +
                                       str(full_disk_count) +
                                       " times, same rack " +
                                       str(same_rack_count) +
                                       " times, and same disk " +
                                       str(same_disk_count) + " times")
                    raise Exception("Disk distribution failed")
                continue

            available_racks.remove(rack_disks)

            m = disk.getMetadata()
            if m.slices == []:
                m.slices = [-10] * self.conf.chunks_per_disk

            # LZR
            self.slice_locations[slice_index].append(disk)
            # add slice indexs to children list of disks
            disk.addChild(slice_index)
            m.slices[m.slice_count] = slice_index
            m.slice_count += 1
            break
예제 #3
0
    def distributeSliceToDisk(self, slice_index, machines):
        retry_count = 0
        full_machine_count = 0
        locations = []

        if len(machines) < self.r:
            raise Exception("No enough racks left")

        retry_flag = True
        while retry_flag:
            racks_for_slice = sample(machines, self.r)

            retry_flag = False
            for i, rack in enumerate(racks_for_slice):
                if len(rack) < self.slices_chunks_on_racks[i]:
                    retry_flag = True
                    retry_count += 1
                    if retry_count > 100:
                        error_logger.error("Unable to distribute slice " +
                                           str(slice_index))
                        raise Exception("Data distribution failed")
                    else:
                        break

        # choose machines from the right rack
        for i, rack in enumerate(racks_for_slice):
            machines_for_slice = sample(rack, self.slices_chunks_on_racks[i])
            for machine in machines_for_slice:
                disk = choice(machine.getChildren())
                locations.append(disk)
                disk.addChild(slice_index)
                slice_count = len(disk.getChildren())
                if slice_count >= self.conf.max_chunks_per_disk:
                    full_disk_count += 1
                    error_logger.info("One disk is completely full " +
                                      str(disk.toString()))
                    rack.remove(machine)

                if len(rack) == 0:
                    error_logger.error("One rack is completely full" +
                                       str(machine.getParent().getID()))
                    machines.remove(rack)
        # LZR
        self.slice_locations[slice_index] = locations
예제 #4
0
    def handleEagerRecoveryStart(self, u, time, e, queue):
        self._my_assert(isinstance(u, Machine))
        self.total_eager_machine_repairs += 1
        u.setLastFailureTime(e.getTime())
        original_failure_time = e.getTime()

        # Eager recovery begins now, and ends at time e.next_recovery_time
        # (which is when the machine recovers). Recovery rate will be
        # (recoveryBandwidthCap - currentRecoveryBandwidth) MB/hr. Therefore,
        # total number of chunks that can be recovered = eager recovery
        # duration * recovery rate. This happens in installments, of
        # installmentSize number of chunks each. The last installment will
        # have (total num chunks % installmentSize) number of chunks
        self._my_assert(e.next_recovery_time - e.getTime() > 0)
        self._my_assert(self.current_recovery_bandwidth >= 0)
        recovery_rate = self.recovery_bandwidth_cap - \
            self.current_recovery_bandwidth
        if recovery_rate <= 0:
            return

        num_chunks_to_recover = int((recovery_rate / self.conf.chunk_size) *
                                    (e.next_recovery_time - e.getTime()))
        if num_chunks_to_recover < 1:
            return

        recovery_rate = num_chunks_to_recover*self.conf.chunk_size / \
            (e.next_recovery_time-e.getTime())
        self._my_assert(recovery_rate >= 0)
        self.current_recovery_bandwidth += recovery_rate
        self._my_assert(self.current_recovery_bandwidth >= 0)

        curr_installment_size = self.conf.installment_size
        if num_chunks_to_recover < self.conf.installment_size:
            curr_installment_size = num_chunks_to_recover

        try:
            slice_installment = SliceSet("SliceSet-" + u.toString(), [])
            slice_installment.setLastFailureTime(u.getLastFailureTime())
            slice_installment.setOriginalFailureTime(original_failure_time)
        except Exception, e:
            error_logger.error("Error in eager recovery: " + e)
            return
예제 #5
0
    def distributeSlices(self, root, increase_slices):
        full_disk_count = 0
        full_machine_count = 0
        disks_per_machine = self.returnDisksPerMachine()
        self.total_slices += increase_slices
        machines_in_racks = self.getAllMachines()
        copy_sets = self.divideMachinesIntoSets(machines_in_racks)

        full_disk_indexes = [[[] for i in xrange(self.s)]
                             for j in xrange(len(copy_sets))]
        full_machine_indexes = [[] for j in xrange(len(copy_sets))]
        copysets_index = [i for i in xrange(len(copy_sets))]
        for i in xrange(self.total_slices - increase_slices,
                        self.total_slices):
            locations = []
            retry_count = 0
            while retry_count <= 100:
                copy_set_index = choice(copysets_index)
                copy_set = copy_sets[copy_set_index]
                machine_indexes = self._getMachinesFromCopyset(
                    copy_set, full_machine_indexes[copy_set_index])
                if machine_indexes is None:
                    copysets_index.remove(
                        copysets_index.remove(copy_set_index))
                    retry_count += 1
                    continue
                else:
                    break

            for machine_index in machine_indexes:
                machine = copy_set[machine_index]

                disk_indexes = [j for j in xrange(self.conf.disks_per_machine)]
                for i in full_disk_indexes[copy_set_index][machine_index]:
                    disk_indexes.remove(i)
                try:
                    disk_index = choice(disk_indexes)
                except IndexError:
                    raise Exception("full machine is " + machine.toString())
                disk = machine.getChildren()[disk_index]
                disk.addChild(i)
                locations.append(disk)

                if len(disk.getChildren()) >= self.conf.max_chunks_per_disk:
                    full_disk_count += 1
                    full_disk_indexes[copy_set_index][machine_index].append(
                        disk_index)
                    error_logger.error(
                        "A disk is completely full, full disk is " +
                        disk.toString())
                    if len(full_disk_indexes[copy_set_index]
                           [machine_index]) == disks_per_machine:
                        full_machine_count += 1
                        full_machine_indexes[copy_set_index].append(
                            machine_index)
                        error_logger.error(
                            "A machine is completely full, full machine is " +
                            machine.toString())

            self.slice_locations.append(locations)

            self._my_assert(len(self.slice_locations[i]) == self.n)
        self._my_assert(len(self.slice_locations) == self.total_slices)
예제 #6
0
 def __init__(self):
     error_logger.error("Starting simulation ")
예제 #7
0
    def handleEagerRecoveryInstallment(self, u, time, e):
        self._my_assert(isinstance(u, SliceSet))
        transfer_required = 0.0
        if u.getLastBandwidthNeed() != -1:
            self.current_recovery_bandwidth -= u.getLastBandwidthNeed()
            if self.current_recovery_bandwidth < 0 and \
               self.current_recovery_bandwidth > -1:
                self.current_recovery_bandwidth = 0
                self._my_assert(self.current_recovery_bandwidth >= 0)

            for slice_index in u.slices:
                # slice_index = s.intValue()
                if self.status[slice_index] == self.lost_slice:
                    if slice_index in self.unavailable_slice_durations.keys() and \
                        len(self.unavailable_slice_durations[slice_index][-1]) == 1:
                        self.unavailable_slice_durations[slice_index][
                            -1].append(time)
                    continue

                threshold_crossed = False
                actual_threshold = self.recovery_threshold
                # need uc = u?
                actual_threshold = self.conf.getAvailableLazyThreshold(
                    e.getTime() - u.getOriginalFailureTime())

                if self.durableCount(slice_index) <= actual_threshold:
                    threshold_crossed = True

                if self.availability_counts_for_recovery:
                    if self.availableCount(slice_index) <= actual_threshold:
                        threshold_crossed = True

                if threshold_crossed:
                    if self.isLost(slice_index):
                        self.status[slice_index] = self.lost_slice
                        continue
                    if not self.isRepairable(slice_index):
                        continue
                    self.total_eager_slice_repairs += 1
                    if self.lazy_recovery:
                        chunks_recovered = self.parallelRepair(slice_index)
                        # self.handleSliceRecovery(slice_index, e, False)
                        self._my_assert(
                            self.availableCount(slice_index) == self.n
                            and self.durableCount(slice_index) == self.n)
                        if self.durableCount(slice_index) != self.n:
                            self.sliceRecovered(slice_index)
                        else:
                            self.sliceRecoveredAvailability(slice_index)
                        transfer_required += self.k - 1 + chunks_recovered
                    else:
                        if self.availableCount(slice_index) < self.n:
                            try:
                                index = self.status[slice_index].index(0)
                            except ValueError:
                                error_logger.error("No block crash in slice " +
                                                   str(slice_index))
                                continue
                            rc = self.repair(slice_index, index)
                            transfer_required += rc
                            if self.durableCount(slice_index) != self.n:
                                self.sliceRecovered(slice_index)
                            else:
                                self.sliceRecoveredAvailability(slice_index)

            u.setLastFailureTime(e.getTime())