def distributeSlices(self, root, increase_slices): disks = [] self.getAllDisks(root, disks) self.total_slices += increase_slices groups = self.divideDisksIntoGroups(disks) full_disk_count = 0 for i in xrange(self.total_slices - increase_slices, self.total_slices): group = choice(groups) self.slice_locations.append(group) for disk in group: if len(disk.getChildren()) > self.conf.max_chunks_per_disk: full_disk_count += self.n groups.remove(group) error_logger.error( "A Partition is completely full, full disk count is " + str(full_disk_count)) break disk.addChild(i) self._my_assert(len(self.slice_locations[i]) == self.n) self._my_assert(len(self.slice_locations) == self.total_slices)
def distributeSliceToDisk(self, slice_index, disks, available_racks, separate_racks): retry_count = 0 same_rack_count = 0 same_disk_count = 0 full_disk_count = 0 while True: retry_count += 1 # choose disk from the right rack if len(available_racks) == 0: raise Exception("No racks left") prev_racks_index = randint(0, len(available_racks) - 1) rack_disks = available_racks[prev_racks_index] disk_index_in_rack = randint(0, len(rack_disks) - 1) disk = rack_disks[disk_index_in_rack] if disk.getMetadata().slice_count >= self.conf.chunks_per_disk: full_disk_count += 1 rack_disks.remove(disk) if len(rack_disks) == 0: error_logger.error( "One rack is completely full" + str(disk.getParent().getParent().getID())) available_racks.remove(rack_disks) disks.remove(rack_disks) if retry_count > 100: error_logger.error("Unable to distribute slice " + str(slice_index) + "; picked full disk " + str(full_disk_count) + " times, same rack " + str(same_rack_count) + " times, and same disk " + str(same_disk_count) + " times") raise Exception("Disk distribution failed") continue available_racks.remove(rack_disks) m = disk.getMetadata() if m.slices == []: m.slices = [-10] * self.conf.chunks_per_disk # LZR self.slice_locations[slice_index].append(disk) # add slice indexs to children list of disks disk.addChild(slice_index) m.slices[m.slice_count] = slice_index m.slice_count += 1 break
def distributeSliceToDisk(self, slice_index, machines): retry_count = 0 full_machine_count = 0 locations = [] if len(machines) < self.r: raise Exception("No enough racks left") retry_flag = True while retry_flag: racks_for_slice = sample(machines, self.r) retry_flag = False for i, rack in enumerate(racks_for_slice): if len(rack) < self.slices_chunks_on_racks[i]: retry_flag = True retry_count += 1 if retry_count > 100: error_logger.error("Unable to distribute slice " + str(slice_index)) raise Exception("Data distribution failed") else: break # choose machines from the right rack for i, rack in enumerate(racks_for_slice): machines_for_slice = sample(rack, self.slices_chunks_on_racks[i]) for machine in machines_for_slice: disk = choice(machine.getChildren()) locations.append(disk) disk.addChild(slice_index) slice_count = len(disk.getChildren()) if slice_count >= self.conf.max_chunks_per_disk: full_disk_count += 1 error_logger.info("One disk is completely full " + str(disk.toString())) rack.remove(machine) if len(rack) == 0: error_logger.error("One rack is completely full" + str(machine.getParent().getID())) machines.remove(rack) # LZR self.slice_locations[slice_index] = locations
def handleEagerRecoveryStart(self, u, time, e, queue): self._my_assert(isinstance(u, Machine)) self.total_eager_machine_repairs += 1 u.setLastFailureTime(e.getTime()) original_failure_time = e.getTime() # Eager recovery begins now, and ends at time e.next_recovery_time # (which is when the machine recovers). Recovery rate will be # (recoveryBandwidthCap - currentRecoveryBandwidth) MB/hr. Therefore, # total number of chunks that can be recovered = eager recovery # duration * recovery rate. This happens in installments, of # installmentSize number of chunks each. The last installment will # have (total num chunks % installmentSize) number of chunks self._my_assert(e.next_recovery_time - e.getTime() > 0) self._my_assert(self.current_recovery_bandwidth >= 0) recovery_rate = self.recovery_bandwidth_cap - \ self.current_recovery_bandwidth if recovery_rate <= 0: return num_chunks_to_recover = int((recovery_rate / self.conf.chunk_size) * (e.next_recovery_time - e.getTime())) if num_chunks_to_recover < 1: return recovery_rate = num_chunks_to_recover*self.conf.chunk_size / \ (e.next_recovery_time-e.getTime()) self._my_assert(recovery_rate >= 0) self.current_recovery_bandwidth += recovery_rate self._my_assert(self.current_recovery_bandwidth >= 0) curr_installment_size = self.conf.installment_size if num_chunks_to_recover < self.conf.installment_size: curr_installment_size = num_chunks_to_recover try: slice_installment = SliceSet("SliceSet-" + u.toString(), []) slice_installment.setLastFailureTime(u.getLastFailureTime()) slice_installment.setOriginalFailureTime(original_failure_time) except Exception, e: error_logger.error("Error in eager recovery: " + e) return
def distributeSlices(self, root, increase_slices): full_disk_count = 0 full_machine_count = 0 disks_per_machine = self.returnDisksPerMachine() self.total_slices += increase_slices machines_in_racks = self.getAllMachines() copy_sets = self.divideMachinesIntoSets(machines_in_racks) full_disk_indexes = [[[] for i in xrange(self.s)] for j in xrange(len(copy_sets))] full_machine_indexes = [[] for j in xrange(len(copy_sets))] copysets_index = [i for i in xrange(len(copy_sets))] for i in xrange(self.total_slices - increase_slices, self.total_slices): locations = [] retry_count = 0 while retry_count <= 100: copy_set_index = choice(copysets_index) copy_set = copy_sets[copy_set_index] machine_indexes = self._getMachinesFromCopyset( copy_set, full_machine_indexes[copy_set_index]) if machine_indexes is None: copysets_index.remove( copysets_index.remove(copy_set_index)) retry_count += 1 continue else: break for machine_index in machine_indexes: machine = copy_set[machine_index] disk_indexes = [j for j in xrange(self.conf.disks_per_machine)] for i in full_disk_indexes[copy_set_index][machine_index]: disk_indexes.remove(i) try: disk_index = choice(disk_indexes) except IndexError: raise Exception("full machine is " + machine.toString()) disk = machine.getChildren()[disk_index] disk.addChild(i) locations.append(disk) if len(disk.getChildren()) >= self.conf.max_chunks_per_disk: full_disk_count += 1 full_disk_indexes[copy_set_index][machine_index].append( disk_index) error_logger.error( "A disk is completely full, full disk is " + disk.toString()) if len(full_disk_indexes[copy_set_index] [machine_index]) == disks_per_machine: full_machine_count += 1 full_machine_indexes[copy_set_index].append( machine_index) error_logger.error( "A machine is completely full, full machine is " + machine.toString()) self.slice_locations.append(locations) self._my_assert(len(self.slice_locations[i]) == self.n) self._my_assert(len(self.slice_locations) == self.total_slices)
def __init__(self): error_logger.error("Starting simulation ")
def handleEagerRecoveryInstallment(self, u, time, e): self._my_assert(isinstance(u, SliceSet)) transfer_required = 0.0 if u.getLastBandwidthNeed() != -1: self.current_recovery_bandwidth -= u.getLastBandwidthNeed() if self.current_recovery_bandwidth < 0 and \ self.current_recovery_bandwidth > -1: self.current_recovery_bandwidth = 0 self._my_assert(self.current_recovery_bandwidth >= 0) for slice_index in u.slices: # slice_index = s.intValue() if self.status[slice_index] == self.lost_slice: if slice_index in self.unavailable_slice_durations.keys() and \ len(self.unavailable_slice_durations[slice_index][-1]) == 1: self.unavailable_slice_durations[slice_index][ -1].append(time) continue threshold_crossed = False actual_threshold = self.recovery_threshold # need uc = u? actual_threshold = self.conf.getAvailableLazyThreshold( e.getTime() - u.getOriginalFailureTime()) if self.durableCount(slice_index) <= actual_threshold: threshold_crossed = True if self.availability_counts_for_recovery: if self.availableCount(slice_index) <= actual_threshold: threshold_crossed = True if threshold_crossed: if self.isLost(slice_index): self.status[slice_index] = self.lost_slice continue if not self.isRepairable(slice_index): continue self.total_eager_slice_repairs += 1 if self.lazy_recovery: chunks_recovered = self.parallelRepair(slice_index) # self.handleSliceRecovery(slice_index, e, False) self._my_assert( self.availableCount(slice_index) == self.n and self.durableCount(slice_index) == self.n) if self.durableCount(slice_index) != self.n: self.sliceRecovered(slice_index) else: self.sliceRecoveredAvailability(slice_index) transfer_required += self.k - 1 + chunks_recovered else: if self.availableCount(slice_index) < self.n: try: index = self.status[slice_index].index(0) except ValueError: error_logger.error("No block crash in slice " + str(slice_index)) continue rc = self.repair(slice_index, index) transfer_required += rc if self.durableCount(slice_index) != self.n: self.sliceRecovered(slice_index) else: self.sliceRecoveredAvailability(slice_index) u.setLastFailureTime(e.getTime())