def generateEvents(self, result_events, start_time, end_time, reset): if start_time < self.start_time: start_time = self.start_time current_time = start_time last_recover_time = start_time while True: self.failure_generator.reset(current_time) failure_time = self.failure_generator.generateNextEvent( current_time) current_time = failure_time if current_time > end_time: failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, flag] in failure_intervals: self.addCorrelatedFailures(result_events, fail_time, recover_time, flag) break self.recovery_generator.reset(current_time) recovery_time = self.recovery_generator.generateNextEvent( current_time) assert (recovery_time > failure_time) # only failure identification time has been given by recovery generator, we add data transfer time here. recovery_time += self.disk_repair_time failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, _bool] in failure_intervals: if recovery_time < fail_time: break remove_flag = True # combine the correlated failure with component failure if fail_time < failure_time <= recover_time: failure_time = fail_time remove_flag = False if fail_time < recovery_time <= recover_time: recovery_time = recover_time remove_flag = False if remove_flag: disk_fail_event = Event(Event.EventType.Failure, fail_time, self) disk_fail_event.next_recovery_time = recover_time result_events.addEvent(disk_fail_event) result_events.addEvent(Event(Event.EventType.Recovered, recover_time, self)) self.failure_intervals.remove([fail_time, recover_time, _bool]) current_time = failure_time fail_event = Event(Event.EventType.Failure, current_time, self) result_events.addEvent(fail_event) fail_event.next_recovery_time = recovery_time current_time = recovery_time if current_time > end_time: result_events.addEvent(Event(Event.EventType.Recovered, current_time, self)) break result_events.addEvent(Event(Event.EventType.Recovered, current_time, self)) last_recover_time = current_time
def generateLatentErrors(self, result_events, start_time, end_time): if isinf(start_time) or isnan(start_time): raise Exception("start time = Inf or NAN") if isinf(end_time) or isnan(end_time): raise Exception("end time = Inf or NaN") current_time = start_time while True: latent_error_time = self.latent_error_generator.generateNextEvent( current_time) if isinf(latent_error_time): break if isinf(current_time) or isnan(current_time): raise Exception("current time is infinitiy or -infinitiy") if isinf(latent_error_time) or isnan(latent_error_time): raise Exception("current time is infinitiy or -infinitiy") LSE_in_CFI = False for [fail_time, recover_time, _bool] in self.failure_intervals: if fail_time <= latent_error_time <= recover_time: LSE_in_CFI = True current_time = latent_error_time if current_time > end_time or LSE_in_CFI: break e = Event(Event.EventType.LatentDefect, current_time, self) result_events.addEvent(e) latent_recovery_time = self.scrub_generator.generateNextEvent( current_time) e.next_recovery_time = latent_recovery_time if latent_recovery_time >= end_time: break recovery_e = Event(Event.EventType.LatentRecovered, latent_recovery_time, self) result_events.addEvent(recovery_e)
def generateEvents(self, result_events, start_time, end_time, reset): current_time = start_time last_recover_time = start_time if self.failure_generator is None: for u in self.children: u.generateEvents(result_events, start_time, end_time, True) return while True: if reset: self.failure_generator.reset(current_time) failure_time = self.failure_generator.generateNextEvent( current_time) current_time = failure_time if current_time > end_time: for u in self.children: u.generateEvents(result_events, last_recover_time, end_time, True) break fail_event = Event(Event.EventType.Failure, current_time, self) result_events.addEvent(fail_event) if self.fast_forward: fail_event.ignore = True for u in self.children: u.generateEvents(result_events, last_recover_time, current_time, True) self.recovery_generator.reset(current_time) recovery_time = self.recovery_generator.generateNextEvent( current_time) assert (recovery_time > failure_time) current_time = recovery_time fail_event.next_recovery_time = recovery_time if current_time > end_time: break if self.fast_forward: result_events.addEvent( Event(Event.EventType.Recoverd, current_time, self, ignore=True)) else: result_events.addEvent( Event(Event.EventType.Recovered, current_time, self)) last_recover_time = current_time
def addCorrelatedFailures(self, result_events, failure_time, recovery_time, lost_flag): if lost_flag: failure_type = 3 else: if recovery_time - failure_time <= self.fail_timeout: failure_type = 1 else: failure_type = 2 fail_event = Event(Event.EventType.Failure, failure_time, self, failure_type) fail_event.next_recovery_time = recovery_time recovery_event = Event(Event.EventType.Recovered, recovery_time, self, failure_type) result_events.addEvent(fail_event) result_events.addEvent(recovery_event) if [failure_time, recovery_time, lost_flag] in self.failure_intervals: self.failure_intervals.remove( [failure_time, recovery_time, lost_flag]) return fail_event
def generateEvents(self, result_events, start_time, end_time, reset): current_time = start_time last_recover_time = start_time if self.children is not None and len(self.children) != 0: raise Exception("Disk should not have any children!") while True: self.failure_generator.reset(current_time) failure_time = self.failure_generator.generateNextEvent( current_time) current_time = failure_time if current_time > end_time: if self.latent_error_generator is None: break self.generateLatentErrors(result_events, last_recover_time, end_time) break fail_event = Event(Event.EventType.Failure, current_time, self) result_events.addEvent(fail_event) if self.latent_error_generator is not None: self.generateLatentErrors(result_events, last_recover_time, current_time) self.recovery_generator.reset(current_time) recovery_time = self.recovery_generator.generateNextEvent( current_time) assert (recovery_time > failure_time) fail_event.next_recovery_time = recovery_time current_time = recovery_time if current_time > end_time: result_events.addEvent( Event(Event.EventType.Recovered, current_time, self)) break result_events.addEvent( Event(Event.EventType.Recovered, current_time, self)) last_recover_time = current_time
def generateEvents(self, result_events, start_time, end_time, reset): if start_time < self.start_time: start_time = self.start_time current_time = start_time last_recover_time = start_time if self.failure_generator is None: failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, flag] in failure_intervals: self.addCorrelatedFailures(result_events, fail_time, recover_time, flag) for u in self.children: u.generateEvents(result_events, start_time, end_time, True) return while True: if reset: self.failure_generator.reset(current_time) failure_time = self.failure_generator.generateNextEvent( current_time) current_time = failure_time self.recovery_generator.reset(current_time) recovery_time = self.recovery_generator.generateNextEvent( current_time) assert (recovery_time > failure_time) if current_time > end_time: failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, flag] in failure_intervals: self.addCorrelatedFailures(result_events, fail_time, recover_time, flag) for u in self.children: u.generateEvents(result_events, last_recover_time, end_time, True) break for [fail_time, recover_time, _bool] in self.failure_intervals: if recovery_time < fail_time: break remove_flag = True # combine the correlated failure with component failure if fail_time < failure_time <= recover_time: failure_time = fail_time remove_flag = False if fail_time < recovery_time <= recover_time: recovery_time = recover_time remove_flag = False if remove_flag: result_events.addEvent( Event(Event.EventType.Failure, fail_time, self)) result_events.addEvent( Event(Event.EventType.Recovered, recover_time, self)) self.failure_intervals.remove([fail_time, recover_time, _bool]) fail_event = Event(Event.EventType.Failure, failure_time, self) result_events.addEvent(fail_event) if self.fast_forward: fail_event.ignore = True for u in self.children: u.generateEvents(result_events, last_recover_time, failure_time, True) current_time = recovery_time fail_event.next_recovery_time = recovery_time if current_time > end_time: break if self.fast_forward: result_events.addEvent( Event(Event.EventType.Recovered, current_time, self, ignore=True)) else: result_events.addEvent( Event(Event.EventType.Recovered, current_time, self)) last_recover_time = current_time
def generateEvents(self, result_events, start_time, end_time, reset): if start_time < self.start_time: start_time = self.start_time if isnan(start_time) or isinf(start_time): raise Exception("start_time = Inf or NAN") if isnan(end_time) or isinf(end_time): raise Exception("end_time = Inf or NAN") current_time = start_time if start_time == 0: self.last_recovery_time = 0 self.latent_error_generator.reset(0) while True: if self.last_recovery_time < 0: raise Exception("Negative last recover time") # The loop below is what makes the difference for avoiding weird # amplification of failures when having machine failures. # The reason is as follows: when generateEvents is called once for # the whole duration of the simulation(as when there are no # machine failures), this loop will never be executed. But when # machine fail, the function is called for the time interval # between machine recovery and second failure. The first time # the disk failure event generated, it may occur after the machine # failure event, so it is discarded when it is called for the next # time interval, the new failure event might be generated, to be # before the current start of the current interval. It's tempting # to round that event to the start of the interval, but then it # occurs concurrently to many disks. So the critical addition is # this loop, which effectively forces the proper generation of the # event, which is consistent with the previously generated one that # was discarded. failure_time = 0 failure_time = self.failure_generator.generateNextEvent( self.last_recovery_time) while failure_time < start_time: failure_time = self.failure_generator.generateNextEvent( self.last_recovery_time) if failure_time > end_time: failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, flag] in failure_intervals: self.addCorrelatedFailures(result_events, fail_time, recover_time, flag) if self.latent_error_generator is None: break self.generateLatentErrors(result_events, current_time, end_time) break if failure_time < start_time or failure_time > end_time: raise Exception("Wrong time range.") recovery_time = self.generateRecoveryEvent(result_events, failure_time, end_time) if recovery_time < 0: raise Exception("recovery time is negative") failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, _bool] in failure_intervals: if recovery_time < fail_time: break remove_flag = True # combine the correlated failure with component failure if fail_time < failure_time <= recover_time: failure_time = fail_time remove_flag = False if fail_time < recovery_time <= recover_time: recovery_time = recover_time remove_flag = False if remove_flag: disk_fail_event = Event(Event.EventType.Failure, fail_time, self) disk_fail_event.next_recovery_time = recover_time result_events.addEvent(disk_fail_event) result_events.addEvent( Event(Event.EventType.Recovered, recover_time, self)) self.failure_intervals.remove([fail_time, recover_time, _bool]) fail_event = Event(Event.EventType.Failure, failure_time, self) result_events.addEvent(fail_event) fail_event.next_recovery_time = recovery_time # generate latent errors from the current time to the time of the # generated failure. self.generateLatentErrors(result_events, current_time, failure_time) # lifetime of a latent error starts when the disk is reconstructed self.latent_error_generator.reset(recovery_time) # move the clocks, next iteration starts from the next recovery current_time = self.last_recovery_time if current_time < 0: raise Exception("current recovery time is negative")
def generateEvents(self, result_events, start_time, end_time, reset): if isnan(start_time) or isinf(start_time): raise Exception("start_time = Inf or NAN") if isnan(end_time) or isinf(end_time): raise Exception("end_time = Inf or NAN") current_time = start_time if self.children != [] or len(self.children): raise Exception("Disk should not have any children") if start_time == 0: self.last_recovery_time = 0 self.latent_error_generator.reset(0) while True: if self.last_recovery_time < 0: raise Exception("Negative last recover time") # The loop below is what makes the difference for avoiding weird # amplification of failures when having machine failures. # The reason is as follows: when generateEvents is called once for # the whole duration of the simulation(as when there are no # machine failures), this loop will never be executed. But when # machine fail, the function is called for the time interval # between machine recovery and second failure. The first time # the disk failure event generated, it may occur after the machine # failure event, so it is discarded when it is called for the next # time interval, the new failure event might be generated, to be # before the current start of the current interval. It's tempting # to round that event to the start of the interval, but then it # occurs concurrently to many disks. So the critical addition is # this loop, which effectively forces the proper generation of the # event, which is consistent with the previously generated one that # was discarded. failure_time = 0 failure_time = self.failure_generator.generateNextEvent( self.last_recovery_time) while failure_time < start_time: failure_time = self.failure_generator.generateNextEvent( self.last_recovery_time) if failure_time > end_time: self.generateLatentErrors(result_events, current_time, end_time) # self.generateScrub(result_events, current_time, end_time) break if failure_time < start_time or failure_time > end_time: raise Exception("Wrong time range.") fail_event = Event(Event.EventType.Failure, failure_time, self) result_events.addEvent(fail_event) recovery_time = self.generateRecoveryEvent(result_events, failure_time, end_time) if recovery_time < 0: raise Exception("recovery time is negative") fail_event.next_recovery_time = recovery_time # generate latent errors from the current time to the time of the # generated failure. self.generateLatentErrors(result_events, current_time, failure_time) # lifetime of a latent error starts when the disk is reconstructed self.latent_error_generator.reset(recovery_time) # scrubs get generated depending on the scrub frequency, starting # from the previous scrub finish event. # self.generateScrub(result_events, current_time, failure_time) # scrub generator is reset on the next recovery from the disk error # self.scrub_generator.reset(self.last_recovery_time) # move the clocks, next iteration starts from the next recovery current_time = self.last_recovery_time if current_time < 0: raise Exception("current recovery time is negative")
def generateEvents(self, result_events, start_time, end_time, reset): if start_time < self.start_time: start_time = self.start_time current_time = start_time last_recover_time = start_time if self.failure_generator is None: failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, flag] in failure_intervals: self.addCorrelatedFailures(result_events, fail_time, recover_time, flag) for u in self.children: u.generateEvents(result_events, start_time, end_time, True) return if isinstance(self.failure_generator, Trace): self.failure_generator.setCurrentMachine(self.my_id) if isinstance(self.recovery_generator, Trace): self.recovery_generator.setCurrentMachine(self.my_id) while True: if reset: self.failure_generator.reset(current_time) if isinstance(self.failure_generator, Trace): # For the event start. self.failure_generator.setCurrentEventType(True) failure_time = self.failure_generator.generateNextEvent( current_time) current_time = failure_time if current_time > end_time: failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, flag] in failure_intervals: self.addCorrelatedFailures(result_events, fail_time, recover_time, flag) for u in self.children: u.generateEvents(result_events, last_recover_time, end_time, True) break if isinstance(self.failure_generator, Trace): # For event start. self.failure_generator.eventAccepted() if isinstance(self.recovery_generator, Trace): self.recovery_generator.setCurrentEventType(False) self.recovery_generator.reset(current_time) recovery_time = self.recovery_generator.generateNextEvent( current_time) assert (recovery_time > failure_time) failure_intervals = deepcopy(self.failure_intervals) for [fail_time, recover_time, _bool] in failure_intervals: if recovery_time < fail_time: break remove_flag = True # combine the correlated failure with component failure if fail_time < failure_time <= recover_time: failure_time = fail_time remove_flag = False if fail_time < recovery_time <= recover_time: recovery_time = recover_time remove_flag = False if remove_flag: self.addCorrelatedFailures(result_events, fail_time, recover_time, _bool) else: self.failure_intervals.remove( [fail_time, recover_time, _bool]) for u in self.children: u.generateEvents(result_events, last_recover_time, failure_time, True) if recovery_time > end_time - (1E-5): recovery_time = end_time - (1E-5) r = random() if not self.fast_forward: # we will process failures if r < Machine.fail_fraction: # failure type: tempAndShort=1, tempAndLong=2, permanent=3 failure_type = 3 # detection_time = uniform(0, self.fail_timeout) # recovery_time = failure_time + detection_time + self.fail_timeout + \ # self.machine_repair_time # detection time and identification time comes from recovery_generator2 recovery_time = self.recovery_generator2.generateNextEvent( failure_time) + self.machine_repair_time else: if recovery_time - failure_time <= self.fail_timeout: # transient failure and come back very soon failure_type = 1 else: # transient failure, but last long. failure_type = 2 if self.eager_recovery_enabled: eager_recovery_start_time = failure_time + \ self.fail_timeout eager_recovery_start_event = Event( Event.EventType.EagerRecoveryStart, eager_recovery_start_time, self) eager_recovery_start_event.next_recovery_time = \ recovery_time result_events.addEvent(eager_recovery_start_event) # Ensure machine recovery happens after last eager # recovery installment recovery_time += 1E-5 if isinstance(self.failure_generator, Trace): self.failure_generator.eventAccepted() if self.fast_forward: result_events.addEvent( Event(Event.EventType.Failure, failure_time, self, True)) result_events.addEvent( Event(Event.EventType.Recovered, recovery_time, self, True)) else: fail_event = Event(Event.EventType.Failure, failure_time, self, failure_type) fail_event.next_recovery_time = recovery_time result_events.addEvent(fail_event) result_events.addEvent( Event(Event.EventType.Recovered, recovery_time, self, failure_type)) current_time = recovery_time last_recover_time = current_time if current_time >= end_time - (1E-5): break
def generateEvents(self, result_events, start_time, end_time, reset): if start_time < self.start_time: start_time = self.start_time current_time = start_time last_recover_time = start_time while True: self.failure_generator.reset(current_time) failure_time = self.failure_generator.generateNextEvent( current_time) current_time = failure_time if current_time > end_time: for [fail_time, recover_time, flag] in self.failure_intervals: self.addCorrelatedFailures(result_events, fail_time, recover_time, flag) if self.latent_error_generator is None: break self.generateLatentErrors(result_events, last_recover_time, end_time) break self.recovery_generator.reset(current_time) recovery_time = self.recovery_generator.generateNextEvent( current_time) assert (recovery_time > failure_time) # for disk repair, detection and identification have been given by recovery generator, so we add data transferring time here. recovery_time += self.disk_repair_time for [fail_time, recover_time, _bool] in self.failure_intervals: if recovery_time < fail_time: break remove_flag = True # combine the correlated failure with component failure if fail_time < failure_time <= recover_time: failure_time = fail_time remove_flag = False if fail_time < recovery_time <= recover_time: recovery_time = recover_time remove_flag = False if remove_flag: disk_fail_event = Event(Event.EventType.Failure, fail_time, self) disk_fail_event.next_recovery_time = recover_time result_events.addEvent(disk_fail_event) result_events.addEvent( Event(Event.EventType.Recovered, recover_time, self)) self.failure_intervals.remove([fail_time, recover_time, _bool]) current_time = failure_time fail_event = Event(Event.EventType.Failure, current_time, self) result_events.addEvent(fail_event) if self.latent_error_generator is not None: self.generateLatentErrors(result_events, last_recover_time, current_time) fail_event.next_recovery_time = recovery_time current_time = recovery_time if current_time > end_time: result_events.addEvent( Event(Event.EventType.Recovered, current_time, self)) break result_events.addEvent( Event(Event.EventType.Recovered, current_time, self)) last_recover_time = current_time
def generateEvents(self, result_events, start_time, end_time, reset): current_time = start_time last_recover_time = start_time if self.failure_generator is None: for u in self.children: u.generateEvents(result_events, start_time, end_time, True) return if isinstance(self.failure_generator, Trace): self.failure_generator.setCurrentMachine(self.my_id) if isinstance(self.recovery_generator, Trace): self.recovery_generator.setCurrentMachine(self.my_id) while True: if reset: self.failure_generator.reset(current_time) if isinstance(self.failure_generator, Trace): # For the event start. self.failure_generator.setCurrentEventType(True) failure_time = self.failure_generator.generateNextEvent( current_time) current_time = failure_time if current_time > end_time: for u in self.children: u.generateEvents(result_events, last_recover_time, end_time, True) break if isinstance(self.failure_generator, Trace): # For event start. self.failure_generator.eventAccepted() for u in self.children: u.generateEvents(result_events, last_recover_time, current_time, True) if isinstance(self.recovery_generator, Trace): self.recovery_generator.setCurrentEventType(False) self.recovery_generator.reset(current_time) recovery_time = self.recovery_generator.generateNextEvent( current_time) assert (recovery_time > failure_time) if recovery_time > end_time - (1E-5): recovery_time = end_time - (1E-5) r = random() if not self.fast_forward: # we will process failures if r < Machine.fail_fraction: # failure type: tempAndShort=1, tempAndLong=2, permanent=3 failure_type = 3 # generate disk failures max_recovery_time = recovery_time for u in self.children: # ensure machine fails before disk disk_fail_time = failure_time + 1E-5 disk_fail_event = Event(Event.EventType.Failure, disk_fail_time, u) result_events.addEvent(disk_fail_event) disk_recovery_time = u.generateRecoveryEvent( result_events, disk_fail_time, end_time - (1E-5)) disk_fail_event.next_recovery_time = disk_recovery_time # machine recovery must coincide with last disk recovery if disk_recovery_time > max_recovery_time: max_recovery_time = disk_recovery_time recovery_time = max_recovery_time + (1E-5) else: if recovery_time - failure_time <= self.fail_timeout: # transient failure and come back very soon failure_type = 1 else: # transient failure, but last long. failure_type = 2 if self.eager_recovery_enabled: eager_recovery_start_time = failure_time + \ self.fail_timeout eager_recovery_start_event = Event( Event.EventType.EagerRecoveryStart, eager_recovery_start_time, self) eager_recovery_start_event.next_recovery_time = \ recovery_time result_events.addEvent(eager_recovery_start_event) # Ensure machine recovery happens after last eager # recovery installment recovery_time += 1E-5 if isinstance(self.failure_generator, Trace): self.failure_generator.eventAccepted() if self.fast_forward: result_events.addEvent( Event(Event.EventType.Failure, failure_time, self, True)) result_events.addEvent( Event(Event.EventType.Recovered, recovery_time, self, True)) else: result_events.addEvent( Event(Event.EventType.Failure, failure_time, self, failure_type)) result_events.addEvent( Event(Event.EventType.Recovered, recovery_time, self, failure_type)) current_time = recovery_time last_recover_time = current_time if current_time >= end_time - (1E-5): break