def __init__(self, name, parent, parameters): self.my_id = Machine.id_counter Machine.id_counter += 1 super(Machine, self).__init__(name, parent, parameters) # recovery generator for permanent machine failure self.recovery_generator2 = None # amount of time after which a machine failure is treated as permanent, # and eager disk recovery is begun, if eager_recovery_enabled is True. self.fail_timeout = -1 if self.fail_timeout == -1: # Fraction of machine failures that are permanent. Machine.fail_fraction = float( parameters.get("fail_fraction", 0.008)) self.fail_timeout = float(parameters.get("fail_timeout", 0.25)) # If True, machine failure and recovery durations will be generated # but ignored. self.fast_forward = bool(parameters.get("fast_forward")) self.eager_recovery_enabled = bool( parameters.get("eager_recovery_enabled")) self.fail_durations = [] conf = Configuration() self.machine_repair_time = conf.node_repair_time
def run(self): conf = Configuration(self.conf_path) xml = XMLParser(conf) if conf.hier: self.distributer = HierSSSDistribute(xml) else: self.distributer = SSSDistribute(xml) self.conf = self.distributer.returnConf() self.event_handler = EventHandler self.distributer.start() events_handled = 0 events = EventQueue() if self.conf.system_upgrade: for info in self.conf.system_upgrade_infos: if info[0] == 1: upgrade_start_times = self.addSystemUpgrade(info, self.conf.total_time) if info[-1] is not None: self.addUpgradeCheckEvents(events, upgrade_start_times, info[-1]) if self.conf.correlated_failures: for info in self.conf.correlated_failures_infos: for i in xrange(10): cf_info = deepcopy(list(info)) cf_info[0] += i * 8760 print "correlated_failures info:", cf_info self.addCorrelatedFailures(cf_info) if self.conf.system_scaling: for info in self.conf.system_scaling_infos: self.addSystemScaling(info) info_logger.info("disk usage is: " + str(self.distributer.diskUsage()*100) + "%\n") self.distributer.getRoot().printAll() root = self.distributer.getRoot() root.generateEvents(events, 0, self.conf.total_time, True) for ts in self.conf.upgrade_ts: full_system_check_event = Event(Event.EventType.UpgradeCheck, ts, root, 6) events.addEvent(full_system_check_event) if self.conf.event_file != None: events_file = self.conf.event_file + '-' + self.ts events.printAll(events_file, "Iteration number: "+str(self.iteration_times)) self.iteration_times += 1 handler = self.event_handler(self.distributer) print "total slices:", handler.total_slices e = events.removeFirst() while e is not None: handler.handleEvent(e, events) e = events.removeFirst() events_handled += 1 self.total_events_handled += events_handled result = handler.end() info_logger.info(result.toString()) return result
def __init__(self, durations): self.durations = durations self.conf = Configuration() self.drs_handler = self.conf.getDRSHandler() self.isMDS = self.drs_handler.isMDS ft = self.drs_handler.n - self.drs_handler.k if self.isMDS: self.ft = ft else: # We only consider LRC with l = 2, so we # need to consider failures more than n-k-1 self.ft = ft - 1 self.concurrent_count = 0 self.lost_concurrent_count = 0 self.total_failure_slice_count = 0
def __init__(self, name, parent, parameters): super(Disk, self).__init__(name, parent, parameters) conf = Configuration() self.disk_capacity = conf.disk_capacity self.disk_repair_time = conf.disk_repair_time self.chunk_repair_time = conf.chunk_repair_time self.slices_hit_by_LSE = [] self.latent_error_generator = None self.scrub_generator = None
def __init__(self, conf_path): self.iteration_times = 1 self.ts = strftime("%Y%m%d.%H.%M.%S") self.total_events_handled = 0 self.conf = Configuration(conf_path) xml = XMLParser(self.conf) if self.conf.hier: self.distributer = HierSSSDistribute(xml) else: self.distributer = SSSDistribute(xml)
def run(self): conf = Configuration(self.conf_path) xml = XMLParser(conf) distributer_class = returnDistributer(conf.data_placement, conf.hierarchical) self.distributer = distributer_class(xml) self.conf = self.distributer.returnConf() if self.conf.rafi_recovery: self.event_handler = RAFIEventHandler else: self.event_handler = EventHandler self.distributer.start() # self.distributer.printGroupsToFile() info_logger.info("disk usage is: " + str(self.distributer.diskUsage() * 100) + "%\n") self.distributer.getRoot().printAll() events_handled = 0 events = EventQueue() root = self.distributer.getRoot() root.generateEvents(events, 0, self.conf.total_time, True) # if False: if self.conf.event_file != None: events_file = self.conf.event_file + '-' + self.ts events.printAll(events_file, "Iteration number: " + str(self.iteration_times)) self.iteration_times += 1 handler = self.event_handler(self.distributer) print "total slices:", handler.total_slices e = events.removeFirst() while e is not None: handler.handleEvent(e, events) e = events.removeFirst() events_handled += 1 self.total_events_handled += events_handled result = handler.end() info_logger.info(result.toString()) return result
def returnLayerArch(layer): conf = Configuration() dcs = conf.datacenters racks = conf.racks machines = conf.machines_per_rack disks = conf.disks_per_machine layer_tree = Tree("SYS") for dc_id in xrange(dcs): dc_node = layer_tree.addChild("DC" + str(dc_id)) for rack_id in xrange(racks): rack_node = dc_node.addChild("R" + str(rack_id)) for machine_id in xrange(machines): machine_node = rack_node.addChild("M" + str(machine_id)) # Haven't consider the medium is SSD. for disk_id in xrange(disks): machine_node.addChild("H" + str(disk_id)) return layer_tree
CopySet(Cidon et al. 2013) """ pass class RandomDistributeSameRack(RandomDistribute): """ HDFS: three replicas, two of them on different machines of same rack, the third one on a different rack QFS: n blocks, several of them on same rack """ class RandomDistributeDRC(RandomDistribute): """ DRC: Double Regenerating Codes.(Hu et al. 2017) """ pass if __name__ == "__main__": conf = Configuration() xml = XMLParser(conf) sss = RandomDistributeSSS(xml) sss.start() sss.printTest() sss.printToFile() sss.systemScaling(1000, 0.1, 20000, 3, 9000, True) sss.printTest() sss.printToFile()
def returnLayers(): layers = [] conf = Configuration() for i in xrange(1, conf.tier_num + 1): layers.append(Layer(i)) return layers
def __init__(self, name, parent, parameters): super(Disk, self).__init__(name, parent, parameters) conf = Configuration() self.disk_capacity = conf.disk_capacity self.disk_repair_time = conf.disk_repair_time self.slices_hit_by_LSE = []
machines_set = [[0 for i in xrange(len(machines[0]))] for j in xrange(len(machines))] for i, copy_set in enumerate(copy_sets): format_output = "copy set " + str(i) + ": " for machine in copy_set: format_output += " " + machine.toString() for j, rack_machines in enumerate(machines): if machine in rack_machines: machine_index = rack_machines.index(machine) machines_set[j][machine_index] += 1 return copy_sets if __name__ == "__main__": conf = Configuration("/root/CR-SIM/conf/cr-sim.conf") xml = XMLParser(conf) distribute = COPYSETDistribute(xml) machines = distribute.getAllMachines() distribute.distributeSlices(distribute.getRoot(), distribute.conf.total_slices) distribute.printToFile() """ total_slices = 419431 distribute.distributeSlices(distribute.getRoot(), total_slices) for i in xrange(total_slices): format_output = "slice index " + str(i) + ": " for disk in distribute.slice_locations[i]: format_output += " " + disk.toString() print format_output"""
def __init__(self, layer_id): layer_path = CONF_PATH + os.sep + "layer_" + str(layer_id) + ".xml" self.tree = ET.parse(layer_path) self.root = self.tree.getroot() self.conf = Configuration()
class HandleDuration(object): def __init__(self, durations): self.durations = durations self.conf = Configuration() self.drs_handler = self.conf.getDRSHandler() self.isMDS = self.drs_handler.isMDS ft = self.drs_handler.n - self.drs_handler.k if self.isMDS: self.ft = ft else: # We only consider LRC with l = 2, so we # need to consider failures more than n-k-1 self.ft = ft - 1 self.concurrent_count = 0 self.lost_concurrent_count = 0 self.total_failure_slice_count = 0 def returnConcurrentCount(self): return self.concurrent_count def returnFailureSliceCount(self): return self.total_failure_slice_count def isHandleLost(self): return self.handle_only_lost # Return concurrent durations # format: {(start time, end time):[list of units], ...} def findConcurrent(self): tmp_durations = [] concurrent_durations = {} lost_concurrent_durations = {} last_concurrent_period = None durations = self.durations.clone() print "duration size:", durations.size() while durations.size() != 0: d = durations.removeFirst() current_time = d.getStartTime() for tmp_d in reversed(tmp_durations): if tmp_d.getEndTime() <= current_time: tmp_durations.remove(tmp_d) tmp_durations.append(d) if len(tmp_durations) <= self.ft: continue concurrent_period = (max([ tmp.getStartTime() for tmp in tmp_durations ]), min([tmp.getEndTime() for tmp in tmp_durations])) if last_concurrent_period is None: last_concurrent_period = concurrent_period else: if concurrent_period[0] < last_concurrent_period[1]: pop_units = concurrent_durations.pop( last_concurrent_period) concurrent_durations[(last_concurrent_period[0], concurrent_period[0])] = pop_units if concurrent_period[1] < last_concurrent_period[1]: concurrent_durations[( concurrent_period[1], last_concurrent_period[1])] = pop_units concurrent_units = [tmp.getUnit() for tmp in tmp_durations] concurrent_durations[concurrent_period] = concurrent_units if d.getType() == Duration.DurationType.Loss: lost_concurrent_durations[concurrent_period] = concurrent_units self.lost_concurrent_count += 1 last_concurrent_period = concurrent_period self.concurrent_count += 1 print "lost concurrent count:", self.lost_concurrent_count print "concurrent count:", self.concurrent_count return lost_concurrent_durations, concurrent_durations def process(self, concurrent_durations, distributer): total_failure_times = 0 self.total_failure_slice_count = 0 total_failure_period = 0.0 # failure_period * failure_slice_count failure_period_with_weight = 0.0 periods = concurrent_durations.keys() for period in periods: # key:slice_index, value:failure count in slice slice_failures = {} f_units = concurrent_durations[period] for u in f_units: if isinstance(u, Sector): r = random() # if no chunk is hited by sector error if r > distributer.diskUsage(): continue all_slices = u.parent.getChildren() slice_index = choice(all_slices) slice_failures.setdefault( slice_index, slice_failures.pop(slice_index, 0) + 1) else: disks = [] if isinstance(u, Rack): distributer.getAllDisksInRack(u, disks) elif isinstance(u, Machine): disks += u.getChildren() elif isinstance(u, Disk): disks.append(u) else: raise Exception("Invalid unit") for disk in disks: slices = disk.getChildren() for slice_index in slices: slice_failures.setdefault( slice_index, slice_failures.pop(slice_index, 0) + 1) failure_slice_count = 0 slice_failure_in_period_flag = False failure_nums = slice_failures.values() for num in failure_nums: if self.isMDS and num > self.ft: slice_failure_in_period_flag = True failure_slice_count += 1 if not self.isMDS and num > self.ft: if num == self.ft + 1: r2 = random() if r2 < self.drs_handler.threshold: slice_failure_in_period_flag = True failure_slice_count += 1 print "random:%f, failures:%d,period:%f, threshold:%f" % ( r2, failure_slice_count, period[1] - period[0], self.drs_handler.threshold) else: slice_failure_in_period_flag = True failure_slice_count += 1 self.total_failure_slice_count += failure_slice_count failure_period_with_weight += failure_slice_count * (period[1] - period[0]) if slice_failure_in_period_flag: total_failure_times += 1 total_failure_period += period[1] - period[0] return (total_failure_times, self.total_failure_slice_count, total_failure_period, failure_period_with_weight) # Get concurrent durations which contain $num$ failed units def getConcurrent(self, concurrent_durations, num): res = {} periods = concurrent_durations.keys() for period in periods: if len(concurrent_durations[period]) == num: res[period] = concurrent_durations[period] return res def printAll(self, concurrent_durations): i = 0 periods = concurrent_durations.keys() periods.sort() for period in periods: format_string = str(period[0]) + " " + str(period[1]) + " " for u in concurrent_durations[period]: format_string += " " + u.toString() print format_string def printToFile(self, file_path, concurrent_durations): pass