def multiTest(self): """ sanity check a multi-site RADOS simulation """ # the ballpark estimates get pretty complicated in this one # one disk failing during a year D_fail = float(self.fits) * YEAR / 1000000000 # time for RADOS to recovery from one disk failure D_recover = float(self.full) * self.size / \ (self.recovery * self.pgs * 3600) # one disk experience and NRE during recovery D_nre = self.nre * self.full * self.size * 8 # probability of losing a site S_fail = float(self.s_fits) * YEAR / 1000000000 # time to recover a placement group from another site PG_size = self.size * self.full / (2 * self.pgs) PG_recover = float(self.full) * self.size / \ (self.s_recovery * self.pgs * 3600) # time to recover an object from another site O_recover = float(self.objsize) / (self.s_recovery * 3600) # instantiate the simulations e_disk = Disk(size=self.size, fits=self.fits, nre=self.nre, desc="test") rados = RADOS(e_disk, pg=self.pgs, speed=self.recovery, nre_model="fail", fullness=self.full, objsize=self.objsize, delay=0, stripe=1, copies=2) site = Site(fits=self.s_fits, rplc=self.s_replace) multi = MultiSite(rados, site, speed=self.s_recovery, latency=0, sites=1) # sanity check and calibration single site P(sitefail) multi.sites = 1 multi.compute(period=YEAR) v = multi.P_site exp = S_fail if self.test("5A", "P(1 site failure)", v, exp): S_fail = multi.P_site # sanity check and calibration single site P(nre) v = multi.P_nre exp = 2 * D_fail * 2 * D_nre if self.test("5B", "Pnre(one site)", v, exp, slop=0.02): D_nre = multi.P_nre / (2 * 2 * D_fail) # single site L(sitefail) v = multi.L_site exp = site.size self.test("5C", "L(1 site failure)", v, exp) # ballpark probability of losing a site during site recovery S_fail2 = S_fail * self.s_replace / YEAR # two site P(sitefail) multi.sites = 2 multi.compute(period=YEAR) v = multi.P_site exp = 2 * S_fail * S_fail2 if self.test("5D", "P(2 site failure)", v, exp): S_fail2 = multi.P_site / (2 * S_fail) # two site L(sitefail) v = multi.L_site exp = site.size self.test("5E", "L(2 site failure)", v, exp) # three site P(sitefail) multi.sites = 3 multi.compute(period=YEAR) v = multi.P_site exp = 3 * S_fail * 2 * S_fail2 * S_fail2 self.test("5F", "P(3 site failure)", v, exp) # three site L(sitefail) v = multi.L_site exp = site.size self.test("5G", "L(3 site failure)", v, exp) # four site P(sitefail) multi.sites = 4 multi.compute(period=YEAR) v = multi.P_site exp = 4 * S_fail * 3 * 2 * S_fail2 * S_fail2 * S_fail2 self.test("5H", "P(4 site failure)", v, exp) # four site L(sitefail) v = multi.L_site exp = site.size self.test("5I", "L(4 site failure)", v, exp) # ballpark probability of losing both disks before recovering D_2fail = (2 * D_fail) * (self.pgs * (D_fail * D_recover / YEAR)) # one site P(drivefail) multi.sites = 1 multi.compute(period=YEAR) v = multi.P_drive exp = D_2fail if self.test("5J", "P(1 site drive failure)", v, exp, slop=0.01): D_2fail = multi.P_drive # single site durability includeing drive failure v = 1.0 - multi.dur # complement for better precision exp = S_fail exp += D_2fail exp += 0 # Pnre is in the noise self.test("5K", "Durability(rados 1x2)", v, exp) # one site L(drivefail) v = multi.L_drive exp = PG_size self.test("5L", "L(1 site drive failure)", v, exp) # next site has a much shorter time in which to fail D_2more = 2 * ((D_fail * D_recover / YEAR) ** 2) # two site P(drivefail) multi.sites = 2 multi.compute(period=YEAR) v = multi.P_drive exp = 0 # enumerating all the cases for clarity exp += (2 * D_2fail) * D_2more # both sites lose all drives exp += (2 * S_fail) * D_2more # lose one site, then all drives exp += (2 * D_2fail) * S_fail2 # lose all drives, then a site self.test("5M", "P(2 site drive failure)", v, exp) # dual site durability includeing drive failure v = 1.0 - multi.dur # complement for better precision exp = 2 * S_fail * S_fail2 exp += (2 * D_2fail) * D_2more # both sites lose all drives exp += (2 * S_fail) * D_2more # lose one site, then all drives exp += (2 * D_2fail) * S_fail2 # lose all drives, then a site exp += 0 # Pnre is in the noise self.test("5N", "Durability(rados 2x2)", v, exp) # two site L(drivefail) v = multi.L_drive exp = PG_size self.test("5O", "L(2 site drive failure)", v, exp) # three site P(drivefail) multi.sites = 3 multi.compute(period=YEAR) v = multi.P_drive exp = 0 # enumerating all the cases for clarity exp += (3 * D_2fail) * (2 * D_2more) * D_2more # c, c, c exp += (3 * D_2fail) * (2 * D_2more) * S_fail2 # c, c, s exp += (3 * D_2fail) * (2 * S_fail2) * D_2more # c, s, c exp += (3 * D_2fail) * (2 * S_fail2) * S_fail2 # c, s, s exp += (3 * S_fail) * (2 * D_2more) * D_2more # s, c, c exp += (3 * S_fail) * (2 * D_2more) * S_fail2 # s, c, s exp += (3 * S_fail) * (2 * S_fail2) * D_2more # s, s, c self.test("5P", "P(3 site drive failure)", v, exp) # tripple site durability including drive failure v = 1.0 - multi.dur # complement for better precision exp = 3 * S_fail * 2 * S_fail2 * S_fail2 exp += (3 * D_2fail) * (2 * D_2more) * D_2more # c, c, c exp += (3 * D_2fail) * (2 * D_2more) * S_fail2 # c, c, s exp += (3 * D_2fail) * (2 * S_fail2) * D_2more # c, s, c exp += (3 * D_2fail) * (2 * S_fail2) * S_fail2 # c, s, s exp += (3 * S_fail) * (2 * D_2more) * D_2more # s, c, c exp += (3 * S_fail) * (2 * D_2more) * S_fail2 # s, c, s exp += (3 * S_fail) * (2 * S_fail2) * D_2more # s, s, c exp += 0 # Pnre is in the noise self.test("5Q", "Durability(rados 3x2)", v, exp) # three site L(drivefail) v = multi.L_drive exp = PG_size self.test("5R", "L(3 site drive failure)", v, exp) # P(sitefail) vs site replacment time multi.sites = 2 site.replace *= 2 multi.compute(period=YEAR) site.replace /= 2 v = multi.P_site exp = 2 * (2 * S_fail * S_fail2) self.test("5S", "2*replacement ->2*P(sitefail) ", v, exp) # P(rep) multi.latency = self.s_latency multi.compute(period=YEAR) exp = S_fail * multi.sites v = multi.P_rep self.test("5T", "PL(repfail)", v, exp) # L(rep) vs latency exp = self.s_latency * self.s_recovery / (2 * SECOND) v = multi.L_rep self.test("5U", "L(repfail) vs latency", v, exp) # L(rep) vs remote recovery rate multi.speed *= 2 multi.compute(period=YEAR) multi.speed /= 2 exp = 2 * self.s_latency * self.s_recovery / (2 * SECOND) v = multi.L_rep self.test("5V", "2*speed -> L(repfail)/2", v, exp)
def defaultTests(cfg): """ run a standard set of interesting simulations cfg -- default configuration values """ disk = Disk(size=cfg.disk_size, fits=cfg.disk_fit, nre=cfg.disk_nre, desc="Disk: %s" % (cfg.disk_type)) raid0 = RAID0(disk, volumes=2, nre_model=cfg.nre_model, recovery=cfg.raid_recover, delay=cfg.raid_replace, objsize=cfg.obj_size) raid1 = RAID1(disk, volumes=2, nre_model=cfg.nre_model, recovery=cfg.raid_recover, delay=cfg.raid_replace, objsize=cfg.obj_size) raid5 = RAID5(disk, volumes=4, nre_model=cfg.nre_model, recovery=cfg.raid_recover, delay=cfg.raid_replace, objsize=cfg.obj_size) raid6 = RAID6(disk, volumes=8, nre_model=cfg.nre_model, recovery=cfg.raid_recover, delay=cfg.raid_replace, objsize=cfg.obj_size) tests = [disk, raid0, raid5, raid1, raid6] # single site RADOS for cp in (1, 2, 3): rados = RADOS(disk, pg=cfg.rados_decluster, copies=cp, speed=cfg.rados_recover, fullness=cfg.rados_fullness, objsize=cfg.obj_size, stripe=cfg.stripe_length, nre_model=cfg.nre_model, delay=cfg.rados_markout) tests.append(rados) # multi-site RADOS tests.append(None) site = Site(fits=cfg.majeure, rplc=cfg.site_recover) tests.append(site) for sites in (1, 2, 3, 4): for cp in (1, 2, 3): rados = RADOS(disk, pg=cfg.rados_decluster, copies=cp, speed=cfg.rados_recover, fullness=cfg.rados_fullness, objsize=cfg.obj_size, stripe=cfg.stripe_length, nre_model=cfg.nre_model, delay=cfg.rados_markout) multi = MultiSite(rados, site, speed=cfg.remote_recover, latency=cfg.remote_latency, sites=sites) tests.append(multi) # and run them all Run(tests, period=cfg.period, verbosity=cfg.verbose)
def siteTest(self): """ sanity check a site reliability simulation """ # ball park estimates Pfail = float(self.s_fits) * YEAR / 1000000000 mttf = 1000000000 / self.s_fits site = Site(fits=self.s_fits, size=self.s_size) # basic site FIT rates site.compute(period=YEAR) exp = Pfail v = site.P_site if self.test("4A", "1Y site fail rate", v, exp): Pfail = site.P_site # move to more accurate value # single year durability v = 1.0 - site.dur # complement for better precision exp = Pfail self.test("4B", "1Y Durability(site)", v, exp) # single year availability w/o repair exp = 1.0 - Pfail v = site.availability() self.test("4C", "1Y availability", v, exp) # double the period, double Pfail site.compute(period=2 * YEAR) exp = 2 * Pfail v = site.P_site self.test("4D", "2Y fail rate", v, exp) # two year durability v = 1.0 - site.dur # complement for better precision exp = 2 * Pfail self.test("4E", "2Y Durability(site)", v, exp) # effects of multiples on P_fail site.compute(period=YEAR, mult=3) exp = Pfail * 3 v = site.P_site self.test("4F", "3x 1Y fail rate", v, exp, slop=0.01) # site recovery rates and availability site.replace = mttf exp = 0.5 # mttr = mttf v = site.availability() self.test("4G", "long term availability", v, exp) # site raw size v = site.rawsize exp = self.s_size self.test("4Z", "per petabyte", v, exp)
def oneTest(cfg, which): """ run a single simulation (call-back from the GUI) cfg -- configuration values to use which -- type of simulation to be run """ # everybody needs a disk simulation disk = Disk(size=cfg.disk_size, fits=cfg.disk_fit, fits2=cfg.disk_fit2, nre=cfg.disk_nre, desc="Disk: %s" % (cfg.disk_type)) if which == "disk": Run([disk], period=cfg.period, verbosity=cfg.verbose) return if which == "raid": if cfg.raid_type == "RAID-0": raid = RAID0(disk, volumes=cfg.raid_vols, nre_model=cfg.nre_model, recovery=cfg.raid_recover, delay=cfg.raid_replace, objsize=cfg.obj_size) elif cfg.raid_type == "RAID-1": raid = RAID1(disk, volumes=cfg.raid_vols, nre_model=cfg.nre_model, recovery=cfg.raid_recover, delay=cfg.raid_replace, objsize=cfg.obj_size) elif cfg.raid_type == "RAID-5": raid = RAID5(disk, volumes=cfg.raid_vols, nre_model=cfg.nre_model, recovery=cfg.raid_recover, delay=cfg.raid_replace, objsize=cfg.obj_size) elif cfg.raid_type == "RAID-6": raid = RAID6(disk, volumes=cfg.raid_vols, nre_model=cfg.nre_model, recovery=cfg.raid_recover, delay=cfg.raid_replace, objsize=cfg.obj_size) Run([raid], period=cfg.period, verbosity=cfg.verbose) return rados = RADOS(disk, pg=cfg.rados_decluster, copies=cfg.rados_copies, speed=cfg.rados_recover, fullness=cfg.rados_fullness, objsize=cfg.obj_size, stripe=cfg.stripe_length, nre_model=cfg.nre_model, delay=cfg.rados_markout) if which == "rados": Run([rados], period=cfg.period, verbosity=cfg.verbose) return if which == "multi": site = Site(fits=cfg.majeure, rplc=cfg.site_recover) multi = MultiSite(rados, site, speed=cfg.remote_recover, latency=cfg.remote_latency, sites=cfg.remote_sites) Run([multi], period=cfg.period, verbosity=cfg.verbose) return