Пример #1
0
    def multiTest(self):
        """ sanity check a multi-site RADOS simulation """

        # the ballpark estimates get pretty complicated in this one

        # one disk failing during a year
        D_fail = float(self.fits) * YEAR / 1000000000
        # time for RADOS to recovery from one disk failure
        D_recover = float(self.full) * self.size / \
                            (self.recovery * self.pgs * 3600)
        # one disk experience and NRE during recovery
        D_nre = self.nre * self.full * self.size * 8
        # probability of losing a site
        S_fail = float(self.s_fits) * YEAR / 1000000000

        # time to recover a placement group from another site
        PG_size = self.size * self.full / (2 * self.pgs)
        PG_recover = float(self.full) * self.size / \
                            (self.s_recovery * self.pgs * 3600)
        # time to recover an object from another site
        O_recover = float(self.objsize) / (self.s_recovery * 3600)

        # instantiate the simulations
        e_disk = Disk(size=self.size, fits=self.fits, nre=self.nre,
                    desc="test")
        rados = RADOS(e_disk, pg=self.pgs, speed=self.recovery,
                        nre_model="fail", fullness=self.full,
                        objsize=self.objsize, delay=0, stripe=1, copies=2)
        site = Site(fits=self.s_fits, rplc=self.s_replace)
        multi = MultiSite(rados, site, speed=self.s_recovery,
                        latency=0, sites=1)

        # sanity check and calibration single site P(sitefail)
        multi.sites = 1
        multi.compute(period=YEAR)
        v = multi.P_site
        exp = S_fail
        if self.test("5A", "P(1 site failure)", v, exp):
            S_fail = multi.P_site

        # sanity check and calibration single site P(nre)
        v = multi.P_nre
        exp = 2 * D_fail * 2 * D_nre
        if self.test("5B", "Pnre(one site)", v, exp, slop=0.02):
            D_nre = multi.P_nre / (2 * 2 * D_fail)

        # single site L(sitefail)
        v = multi.L_site
        exp = site.size
        self.test("5C", "L(1 site failure)", v, exp)

        # ballpark probability of losing a site during site recovery
        S_fail2 = S_fail * self.s_replace / YEAR

        # two site P(sitefail)
        multi.sites = 2
        multi.compute(period=YEAR)
        v = multi.P_site
        exp = 2 * S_fail * S_fail2
        if self.test("5D", "P(2 site failure)", v, exp):
            S_fail2 = multi.P_site / (2 * S_fail)

        # two site L(sitefail)
        v = multi.L_site
        exp = site.size
        self.test("5E", "L(2 site failure)", v, exp)

        # three site P(sitefail)
        multi.sites = 3
        multi.compute(period=YEAR)
        v = multi.P_site
        exp = 3 * S_fail * 2 * S_fail2 * S_fail2
        self.test("5F", "P(3 site failure)", v, exp)

        # three site L(sitefail)
        v = multi.L_site
        exp = site.size
        self.test("5G", "L(3 site failure)", v, exp)

        # four site P(sitefail)
        multi.sites = 4
        multi.compute(period=YEAR)
        v = multi.P_site
        exp = 4 * S_fail * 3 * 2 * S_fail2 * S_fail2 * S_fail2
        self.test("5H", "P(4 site failure)", v, exp)

        # four site L(sitefail)
        v = multi.L_site
        exp = site.size
        self.test("5I", "L(4 site failure)", v, exp)

        # ballpark probability of losing both disks before recovering
        D_2fail = (2 * D_fail) * (self.pgs * (D_fail * D_recover / YEAR))

        # one site P(drivefail)
        multi.sites = 1
        multi.compute(period=YEAR)
        v = multi.P_drive
        exp = D_2fail
        if self.test("5J", "P(1 site drive failure)", v, exp, slop=0.01):
            D_2fail = multi.P_drive

        # single site durability includeing drive failure
        v = 1.0 - multi.dur     # complement for better precision
        exp = S_fail
        exp += D_2fail
        exp += 0                # Pnre is in the noise
        self.test("5K", "Durability(rados 1x2)", v, exp)

        # one site L(drivefail)
        v = multi.L_drive
        exp = PG_size
        self.test("5L", "L(1 site drive failure)", v, exp)

        # next site has a much shorter time in which to fail
        D_2more = 2 * ((D_fail * D_recover / YEAR) ** 2)

        # two site P(drivefail)
        multi.sites = 2
        multi.compute(period=YEAR)
        v = multi.P_drive
        exp = 0     # enumerating all the cases for clarity
        exp += (2 * D_2fail) * D_2more  # both sites lose all drives
        exp += (2 * S_fail) * D_2more   # lose one site, then all drives
        exp += (2 * D_2fail) * S_fail2  # lose all drives, then a site
        self.test("5M", "P(2 site drive failure)", v, exp)

        # dual site durability includeing drive failure
        v = 1.0 - multi.dur     # complement for better precision
        exp = 2 * S_fail * S_fail2
        exp += (2 * D_2fail) * D_2more  # both sites lose all drives
        exp += (2 * S_fail) * D_2more   # lose one site, then all drives
        exp += (2 * D_2fail) * S_fail2  # lose all drives, then a site
        exp += 0                        # Pnre is in the noise
        self.test("5N", "Durability(rados 2x2)", v, exp)

        # two site L(drivefail)
        v = multi.L_drive
        exp = PG_size
        self.test("5O", "L(2 site drive failure)", v, exp)

        # three site P(drivefail)
        multi.sites = 3
        multi.compute(period=YEAR)
        v = multi.P_drive
        exp = 0     # enumerating all the cases for clarity
        exp += (3 * D_2fail) * (2 * D_2more) * D_2more  # c, c, c
        exp += (3 * D_2fail) * (2 * D_2more) * S_fail2  # c, c, s
        exp += (3 * D_2fail) * (2 * S_fail2) * D_2more  # c, s, c
        exp += (3 * D_2fail) * (2 * S_fail2) * S_fail2  # c, s, s
        exp += (3 * S_fail) * (2 * D_2more) * D_2more   # s, c, c
        exp += (3 * S_fail) * (2 * D_2more) * S_fail2   # s, c, s
        exp += (3 * S_fail) * (2 * S_fail2) * D_2more   # s, s, c
        self.test("5P", "P(3 site drive failure)", v, exp)

        # tripple site durability including drive failure
        v = 1.0 - multi.dur     # complement for better precision
        exp = 3 * S_fail * 2 * S_fail2 * S_fail2
        exp += (3 * D_2fail) * (2 * D_2more) * D_2more  # c, c, c
        exp += (3 * D_2fail) * (2 * D_2more) * S_fail2  # c, c, s
        exp += (3 * D_2fail) * (2 * S_fail2) * D_2more  # c, s, c
        exp += (3 * D_2fail) * (2 * S_fail2) * S_fail2  # c, s, s
        exp += (3 * S_fail) * (2 * D_2more) * D_2more   # s, c, c
        exp += (3 * S_fail) * (2 * D_2more) * S_fail2   # s, c, s
        exp += (3 * S_fail) * (2 * S_fail2) * D_2more   # s, s, c
        exp += 0          # Pnre is in the noise
        self.test("5Q", "Durability(rados 3x2)", v, exp)

        # three site L(drivefail)
        v = multi.L_drive
        exp = PG_size
        self.test("5R", "L(3 site drive failure)", v, exp)

        # P(sitefail) vs site replacment time
        multi.sites = 2
        site.replace *= 2
        multi.compute(period=YEAR)
        site.replace /= 2
        v = multi.P_site
        exp = 2 * (2 * S_fail * S_fail2)
        self.test("5S", "2*replacement ->2*P(sitefail) ", v, exp)

        # P(rep)
        multi.latency = self.s_latency
        multi.compute(period=YEAR)
        exp = S_fail * multi.sites
        v = multi.P_rep
        self.test("5T", "PL(repfail)", v, exp)

        # L(rep) vs latency
        exp = self.s_latency * self.s_recovery / (2 * SECOND)
        v = multi.L_rep
        self.test("5U", "L(repfail) vs latency", v, exp)

        # L(rep) vs remote recovery rate
        multi.speed *= 2
        multi.compute(period=YEAR)
        multi.speed /= 2
        exp = 2 * self.s_latency * self.s_recovery / (2 * SECOND)
        v = multi.L_rep
        self.test("5V", "2*speed -> L(repfail)/2", v, exp)
Пример #2
0
def defaultTests(cfg):
    """
    run a standard set of interesting simulations
        cfg -- default configuration values
    """
    disk = Disk(size=cfg.disk_size, fits=cfg.disk_fit,
                        nre=cfg.disk_nre,
                        desc="Disk: %s" % (cfg.disk_type))

    raid0 = RAID0(disk, volumes=2,
                          nre_model=cfg.nre_model,
                          recovery=cfg.raid_recover,
                          delay=cfg.raid_replace,
                          objsize=cfg.obj_size)
    raid1 = RAID1(disk, volumes=2,
                          nre_model=cfg.nre_model,
                          recovery=cfg.raid_recover,
                          delay=cfg.raid_replace,
                          objsize=cfg.obj_size)
    raid5 = RAID5(disk, volumes=4,
                          nre_model=cfg.nre_model,
                          recovery=cfg.raid_recover,
                          delay=cfg.raid_replace,
                          objsize=cfg.obj_size)
    raid6 = RAID6(disk, volumes=8,
                          nre_model=cfg.nre_model,
                          recovery=cfg.raid_recover,
                          delay=cfg.raid_replace,
                          objsize=cfg.obj_size)

    tests = [disk, raid0, raid5, raid1, raid6]

    # single site RADOS
    for cp in (1, 2, 3):
        rados = RADOS(disk, pg=cfg.rados_decluster,
                        copies=cp,
                        speed=cfg.rados_recover,
                        fullness=cfg.rados_fullness,
                        objsize=cfg.obj_size,
                        stripe=cfg.stripe_length,
                        nre_model=cfg.nre_model,
                        delay=cfg.rados_markout)
        tests.append(rados)

    # multi-site RADOS
    tests.append(None)
    site = Site(fits=cfg.majeure, rplc=cfg.site_recover)
    tests.append(site)
    for sites in (1, 2, 3, 4):
        for cp in (1, 2, 3):
            rados = RADOS(disk, pg=cfg.rados_decluster,
                        copies=cp,
                        speed=cfg.rados_recover,
                        fullness=cfg.rados_fullness,
                        objsize=cfg.obj_size,
                        stripe=cfg.stripe_length,
                        nre_model=cfg.nre_model,
                        delay=cfg.rados_markout)

            multi = MultiSite(rados, site,
                    speed=cfg.remote_recover,
                    latency=cfg.remote_latency,
                    sites=sites)
            tests.append(multi)

    # and run them all
    Run(tests, period=cfg.period, verbosity=cfg.verbose)
Пример #3
0
    def siteTest(self):
        """ sanity check a site reliability simulation """

        # ball park estimates
        Pfail = float(self.s_fits) * YEAR / 1000000000
        mttf = 1000000000 / self.s_fits

        site = Site(fits=self.s_fits, size=self.s_size)

        # basic site FIT rates
        site.compute(period=YEAR)
        exp = Pfail
        v = site.P_site
        if self.test("4A", "1Y site fail rate", v, exp):
            Pfail = site.P_site     # move to more accurate value

        # single year durability
        v = 1.0 - site.dur      # complement for better precision
        exp = Pfail
        self.test("4B", "1Y Durability(site)", v, exp)

        # single year availability w/o repair
        exp = 1.0 - Pfail
        v = site.availability()
        self.test("4C", "1Y availability", v, exp)

        # double the period, double Pfail
        site.compute(period=2 * YEAR)
        exp = 2 * Pfail
        v = site.P_site
        self.test("4D", "2Y fail rate", v, exp)

        # two year durability
        v = 1.0 - site.dur      # complement for better precision
        exp = 2 * Pfail
        self.test("4E", "2Y Durability(site)", v, exp)

        # effects of multiples on P_fail
        site.compute(period=YEAR, mult=3)
        exp = Pfail * 3
        v = site.P_site
        self.test("4F", "3x 1Y fail rate", v, exp, slop=0.01)

        # site recovery rates and availability
        site.replace = mttf
        exp = 0.5       # mttr = mttf
        v = site.availability()
        self.test("4G", "long term availability", v, exp)

        # site raw size
        v = site.rawsize
        exp = self.s_size
        self.test("4Z", "per petabyte", v, exp)
Пример #4
0
def oneTest(cfg, which):
    """
    run a single simulation (call-back from the GUI)
        cfg -- configuration values to use
        which -- type of simulation to be run
    """

    # everybody needs a disk simulation
    disk = Disk(size=cfg.disk_size,
                        fits=cfg.disk_fit, fits2=cfg.disk_fit2,
                        nre=cfg.disk_nre,
                        desc="Disk: %s" % (cfg.disk_type))

    if which == "disk":
        Run([disk], period=cfg.period, verbosity=cfg.verbose)
        return

    if which == "raid":
        if cfg.raid_type == "RAID-0":
            raid = RAID0(disk, volumes=cfg.raid_vols,
                                  nre_model=cfg.nre_model,
                                  recovery=cfg.raid_recover,
                                  delay=cfg.raid_replace,
                                  objsize=cfg.obj_size)
        elif cfg.raid_type == "RAID-1":
            raid = RAID1(disk, volumes=cfg.raid_vols,
                                  nre_model=cfg.nre_model,
                                  recovery=cfg.raid_recover,
                                  delay=cfg.raid_replace,
                                  objsize=cfg.obj_size)
        elif cfg.raid_type == "RAID-5":
            raid = RAID5(disk, volumes=cfg.raid_vols,
                                  nre_model=cfg.nre_model,
                                  recovery=cfg.raid_recover,
                                  delay=cfg.raid_replace,
                                  objsize=cfg.obj_size)
        elif cfg.raid_type == "RAID-6":
            raid = RAID6(disk, volumes=cfg.raid_vols,
                                  nre_model=cfg.nre_model,
                                  recovery=cfg.raid_recover,
                                  delay=cfg.raid_replace,
                                  objsize=cfg.obj_size)
        Run([raid], period=cfg.period, verbosity=cfg.verbose)
        return

    rados = RADOS(disk, pg=cfg.rados_decluster,
                    copies=cfg.rados_copies,
                    speed=cfg.rados_recover,
                    fullness=cfg.rados_fullness,
                    objsize=cfg.obj_size,
                    stripe=cfg.stripe_length,
                    nre_model=cfg.nre_model,
                    delay=cfg.rados_markout)
    if which == "rados":
        Run([rados], period=cfg.period, verbosity=cfg.verbose)
        return

    if which == "multi":
        site = Site(fits=cfg.majeure, rplc=cfg.site_recover)
        multi = MultiSite(rados, site,
                speed=cfg.remote_recover,
                latency=cfg.remote_latency,
                sites=cfg.remote_sites)
        Run([multi], period=cfg.period, verbosity=cfg.verbose)
        return