Exemplo n.º 1
0
    def reset(self):
        # Reset clocks and state for each disk
        for disk in self.disks:
            disk.init_clock(0)
            disk.init_state()

        # Reset clocks and state for each node
        for node in self.nodes:
            node.init_clock(0)
            node.init_state()

        # Reset clocks and state for each rack
        for rack in self.racks:
            rack.init_state()

        # Reset system state
        self.state = State(self.num_disks, self.num_nodes)

        # Rest repair queue
        self.repair_queue = []

        # Regenerate new placement
        self.placement = Placement(self.num_racks, self.nodes_per_rack,
                                   self.disks_per_node, self.capacity_per_disk,
                                   self.num_stripes, self.chunk_size,
                                   self.code_type, self.n, self.k,
                                   self.place_type, self.chunk_rack_config,
                                   self.l)
        # Reset LR
        self.lr = float(1.)

        self.total_failure_rate = 0.
        self.total_failrue_rate_cnt = 0
        self.total_repair_rate = 0.
        self.total_repair_rate_cnt = 0
Exemplo n.º 2
0
 def __init__(self, mission_time, plus_one, num_servers,
              num_disks_per_server, num_spares_per_server, k, m, fb,
              dp_type, failure_type, mtbf, failure_percent, rebuildIO,
              slaTime, copybackIO, diskCap, useRatio):
     #---------------------------
     # compressed time window
     #---------------------------
     self.mission_time = mission_time
     #---------------------------
     # system and placement
     #---------------------------
     self.sys = Campaign(plus_one, num_servers, num_disks_per_server,
                         num_spares_per_server, k, m, fb, dp_type, diskCap,
                         useRatio)
     self.place = Placement(self.sys)
     #--------------------------------------
     # fast rebuild + copyback phases
     #--------------------------------------
     self.rebuild = Rebuild(self.sys, rebuildIO)
     self.copyback = Copyback(copybackIO, slaTime)
     #--------------------------------------
     # failures distribution and mtbf
     #--------------------------------------
     self.mtbf = mtbf
     self.failure_type = failure_type
     self.failure_percent = failure_percent
Exemplo n.º 3
0
 def testRedundancyMultipleOccurence(self):
     tiles = ['s', 'a', 'f', 'e', None, None, 's', 'a', 'f', None]
     nodes = self.create_horizontal_nodes(tiles)
     placements = Placement.placements('safe', nodes[0])
     placement = placements[0]
     self.assertFalse(Collision.safe(placement))
     placements = Placement.placements('safe', nodes[6])
     placement = placements[0]
     self.assertTrue(Collision.safe(placement))
Exemplo n.º 4
0
    def create_cylinder(self,key,density,length,radius,pos,base=0,rot=0,R=0.):
        """Creates a cylinder body and corresponding geom.
        
        Arguments:
        key : number id to assign to the cylinder
        density : density of the given body
        length : length of the cylinder
        radius : radius of the cylinder
        pos : position of the center of the cylinder (x,y,z list)
        base : place new object at negative end of base object

        """
        # Auto label the joint key or not.
        key = len(self.bodies) if key == -1 else key

        # create cylinder body (aligned along the z-axis so that it matches the
        #   GeomCylinder created below, which is aligned along the z-axis by
        #   default)
        body = ode.Body(self.world)
        M = ode.Mass()
        M.setCylinder(density, 3, radius, length)
        body.setMass(M)
        
        # create a cylinder geom for collision detection
        geom = ode.GeomCylinder(self.space, radius, length)
        geom.setBody(body)
        
        # set the position of the cylinder
        body.setPosition((pos[0],pos[1],pos[2]))

        # set parameters for drawing the body
        body.shape = "cylinder"
        body.length = length
        body.radius = radius

        # set the rotation of the cylinder
        if(rot):
            body.setRotation(self.form_rotation(rot))
       
        # set the rotation of the cylinder directly
        if R:
            body.setRotation(R)

        self.bodies[key] = body
        self.geoms[key] = geom

        if(base):
            Placement.place_object(self.bodies[base],body)

        if(self.fluid_dynamics):
            self.create_surfaces(key,1.)  

        return key
Exemplo n.º 5
0
 def testRedundancy(self):
     # Run the test again, but without any empty space.
     tiles = ['s', 'a', 'f', 'e', None, None]
     nodes = self.create_horizontal_nodes(tiles)
     placements = Placement.placements('safe', nodes[0])
     placement = placements[0]
     self.assertFalse(Collision.safe(placement))
     tiles = ['a', 'b', 's', 'a', 'f', 'e', 't', 'b', 'z', 'x', 'h']
     nodes = self.create_horizontal_nodes(tiles)
     placements = Placement.placements('safe', nodes[2])
     placement = placements[0]
     self.assertEqual(placement.node(0), nodes[2])
     self.assertEqual(placement.node(0).letter, 's')
     self.assertFalse(Collision.safe(placement))
Exemplo n.º 6
0
    def record_tourney(self, tournament, player):
        """
        Record the results from a tournament.

        Needs the tournament json object and the player json object.
        Player json object needs to be from the same tournament.
        """
        self.placings.append(Placement(self.name, tournament, player))
Exemplo n.º 7
0
 def testHorizontalCollision(self):
     tiles = [None, 's', 'a', 'e', None]
     nodes = self.create_horizontal_nodes(tiles)
     placements = Placement.placements('safe', nodes[1])
     # We expect the vertical and horizontal placements.
     placement = placements[0]
     self.assertEqual(placement.node(0), nodes[1])
     self.assertFalse(Collision.safe(placement))
Exemplo n.º 8
0
    def create_capsule(self,key,density,length,radius,pos,base=0,rot=0):
        """Creates a capsule body and corresponding geom.
        
        Arguments:
        key : number id to assign to the capsule
        density : density of the given body
        length : length of the capsule
        radius : radius of the capsule
        pos : position of the center of the capsule (x,y,z list)
        base : place new object at negative end of base object

        """

        # create capsule body (aligned along the z-axis so that it matches the
        #   GeomCCylinder created below, which is aligned along the z-axis by
        #   default)
        body = ode.Body(self.world)
        M = ode.Mass()
        M.setCapsule(density, 3, radius, length)
        body.setMass(M)
        
        # create a capsule geom for collision detection
        geom = ode.GeomCCylinder(self.space, radius, length)
        geom.setBody(body)
        
        # set the position of the capsule
        body.setPosition((pos[0],pos[1],pos[2]))

        # set parameters for drawing the body
        body.shape = "capsule"
        body.length = length
        body.radius = radius

        # set the rotation of the capsule
        if(rot):
            body.setRotation(self.form_rotation(rot))
        
        self.bodies[key] = body
        self.geoms[key] = geom

        if(base):
            Placement.place_object(self.bodies[base],body)
Exemplo n.º 9
0
    def create_capsule(self, key, density, length, radius, pos, base=0, rot=0):
        """Creates a capsule body and corresponding geom.
        
        Arguments:
        key : number id to assign to the capsule
        density : density of the given body
        length : length of the capsule
        radius : radius of the capsule
        pos : position of the center of the capsule (x,y,z list)
        base : place new object at negative end of base object

        """

        # create capsule body (aligned along the z-axis so that it matches the
        #   GeomCCylinder created below, which is aligned along the z-axis by
        #   default)
        body = ode.Body(self.world)
        M = ode.Mass()
        M.setCapsule(density, 3, radius, length)
        body.setMass(M)

        # create a capsule geom for collision detection
        geom = ode.GeomCCylinder(self.space, radius, length)
        geom.setBody(body)

        # set the position of the capsule
        body.setPosition((pos[0], pos[1], pos[2]))

        # set parameters for drawing the body
        body.shape = "capsule"
        body.length = length
        body.radius = radius

        # set the rotation of the capsule
        if (rot):
            body.setRotation(self.form_rotation(rot))

        self.bodies[key] = body
        self.geoms[key] = geom

        if (base):
            Placement.place_object(self.bodies[base], body)
Exemplo n.º 10
0
 def testNodesMatchingLetter(self):
     tiles = [['', '', '', 'x', 'z', 's', '', 'f', '']]
     board = Board(tiles)
     first_row = board.nodes[0]
     placements = Placement.placements('safe', first_row[5])
     placement = placements[0]
     nodes = Collision.nodes_matching_letter(placement, 'f')
     self.assertEqual(len(nodes), 1)
     f_node = nodes[0]
     self.assertEqual(f_node.letter, 'f')
     self.assertTrue(f_node.placed)
Exemplo n.º 11
0
    def initPlacements(self):
        """
        Initialises the placement objects for this board (1 for each player)
        :return: List of placement objects
        """
        player1placements = Placement(playerNumber=1)
        player2placements = Placement(playerNumber=2)
        player3placements = None
        player4placements = None
        if (self.playerCount >= 3):
            player3placements = Placement(playerNumber=3)
        if (self.playerCount == 4):
            player4placements = Placement(playerNumber=4)

        placements = []
        for p in [
                player1placements, player2placements, player3placements,
                player4placements
        ]:
            if (p != None):
                placements.append(p)

        return placements
Exemplo n.º 12
0
 def testPreexistingLetters(self):
     tiles = [None, None, None, 'x', 'z', 's', None, 'f', None]
     nodes = self.create_vertical_nodes(tiles)
     placements = Placement.placements('safe', nodes[5])
     self.assertEqual(len(placements), 2)
     placement = placements[1]
     self.assertTrue(placement, Collision.safe(placement))
     # The preexisting letters should contain the lettera 's' and 'f' because
     # when placing the word 'safe' across from the letter 's' the letter 'f'
     # would be used as well.
     letters = Collision.preexisting_letters(placement)
     self.assertEqual(len(letters), 2)
     self.assertEqual(letters[0], 's')
     self.assertEqual(letters[1], 'f')
Exemplo n.º 13
0
def runjob(mission_time,
                 num_racks, node_per_rack, disks_per_node, onumgroup, numgroup,
                 capacity_per_disk,chunk_size, num_stripes,
                 bandwidth,
                 code_n, code_k, code_m, use_ratio,
                 weibull, ssd_fail):
    placement = Placement(disks_per_node, node_per_rack, num_racks, onumgroup, numgroup, num_stripes,
                          code_n, code_k)
    placement.generate_palcement()

    network = Network(num_racks, num_racks * node_per_rack, onumgroup, numgroup, node_per_rack,
                      disks_per_node, bandwidth, capacity_per_disk*use_ratio, chunk_size,
                      code_n, code_k, code_m)
    sim = Simulation(weibull, ssd_fail, placement, network, onumgroup, disks_per_node, node_per_rack,
                     num_racks, mission_time)
    res = sim.run()
    print res
    '''
    file =open("result", "a+")
    fcntl.flock(file.fileno(), fcntl.LOCK_EX)
    file.write(str(res)+"\n")
    file.close()
    '''
    return res
Exemplo n.º 14
0
    def testHorizontalSafe(self):
        tiles = [None, 's', None, 'f', None]
        nodes = self.create_horizontal_nodes(tiles)
        placements = Placement.placements('safe', nodes[1])
        self.assertEqual(len(placements), 2)

        placement = placements[0]
        # Test that we have the right start node.
        self.assertEqual(placement.node(0), nodes[1])

        # Make sure that there is no collision.
        self.assertTrue(Collision.safe(placement))

        vertical_placement = placements[1]
        self.assertEqual(vertical_placement.node(0), nodes[1])
        self.assertFalse(Collision.safe(vertical_placement))
Exemplo n.º 15
0
 def testVerticalSafe(self):
     tiles = [None, 's', None, 'f', None]
     nodes = self.create_vertical_nodes(tiles)
     placements = Placement.placements('safe', nodes[1])
     # There should be two placement objects returned from the previous static
     # method.
     # One would attempt to place it to the right, and the other downward. Of
     # course the downward placement would fall right off the board, and should
     # not pass the collision safety test.
     self.assertEqual(len(placements), 2)
     placement = placements[1]
     # Test that we have the right start node.
     self.assertEqual(placement.node(0), nodes[1])
     # Make sure that there is no collision.
     self.assertTrue(Collision.safe(placement))
     horizontal_placement = placements[0]
     self.assertEqual(horizontal_placement.node(0), nodes[1])
     self.assertFalse(Collision.safe(horizontal_placement))
Exemplo n.º 16
0
class RegularSimulation(Simulation):
    ##
    # __init__() from Simulation
    #

    ##
    # Initialize the simulation
    #
    def init(self):
        # Initialize the state of the system
        self.state = State(self.num_disks)

        # Employ priority queue to keep all the failures and repairs
        # The element in the queue is (event_time, event_type, device_id)
        self.events_queue = []

        # Keep failed disks awaiting repair
        self.wait_repair_queue = []

        # Keep delayed stripes due to unavailable nodes
        # Key is the disk_idx delayed, value is the list of delayed stripes
        self.delayed_repair_dict = dict()

        self.enable_transient_failures = False

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.ERROR)
        # self.logger.setLevel(logging.INFO)
        self.logger.addHandler(console)
        self.logger.propagate = False

    ##
    # Reset the simulation
    #
    def reset(self, ite=0):
        # Generate node transient and permanent failure events from trace
        if self.use_trace:
            for i in xrange(self.num_nodes):
                self.nodes[i] = Node(None, None, None,
                                     Trace(self.trace_id, i, 'p'),
                                     Trace(self.trace_id, i, 't'),
                                     Trace(self.trace_id, i, 'r'))

        self.state = State(self.num_disks)

        for disk in self.disks:
            disk.init_clock(0)
            disk.init_state()
        for node in self.nodes:
            node.init_state()
        for rack in self.racks:
            rack.init_state()

        self.events_queue = []
        self.wait_repair_queue = []
        self.delayed_repair_dict = dict()

        # generate disk failures and put them into events_queue
        for disk_id in xrange(len(self.disks)):
            disk_fail_time = self.disk_fail_dists.draw()
            if disk_fail_time <= self.mission_time:
                self.events_queue.append(
                    (disk_fail_time, Disk.EVENT_DISK_FAIL, disk_id))
        # generate node failures and push them into events_queue
        for node_id in xrange(self.num_nodes):
            if not self.use_trace:
                self.events_queue.append((self.node_fail_dists.draw(),
                                          Node.EVENT_NODE_FAIL, node_id))
                if self.enable_transient_failures:
                    self.events_queue.append(
                        (self.node_transient_fail_dists.draw(),
                         Node.EVENT_NODE_TRANSIENT_FAIL, node_id))
            else:
                for node_failure_time in self.nodes[
                        node_id].node_fail_trace.get_trace_ls():
                    # push node failure event to event_queue
                    self.events_queue.append(
                        (node_failure_time, Node.EVENT_NODE_FAIL, node_id))
                node_transient_failure_ls = self.nodes[
                    node_id].node_transient_fail_trace.get_trace_ls()
                node_transient_repair_ls = self.nodes[
                    node_id].node_transient_repair_trace.get_trace_ls()
                for ls_idx in xrange(len(node_transient_failure_ls)):
                    node_transient_failure_time = node_transient_failure_ls[
                        ls_idx]
                    node_transient_repair_time = node_transient_repair_ls[
                        ls_idx]
                    self.events_queue.append(
                        (node_transient_failure_time,
                         Node.EVENT_NODE_TRANSIENT_FAIL, node_id))
                    self.events_queue.append(
                        (node_transient_failure_time +
                         node_transient_repair_time,
                         Node.EVENT_NODE_TRANSIENT_REPAIR, node_id))

        # generate rack failures and push them into events_queue
        if not self.use_power_outage and self.enable_transient_failures:
            for rack_id in xrange(len(self.racks)):
                self.events_queue.append((self.rack_fail_dists.draw(),
                                          Rack.EVENT_RACK_FAIL, rack_id))

        # correlated failures caused by power outage
        if (not self.use_trace) and self.use_power_outage:
            for rack_id in xrange(self.num_racks):
                occur_time = float(0) + self.power_outage_dist.draw()
                while occur_time < self.mission_time:
                    self.events_queue.append(
                        (occur_time, Rack.EVENT_RACK_FAIL, rack_id))
                    occur_time += random.expovariate(
                        (1 / float(self.power_outage_duration)))
                    self.events_queue.append(
                        (occur_time, Rack.EVENT_RACK_REPAIR, rack_id))
                    for i in xrange(self.nodes_per_rack):
                        # draw a bernoulli distribution
                        if nprandom.binomial(1, 0.01):
                            self.events_queue.append(
                                (occur_time, Node.EVENT_NODE_FAIL,
                                 (self.nodes_per_rack * rack_id + i)))
                    occur_time += self.power_outage_dist.draw()

        heapify(self.events_queue)
        self.placement = Placement(self.num_racks, self.nodes_per_rack,
                                   self.disks_per_node, self.capacity_per_disk,
                                   self.num_stripes, self.chunk_size,
                                   self.code_type, self.n, self.k,
                                   self.place_type, self.chunk_rack_config,
                                   self.l)

        self.network = Network(self.num_racks, self.nodes_per_rack,
                               self.network_setting)

        self.num_stripes_repaired = 0
        self.num_stripes_repaired_single_chunk = 0
        self.num_stripes_delayed = 0

    ##
    # Generate permanent disk failure event
    #
    def set_disk_fail(self, disk_idx, curr_time):
        heappush(self.events_queue, (self.disk_fail_dists.draw() + curr_time,
                                     Disk.EVENT_DISK_FAIL, disk_idx))

    ##
    # Generate repair event for permanent disk failure
    #
    def set_disk_repair(self, disk_idx, curr_time):
        if not self.use_network:
            # get the repair time from a pre-defined repair distribution
            heappush(self.events_queue,
                     (self.disk_repair_dists.draw() + curr_time,
                      Disk.EVENT_DISK_REPAIR, disk_idx))
        else:
            # repair time = cross-rack repair traffic / available cross-rack bandwidth
            rack_id = disk_idx / (self.nodes_per_rack * self.disks_per_node)

            # If there is no available bandwidth or the rack is under transient failure
            if self.network.get_avail_cross_rack_repair_bwth() == 0 or \
                self.racks[rack_id].get_curr_state() != Rack.STATE_RACK_NORMAL:
                heappush(self.wait_repair_queue, (curr_time, disk_idx))
            else:
                cross_rack_download = 0
                stripes_to_repair = self.placement.get_stripes_to_repair(
                    disk_idx)
                self.num_stripes_repaired += len(stripes_to_repair)
                stripes_to_delay = []

                # for each stripe to repair
                for stripe_id in stripes_to_repair:
                    num_failed_chunk = 0
                    num_alive_chunk_same_rack = 0
                    num_unavail_chunk = 0
                    idx = 0
                    fail_idx = 0
                    alive_chunk_same_rack = []

                    # check the status of each chunk in the stripe
                    for disk_id in self.placement.get_stripe_location(
                            stripe_id):
                        # get the total number of unavailable chunk (due to permanent/transient failures) in this stripe
                        if self.disks[disk_id].state != Disk.STATE_NORMAL:
                            num_unavail_chunk += 1

                        # for RS, DRC
                        if self.placement.code_type != Placement.CODE_TYPE_LRC:
                            if self.disks[disk_id].get_curr_state(
                            ) == Disk.STATE_CRASHED:
                                num_failed_chunk += 1
                            elif (disk_id / (self.nodes_per_rack *
                                             self.disks_per_node)) == rack_id:
                                num_alive_chunk_same_rack += 1
                        # for LRC
                        else:
                            if self.disks[disk_id].get_curr_state(
                            ) == Disk.STATE_CRASHED:
                                num_failed_chunk += 1
                                if disk_idx == disk_id:
                                    fail_idx = idx
                            elif (disk_id / (self.nodes_per_rack *
                                             self.disks_per_node)) == rack_id:
                                num_alive_chunk_same_rack += 1
                                alive_chunk_same_rack.append(idx)
                            idx += 1

                    # this is a single-chunk repair
                    if num_failed_chunk == 1:
                        self.num_stripes_repaired_single_chunk += 1
                    # the repair for this stripe is delayed
                    if num_unavail_chunk > (self.n - self.k):
                        stripes_to_delay.append(stripe_id)

                    # RS
                    if self.placement.code_type == Placement.CODE_TYPE_RS:
                        if num_alive_chunk_same_rack < self.k:
                            cross_rack_download += (self.k -
                                                    num_alive_chunk_same_rack)
                    # LRC
                    elif self.placement.code_type == Placement.CODE_TYPE_LRC:
                        if num_failed_chunk == 1:
                            # global parity
                            if fail_idx in self.placement.lrc_global_parity:
                                if num_alive_chunk_same_rack < self.k:
                                    cross_rack_download += self.k - num_alive_chunk_same_rack
                            # data chunk or local parity
                            else:
                                # find which group that the failed chunk is in
                                fail_gid = 0
                                for gid in xrange(self.l):
                                    if fail_idx in self.placement.lrc_data_group[gid] or \
                                        fail_idx == self.placement.lrc_local_parity[gid]:
                                        fail_gid = gid
                                        break
                                # find how many chunk in the same rack can be used for repair
                                num_alive_chunk_same_rack = 0
                                for each in alive_chunk_same_rack:
                                    if each in self.placement.lrc_data_group[fail_gid] or \
                                        each == self.placement.lrc_data_group[fail_gid]:
                                        num_alive_chunk_same_rack += 1
                                if num_alive_chunk_same_rack < self.k / self.l:
                                    cross_rack_download += self.k / self.l - num_alive_chunk_same_rack
                        else:
                            if num_alive_chunk_same_rack < self.k:
                                cross_rack_download += (
                                    self.k - num_alive_chunk_same_rack)
                    # DRC
                    elif self.placement.code_type == Placement.CODE_TYPE_DRC:
                        if num_failed_chunk == 1:
                            if self.k == 5 and self.n == 9:
                                cross_rack_download += 1.0
                            elif self.k == 6 and self.n == 9:
                                cross_rack_download += 2.0
                            else:
                                print "Only support DRC - (9,6,3), (9,5,3)"
                        else:
                            if num_alive_chunk_same_rack < self.k:
                                cross_rack_download += (
                                    self.k - num_alive_chunk_same_rack)
                    else:
                        print "Not correct code type in set_disk_repair()!"

                repair_bwth = self.network.get_avail_cross_rack_repair_bwth()
                self.network.update_avail_cross_rack_repair_bwth(0)
                repair_time = cross_rack_download * self.chunk_size / float(
                    repair_bwth)  # seconds
                repair_time /= float(3600)  # hours

                if len(stripes_to_delay) != 0:
                    self.num_stripes_delayed += len(stripes_to_delay)
                    self.delayed_repair_dict[disk_idx] = stripes_to_delay

                self.logger.debug("repair_time = %d, repair_bwth = %d" %
                                  (repair_time, repair_bwth))
                heappush(self.events_queue,
                         (repair_time + curr_time, Disk.EVENT_DISK_REPAIR,
                          disk_idx, repair_bwth))

    ##
    # Generate permanent node failure event
    #
    def set_node_fail(self, node_idx, curr_time):
        heappush(self.events_queue, (self.node_fail_dists.draw() + curr_time,
                                     Node.EVENT_NODE_FAIL, node_idx))

    ##
    # Generate repair event for permanent node failure
    # The repair for the failed node is conducted by the repair for the failed disks on that node
    #
    def set_node_repair(self, node_idx, curr_time):
        for i in xrange(self.disks_per_node):
            disk_idx = node_idx * self.disks_per_node + i
            self.set_disk_repair(disk_idx, curr_time)

    ##
    # Generate transient node failure event
    #
    def set_node_transient_fail(self, node_idx, curr_time):
        heappush(self.events_queue,
                 (self.nodes[node_idx].node_transient_fail_distr.draw() +
                  curr_time, Node.EVENT_NODE_TRANSIENT_FAIL, node_idx))

    ##
    # Generate repair event for transient node failure
    #
    def set_node_transient_repair(self, node_idx, curr_time):
        heappush(self.events_queue,
                 (self.nodes[node_idx].node_transient_repair_distr.draw() +
                  curr_time, Node.EVENT_NODE_TRANSIENT_REPAIR, node_idx))

    ##
    # Generate transient rack failure
    #
    def set_rack_fail(self, rack_idx, curr_time):
        heappush(self.events_queue, (self.rack_fail_dists.draw() + curr_time,
                                     Rack.EVENT_RACK_FAIL, rack_idx))

    ##
    # Generate repair for transient rack failure
    #
    def set_rack_repair(self, rack_idx, curr_time):
        heappush(self.events_queue, (self.rack_repair_dists.draw() + curr_time,
                                     Rack.EVENT_RACK_REPAIR, rack_idx))

    ##
    # Get the next event from the event queue
    #
    def get_next_event(self, curr_time):
        self.logger.debug(
            "len(delayed_repair_dict) = %d, len(wait_repair_queue) = %d" %
            (len(self.delayed_repair_dict), len(self.wait_repair_queue)))
        # If there are some stripes delayed
        if len(self.delayed_repair_dict) != 0:
            items_to_remove = []  # keep the key of the items to remove
            for key in self.delayed_repair_dict:
                tmp_dict_value = []
                for stripe_id in self.delayed_repair_dict[key]:
                    repair_delay = False
                    num_unavail_chunk = 0
                    for disk_idx in self.placement.get_stripe_location(
                            stripe_id):
                        if self.disks[disk_idx].state != Disk.STATE_NORMAL:
                            num_unavail_chunk += 1
                        if num_unavail_chunk > (self.n - self.k):
                            repair_delay = True
                            break
                    if repair_delay:  # stripe whose repair is delayed
                        tmp_dict_value.append(stripe_id)
                if len(tmp_dict_value) == 0:
                    items_to_remove.append(key)
                else:
                    self.delayed_repair_dict[key] = tmp_dict_value
            for key in items_to_remove:
                self.delayed_repair_dict.pop(key)

        # If there are some failed disks awaiting repair
        if len(self.wait_repair_queue) != 0:
            disk_id = self.wait_repair_queue[0][1]
            rack_id = disk_id / (self.nodes_per_rack * self.disks_per_node)
            if self.use_network and self.network.get_avail_cross_rack_repair_bwth() != 0 and \
                self.network.get_avail_intra_rack_repair_bwth(rack_id) != 0 and \
                self.racks[rack_id].get_curr_state() == Rack.STATE_RACK_NORMAL:
                heappop(self.wait_repair_queue)
                self.set_disk_repair(disk_id, curr_time)

        next_event = heappop(self.events_queue)
        next_event_time = next_event[0]
        next_event_type = next_event[1]
        if next_event_time > self.mission_time:
            return (next_event_time, None, None)

        device_idx_set = []
        device_idx_set.append(next_event[2])
        repair_bwth_set = []
        # If use network bandwidth to calculate repair_time
        if self.use_network and next_event_type == Disk.EVENT_DISK_REPAIR:
            repair_bwth_set.append(next_event[3])

        # Gather the events with the same occurring time and event type
        while self.events_queue[0][0] == next_event_time and self.events_queue[
                0][1] == next_event_type:
            next_event = heappop(self.events_queue)
            device_idx_set.append(next_event[2])
            if self.use_network and next_event_type == Disk.EVENT_DISK_REPAIR:
                repair_bwth_set.append(next_event[3])

        # disk permanent failure
        if next_event_type == Disk.EVENT_DISK_FAIL:
            fail_time = next_event_time
            for device_idx in device_idx_set:
                # avoid the case that this disk is under repair
                if self.disks[device_idx].get_curr_state(
                ) != Disk.STATE_CRASHED:
                    if self.delayed_repair_dict.has_key(device_idx):
                        self.delayed_repair_dict.pop(device_idx)
                    # update the state of the disk
                    self.disks[device_idx].fail_disk(fail_time)
                    # generate the repair event
                    self.set_disk_repair(device_idx, fail_time)
            return (fail_time, Disk.EVENT_DISK_FAIL, device_idx_set)

        # node permanent failure
        elif next_event_type == Node.EVENT_NODE_FAIL:
            failed_disks_set = set([])
            fail_time = next_event_time
            for device_idx in device_idx_set:
                # avoid the case that the node is under repair
                if self.nodes[device_idx].get_curr_state(
                ) != Node.STATE_NODE_CRASHED:
                    # update the state of node
                    self.nodes[device_idx].fail_node(fail_time)
                    for i in xrange(self.disks_per_node):
                        disk_idx = device_idx * self.disks_per_node + i
                        failed_disks_set.add(disk_idx)
                        # avoid the case that the disk is under repair
                        if self.disks[disk_idx].get_curr_state(
                        ) != Disk.STATE_CRASHED:
                            if self.delayed_repair_dict.has_key(device_idx):
                                self.delayed_repair_dict.pop(device_idx)
                            # update the state of the disk
                            self.disks[disk_idx].fail_disk(fail_time)
                            # generate the repair event
                            self.set_disk_repair(disk_idx, fail_time)
            return (fail_time, Node.EVENT_NODE_FAIL, failed_disks_set)

        # node transient failure
        elif next_event_type == Node.EVENT_NODE_TRANSIENT_FAIL:
            fail_time = next_event_time
            for device_idx in device_idx_set:
                if self.nodes[device_idx].get_curr_state(
                ) == Node.STATE_NODE_NORMAL:
                    # update the state of node
                    self.nodes[device_idx].offline_node()
                    for i in xrange(self.disks_per_node):
                        disk_id = device_idx * self.disks_per_node + i
                        if self.disks[disk_id].get_curr_state(
                        ) == Disk.STATE_NORMAL:
                            # update the state of disk
                            self.disks[disk_id].offline_disk(fail_time)
                # generate the repair event
                if not self.use_trace:
                    self.set_node_transient_repair(device_idx, fail_time)

            return (fail_time, Node.EVENT_NODE_TRANSIENT_FAIL, None)

        # transient rack failure
        elif next_event_type == Rack.EVENT_RACK_FAIL:
            fail_time = next_event_time
            for device_idx in device_idx_set:
                if self.racks[device_idx].get_curr_state(
                ) == Rack.STATE_RACK_NORMAL:
                    # update the state of the rack
                    self.racks[device_idx].fail_rack(fail_time)
                    for i in xrange(self.nodes_per_rack):
                        # update the state of the node
                        node_idx = device_idx * self.nodes_per_rack + i
                        if self.nodes[node_idx].get_curr_state(
                        ) == Node.STATE_NODE_NORMAL:
                            self.nodes[node_idx].offline_node()
                            for j in xrange(self.disks_per_node):
                                # update the state of the disk
                                disk_idx = node_idx * self.disks_per_node + j
                                if self.disks[disk_idx].get_curr_state(
                                ) == Disk.STATE_NORMAL:
                                    self.disks[disk_idx].offline_disk(
                                        fail_time)
                # generate the repair event
                if not self.use_power_outage:
                    self.set_rack_repair(device_idx, fail_time)

            return (fail_time, Rack.EVENT_RACK_FAIL, None)

        # repair for permanent disk failure
        elif next_event_type == Disk.EVENT_DISK_REPAIR:
            repair_time = next_event_time
            for repair_disk_idx in device_idx_set:
                if self.disks[repair_disk_idx].get_curr_state(
                ) == Disk.STATE_CRASHED:
                    # update the state of the disk
                    self.disks[repair_disk_idx].repair_disk(repair_time)
                    # generate next permanent disk failure
                    self.set_disk_fail(repair_disk_idx, repair_time)

                # if the repair event is caused by permanent node failure
                node_idx = repair_disk_idx / self.disks_per_node
                if self.nodes[node_idx].get_curr_state(
                ) == Node.STATE_NODE_CRASHED:
                    all_disk_ok = True
                    for i in xrange(self.disks_per_node):
                        disk = self.disks[node_idx * self.disks_per_node + i]
                        if disk.get_curr_state() != disk.STATE_NORMAL:
                            all_disk_ok = False
                            break
                    if all_disk_ok:
                        # update the state of the node
                        self.nodes[node_idx].repair_node()
                        # generate next permanent node failure
                        if not self.use_trace:
                            self.set_node_fail(node_idx, repair_time)
            # update the network status
            if self.use_network:
                idx = 0
                for repair_disk_idx in device_idx_set:
                    repair_bwth = repair_bwth_set[idx]
                    self.network.update_avail_cross_rack_repair_bwth(
                        self.network.get_avail_cross_rack_repair_bwth() +
                        repair_bwth)
                    idx += 1

            # return the set of repaired disks
            return (repair_time, Disk.EVENT_DISK_REPAIR, device_idx_set)

        # repair for node transient failure
        elif next_event_type == Node.EVENT_NODE_TRANSIENT_REPAIR:
            repair_time = next_event_time
            for repair_node_idx in device_idx_set:
                # update the state of the node
                if self.nodes[repair_node_idx].get_curr_state(
                ) == Node.STATE_NODE_UNAVAILABLE:
                    self.nodes[repair_node_idx].online_node()
                    # update the state of the disk
                    for i in xrange(self.disks_per_node):
                        disk_id = repair_node_idx * self.disks_per_node + i
                        if self.disks[disk_id].get_curr_state(
                        ) == Disk.STATE_UNAVAILABLE:
                            self.disks[disk_id].online_disk(repair_time)
                # generate the next transient node failure
                if not self.use_trace:
                    self.set_node_transient_fail(repair_node_idx, repair_time)
            return (repair_time, Node.EVENT_NODE_TRANSIENT_REPAIR, None)

        # repair for rack transient failure
        elif next_event_type == Rack.EVENT_RACK_REPAIR:
            repair_time = next_event_time
            for repair_rack_idx in device_idx_set:
                if self.racks[repair_rack_idx].get_curr_state(
                ) == Rack.STATE_RACK_UNAVAILABLE:
                    # update the state of the rack
                    self.racks[repair_rack_idx].repair_rack()
                    for i in xrange(self.nodes_per_rack):
                        node_idx = repair_rack_idx * self.nodes_per_rack + i
                        # update the state of the node
                        if self.nodes[node_idx].get_curr_state(
                        ) == Node.STATE_NODE_UNAVAILABLE:
                            self.nodes[node_idx].online_node()
                            for j in xrange(self.disks_per_node):
                                disk_idx = node_idx * self.disks_per_node + j
                                # update the state of the disk
                                if self.disks[disk_idx].get_curr_state(
                                ) == Disk.STATE_UNAVAILABLE:
                                    self.disks[disk_idx].online_disk(
                                        repair_time)
                # generate the next transient rack failure
                if not self.use_power_outage:
                    self.set_rack_fail(repair_rack_idx, repair_time)

            return (repair_time, Rack.EVENT_RACK_REPAIR, None)

        else:
            self.logger.error('Wrong type of next_event in get_next_event()!')
            return None

    ##
    # Run an iteration of the simulator
    #
    def run_iteration(self, ite=0):
        self.reset()
        curr_time = 0

        self.logger.info(
            "Regular Simulator: begin an iteration %d, num_failed_disks = %d, "
            "avail_cross_rack_bwth = %d" %
            (ite, len(self.state.get_failed_disks()),
             self.network.get_avail_cross_rack_repair_bwth()))

        while True:
            (event_time, event_type,
             disk_id_set) = self.get_next_event(curr_time)
            curr_time = event_time
            if curr_time > self.mission_time:
                break
            # update the whole status
            if not self.state.update_state(event_type, disk_id_set):
                self.logger.error('update_state failed!')
            if event_type != None:
                self.logger.debug(
                    "Time %s, Event type: %s, Number of failed disks: %s\n" %
                    (event_time, event_type,
                     self.state.get_num_failed_disks()))

            # Check durability when disk_failure/node_failure happens
            if event_type == Disk.EVENT_DISK_FAIL or event_type == Node.EVENT_NODE_FAIL:
                if ite == 1:
                    self.logger.info(
                        "Time %s, Event type: %s, Number of failed disks: %s\n"
                        % (event_time, event_type,
                           self.state.get_num_failed_disks()))
                failed_disks = self.state.get_failed_disks()
                if self.placement.check_data_loss(failed_disks):
                    # the number of failed stripes and the number of lost chunks
                    (num_failed_stripes, num_lost_chunks
                     ) = self.placement.get_num_failed_status(failed_disks)
                    # Count in the delayed stripes
                    if len(self.delayed_repair_dict) != 0:
                        for key in self.delayed_repair_dict:
                            num_failed_stripes += len(
                                self.delayed_repair_dict[key])
                            num_lost_chunks += len(
                                self.delayed_repair_dict[key])
                    # Calculate blocked ratio
                    sum_unavail_time = 0
                    for disk_id in xrange(self.num_disks):
                        sum_unavail_time += self.disks[disk_id].get_unavail_time(curr_time) * \
                                            self.placement.get_num_chunks_per_disk(disk_id)
                    blocked_ratio = sum_unavail_time / (
                        self.placement.num_chunks * curr_time)
                    # Calculate the single-chunk repair ratio
                    single_chunk_repair_ratio = 0
                    self.logger.info(
                        "num_stripes_repaired_single_chunk = %d, num_stripes_repaired = %d"
                        % (self.num_stripes_repaired_single_chunk,
                           self.num_stripes_repaired))

                    if self.num_stripes_repaired != 0:
                        single_chunk_repair_ratio = float(self.num_stripes_repaired_single_chunk) / \
                                                    float(self.num_stripes_repaired)

                    return (1, "(%d, %d, %f, %f)" %
                            (num_failed_stripes, num_lost_chunks,
                             blocked_ratio, single_chunk_repair_ratio))

        # No data loss
        # Calculate blocked ratio
        sum_unavail_time = 0
        for disk_id in xrange(self.num_disks):
            sum_unavail_time += self.disks[disk_id].get_unavail_time(self.mission_time) * \
                                self.placement.get_num_chunks_per_disk(disk_id)
        blocked_ratio = sum_unavail_time / (self.placement.num_chunks *
                                            self.mission_time)
        # Calculate the single-chunk repair ratio
        single_chunk_repair_ratio = 0
        if self.num_stripes_repaired != 0:
            single_chunk_repair_ratio = float(self.num_stripes_repaired_single_chunk) / \
                                        float(self.num_stripes_repaired)

        return (0,
                "(0, 0, %f, %f)" % (blocked_ratio, single_chunk_repair_ratio))
Exemplo n.º 17
0
'''   

if False: ####### Placement routine.
    object_height = 0.1
    #SCALE = 1
    resolution = [.01*SCALE, .01*SCALE]  #sets resolution of occupancy grid
    print 'NOTE: Resolution is ',100*resolution[0], 'cm' ###
    polygon = label_object()
    polygon.add_point([0,0])
    polygon.add_point([0,5*SCALE])
    polygon.add_point([10*SCALE,5*SCALE])
    polygon.add_point([10*SCALE,0])
    ###object_height = 0.1

    print 'creating placement object'
    pl = Placement(pc, resolution)  ###REPLACE WITH MY OWN CLASS DEFINITION WITH FUNCTIONs

    if displayOn:
        placement_point = pl.test_placement(polygon, object_height) 
    else:
        placement_point = pl.find_placement(polygon, object_height)#Add param True to get debug popups
        
    placement_point -= pc.scan_dataset.ground_plane_translation
    
    #Assumes 'codyRobot'==ROBOT
    #This should be optional
    ### Formerly, the robot would reach out and place object at this point
    #import mekabot.coord_frames as mcf
    #placement_point_global = mcf.thok0Tglobal(placement_point)
    print 'placement point in global coordinate frame:', placement_point_global.T
        
Exemplo n.º 18
0
class UnifBFBSimulation(Simulation):
    ##
    # __init__() from Simulation
    #

    ##
    # Initialize UnifBFBSimulation
    #
    def init(self):
        self.logger = logging.getLogger(__name__)
        # self.logger.setLevel(logging.ERROR)
        self.logger.setLevel(logging.INFO)
        # self.logger.setLevel(logging.DEBUG)
        self.logger.addHandler(console)
        self.logger.propagate = False

        # Failure biasing prob
        self.fb_prob = float(self.is_parms.fb_prob)
        # Arrival rate of homogeneous Poisson process, beta
        self.poisson_rate = float(self.is_parms.beta)
        # Likelihood ratio
        self.lr = float(1.)

        self.logger.debug(
            "UnifBFBSimulation init() - fb_prob = %.6f, poisson_rate = %.6f",
            self.fb_prob, self.poisson_rate)

    ##
    # Reset the simulator
    #
    def reset(self):
        # Reset clocks and state for each disk
        for disk in self.disks:
            disk.init_clock(0)
            disk.init_state()

        # Reset clocks and state for each node
        for node in self.nodes:
            node.init_clock(0)
            node.init_state()

        # Reset clocks and state for each rack
        for rack in self.racks:
            rack.init_state()

        # Reset system state
        self.state = State(self.num_disks, self.num_nodes)

        # Rest repair queue
        self.repair_queue = []

        # Regenerate new placement
        self.placement = Placement(self.num_racks, self.nodes_per_rack,
                                   self.disks_per_node, self.capacity_per_disk,
                                   self.num_stripes, self.chunk_size,
                                   self.code_type, self.n, self.k,
                                   self.place_type, self.chunk_rack_config,
                                   self.l)
        # Reset LR
        self.lr = float(1.)

        self.total_failure_rate = 0.
        self.total_failrue_rate_cnt = 0
        self.total_repair_rate = 0.
        self.total_repair_rate_cnt = 0

    ##
    # Get failure rate
    #
    def get_failure_rate(self):
        fail_rate = float(0)

        for disk in self.disks:
            fail_rate += disk.curr_disk_fail_rate()

        for node in self.nodes:
            fail_rate += node.curr_node_fail_rate()

        # self.logger.debug("get_failure_rate(): fail_rate = %.6f", fail_rate)
        # print("get_failure_rate(): fail_rate = %.6f" % fail_rate)
        return fail_rate

    ##
    # Get the probability of node failure
    # To decide whether a failure event is node failure or disk failure
    #
    def get_node_failure_prob(self):
        comp_fail_rate = float(0)
        node_fail_rate = float(0)
        for disk in self.disks:
            comp_fail_rate += disk.curr_disk_fail_rate()
        for node in self.nodes:
            node_fail_rate += node.curr_node_fail_rate()

        return node_fail_rate / (node_fail_rate + comp_fail_rate)

    ##
    # Calculate the repair time for a failed component
    # The repair time = the amount of cross_rack data to download / cross_rack bandwidth
    #
    def get_disk_repair_duration(self, disk_idx):
        if not self.use_network:
            # get the repair time from a pre-defined repair distribution
            return self.disk_repair_dists.draw()
        else:
            # repair time = cross-rack repair traffic / available cross-rack bandwidth
            rack_id = disk_idx / (self.nodes_per_rack * self.disks_per_node)
            cross_rack_download = 0
            stripes_to_repair = self.placement.get_stripes_to_repair(disk_idx)
            # self.num_stripes_repaired += len(stripes_to_repair)
            # stripes_to_delay = []

            # print("len(stripes_to_repair) = %d" % len(stripes_to_repair))
            # for each stripe to repair
            for stripe_id in stripes_to_repair:
                num_failed_chunk = 0
                num_alive_chunk_same_rack = 0
                idx = 0
                fail_idx = 0
                alive_chunk_same_rack = []

                # check the status of each chunk in the stripe
                for disk_id in self.placement.get_stripe_location(stripe_id):

                    # for RS, DRC
                    if self.placement.code_type != Placement.CODE_TYPE_LRC:
                        if self.disks[disk_id].get_curr_state(
                        ) == Disk.STATE_CRASHED:
                            num_failed_chunk += 1
                        elif (disk_id / (self.nodes_per_rack *
                                         self.disks_per_node)) == rack_id:
                            num_alive_chunk_same_rack += 1
                    # for LRC
                    else:
                        if self.disks[disk_id].get_curr_state(
                        ) == Disk.STATE_CRASHED:
                            num_failed_chunk += 1
                            if disk_idx == disk_id:
                                fail_idx = idx
                        elif (disk_id / (self.nodes_per_rack *
                                         self.disks_per_node)) == rack_id:
                            num_alive_chunk_same_rack += 1
                            alive_chunk_same_rack.append(idx)
                        idx += 1

                # # this is a single-chunk repair
                # if num_failed_chunk == 1:
                #     self.num_stripes_repaired_single_chunk += 1

                # RS
                if self.placement.code_type == Placement.CODE_TYPE_RS:
                    if num_alive_chunk_same_rack < self.k:
                        cross_rack_download += (self.k -
                                                num_alive_chunk_same_rack)
                # LRC
                elif self.placement.code_type == Placement.CODE_TYPE_LRC:
                    if num_failed_chunk == 1:
                        # global parity
                        if fail_idx in self.placement.lrc_global_parity:
                            if num_alive_chunk_same_rack < self.k:
                                cross_rack_download += self.k - num_alive_chunk_same_rack
                        # data chunk or local parity
                        else:
                            # find which group that the failed chunk is in
                            fail_gid = 0
                            for gid in xrange(self.l):
                                if fail_idx in self.placement.lrc_data_group[gid] or \
                                        fail_idx == self.placement.lrc_local_parity[gid]:
                                    fail_gid = gid
                                    break
                            # find how many chunk in the same rack can be used for repair
                            num_alive_chunk_same_rack = 0
                            for each in alive_chunk_same_rack:
                                if each in self.placement.lrc_data_group[fail_gid] or \
                                        each == self.placement.lrc_data_group[fail_gid]:
                                    num_alive_chunk_same_rack += 1
                            if num_alive_chunk_same_rack < self.k / self.l:
                                cross_rack_download += self.k / self.l - num_alive_chunk_same_rack
                    else:
                        if num_alive_chunk_same_rack < self.k:
                            cross_rack_download += (self.k -
                                                    num_alive_chunk_same_rack)
                # DRC
                elif self.placement.code_type == Placement.CODE_TYPE_DRC:
                    if num_failed_chunk == 1:
                        if self.k == 5 and self.n == 9:
                            cross_rack_download += 1.0
                        elif self.k == 6 and self.n == 9:
                            cross_rack_download += 2.0
                        else:
                            print "Only support DRC - (9,6,3), (9,5,3)"
                    else:
                        if num_alive_chunk_same_rack < self.k:
                            cross_rack_download += (self.k -
                                                    num_alive_chunk_same_rack)
                else:
                    print "Not correct code type in set_disk_repair()!"

            repair_duration = cross_rack_download * self.chunk_size / \
                              float(self.network.get_avail_cross_rack_repair_bwth()) # seconds
            # print "repair_time = %.1f" % (repair_duration / 3600.)
            # print("repair_duration = %.10f, cross_rack_download=%d" % \
            #        (repair_duration / 3600., cross_rack_download))

            if repair_duration != 0:
                self.total_repair_rate += 3600. / repair_duration
                self.total_repair_rate_cnt += 1

            return repair_duration / 3600.  # hours

    def get_earliest_repair_time(self, curr_time):
        earliest_repair_time = curr_time
        if len(self.repair_queue) > 0:
            for repair_event in self.repair_queue:
                repair_event_time = repair_event[0]
                if repair_event_time > earliest_repair_time:
                    earliest_repair_time = repair_event_time

        return earliest_repair_time

    ##
    # Set next repair time for disk indexed with disk_index
    #
    def set_disk_repair(self, disk_idx, curr_time):
        heappush(self.repair_queue, (self.get_disk_repair_duration(disk_idx) +
                                     self.get_earliest_repair_time(curr_time),
                                     Disk.EVENT_DISK_REPAIR, disk_idx))

    ##
    # Set new node repair time for node node_idx
    #
    def set_node_repair(self, node_idx, curr_time):
        node_repair_duration = 0
        # Get the repair duration of each disk on this node
        for i in xrange(self.disks_per_node):
            disk_idx = self.disks_per_node * node_idx + i
            node_repair_duration += self.get_disk_repair_duration(disk_idx)

        heappush(
            self.repair_queue,
            (node_repair_duration + self.get_earliest_repair_time(curr_time),
             Node.EVENT_NODE_REPAIR, node_idx))

    ##
    # Get the next event in UnifBFBSimulation
    #
    def get_next_event(self, curr_time):
        # Update clock for each disk
        for disk in self.disks:
            disk.update_clock(curr_time)

        # Update clock for each node
        for node in self.nodes:
            node.update_clock(curr_time)

        # If not in a failed state, then draw for next failure
        if self.state.get_sys_state() == self.state.CURR_STATE_OK:
            failure_queue = []

            for each_disk in range(self.num_disks):
                failure_queue.append(
                    (self.disks[each_disk].disk_fail_distr.
                     draw_inverse_transform(self.disks[each_disk].read_clock())
                     + curr_time, Disk.EVENT_DISK_FAIL, each_disk))

            for each_node in range(self.num_nodes):
                failure_queue.append(
                    (self.nodes[each_node].node_fail_distr.
                     draw_inverse_transform(self.nodes[each_node].read_clock())
                     + curr_time, Node.EVENT_NODE_FAIL, each_node))

            heapify(failure_queue)
            (next_event_time, next_event_type,
             next_event_subsystem) = heappop(failure_queue)

            if next_event_type == Disk.EVENT_DISK_FAIL:
                self.disks[next_event_subsystem].fail_disk(next_event_time)
                self.set_disk_repair(next_event_subsystem, next_event_time)
            elif next_event_type == Node.EVENT_NODE_FAIL:
                self.nodes[next_event_subsystem].fail_node(next_event_time)
                for each_disk_on_this_node in range(
                        next_event_subsystem * self.disks_per_node,
                    (next_event_subsystem + 1) * self.disks_per_node):
                    self.disks[each_disk_on_this_node].fail_disk(
                        next_event_time)
                self.set_node_repair(next_event_subsystem, next_event_time)
            else:
                self.logger.error(
                    "UnifBFBSimulation - get_next_event(): wrong next_event_type!"
                )

            return (next_event_time, next_event_type, next_event_subsystem)

        elif self.state.get_sys_state() == self.state.CURR_STATE_DEGRADED:
            if not self.repair_queue:
                self.logger.error(
                    "UnifBFBSimulation - get_next_event(): repair_queue is empty!"
                )
                sys.exit(2)

            (repair_time, repair_event, subsystem_idx) = self.repair_queue[0]
            next_event_time = nprandom.exponential(
                self.poisson_rate) + curr_time

            if repair_time <= next_event_time:
                heappop(self.repair_queue)
                if repair_event == Disk.EVENT_DISK_REPAIR:
                    self.disks[subsystem_idx].repair_disk(repair_time)
                    return (repair_time, Disk.EVENT_DISK_REPAIR, subsystem_idx)
                elif repair_event == Node.EVENT_NODE_REPAIR:
                    self.nodes[subsystem_idx].repair_node()
                    for i in range(self.disks_per_node):
                        disk_idx = subsystem_idx * self.disks_per_node + i
                        self.disks[disk_idx].repair_disk(repair_time)
                    return (repair_time, Node.EVENT_NODE_REPAIR, subsystem_idx)
                else:
                    self.logger.error(
                        "UnifBFBSimulation - get_next_event(): wrong repair_event!"
                    )

            for disk in self.disks:
                disk.update_clock(next_event_time)
            for node in self.nodes:
                node.update_clock(next_event_time)

            self.total_failure_rate += self.get_failure_rate()
            self.total_failrue_rate_cnt += 1

            draw = nprandom.uniform()
            # Determine whether it is a "real" event or "pseudo" event
            if draw > self.fb_prob:
                # It is a pseudo event
                old_lr = self.lr
                self.lr *= (1. - self.get_failure_rate() /
                            self.poisson_rate) / (1. - self.fb_prob)
                self.logger.debug(
                    "get_next_event(): pseudo event - old_lr = %.10f, update, lr = %.10f",
                    old_lr, self.lr)
                # Return nothing because we are staying in the current state
                return (next_event_time, None, None)

            else:
                # Randomly fail a disk or node
                # prob_node_failure = self.get_node_failure_prob()
                if nprandom.uniform() > self.get_node_failure_prob():
                    # disk failure
                    avail_disks = self.state.get_avail_disks()
                    fail_disk_idx = avail_disks[random.randint(
                        0,
                        len(avail_disks) - 1)]

                    old_lr = self.lr
                    # self.lr *= (self.disks[fail_disk_idx].curr_disk_fail_rate() / self.poisson_rate) \
                    #            / (self.fb_prob * (1 - prob_node_failure) / len(avail_disks))
                    # The above equation equals to the following
                    self.lr *= (self.get_failure_rate() /
                                self.poisson_rate) / self.fb_prob
                    self.logger.debug(
                        "get_next_event(): disk failure event, lr = %.10f, update, lr = %.10f",
                        old_lr, self.lr)

                    self.disks[fail_disk_idx].fail_disk(next_event_time)
                    self.set_disk_repair(fail_disk_idx, next_event_time)

                    return (next_event_time, Disk.EVENT_DISK_FAIL,
                            fail_disk_idx)

                else:
                    avail_nodes = self.state.get_avail_nodes()
                    fail_node_idx = avail_nodes[random.randint(
                        0,
                        len(avail_nodes) - 1)]

                    old_lr = self.lr
                    # self.lr *= (self.nodes[fail_node_idx].curr_node_fail_rate() / self.poisson_rate) \
                    #            / (self.fb_prob * prob_node_failure / len(avail_nodes))
                    # The above equation equals to the following
                    self.lr *= (self.get_failure_rate() /
                                self.poisson_rate) / self.fb_prob
                    self.logger.debug(
                        "get_next_event(): node failure event - old_lr = %.10f, update, lr = %.10f",
                        old_lr, self.lr)

                    # Update internal node state
                    self.nodes[fail_node_idx].fail_node(next_event_time)
                    for each_disk_on_failed_node in range(
                            fail_node_idx * self.disks_per_node,
                        (fail_node_idx + 1) * self.disks_per_node):
                        self.disks[each_disk_on_failed_node].fail_disk(
                            next_event_time)

                    # Schedule repair for the failed node
                    self.set_node_repair(fail_node_idx, next_event_time)

                    return (next_event_time, Node.EVENT_NODE_FAIL,
                            fail_node_idx)

    ##
    # Run an iteration in UnifBFBSimulation
    #
    def run_iteration(self, ite=0):
        self.reset()
        curr_time = 0
        self.logger.info(
            "UnifBFBSimulator: begin an iteration %d, num_failed_disks = %d, "
            "avail_cross_rack_bwth = %d" %
            (ite, len(self.state.get_failed_disks()),
             self.network.get_avail_cross_rack_repair_bwth()))

        while True:
            (event_time, event_type,
             subsystem_idx) = self.get_next_event(curr_time)
            curr_time = event_time

            if event_time > self.mission_time:
                break

            if event_type != None:
                self.logger.debug(
                    "Time: %.3f, event = %s, subsystem = %d, "
                    "number_failed_disks = %d, number_failed_nodes = %d" %
                    (event_time, event_type, subsystem_idx,
                     self.state.get_num_failed_disks(),
                     self.state.get_num_failed_nodes()))

                if not self.state.update_state_unifbfb(event_type,
                                                       subsystem_idx):
                    self.logger.error('Update_state_unifbfb failed!')

            # Check durability when disk failure or node failure happens
            if event_type == Disk.EVENT_DISK_FAIL or event_type == Node.EVENT_NODE_FAIL:
                failed_disks = self.state.get_failed_disks()
                if self.placement.check_data_loss(failed_disks):
                    self.logger.debug(
                        "===== END of one iteration, self.lr = %.10f",
                        min(self.lr, 1))
                    (num_failed_stripes, num_lost_chunks
                     ) = self.placement.get_num_failed_status(failed_disks)
                    self.logger.info("avg_failure_rate = %.6f" %
                                     (self.total_failure_rate /
                                      self.total_failrue_rate_cnt))
                    self.logger.info(
                        "avg_repair_rate = %.6f" %
                        (self.total_repair_rate / self.total_repair_rate_cnt))
                    return (min(self.lr, 1), "(%d, %d, 0, 0)" %
                            (num_failed_stripes, num_lost_chunks))

        # No data loss
        self.logger.debug(
            "END of one iteration, self.lr = 0 because no data loss")
        return (0, "(0, 0, 0, 0)")
Exemplo n.º 19
0
                    print "# ble", index, "pin", pin, ":", selection, "(", subblock[
                        pin], ")"
                    inputs[index * self.bitgen.inputs + pin - 1] = selection

        self.bitgen.gen_lb(inputs, functions, flops)


if __name__ == '__main__':
    import sys

    if len(sys.argv) != 6:
        sys.stderr.write(
            "usage: {:s} <placement.out> <routing.out> <netlist.net> <logic.blif>\n"
            .format(sys.argv[0]))
        sys.exit(1)

    placement = Placement(sys.argv[1])
    routing = Routing(sys.argv[2])
    netlist = NET(sys.argv[3])
    blif = BLIF(sys.argv[4])
    tracks = int(sys.argv[5]) / 2

    bitgen = Bitgen(cluster_size=4,
                    ble_inputs=6,
                    lb_inputs_per_side=4,
                    tracks_per_direction=tracks,
                    mux_size=5)

    fpga = FPGA(placement, routing, netlist, blif, bitgen)
    fpga.generate()
Exemplo n.º 20
0
    def reset(self, ite=0):
        # Generate node transient and permanent failure events from trace
        if self.use_trace:
            for i in xrange(self.num_nodes):
                self.nodes[i] = Node(None, None, None,
                                     Trace(self.trace_id, i, 'p'),
                                     Trace(self.trace_id, i, 't'),
                                     Trace(self.trace_id, i, 'r'))

        self.state = State(self.num_disks)

        for disk in self.disks:
            disk.init_clock(0)
            disk.init_state()
        for node in self.nodes:
            node.init_state()
        for rack in self.racks:
            rack.init_state()

        self.events_queue = []
        self.wait_repair_queue = []
        self.delayed_repair_dict = dict()

        # generate disk failures and put them into events_queue
        for disk_id in xrange(len(self.disks)):
            disk_fail_time = self.disk_fail_dists.draw()
            if disk_fail_time <= self.mission_time:
                self.events_queue.append(
                    (disk_fail_time, Disk.EVENT_DISK_FAIL, disk_id))
        # generate node failures and push them into events_queue
        for node_id in xrange(self.num_nodes):
            if not self.use_trace:
                self.events_queue.append((self.node_fail_dists.draw(),
                                          Node.EVENT_NODE_FAIL, node_id))
                if self.enable_transient_failures:
                    self.events_queue.append(
                        (self.node_transient_fail_dists.draw(),
                         Node.EVENT_NODE_TRANSIENT_FAIL, node_id))
            else:
                for node_failure_time in self.nodes[
                        node_id].node_fail_trace.get_trace_ls():
                    # push node failure event to event_queue
                    self.events_queue.append(
                        (node_failure_time, Node.EVENT_NODE_FAIL, node_id))
                node_transient_failure_ls = self.nodes[
                    node_id].node_transient_fail_trace.get_trace_ls()
                node_transient_repair_ls = self.nodes[
                    node_id].node_transient_repair_trace.get_trace_ls()
                for ls_idx in xrange(len(node_transient_failure_ls)):
                    node_transient_failure_time = node_transient_failure_ls[
                        ls_idx]
                    node_transient_repair_time = node_transient_repair_ls[
                        ls_idx]
                    self.events_queue.append(
                        (node_transient_failure_time,
                         Node.EVENT_NODE_TRANSIENT_FAIL, node_id))
                    self.events_queue.append(
                        (node_transient_failure_time +
                         node_transient_repair_time,
                         Node.EVENT_NODE_TRANSIENT_REPAIR, node_id))

        # generate rack failures and push them into events_queue
        if not self.use_power_outage and self.enable_transient_failures:
            for rack_id in xrange(len(self.racks)):
                self.events_queue.append((self.rack_fail_dists.draw(),
                                          Rack.EVENT_RACK_FAIL, rack_id))

        # correlated failures caused by power outage
        if (not self.use_trace) and self.use_power_outage:
            for rack_id in xrange(self.num_racks):
                occur_time = float(0) + self.power_outage_dist.draw()
                while occur_time < self.mission_time:
                    self.events_queue.append(
                        (occur_time, Rack.EVENT_RACK_FAIL, rack_id))
                    occur_time += random.expovariate(
                        (1 / float(self.power_outage_duration)))
                    self.events_queue.append(
                        (occur_time, Rack.EVENT_RACK_REPAIR, rack_id))
                    for i in xrange(self.nodes_per_rack):
                        # draw a bernoulli distribution
                        if nprandom.binomial(1, 0.01):
                            self.events_queue.append(
                                (occur_time, Node.EVENT_NODE_FAIL,
                                 (self.nodes_per_rack * rack_id + i)))
                    occur_time += self.power_outage_dist.draw()

        heapify(self.events_queue)
        self.placement = Placement(self.num_racks, self.nodes_per_rack,
                                   self.disks_per_node, self.capacity_per_disk,
                                   self.num_stripes, self.chunk_size,
                                   self.code_type, self.n, self.k,
                                   self.place_type, self.chunk_rack_config,
                                   self.l)

        self.network = Network(self.num_racks, self.nodes_per_rack,
                               self.network_setting)

        self.num_stripes_repaired = 0
        self.num_stripes_repaired_single_chunk = 0
        self.num_stripes_delayed = 0
Exemplo n.º 21
0
class Simulate:
    def __init__(self, mission_time, plus_one, num_servers,
                 num_disks_per_server, num_spares_per_server, k, m, fb,
                 dp_type, failure_type, mtbf, failure_percent, rebuildIO,
                 slaTime, copybackIO, diskCap, useRatio):
        #---------------------------
        # compressed time window
        #---------------------------
        self.mission_time = mission_time
        #---------------------------
        # system and placement
        #---------------------------
        self.sys = Campaign(plus_one, num_servers, num_disks_per_server,
                            num_spares_per_server, k, m, fb, dp_type, diskCap,
                            useRatio)
        self.place = Placement(self.sys)
        #--------------------------------------
        # fast rebuild + copyback phases
        #--------------------------------------
        self.rebuild = Rebuild(self.sys, rebuildIO)
        self.copyback = Copyback(copybackIO, slaTime)
        #--------------------------------------
        # failures distribution and mtbf
        #--------------------------------------
        self.mtbf = mtbf
        self.failure_type = failure_type
        self.failure_percent = failure_percent

    def reset(self):
        #----------------------------------------------
        # failures arrive by using poisson distribution
        #----------------------------------------------
        if self.failure_type == 0:
            trace = Poisson(self.sys.num_disks, self.failure_percent,
                            self.mtbf)
        if self.failure_type == 1:
            trace = Exponential(self.sys.num_disks, self.failure_percent,
                                self.mtbf)
        if self.failure_type == 2:
            trace = Batch(self.sys.num_disks,
                          self.failure_percent,
                          self.mtbf,
                          cascade_factor=10.0)
        self.trace_entry = trace.generate_failures()
        #------------------------------------------
        # put the disk failures in the event queue
        #------------------------------------------
        self.events_queue = []
        for disk_fail_time, diskId in self.trace_entry:
            heappush(self.events_queue,
                     (disk_fail_time, Disk.EVENT_FAIL, diskId))
            print ">>>>> reset disk", diskId, Disk.EVENT_FAIL, "@", disk_fail_time
            self.mission_time = disk_fail_time
        print " - system mission time - ", self.mission_time
        #------------------------------
        # initialize the system state
        #------------------------------
        self.state = State(self.sys, self.rebuild, self.copyback,
                           self.events_queue)

    def get_next_wait_events(self):
        events = []
        #---------------------------------------------------------------------------------------
        if self.sys.dp_type == 0 or self.sys.dp_type == 1 or self.sys.dp_type == 2:
            #---------------------------------------------------------------------------------------
            for serverId in self.sys.servers:
                if self.state.servers[serverId].wait_queue:
                    avail_spares = self.state.servers[serverId].avail_spares
                    while avail_spares and self.state.servers[
                            serverId].wait_queue:
                        print "\n@wait_queue in server [", serverId, "] avail spares:", self.state.servers[
                            serverId].avail_spares
                        deviceset = []
                        next_event = heappop(
                            self.state.servers[serverId].wait_queue)
                        #------------------------------------------
                        next_event_time = next_event[0]
                        next_event_type = next_event[1]
                        deviceset.append(next_event[2])
                        avail_spares -= 1
                        while self.state.servers[
                                serverId].wait_queue and self.state.servers[
                                    serverId].wait_queue[0][
                                        0] == next_event_time and self.state.servers[
                                            serverId].wait_queue[0][
                                                1] == next_event_type and avail_spares > 0:
                            simultaneous_event = heappop(
                                self.state.servers[serverId].wait_queue)
                            deviceset.append(simultaneous_event[2])
                            avail_spares -= 1
                        print ">>>>> pop server wait disk", deviceset, next_event_type, " - time - ", next_event_time
                        events.append(
                            (next_event_time, next_event_type, deviceset))
            return events

    def get_next_events(self):
        #--------------------------------------------------------------
        wait_events = self.get_next_wait_events()
        if len(wait_events) > 0:
            return wait_events
        #--------------------------------------------------------------
        if self.events_queue:
            deviceset = []
            next_event = heappop(self.events_queue)
            #------------------------------------------
            next_event_time = next_event[0]
            next_event_type = next_event[1]
            deviceset.append(next_event[2])
            #----------------------------------------------
            # gather the simultaneous failure/repair events
            #----------------------------------------------
            while self.events_queue and self.events_queue[0][
                    0] == next_event_time and self.events_queue[0][
                        1] == next_event_type:
                simultaneous_event = heappop(self.events_queue)
                deviceset.append(simultaneous_event[2])
            print "\n\n>>>>> pop next event -", deviceset, next_event_type, next_event_time
            return [(next_event_time, next_event_type, deviceset)]
        else:
            return [(None, None, None)]

    def run_simulation(self, iterations_per_worker, traces_per_worker):
        results = []
        for one_iter in range(iterations_per_worker):
            results.append(self.run_iteration(one_iter))
        return results

    def run_iteration(self, num_iter):
        self.reset()
        curr_time = 0
        loss = 0
        loopflag = True
        eventDL = 0
        while loopflag:
            for each_event in self.get_next_events():
                (event_time, event_type, deviceset) = each_event
                #-----------------------------
                # if invalid event, then exit
                #-----------------------------
                if event_time == None:
                    loopflag = False
                    break
                #----------------------------------
                # update the system time and state
                #----------------------------------
                if curr_time < event_time:
                    curr_time = event_time
                #---------------------------
                # exceed mission-time, exit
                #---------------------------
                if curr_time > self.mission_time:
                    loopflag = False
                    loss = self.place.calculate_dataloss(self.state)
                    break
                #----------------------------------
                self.state.update_clock(event_type, curr_time)
                self.state.update_state(event_type, deviceset)
                self.state.update_event(event_type, deviceset)
                #-------------------------------------------------------
                # degraded rebuild or copyback event, continue
                #-------------------------------------------------------
                if event_type == Disk.EVENT_DEGRADEDREBUILD or event_type == Disk.EVENT_COPYBACK:
                    continue
                #------------------------------------------
                # check the PDL according to failure events
                #------------------------------------------
                if event_type == Disk.EVENT_FAIL:
                    eventDL = eventDL + 1
                    if self.place.check_global_dataloss(self.state, deviceset):
                        print "############### data loss ##############", eventDL, "deviceset", deviceset, curr_time, ">>> unrecoverables - ", self.state.MTTDL, "\n"
        return (self.state.MTTDL, loss)