Пример #1
0
    def printAll(self):
        default_infos = "Default Configurations: \t total_time: " + str(self.total_time) + \
                        ", disk capacity: " + str(self.disk_capacity) + "TB" + \
                        ", disks per machine: " + str(self.disks_per_machine) + \
                        ", machines per rack: " + str(self.machines_per_rack) + \
                        ", rack count: " + str(self.rack_count) + \
                        ", chunk size: " + str(self.chunk_size) + "MB" + \
                        ", total active storage: " + str(self.total_active_storage) + "PB" +\
                        ", data redundancy: " + self.data_redundancy + \
                        ", data placement: " + self.data_placement + \
                        ", recovery bandwidth cross rack: " + str(self.recovery_bandwidth_cross_rack) + \
                        ", installment size: " + str(self.installment_size) + \
                        ", event file path: " + self.event_file + \
                        ", outputs: " + str(self.outputs) + \
                        ", auto repair: " + str(self.auto_repair) + \
                        ", hierarchical: " + str(self.hierarchical) + \
                        ", parallel repair: " + str(self.parallel_repair) + \
                        ", lazy recovery flag: " + str(self.lazy_recovery) + \
                        ", lazy only available: " + str(self.lazy_only_available) + \
                        ", recovery threshold: " + str(self.recovery_threshold) + \
                        ", rafi recovery flag: " + str(self.rafi_recovery)

        if self.data_placement == "copyset":
            default_infos += ", scatter width: " + str(self.scatter_width)
        if self.hierarchical:
            default_infos += ", distinct racks: " + str(self.distinct_racks)

        info_logger.info(default_infos)

        recovery_infos = ""
        if self.rafi_recovery:
            recovery_infos += " detect intervals: " + str(self.detect_intervals)
        if self.lazy_recovery or self.rafi_recovery:
            info_logger.info(recovery_infos)
Пример #2
0
    def run(self):
        conf = Configuration(self.conf_path)
        xml = XMLParser(conf)
        if conf.hier:
            self.distributer = HierSSSDistribute(xml)
        else:
            self.distributer = SSSDistribute(xml)
        self.conf = self.distributer.returnConf()

        self.event_handler = EventHandler
        self.distributer.start()
        events_handled = 0
        events = EventQueue()

        if self.conf.system_upgrade:
            for info in self.conf.system_upgrade_infos:
                if info[0] == 1:
                    upgrade_start_times = self.addSystemUpgrade(info, self.conf.total_time)
                    if info[-1] is not None:
                        self.addUpgradeCheckEvents(events, upgrade_start_times, info[-1])
        if self.conf.correlated_failures:
            for info in self.conf.correlated_failures_infos:
                for i in xrange(10):
                    cf_info = deepcopy(list(info))
                    cf_info[0] += i * 8760
                    print "correlated_failures info:", cf_info
                    self.addCorrelatedFailures(cf_info)
        if self.conf.system_scaling:
            for info in self.conf.system_scaling_infos:
                self.addSystemScaling(info)

        info_logger.info("disk usage is: " + str(self.distributer.diskUsage()*100) + "%\n")
        self.distributer.getRoot().printAll()

        root = self.distributer.getRoot()
        root.generateEvents(events, 0, self.conf.total_time, True)
        for ts in self.conf.upgrade_ts:
            full_system_check_event = Event(Event.EventType.UpgradeCheck, ts, root, 6)
            events.addEvent(full_system_check_event)

        if self.conf.event_file != None:
            events_file = self.conf.event_file + '-' + self.ts
            events.printAll(events_file, "Iteration number: "+str(self.iteration_times))
        self.iteration_times += 1

        handler = self.event_handler(self.distributer)

        print "total slices:", handler.total_slices
        e = events.removeFirst()
        while e is not None:
            handler.handleEvent(e, events)
            e = events.removeFirst()
            events_handled += 1

        self.total_events_handled += events_handled

        result = handler.end()
        info_logger.info(result.toString())
        return result
Пример #3
0
    def handleLatentDefect(self, u, time, e):
        current_total_slices = self.calCurrentTotalSlices(time)

        if isinstance(u, Disk):
            slice_count = len(u.getChildren())
            if slice_count == 0:
                return
            self._my_assert(slice_count > 10)

            slice_index = choice(u.getChildren())
            if slice_index >= current_total_slices:
                return

            if self.status[slice_index] == self.lost_slice:
                self.total_skipped_latent += 1
                return

            repairable_before = self.isRepairable(slice_index)

            index = self.slice_locations[slice_index].index(u)
            # A LSE cannot hit lost blocks or a same block multiple times
            if self.status[slice_index][
                    index] == UnitState.Corrupted or self.status[slice_index][
                        index] == UnitState.LatentError:
                self.total_skipped_latent += 1
                return

            self._my_assert(self.durableCount(slice_index) >= 0)
            self.sliceDegraded(slice_index)

            self.status[slice_index][index] = UnitState.LatentError
            u.slices_hit_by_LSE.append(slice_index)
            self.total_latent_failures += 1

            repairable_current = self.isRepairable(slice_index)
            if repairable_before and not repairable_current:
                self.unavailable_slice_count += 1
                self.startUnavailable(slice_index, time)

            if self.isLost(slice_index):
                info_logger.info(
                    str(time) + " slice: " + str(slice_index) + " durCount: " +
                    str(self.durableCount(slice_index)) + " latDefect " +
                    str(True) + "  due to ===latent=== error " + " on disk " +
                    str(u.getID()))
                self.undurable_slice_count += 1
                self.endUnavailable(slice_index, time)
                self.status[slice_index] = self.lost_slice
        else:
            raise Exception("Latent defect should only happen for disk")

        self.slices_degraded_list.append(
            (e.getTime(), self.current_slice_degraded))
        self.slices_degraded_avail_list.append(
            (e.getTime(), self.current_avail_slice_degraded))
Пример #4
0
 def printPerYearStart(self, per_day_start, description):
     d = 0
     year = 365
     for t in xrange(1, len(per_day_start)):
         d += per_day_start[t]
         if t % year == 0:
             d /= 365
             info_logger.info(description + " " + str(t / year) + " " +
                              str(d))
             d = 0
     info_logger.info(description + " " + str(len(per_day_start) / year) +
                      " " + str(d / 365))
Пример #5
0
    def end(self):
        ret = Result()

        # data loss probability and data unvailable probability
        data_loss_prob = format(
            float(self.undurable_slice_count) / self.total_slices, ".4e")
        unavailable_prob = self.calUnavailProb()

        Result.undurable_count = self.undurable_slice_count
        Result.unavailable_durations = self.unavailable_durations
        Result.data_loss_prob = data_loss_prob
        Result.unavailable_prob = unavailable_prob
        # repair bandwidth in GBs
        Result.total_repair_transfers = format(
            float(self.total_repair_transfers) / 1024, ".4e")

        info_logger.info(
            "anomalous available count: %d, total latent failure: %d,\
             total scrubs: %d, total scrubs repairs: %d, \
             total disk failures:%d, total disk repairs:%d, \
             total machine failures:%d, total machine repairs:%d, \
             total permanent machine failures:%d, \
             total short temperary machine failures:%d, \
             total long temperary machine failures:%d, \
             total machine failures due to rack failures:%d, \
             total eager machine repairs:%d, total eager slice repairs:%d, \
             total skipped latent:%d, total incomplete recovery:%d\n \
             max recovery bandwidth:%f\n \
             undurable_slice_count:%d\n \
             total repairs:%d, total optimal repairs:%d" %
            (self.anomalous_available_count, self.total_latent_failures,
             self.total_scrubs, self.total_scrub_repairs,
             self.total_disk_failures, self.total_disk_repairs,
             self.total_machine_failures, self.total_machine_repairs,
             self.total_perm_machine_failures,
             self.total_short_temp_machine_failures,
             self.total_long_temp_machine_failures,
             self.total_machine_failures_due_to_rack_failures,
             self.total_eager_machine_repairs, self.total_eager_slice_repairs,
             self.total_skipped_latent,
             self.total_incomplete_recovery_attempts,
             self.max_recovery_bandwidth, self.undurable_slice_count,
             self.total_repairs, self.total_optimal_repairs))

        self.printDegradedStat(self.slices_degraded_list,
                               "Avg_durable_degraded_", "slices")
        self.printDegradedStat(self.slices_degraded_avail_list,
                               "Avg_available_degraded_", "slices")

        self.analyzeBandwidth()

        return ret
Пример #6
0
    def handleLatentDefect(self, u, time, e):
        if isinstance(u, Disk):
            slice_count = len(u.getChildren())
            if slice_count == 0:
                return
            self._my_assert(slice_count > 10)

            slice_index = choice(u.getChildren())
            if slice_index >= self.total_slices:
                return

            if self.status[slice_index] == self.lost_slice:
                self.total_skipped_latent += 1
                return

            repairable_before = self.isRepairable(slice_index)

            index = self.slice_locations[slice_index].index(u)
            # A LSE cannot hit lost blocks or a same block multiple times
            if self.status[slice_index][index] == -1 or self.status[
                    slice_index][index] == -2:
                self.total_skipped_latent += 1
                return

            self._my_assert(self.durableCount(slice_index) >= 0)
            self.sliceDegraded(slice_index)

            self.status[slice_index][index] = -2
            u.slices_hit_by_LSE.append(slice_index)
            self.total_latent_failures += 1

            repairable_current = self.isRepairable(slice_index)
            if repairable_before and not repairable_current:
                self.unavailable_slice_count += 1
                if slice_index in self.unavailable_slice_durations.keys():
                    self.unavailable_slice_durations[slice_index].append(
                        [time])
                else:
                    self.unavailable_slice_durations[slice_index] = [[time]]

            if self.isLost(slice_index):
                info_logger.info(
                    str(time) + " slice: " + str(slice_index) + " durCount: " +
                    str(self.durableCount(slice_index)) + " latDefect " +
                    str(True) + "  due to ===latent=== error " + " on disk " +
                    str(u.getID()))
                self.undurable_slice_count += 1
                self.undurable_slice_infos.append(
                    (slice_index, time, "LSE " + str(u.getID())))
                self.status[slice_index] = self.lost_slice
        else:
            raise Exception("Latent defect should only happen for disk")
Пример #7
0
    def printAll(self):
        default_infos = "Default Configurations: \t total_time: " + str(self.total_time) + \
                        ", disk capacity: " + str(self.disk_capacity) + "TB" + \
                        ", disks per machine: " + str(self.disks_per_machine) + \
                        ", machines per rack: " + str(self.machines_per_rack) + \
                        ", rack count: " + str(self.rack_count) + \
                        ", chunk size: " + str(self.chunk_size) + "MB" + \
                        ", total active storage: " + str(self.total_active_storage) + "PB" +\
                        ", data redundancy: " + str(self.data_redundancy) + \
                        ", hierarchical:" + str(self.hier) + \
                        ", recovery bandwidth cross rack: " + str(self.recovery_bandwidth_cross_rack) + \
                        ", xml file path: " + self.xml_file_path + \
                        ", event file path: " + self.event_file + \
                        ", parallel repair: " + str(self.parallel_repair) + \
                        ", upgrade flag: " + str(self.upgrades) + \
                        ", correlated failures flag: " + str(self.correlated_failures)

        info_logger.info(default_infos)

        if self.upgrades:
            info_logger.info("Upgrade Configurations: " + str(self.hard_upgrade_infos))
            info_logger.info("Upgrade Configurations: " + str(self.soft_upgrade_infos))

        if self.correlated_failures:
            info_logger.info("Correlated Failures Configurations: " + str(self.correlated_failures_infos))
Пример #8
0
    def printAll(self):
        default_infos = "Default Configurations: \t total_time: " + str(self.total_time) + \
                        ", disk capacity: " + str(self.disk_capacity) + "TB" + \
                        ", disks per machine: " + str(self.disks_per_machine) + \
                        ", machines per rack: " + str(self.machines_per_rack) + \
                        ", rack count: " + str(self.rack_count) + \
                        ", chunk size: " + str(self.chunk_size) + "MB" + \
                        ", total active storage: " + str(self.total_active_storage) + "PB" +\
                        ", data redundancy: " + str(self.data_redundancy) + \
                        ", hierarchical:" + str(self.hier) + \
                        ", recovery bandwidth cross rack: " + str(self.recovery_bandwidth_cross_rack) + \
                        ", xml file path: " + self.xml_file_path + \
                        ", event file path: " + self.event_file + \
                        ", outputs: " + str(self.outputs) + \
                        ", parallel repair: " + str(self.parallel_repair) + \
                        ", system Scaling flag: " + str(self.system_scaling) + \
                        ", system upgrade flag: " + str(self.system_upgrade) + \
                        ", correlated failures flag: " + str(self.correlated_failures)

        info_logger.info(default_infos)

        if self.system_scaling:
            info_logger.info("System scaling Configurations: " +
                             str(self.system_scaling_infos))

        if self.system_upgrade:
            # info_logger.info("System upgrade format:(style, domain, freq, interval, downtime)")
            info_logger.info("System upgrade Configurations: " +
                             str(self.system_upgrade_infos))

        if self.correlated_failures:
            info_logger.info("Correlated Failures Configurations: " +
                             str(self.correlated_failures_infos))
Пример #9
0
    def run(self):
        conf = Configuration(self.conf_path)
        xml = XMLParser(conf)
        distributer_class = returnDistributer(conf.data_placement,
                                              conf.hierarchical)
        self.distributer = distributer_class(xml)
        self.conf = self.distributer.returnConf()

        if self.conf.rafi_recovery:
            self.event_handler = RAFIEventHandler
        else:
            self.event_handler = EventHandler
        self.distributer.start()
        # self.distributer.printGroupsToFile()

        info_logger.info("disk usage is: " +
                         str(self.distributer.diskUsage() * 100) + "%\n")
        self.distributer.getRoot().printAll()

        events_handled = 0
        events = EventQueue()

        root = self.distributer.getRoot()
        root.generateEvents(events, 0, self.conf.total_time, True)

        # if False:
        if self.conf.event_file != None:
            events_file = self.conf.event_file + '-' + self.ts
            events.printAll(events_file,
                            "Iteration number: " + str(self.iteration_times))
        self.iteration_times += 1

        handler = self.event_handler(self.distributer)

        print "total slices:", handler.total_slices
        e = events.removeFirst()
        while e is not None:
            handler.handleEvent(e, events)
            e = events.removeFirst()
            events_handled += 1

        self.total_events_handled += events_handled

        result = handler.end()
        info_logger.info(result.toString())
        return result
Пример #10
0
    def main(self, conf_path):
        events_handled = 0
        ge = GenerateEvents(conf_path)
        distributer = ge.getDistributer()
        events = ge.main()
        handler = NormalDistributeEventHandler(distributer)

        print "total slices:", handler.total_slices
        e = events.removeFirst()
        while e is not None:
            handler.handleEvent(e, events)
            e = events.removeFirst()
            events_handled += 1

        result = handler.end()
        info_logger.info(result.toString())
        info_logger.info("Events handled: %d" % events_handled)
Пример #11
0
    def handleFailure(self, u, time, e, queue):
        if e.ignore:
            return

        UnfinishRAFIEvents.queue = queue
        outtoin_slices = {}
        intoin_slices = {}

        current_total_slices = self.calCurrentTotalSlices(time)
        if isinstance(u, Machine):
            self.total_machine_failures += 1
            u.setLastFailureTime(e.getTime())

            if e.info == 3:
                self.total_perm_machine_failures += 1
            else:
                if e.info == 1:
                    self.total_short_temp_machine_failures += 1
                elif e.info == 2:
                    self.total_long_temp_machine_failures += 1
                else:
                    self.total_machine_failures_due_to_rack_failures += 1
                    if e.next_recovery_time - e.getTime() <= u.fail_timeout:
                        self.total_short_temp_machine_failures += 1
                    else:
                        self.total_long_temp_machine_failures += 1

                disks = u.getChildren()
                for child in disks:
                    slice_indexes = child.getChildren()
                    for slice_index in slice_indexes:
                        if slice_index >= current_total_slices:
                            continue
                        if self.status[slice_index] == self.lost_slice:
                            continue
                        self.sliceDegradedAvailability(slice_index)

                        repairable_before = self.isRepairable(slice_index)
                        index = self.slice_locations[slice_index].index(child)
                        if self.status[slice_index][index] == UnitState.Normal:
                            self.status[slice_index][index] = UnitState.Crashed
                        self._my_assert(self.availableCount(slice_index) >= 0)

                        repairable_current = self.isRepairable(slice_index)
                        if repairable_before and not repairable_current:
                            self.unavailable_slice_count += 1
                            self.startUnavailable(slice_index, time)

                        # rafi start
                        unavailable = self.n - self.availableCount(slice_index)
                        fs = FailedSlice()
                        fs.addInfo(time, e.next_recovery_time)
                        self.failed_slices[slice_index] = fs

                        rafi_flag = fs.check(time)
                        # slice from not in rafi event to in a rafi event
                        if rafi_flag == FailedSlice.RAFITransition.OutToIn:
                            outtoin_slices[slice_index] = fs
                        # slice from lower risk rafi event to higher risk rafi event
                        elif rafi_flag == FailedSlice.RAFITransition.InToIn:
                            intoin_slices[slice_index] = fs
                        else:  # don't care other two situations
                            pass

                outtoin_slice_indexes = outtoin_slices.keys()
                intoin_slice_indexes = intoin_slices.keys()
                new_rafi_slices = []
                upgraded_rafi_slices = []
                for slice_index in outtoin_slice_indexes:
                    if self.availableCount(slice_index) <= self.recovery_threshold:
                        new_rafi_slices.append(slice_index)
                for slice_index in intoin_slice_indexes:
                    if self.availableCount(slice_index) <= self.recovery_threshold:
                        upgraded_rafi_slices.append(slice_index)

                if new_rafi_slices != []:
                    groups_in_new = [[] for i in xrange(self.n - self.k)]
                    for slice_index in new_rafi_slices:
                        unavailable = outtoin_slices[slice_index].failedNum()
                        groups_in_new[unavailable-1].append(slice_index)
                    for group in groups_in_new:
                        if group != []:
                            # timestamp of data starts to recover(ts+detect time+identify time)
                            recover_time = ceil(time/self.node_state_check)*self.node_state_check + \
                                           self.detect_intervals[unavailable-1]
                            self.unfinished_rafi_events.addEvent(group, recover_time)

                if upgraded_rafi_slices != []:
                    groups_in_upgraded = [[] for i in xrange(self.n - self.k)]
                    for slice_index in upgraded_rafi_slices:
                        unavailable = intoin_slices[slice_index].failedNum()
                        groups_in_upgraded[unavailable-1].append(slice_index)
                    for group in groups_in_upgraded:
                        if group != []:
                            recover_time = ceil(time/self.node_state_check)*self.node_state_check + \
                                           self.detect_intervals[unavailable-1]
                            self.unfinished_rafi_events.updateEvent(group, recover_time)

                self.slices_degraded_avail_list.append((e.getTime(), self.current_avail_slice_degraded))

        elif isinstance(u, Disk):
            self.total_disk_failures += 1
            u.setLastFailureTime(e.getTime())
            # need to compute projected reovery b/w needed
            projected_bandwidth_need = 0.0

            slice_indexes = u.getChildren()
            for slice_index in slice_indexes:
                if slice_index >= current_total_slices:
                    continue
                if self.status[slice_index] == self.lost_slice:
                    continue

                self.sliceDegraded(slice_index)
                repairable_before = self.isRepairable(slice_index)

                index = self.slice_locations[slice_index].index(u)
                if self.status[slice_index][index] == UnitState.Corrupted:
                    continue
                self.status[slice_index][index] = UnitState.Corrupted

                self._my_assert(self.durableCount(slice_index) >= 0)

                repairable_current = self.isRepairable(slice_index)
                # exclude the disk lost caused by node lost, it has already considered in node lost
                if e.info != self.inherit_lost and repairable_before and not repairable_current:
                    self.unavailable_slice_count += 1
                    self.startUnavailable(slice_index, time)

                if self.isLost(slice_index):
                    info_logger.info(
                        "time: " + str(time) + " slice:" + str(slice_index) +
                        " durCount:" + str(self.durableCount(slice_index)) +
                        " due to disk " + str(u.getID()))
                    self.status[slice_index] = self.lost_slice
                    self.undurable_slice_count += 1
                    self.endUnavailable(slice_index, time)
                    continue

                # is this slice one that needs recovering? if so, how much
                # data to recover?
                if self.status[slice_index] != self.lost_slice:
                    threshold_crossed = False
                    num_undurable = self.n - self.durableCount(slice_index)
                    if num_undurable >= self.n - self.recovery_threshold:
                        threshold_crossed = True

                    num_unavailable = 0
                    if self.availability_counts_for_recovery:
                        num_unavailable = self.n - \
                            self.availableCount(slice_index)
                        if num_unavailable >= self.n - self.recovery_threshold:
                            threshold_crossed = True
                    if threshold_crossed:
                        projected_bandwidth_need += self.k - 1 + \
                            (self.n - self.status[slice_index].count(UnitState.Normal))

            # current recovery bandwidth goes up by projected bandwidth need
            projected_bandwidth_need /= (e.next_recovery_time -
                                         e.getTime())
            u.setLastBandwidthNeed(projected_bandwidth_need)
            self._my_assert(self.current_recovery_bandwidth >= 0)
            self.current_recovery_bandwidth += projected_bandwidth_need
            self._my_assert(self.current_recovery_bandwidth >= 0)
            if self.current_recovery_bandwidth > self.max_recovery_bandwidth:
                self.max_recovery_bandwidth = self.current_recovery_bandwidth
            self._my_assert(self.current_recovery_bandwidth >= 0)

            self.slices_degraded_list.append((e.getTime(),
                                              self.current_slice_degraded))
            self.slices_degraded_avail_list.append(
                (e.getTime(), self.current_avail_slice_degraded))

        else:
            for child in u.getChildren():
                self.handleFailure(child, time, e, queue)
Пример #12
0
    def handleFailure(self, u, time, e, queue):
        if e.ignore:
            return

        UnfinishRAFIEvents.queue = queue
        outtoin_slices = {}
        intoin_slices = {}

        if isinstance(u, Machine):
            self.total_machine_failures += 1
            u.setLastFailureTime(e.getTime())

            if e.info == 3:
                self.total_perm_machine_failures += 1
            else:
                if e.info == 1:
                    self.total_short_temp_machine_failures += 1
                elif e.info == 2:
                    self.total_long_temp_machine_failures += 1
                else:
                    self.total_machine_failures_due_to_rack_failures += 1
                    if e.next_recovery_time - e.getTime() <= u.fail_timeout:
                        self.total_short_temp_machine_failures += 1
                    else:
                        self.total_long_temp_machine_failures += 1

            disks = u.getChildren()
            for child in disks:
                slice_indexes = child.getChildren()
                for slice_index in slice_indexes:
                    if slice_index >= self.total_slices:
                        continue
                    if self.status[slice_index] == self.lost_slice:
                        continue

                    if e.info == 3:
                        self.sliceDegraded(slice_index)
                    else:
                        self.sliceDegradedAvailability(slice_index)

                    repairable_before = self.isRepairable(slice_index)
                    index = self.slice_locations[slice_index].index(child)
                    if self.status[slice_index][index] == -1:
                        continue
                    if e.info == 3:
                        self.status[slice_index][index] == -1
                        self._my_assert(self.durableCount(slice_index) >= 0)
                    else:
                        if self.status[slice_index][index] == 1:
                            self.status[slice_index][index] = 0
                        self._my_assert(self.availableCount(slice_index) >= 0)

                    repairable_current = self.isRepairable(slice_index)
                    if repairable_before and not repairable_current:
                        self.unavailable_slice_count += 1
                        if slice_index in self.unavailable_slice_durations.keys(
                        ):
                            self.unavailable_slice_durations[
                                slice_index].append([time])
                        else:
                            self.unavailable_slice_durations[slice_index] = [[
                                time
                            ]]

                    # rafi start
                    unavailable = self.n - self.availableCount(slice_index)
                    fs = FailedSlice()
                    fs.addInfo(time, e.next_recovery_time)
                    self.failed_slices[slice_index] = fs

                    rafi_flag = fs.check(time)
                    # slice from not in rafi event to in a rafi event
                    if rafi_flag == FailedSlice.RAFITransition.OutToIn:
                        outtoin_slices[slice_index] = fs
                    # slice from lower risk rafi event to higher risk rafi event
                    elif rafi_flag == FailedSlice.RAFITransition.InToIn:
                        intoin_slices[slice_index] = fs
                    else:  # don't care other two situations
                        pass

                    if e.info == 3:
                        # lost stripes have been recorded in unavailable_slice_durations
                        if self.isLost(slice_index):
                            info_logger.info(
                                "time: " + str(time) + " slice:" +
                                str(slice_index) + " durCount:" +
                                str(self.durableCount(slice_index)) +
                                " due to machine " + str(u.getID()))
                            self.status[slice_index] = self.lost_slice
                            self.undurable_slice_count += 1
                            self.undurable_slice_infos.append(
                                (slice_index, time,
                                 "machine " + str(u.getID())))
                            continue

            outtoin_slice_indexes = outtoin_slices.keys()
            intoin_slice_indexes = intoin_slices.keys()
            new_rafi_slices = []
            upgraded_rafi_slices = []
            for slice_index in outtoin_slice_indexes:
                if self.availableCount(slice_index) <= self.recovery_threshold:
                    new_rafi_slices.append(slice_index)
            for slice_index in intoin_slice_indexes:
                if self.availableCount(slice_index) <= self.recovery_threshold:
                    upgraded_rafi_slices.append(slice_index)

            if new_rafi_slices != []:
                groups_in_new = [[] for i in xrange(self.n - self.k)]
                for slice_index in new_rafi_slices:
                    unavailable = outtoin_slices[slice_index].failedNum()
                    groups_in_new[unavailable - 1].append(slice_index)
                for group in groups_in_new:
                    if group != []:
                        # timestamp of data starts to recover(ts+detect time)
                        recover_time = time + self.detect_intervals[unavailable
                                                                    - 1]
                        self.unfinished_rafi_events.addEvent(
                            group, recover_time)

            if upgraded_rafi_slices != []:
                groups_in_upgraded = [[] for i in xrange(self.n - self.k)]
                for slice_index in upgraded_rafi_slices:
                    unavailable = intoin_slices[slice_index].failedNum()
                    groups_in_upgraded[unavailable - 1].append(slice_index)
                for group in groups_in_upgraded:
                    if group != []:
                        recover_time = time + self.detect_intervals[unavailable
                                                                    - 1]
                        self.unfinished_rafi_events.updateEvent(
                            group, recover_time)
        elif isinstance(u, Disk):
            self.total_disk_failures += 1
            u.setLastFailureTime(e.getTime())
            # need to compute projected reovery b/w needed
            projected_bandwidth_need = 0.0

            slice_indexes = u.getChildren()
            for slice_index in slice_indexes:
                if slice_index >= self.total_slices:
                    continue
                if self.status[slice_index] == self.lost_slice:
                    continue

                self.sliceDegraded(slice_index)
                repairable_before = self.isRepairable(slice_index)

                index = self.slice_locations[slice_index].index(u)
                if self.status[slice_index][index] == -1:
                    continue
                self.status[slice_index][index] = -1

                self._my_assert(self.durableCount(slice_index) >= 0)

                repairable_current = self.isRepairable(slice_index)
                if repairable_before and not repairable_current:
                    self.unavailable_slice_count += 1
                    if slice_index in self.unavailable_slice_durations.keys():
                        self.unavailable_slice_durations[slice_index].append(
                            [time])
                    else:
                        self.unavailable_slice_durations[slice_index] = [[
                            time
                        ]]

                if self.isLost(slice_index):
                    info_logger.info("time: " + str(time) + " slice:" +
                                     str(slice_index) + " durCount:" +
                                     str(self.durableCount(slice_index)) +
                                     " due to disk " + str(u.getID()))
                    self.status[slice_index] = self.lost_slice
                    self.undurable_slice_count += 1
                    self.undurable_slice_infos.append(
                        (slice_index, time, "disk " + str(u.getID())))
                    continue
        else:
            for child in u.getChildren():
                self.handleFailure(child, time, e, queue)
Пример #13
0
    def handleFailure(self, u, time, e, queue):
        if e.ignore:
            return

        current_total_slices = self.calCurrentTotalSlices(time)
        if isinstance(u, Machine):
            self.total_machine_failures += 1
            u.setLastFailureTime(e.getTime())

            if e.info == 3:
                self.total_perm_machine_failures += 1
            else:
                if e.info == 1:
                    self.total_short_temp_machine_failures += 1
                elif e.info == 2:
                    self.total_long_temp_machine_failures += 1
                else:
                    self.total_machine_failures_due_to_rack_failures += 1
                    if e.next_recovery_time - e.getTime() <= u.fail_timeout:
                        self.total_short_temp_machine_failures += 1
                    else:
                        self.total_long_temp_machine_failures += 1

                disks = u.getChildren()
                for child in disks:
                    slice_indexes = child.getChildren()
                    for slice_index in slice_indexes:
                        if slice_index >= current_total_slices:
                            continue
                        if self.status[slice_index] == self.lost_slice:
                            continue
                        self.sliceDegradedAvailability(slice_index)

                        repairable_before = self.isRepairable(slice_index)
                        index = self.slice_locations[slice_index].index(child)
                        if self.status[slice_index][index] == UnitState.Normal:
                            self.status[slice_index][index] = UnitState.Crashed
                        self._my_assert(self.availableCount(slice_index) >= 0)

                        repairable_current = self.isRepairable(slice_index)
                        if repairable_before and not repairable_current:
                            self.unavailable_slice_count += 1
                            self.startUnavailable(slice_index, time)

                self.slices_degraded_avail_list.append(
                    (e.getTime(), self.current_avail_slice_degraded))

        elif isinstance(u, Disk):
            self.total_disk_failures += 1
            u.setLastFailureTime(e.getTime())
            # need to compute projected reovery b/w needed
            projected_bandwidth_need = 0.0

            slice_indexes = u.getChildren()
            for slice_index in slice_indexes:
                if slice_index >= current_total_slices:
                    continue
                if self.status[slice_index] == self.lost_slice:
                    continue

                self.sliceDegraded(slice_index)
                repairable_before = self.isRepairable(slice_index)

                index = self.slice_locations[slice_index].index(u)
                if self.status[slice_index][index] == UnitState.Corrupted:
                    continue
                self.status[slice_index][index] = UnitState.Corrupted

                self._my_assert(self.durableCount(slice_index) >= 0)

                repairable_current = self.isRepairable(slice_index)
                # exclude the disk lost caused by node lost, it has already considered in node lost
                if e.info != self.inherit_lost and repairable_before and not repairable_current:
                    self.unavailable_slice_count += 1
                    self.startUnavailable(slice_index, time)

                if self.isLost(slice_index):
                    info_logger.info("time: " + str(time) + " slice:" +
                                     str(slice_index) + " durCount:" +
                                     str(self.durableCount(slice_index)) +
                                     " due to disk " + str(u.getID()))
                    self.status[slice_index] = self.lost_slice
                    self.undurable_slice_count += 1
                    self.endUnavailable(slice_index, time)
                    continue

                # is this slice one that needs recovering? if so, how much
                # data to recover?
                if self.status[slice_index] != self.lost_slice:
                    threshold_crossed = False
                    num_undurable = self.n - self.durableCount(slice_index)
                    if num_undurable >= self.n - self.recovery_threshold:
                        threshold_crossed = True

                    num_unavailable = 0
                    if self.availability_counts_for_recovery:
                        num_unavailable = self.n - \
                            self.availableCount(slice_index)
                        if num_unavailable + num_undurable >= self.n - \
                           self.recovery_threshold:
                            threshold_crossed = True
                    if threshold_crossed:
                        projected_bandwidth_need += self.k - 1 + \
                            (self.n - self.status[slice_index].count(UnitState.Normal))

            # current recovery bandwidth goes up by projected bandwidth need
            projected_bandwidth_need /= (e.next_recovery_time - e.getTime())
            u.setLastBandwidthNeed(projected_bandwidth_need)
            self._my_assert(self.current_recovery_bandwidth >= 0)
            self.current_recovery_bandwidth += projected_bandwidth_need
            self._my_assert(self.current_recovery_bandwidth >= 0)
            if self.current_recovery_bandwidth > self.max_recovery_bandwidth:
                self.max_recovery_bandwidth = self.current_recovery_bandwidth
            self._my_assert(self.current_recovery_bandwidth >= 0)

            self.slices_degraded_list.append(
                (e.getTime(), self.current_slice_degraded))
            self.slices_degraded_avail_list.append(
                (e.getTime(), self.current_avail_slice_degraded))

        else:
            for child in u.getChildren():
                self.handleFailure(child, time, e, queue)
Пример #14
0
    def printDegradedStat(self, degraded, description, unit):
        current_sample_average = 0
        current_time = 0

        sampling_period = 24
        # sampling per min, so 24*60 items in below list
        values_per_sample = []
        samples = int(self.conf.total_time / 24)
        if self.conf.total_time % 24 != 0:
            samples += 1
        samples += 1

        day_samples = [0] * samples
        previous_window_value = 0
        avg_of_avgs = 0
        avg_count = 0
        max_v = 0

        it = iter(degraded)
        try:
            t = it.next()
        except StopIteration:
            t = None

        while t is not None:
            values_per_sample = [0] * (24 * 60)
            for i in xrange(sampling_period * 60):
                if t is None:
                    break
                per_sample_count = 0
                while True:
                    if t[0] > current_time + i / 60:
                        per_sample_count = 0
                        values_per_sample[i] = previous_window_value
                        break
                    else:
                        values_per_sample[i] = (values_per_sample[i] *
                                                per_sample_count+t[1]) /\
                                               (per_sample_count+1)
                        previous_window_value = t[1]
                        per_sample_count += 1
                        try:
                            t = it.next()
                        except StopIteration:
                            t = None
                            break

            current_sample_average = 0
            for i in xrange(sampling_period * 60):
                current_sample_average += values_per_sample[i]
                if max_v < values_per_sample[i]:
                    max_v = values_per_sample[i]
            current_sample_average /= (sampling_period * 60)

            if int(current_time / 24) >= samples:
                break
            day_samples[int(current_time / 24)] = current_sample_average
            current_time += sampling_period
            avg_of_avgs += current_sample_average
            avg_count += 1

        avg_of_avgs /= avg_count
        stdev = 0.0
        for val in day_samples:
            stdev += (val - avg_of_avgs) * (val - avg_of_avgs)

        info_logger.info("%s_per_%dh_%s %d stdev:%f max:%d" %
                         (description, sampling_period, unit, avg_of_avgs,
                          sqrt(stdev / (len(day_samples) - 1)), max_v))

        self.printPerYearStart(day_samples, description)
Пример #15
0
    def end(self):
        ret = Result()

        # data loss probability and data unvailable probability
        data_loss_prob = format(
            float(self.undurable_slice_count) / (self.total_slices * self.n),
            ".4e")

        Result.undurable_count = self.undurable_slice_count
        Result.unavailable_count = self.unavailable_slice_count
        Result.undurable_infos = self.undurable_slice_infos
        Result.unavailable_slice_durations = self.unavailable_slice_durations
        Result.PDL = data_loss_prob

        TTFs, TTRs = self.processDuration()
        Result.PUA = self.calUA(TTFs, TTRs)
        # Result.unavailable_prob1 = self.calUADowntime(TTRs)

        Result.undurable_count_details = self.calUndurableDetails()
        Result.NOMDL = self.NOMDL()

        # total repair cost in PiBs
        Result.TRC = format(
            float(self.total_repair_transfers) / pow(2, 30), ".2e")

        years = self.end_time / 8760
        # total storage cost in PiB*year
        Result.TSC = format(
            float(self.conf.total_active_storage) * self.n / self.k * years,
            ".2e")

        if not self.queue_disable:
            queue_times, avg_queue_time = self.contention_model.statistics()
            Result.queue_times = queue_times
            Result.avg_queue_time = format(avg_queue_time, ".4f")
            info_logger.info(
                "total times of queuing: %d, average queue time: %f" %
                (queue_times, avg_queue_time))

        info_logger.info(
            "anomalous available count: %d, total latent failure: %d,\
             total scrubs: %d, total scrubs repairs: %d, \
             total disk failures:%d, total disk repairs:%d, \
             total machine failures:%d, total machine repairs:%d, \
             total permanent machine failures:%d, \
             total short temperary machine failures:%d, \
             total long temperary machine failures:%d, \
             total machine failures due to rack failures:%d, \
             total eager machine repairs:%d, total eager slice repairs:%d, \
             total skipped latent:%d, total incomplete recovery:%d\n \
             max recovery bandwidth:%f\n \
             undurable_slice_count:%d\n \
             total repairs:%d, total optimal repairs:%d" %
            (self.anomalous_available_count, self.total_latent_failures,
             self.total_scrubs, self.total_scrub_repairs,
             self.total_disk_failures, self.total_disk_repairs,
             self.total_machine_failures, self.total_machine_repairs,
             self.total_perm_machine_failures,
             self.total_short_temp_machine_failures,
             self.total_long_temp_machine_failures,
             self.total_machine_failures_due_to_rack_failures,
             self.total_eager_machine_repairs, self.total_eager_slice_repairs,
             self.total_skipped_latent,
             self.total_incomplete_recovery_attempts,
             self.max_recovery_bandwidth, self.undurable_slice_count,
             self.total_repairs, self.total_optimal_repairs))

        return ret
Пример #16
0
    def handleFailure(self, u, time, e, queue):
        if e.ignore:
            return

        if isinstance(u, Machine):
            self.total_machine_failures += 1
            u.setLastFailureTime(e.getTime())

            if e.info == 3:
                self.total_perm_machine_failures += 1
            else:
                if e.info == 1:
                    self.total_short_temp_machine_failures += 1
                elif e.info == 2:
                    self.total_long_temp_machine_failures += 1
                else:
                    self.total_machine_failures_due_to_rack_failures += 1
                    if e.next_recovery_time - e.getTime() <= u.fail_timeout:
                        self.total_short_temp_machine_failures += 1
                    else:
                        self.total_long_temp_machine_failures += 1

            disks = u.getChildren()
            for child in disks:
                slice_indexes = child.getChildren()
                for slice_index in slice_indexes:
                    if slice_index >= self.total_slices:
                        continue
                    if self.status[slice_index] == self.lost_slice:
                        continue

                    if e.info == 3:
                        self.sliceDegraded(slice_index)
                    else:
                        self.sliceDegradedAvailability(slice_index)

                    repairable_before = self.isRepairable(slice_index)
                    index = self.slice_locations[slice_index].index(child)
                    if self.status[slice_index][index] == -1:
                        continue
                    if e.info == 3:
                        self.status[slice_index][index] = -1
                        self._my_assert(self.durableCount(slice_index) >= 0)
                    else:
                        if self.status[slice_index][index] == 1:
                            self.status[slice_index][index] = 0
                        self._my_assert(self.availableCount(slice_index) >= 0)

                    repairable_current = self.isRepairable(slice_index)
                    if repairable_before and not repairable_current:
                        self.unavailable_slice_count += 1
                        if slice_index in self.unavailable_slice_durations.keys(
                        ):
                            self.unavailable_slice_durations[
                                slice_index].append([time])
                        else:
                            self.unavailable_slice_durations[slice_index] = [[
                                time
                            ]]

                    if e.info == 3:
                        # lost stripes have been recorded in unavailable_slice_durations
                        if self.isLost(slice_index):
                            info_logger.info(
                                "time: " + str(time) + " slice:" +
                                str(slice_index) + " durCount:" +
                                str(self.durableCount(slice_index)) +
                                " due to machine " + str(u.getID()))
                            self.status[slice_index] = self.lost_slice
                            self.undurable_slice_count += 1
                            self.undurable_slice_infos.append(
                                (slice_index, time,
                                 "machine " + str(u.getID())))
                            continue
        elif isinstance(u, Disk):
            self.total_disk_failures += 1
            u.setLastFailureTime(e.getTime())
            # need to compute projected reovery b/w needed
            projected_bandwidth_need = 0.0

            slice_indexes = u.getChildren()
            for slice_index in slice_indexes:
                if slice_index >= self.total_slices:
                    continue
                if self.status[slice_index] == self.lost_slice:
                    continue

                self.sliceDegraded(slice_index)
                repairable_before = self.isRepairable(slice_index)

                index = self.slice_locations[slice_index].index(u)
                if self.status[slice_index][index] == -1:
                    continue
                self.status[slice_index][index] = -1

                self._my_assert(self.durableCount(slice_index) >= 0)

                repairable_current = self.isRepairable(slice_index)
                if repairable_before and not repairable_current:
                    self.unavailable_slice_count += 1
                    if slice_index in self.unavailable_slice_durations.keys():
                        self.unavailable_slice_durations[slice_index].append(
                            [time])
                    else:
                        self.unavailable_slice_durations[slice_index] = [[
                            time
                        ]]

                if self.isLost(slice_index):
                    info_logger.info("time: " + str(time) + " slice:" +
                                     str(slice_index) + " durCount:" +
                                     str(self.durableCount(slice_index)) +
                                     " due to disk " + str(u.getID()))
                    self.status[slice_index] = self.lost_slice
                    self.undurable_slice_count += 1
                    self.undurable_slice_infos.append(
                        (slice_index, time, "disk " + str(u.getID())))
                    continue
        else:
            for child in u.getChildren():
                self.handleFailure(child, time, e, queue)