예제 #1
0
    def _process_msg_pull(self, addr, msg):
        """
        Processes incoming message for clients that are in pull mode, not injecting any fault

        :param addr: The address of the sender
        :param msg: The message dictionary
        """
        # We process status messages for connections that are in the queue
        is_status, status = MessageClient.is_status_message(msg)
        if is_status and status == MessageClient.CONNECTION_LOST_MSG:
            if not self._suppressOutput:
                self._writers[addr].write_entry(
                    MessageBuilder.status_connection(time()))
        elif is_status and status == MessageClient.CONNECTION_RESTORED_MSG:
            if not self._suppressOutput:
                self._writers[addr].write_entry(
                    MessageBuilder.status_connection(time(), restored=True))
        else:
            # Messages are popped from the input queue, and their content stored
            if not self._suppressOutput:
                self._writers[addr].write_entry(msg)
            msg_type = msg[MessageBuilder.FIELD_TYPE]
            if msg_type == MessageBuilder.STATUS_START:
                InjectorController.logger.info(
                    "Task %s started on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
            elif msg_type == MessageBuilder.STATUS_RESTART:
                InjectorController.logger.info(
                    "Task %s restarted on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
            elif msg_type == MessageBuilder.STATUS_END:
                InjectorController.logger.info(
                    "Task %s terminated successfully on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
                if not self._suppressOutput:
                    self._write_task_output(addr, msg)
            elif msg_type == MessageBuilder.STATUS_ERR:
                InjectorController.logger.error(
                    "Task %s terminated with error code %s on host %s" %
                    (msg[MessageBuilder.FIELD_DATA],
                     str(msg[MessageBuilder.FIELD_ERR]), formatipport(addr)))
                if not self._suppressOutput:
                    self._write_task_output(addr, msg)
            elif msg_type == MessageBuilder.STATUS_GREET:
                status_string = 'An injection session is in progress' if msg[MessageBuilder.FIELD_ISF] else \
                    'No injection session is in progress'
                InjectorController.logger.info(
                    "Greetings. Engine %s is alive with %s currently active tasks. %s"
                    % (formatipport(addr), str(
                        msg[MessageBuilder.FIELD_DATA]), status_string))
예제 #2
0
    def _pull(self):
        """
        Starts the injection server in pull mode: that is, no workload is injected, and the execution logs are stored
        as messages are sent from the connected hosts.
        """
        self._client.start()

        if self._client.get_n_registered_hosts() == 0:
            InjectorController.logger.warning(
                "No connected hosts for pulling information. Aborting...")
            return

        msg = MessageBuilder.command_greet(0)
        self._client.broadcast_msg(msg)

        addrs = self._client.get_registered_hosts()
        self._writers = {}
        self._outputsDirs = {}
        for addr in addrs:
            self._outputsDirs[addr] = format_output_directory(
                self._resultsDir, addr)
            # The outputs directory needs to be flushed before starting the new injection session
            if not self._suppressOutput:
                if isdir(self._outputsDirs[addr]):
                    rmtree(self._outputsDirs[addr], ignore_errors=True)
                # We create an execution log writer for each connected host
                self._writers[addr] = ExecutionLogWriter(
                    format_injection_filename(self._resultsDir, addr))

        while True:
            # The loop does not end; it is up to users to terminate the listening process by killing the process
            addr, msg = self._client.pop_msg_queue()
            self._process_msg_pull(addr, msg)
예제 #3
0
 def _process_result(self, task, timestamp, rcode, outdata=''):
     """
     Method that sends a broadcast message to all connected hosts when a task terminates
     
     :param task: The msg related to the task that has terminated
     :param timestamp: The timestamp related to the termination time
     :param rcode: The return code of the task's execution
     :param outdata: the shell output of the task, if it is a benchmark
     """
     task.timestamp = timestamp
     # If output logging is not enabled, or the task is not a benchmark, the output data is discarded
     if not self._log_outputs or task.isFault or len(outdata) == 0:
         outdata = None
     if rcode != 0:
         msg = MessageBuilder.status_error(task, rcode, outdata)
     else:
         msg = MessageBuilder.status_end(task, outdata)
     if msg is not None and not current_thread().has_to_terminate():
         self._server.broadcast_msg(msg)
예제 #4
0
 def _inform_start(self, task, timestamp):
     """
     Method that sends a broadcast message to all connected hosts when a task is started
     
     :param task: The msg related to the task that has been started
     :param timestamp: The timestamp related to the starting time
     """
     task.timestamp = timestamp
     msg = MessageBuilder.status_start(task)
     if msg is not None:
         self._server.broadcast_msg(msg)
예제 #5
0
    def _inform_restart(self, task, timestamp, rcode):
        """
        Method that sends a broadcast message to all connected hosts when a task is restarted

        :param task: The msg related to the task that has terminated
        :param timestamp: The timestamp related to the termination time
        :param rcode: The return code of the task's execution
        """
        task.timestamp = timestamp
        error = None if rcode == 0 else rcode
        msg = MessageBuilder.status_restart(task, error)
        if msg is not None:
            self._server.broadcast_msg(msg)
예제 #6
0
    def _end_session(self):
        """
        Terminates the injection session for all connected hosts
        """
        msg_end = MessageBuilder.command_session(time(), end=True)
        self._client.broadcast_msg(msg_end)

        session_closed = 0
        session_sent = self._client.get_n_registered_hosts()
        session_check_start = time()
        session_check_now = time()
        while session_check_now - session_check_start < self._sessionWait and session_closed < session_sent:
            # We wait until we have received an ack for the termination from all of the connected hosts, or we time out
            if self._client.peek_msg_queue() > 0:
                addr, msg = self._client.pop_msg_queue()
                if msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_YES:
                    InjectorController.logger.info(
                        "Injection session closed with engine %s" %
                        formatipport(addr))
                    if not self._suppressOutput:
                        self._writers[addr].write_entry(
                            MessageBuilder.command_session(
                                msg[MessageBuilder.FIELD_TIME], end=True))
                    session_closed += 1
                else:
                    # If we receive a message that is not an ack after all tasks have terminated, something is wrong
                    InjectorController.logger.error(
                        "Ack expected from engine %s, got %s" %
                        (formatipport(addr), msg[MessageBuilder.FIELD_TYPE]))
            sleep(self._sleepPeriod)
            session_check_now = time()

        # All of the execution log writers are closed, and the session finishes
        if not self._suppressOutput:
            for writer in self._writers.values():
                writer.close()
예제 #7
0
 def _update_session(self, addr, msg):
     """
     Checks and updates session-related information
     
     In a fault injection session, the master is the only host allowed to issue commands to this server. All other
     connected host can only monitor information
     
     :param addr: The (ip, port) address of the sender host
     :param msg: The message dictionary
     """
     ack = False
     err = None
     if msg[MessageBuilder.
            FIELD_TYPE] == MessageBuilder.COMMAND_END_SESSION and addr == self._master:
         # If the current master has terminated its session, we react accordingly
         self._master = None
         self._session_timestamp = -1
         ack = True
         InjectorEngine.logger.info(
             'Injection session terminated with controller %s' %
             formatipport(addr))
     elif msg[MessageBuilder.
              FIELD_TYPE] == MessageBuilder.COMMAND_START_SESSION:
         session_ts = msg[MessageBuilder.FIELD_TIME]
         addresses = self._server.get_registered_hosts()
         if self._master is None or self._master not in addresses or self._master == addr:
             # When starting a brand new session, the thread pool must be reset in order to prevent orphan tasks
             # from the previous session to keep running.
             # The only exception is when the session start command refers to a started session, that must be
             # restored after a disconnection of the master.
             if not self._server.reSendMsgs or self._session_timestamp != session_ts or self._master is None:
                 self._pool.stop(kill_abruptly=True)
                 self._pool.start()
                 err = -1
             # If there is no current master, or the previous one lost its connection, we accept the
             # session start request of the new host
             self._master = addr
             self._session_timestamp = session_ts
             ack = True
             InjectorEngine.logger.info(
                 'Injection session started with controller %s' %
                 formatipport(addr))
         else:
             InjectorEngine.logger.info(
                 'Injection session rejected with controller %s' %
                 formatipport(addr))
         # An ack (positive or negative) is sent to the sender host
     self._server.send_msg(addr, MessageBuilder.ack(time(), ack, err))
예제 #8
0
 def listen(self):
     """
     Listens for incoming fault injection requests and executes them 
     """
     InjectorEngine.logger.info("FINJ Injection Engine v%s started" %
                                VER_ID)
     signal.signal(signal.SIGINT, self._signalhandler)
     signal.signal(signal.SIGTERM, self._signalhandler)
     self._subman.start_subprocesses()
     self._server.start()
     self._pool.start()
     while True:
         # Waiting for a new requests to arrive
         addr, msg = self._server.pop_msg_queue()
         msg_type = msg[MessageBuilder.FIELD_TYPE]
         # If a session command has arrived, we process it accordingly
         if msg_type == MessageBuilder.COMMAND_START_SESSION or msg_type == MessageBuilder.COMMAND_END_SESSION:
             self._update_session(addr, msg)
         # The set time is sent by the master after a successful ack and defines when the 'workload' is started
         elif msg_type == MessageBuilder.COMMAND_SET_TIME and self._master is not None and addr == self._master:
             self._pool.reset_session(msg[MessageBuilder.FIELD_TIME],
                                      time())
         # If the master has sent a clock correction request, we process it
         elif msg_type == MessageBuilder.COMMAND_CORRECT_TIME and self._master is not None and addr == self._master:
             self._pool.correct_time(msg[MessageBuilder.FIELD_TIME])
         # Processing a termination command
         elif msg_type == MessageBuilder.COMMAND_TERMINATE:
             self._check_for_termination(addr, msg)
         # If a new command has been issued by the current session master, we add it to the thread pool queue
         elif addr == self._master and msg[
                 MessageBuilder.FIELD_TYPE] == MessageBuilder.COMMAND_START:
             self._pool.submit_task(Task.msg_to_task(msg))
         elif msg_type == MessageBuilder.COMMAND_GREET:
             reply = MessageBuilder.status_greet(time(),
                                                 self._pool.active_tasks(),
                                                 self._master is not None)
             self._server.send_msg(addr, reply)
         else:
             InjectorEngine.logger.warning(
                 'Invalid command sent from non-master host %s',
                 formatipport(addr))
예제 #9
0
    def _process_msg_inject(self, addr, msg):
        """
        Processes incoming message for clients involved in an injection session

        :param addr: The address of the sender
        :param msg: The message dictionary
        """
        # We process status messages for connections that are in the queue
        is_status, status = MessageClient.is_status_message(msg)
        if is_status and status == MessageClient.CONNECTION_LOST_MSG:
            # If connection has been lost with an host, we remove its pendingTasks entry
            if not self._suppressOutput:
                self._writers[addr].write_entry(
                    MessageBuilder.status_connection(time()))
        elif is_status and status == MessageClient.CONNECTION_RESTORED_MSG:
            # If connection has been restored with an host, we send a new session start command
            self._client.send_msg(
                addr, MessageBuilder.command_session(self._session_id))
            self._client.send_msg(
                addr,
                MessageBuilder.command_set_time(self._get_timestamp(time())))
        elif is_status and status == MessageClient.CONNECTION_FINALIZED_MSG:
            self._pendingTasks.pop(addr, None)
            # If all connections to servers were finalized we assume that the injection can be terminated
            if len(self._pendingTasks) == 0:
                self._endReached = True
                self._reader.close()
        else:
            msg_type = msg[MessageBuilder.FIELD_TYPE]
            if msg_type != MessageBuilder.ACK_YES and msg_type != MessageBuilder.ACK_NO:
                # Ack messages are not written to the output log
                if not self._suppressOutput:
                    self._writers[addr].write_entry(msg)
            # We log on the terminal the content of the message in a pretty form
            if msg_type == MessageBuilder.STATUS_START:
                InjectorController.logger.info(
                    "Task %s started on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
            elif msg_type == MessageBuilder.STATUS_RESTART:
                InjectorController.logger.info(
                    "Task %s restarted on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
            elif msg_type == MessageBuilder.STATUS_END:
                InjectorController.logger.info(
                    "Task %s terminated successfully on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
                # If a task terminates, we remove its sequence number from the set of pending tasks for the host
                self._pendingTasks[addr].discard(
                    msg[MessageBuilder.FIELD_SEQNUM])
                if not self._suppressOutput:
                    self._write_task_output(addr, msg)
            elif msg_type == MessageBuilder.STATUS_ERR:
                InjectorController.logger.error(
                    "Task %s terminated with error code %s on host %s" %
                    (msg[MessageBuilder.FIELD_DATA],
                     str(msg[MessageBuilder.FIELD_ERR]), formatipport(addr)))
                self._pendingTasks[addr].discard(
                    msg[MessageBuilder.FIELD_SEQNUM])
                if not self._suppressOutput:
                    self._write_task_output(addr, msg)
            elif msg_type == MessageBuilder.ACK_YES:
                # ACK messages after the initialization phase are received ONLY when a connection is restored,
                # and the session must be resumed
                InjectorController.logger.warning(
                    "Session resumed with engine %s" % formatipport(addr))
                # If the ack msg contains an error, it means all previously running tasks have been lost
                if not self._suppressOutput:
                    self._writers[addr].write_entry(
                        MessageBuilder.status_connection(time(),
                                                         restored=True))
                if MessageBuilder.FIELD_ERR in msg:
                    self._pendingTasks[addr] = set()
                    if not self._suppressOutput:
                        self._writers[addr].write_entry(
                            MessageBuilder.status_reset(
                                msg[MessageBuilder.FIELD_TIME]))
            elif msg_type == MessageBuilder.ACK_NO:
                InjectorController.logger.warning(
                    "Session cannot be resumed with engine %s" %
                    formatipport(addr))
                self._client.remove_host(addr)
예제 #10
0
    def _init_session(self, workload_name):
        """
        Initializes the injection session for all connected hosts

        :param workload_name: The name of the workload to be injected
        :return: the number of hosts that have accepted the injection start command, and the timestamp ID of the session
        """
        session_start_timestamp = time()
        msg_start = MessageBuilder.command_session(session_start_timestamp)
        self._client.broadcast_msg(msg_start)

        self._writers = {}
        self._outputsDirs = {}
        self._pendingTasks = {}
        session_accepted = set()
        session_replied = 0
        session_sent = self._client.get_n_registered_hosts()
        session_check_start = time()
        session_check_now = time()
        while session_check_now - session_check_start < self._sessionWait and session_replied < session_sent:
            # We wait until we receive an ack (positive or negative) from all connected hosts, or either we time out
            if self._client.peek_msg_queue() > 0:
                addr, msg = self._client.pop_msg_queue()
                if msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_YES:
                    # If an host replies to the injection start command with a positive ack, its log writer is
                    # instantiated, together with its entry in the pendingTasks dictionary
                    InjectorController.logger.info(
                        "Injection session started with engine %s" %
                        formatipport(addr))
                    session_accepted.add(addr)
                    session_replied += 1
                    self._outputsDirs[addr] = format_output_directory(
                        self._resultsDir, addr, workload_name)
                    # The outputs directory needs to be flushed before starting the new injection session
                    if not self._suppressOutput:
                        if isdir(self._outputsDirs[addr]):
                            rmtree(self._outputsDirs[addr], ignore_errors=True)
                        self._writers[addr] = ExecutionLogWriter(
                            format_injection_filename(self._resultsDir, addr,
                                                      workload_name))
                        self._writers[addr].write_entry(
                            MessageBuilder.command_session(
                                msg[MessageBuilder.FIELD_TIME]))
                    self._pendingTasks[addr] = set()
                elif msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_NO:
                    # If an host rejects the injection start command, we discard it
                    InjectorController.logger.warning(
                        "Injection session request rejected by engine %s" %
                        formatipport(addr))
                    session_replied += 1
                    self._client.remove_host(addr)
            sleep(self._sleepPeriod)
            session_check_now = time()

        if session_check_now - session_check_start >= self._sessionWait:
            # If we have reached the time out, it means that not all of the connected hosts have replied. This is
            # highly unlikely, but could still happen. In this case, we remove all hosts that have not replied
            InjectorController.logger.warning(
                "Injection session startup reached the timeout limit")
            for addr in self._client.get_registered_hosts():
                if addr not in session_accepted:
                    self._client.remove_host(addr)

        return len(session_accepted), session_start_timestamp
예제 #11
0
    def _inject(self, reader, max_tasks=None):
        """
        Starts the injection process with a given workload, issuing commands to start tasks on remote hosts and
        collecting their result

        :param reader: a valid Reader object
        :param max_tasks: The maximum number of tasks to be processed before terminating. Useful for debugging
        """
        self._reader = reader
        assert isinstance(
            reader, Reader), '_inject method only supports Reader objects!'
        task = reader.read_entry()
        if task is None:
            InjectorController.logger.warning(
                "Input workload appears to be empty. Aborting...")
            return

        self._client.start()

        # Initializing the injection session
        session_accepted, session_id = self._init_session(
            workload_name=splitext(basename(reader.get_path()))[0])
        if session_accepted == 0:
            InjectorController.logger.warning(
                "No valid hosts for injection detected. Aborting...")
            return

        self._session_id = session_id
        # Determines if we have reached the end of the workload
        self._endReached = False
        read_tasks = 0

        # Start timestamp for the workload, computed from its first entry, minus the specified padding value
        self._start_timestamp = task.timestamp - self._workloadPadding
        # Synchronizes the time with all of the connected hosts
        self._client.broadcast_msg(
            MessageBuilder.command_set_time(self._start_timestamp))
        # Absolute timestamp associated to the workload's starting timestamp
        self._start_timestamp_abs = time()
        # Timestamp of the last correction that was applied to the clock of remote hosts
        last_clock_correction = self._start_timestamp_abs

        while not self._endReached or self._tasks_are_pending():
            # While some tasks are still running, and there are tasks from the workload that still need to be read, we
            # keep looping
            while self._client.peek_msg_queue() > 0:
                # We process all messages in the input queue, and write their content to the execution log for the
                # given host
                addr, msg = self._client.pop_msg_queue()
                self._process_msg_inject(addr, msg)

            # We compute the new "virtual" timestamp, in function of the workload's starting time
            now_timestamp_abs = time()
            now_timestamp = self._get_timestamp(now_timestamp_abs)

            # We perform periodically a correction of the clock of the remote hosts. This has impact only when there
            # is a very large drift between the clocks, of several minutes
            # If the sliding window for the task injection is disabled there is no need to perform clock correction
            if now_timestamp_abs - last_clock_correction > self._clockCorrectionPeriod and self._preSendInterval >= 0:
                msg = MessageBuilder.command_correct_time(now_timestamp)
                self._client.broadcast_msg(msg)
                last_clock_correction = now_timestamp_abs

            while not self._endReached and (
                    task.timestamp < now_timestamp + self._preSendInterval
                    or self._preSendInterval < 0):
                # We read all entries from the workload that correspond to tasks scheduled to start in the next
                # minutes (specified by presendinterval), and issue the related commands. This supposes that the
                # workload entries are ordered by their timestamp
                msg = MessageBuilder.command_start(task)
                self._client.broadcast_msg(msg)
                for s in self._pendingTasks.values():
                    s.add(task.seqNum)
                task = reader.read_entry()
                read_tasks += 1
                if task is None or (max_tasks is not None
                                    and read_tasks >= max_tasks):
                    self._endReached = True
                    reader.close()

            # This is a busy loop, with a short sleep period of roughly one second
            sleep(self._sleepPeriod)

        self._end_session()