Пример #1
0
def test_current_events_will_only_return_non_cancelled_reservations(tables, new_reservation, new_reservation_2):
    new_reservation.start = datetime.utcnow() - timedelta(minutes=10)
    new_reservation.end = datetime.utcnow() + timedelta(minutes=60)
    new_reservation.save()
    assert new_reservation in Reservation.current_events()

    new_reservation.is_cancelled = True
    new_reservation.save()
    new_reservation_2.save()

    current_events = Reservation.current_events()
    assert new_reservation not in current_events
    assert new_reservation_2 in current_events
Пример #2
0
 def log_current_usage(self):
     '''Updates log files related to current reservations'''
     current_reservations = Reservation.current_events()
     infrastructure = self.infrastructure_manager.infrastructure
     for reservation in current_reservations:
         filename = '{id}.json'.format(id=reservation.id)
         log_file_path = self.log_dir / filename
         try:
             gpu_data = self.extract_specific_gpu_data(
                 uuid=reservation.protected_resource_id,
                 infrastructure=infrastructure)
             Log(data=gpu_data).save(out_path=log_file_path)
         except Exception as e:
             log.error(e)
Пример #3
0
    def do_run(self):
        time_func = time.perf_counter
        start_time = time_func()

        # 1. Get list of current reservations
        current_reservations = Reservation.current_events()

        # FIXME DEBUG ONLY
        log.debug(
            json.dumps([r.as_dict() for r in current_reservations], indent=4))

        for reservation in current_reservations:
            # 1. Extract reservation info
            uuid = reservation.resource_id
            hostname = self.find_hostname(uuid)
            user = User.get(reservation.user_id)
            username = user.username
            if hostname is None or username is None:
                log.warning(
                    'Unable to process the reservation ({}@{}), skipping...'.
                    format(username, hostname))
                continue

            # 2. Establish connection to node and find all tty sessions
            node_connection = self.connection_manager.single_connection(
                hostname)
            node_sessions = self.node_tty_sessions(node_connection)
            node_processes = self.node_gpu_processes(hostname)
            reserved_gpu_process_owners = self.gpu_users(node_processes, uuid)

            is_unprivileged = lambda sess: sess[
                'USER'] in reserved_gpu_process_owners
            intruder_ttys = [
                sess for sess in node_sessions if is_unprivileged(sess)
            ]

            try:
                # Priviliged user can be ignored on this list
                reserved_gpu_process_owners.remove(username)
            except ValueError:
                pass
            finally:
                unprivileged_gpu_process_owners = reserved_gpu_process_owners

            # 3. Execute protection handlers
            for intruder in unprivileged_gpu_process_owners:
                violation_data = {
                    'INTRUDER_USERNAME': intruder,
                    'RESERVATION_OWNER_USERNAME': username,
                    'RESERVATION_OWNER_EMAIL': user.email,
                    'RESERVATION_END': utc2local(reservation.end),
                    'UUID': uuid,
                    'GPU_NAME': self.gpu_attr(hostname, uuid,
                                              attribute='name'),
                    'GPU_ID': self.gpu_attr(hostname, uuid, attribute='index'),
                    'HOSTNAME': hostname,
                    'TTY_SESSIONS': intruder_ttys,
                    'SSH_CONNECTION': node_connection
                }
                for handler in self.violation_handlers:
                    handler.trigger_action(violation_data)

        end_time = time_func()
        execution_time = end_time - start_time

        # Hold on until next interval
        if execution_time < self.interval:
            gevent.sleep(self.interval - execution_time)
        waiting_time = time_func() - end_time
        total_time = execution_time + waiting_time
        log.debug(
            'ProtectionService loop took: {:.2f}s (waiting {:.2f}) = {:.2f}'.
            format(execution_time, waiting_time, total_time))
Пример #4
0
    def do_run(self):
        time_func = time.perf_counter
        start_time = time_func()

        current_infrastructure = self.infrastructure_manager.all_nodes_with_gpu_processes(
        )
        for hostname in current_infrastructure:
            violations = {}  # type: Dict[str, Dict]
            for gpu_id in current_infrastructure[hostname]:
                processes = current_infrastructure[hostname][gpu_id]
                if self.strict_reservations or (processes is not None
                                                and len(processes)):
                    current_gpu_reservations = Reservation.current_events(
                        gpu_id)
                    reservation = None
                    if len(current_gpu_reservations):
                        reservation = current_gpu_reservations[0]
                        if hostname is None or reservation.user is None:
                            continue

                        for process in processes:
                            if process['owner'] != reservation.user.username:
                                self.store_violation(violations, process,
                                                     hostname, reservation,
                                                     gpu_id)
                    elif self.strict_reservations:
                        for process in processes:
                            self.store_violation(violations, process, hostname,
                                                 reservation, gpu_id)

            for intruder in violations:
                violation_data = violations[intruder]
                reservations = violation_data['RESERVATIONS']
                hostnames = set([
                    reservation_data['HOSTNAME']
                    for reservation_data in reservations
                ])
                violation_data['SSH_CONNECTIONS'] = {
                    hostname:
                    self.connection_manager.single_connection(hostname)
                    for hostname in hostnames
                }
                violation_data['GPUS'] = ',\n'.join([
                    '{} - GPU{}: {}'.format(data['HOSTNAME'], data['GPU_ID'],
                                            data['GPU_NAME'])
                    for data in reservations
                ])
                violation_data['OWNERS'] = ', '.join([
                    '{} ({})'.format(data['OWNER_USERNAME'],
                                     data['OWNER_EMAIL'])
                    for data in reservations
                ])

                for handler in self.violation_handlers:
                    try:
                        handler.trigger_action(violation_data)
                    except Exception as e:
                        log.warning('Error in violation handler: {}'.format(e))

        end_time = time_func()
        execution_time = end_time - start_time

        # Hold on until next interval
        if execution_time < self.interval:
            gevent.sleep(self.interval - execution_time)
        waiting_time = time_func() - end_time
        total_time = execution_time + waiting_time
        log.debug(
            'ProtectionService loop took: {:.2f}s (waiting {:.2f}) = {:.2f}'.
            format(execution_time, waiting_time, total_time))