def test_current_events_will_only_return_non_cancelled_reservations(tables, new_reservation, new_reservation_2): new_reservation.start = datetime.utcnow() - timedelta(minutes=10) new_reservation.end = datetime.utcnow() + timedelta(minutes=60) new_reservation.save() assert new_reservation in Reservation.current_events() new_reservation.is_cancelled = True new_reservation.save() new_reservation_2.save() current_events = Reservation.current_events() assert new_reservation not in current_events assert new_reservation_2 in current_events
def log_current_usage(self): '''Updates log files related to current reservations''' current_reservations = Reservation.current_events() infrastructure = self.infrastructure_manager.infrastructure for reservation in current_reservations: filename = '{id}.json'.format(id=reservation.id) log_file_path = self.log_dir / filename try: gpu_data = self.extract_specific_gpu_data( uuid=reservation.protected_resource_id, infrastructure=infrastructure) Log(data=gpu_data).save(out_path=log_file_path) except Exception as e: log.error(e)
def do_run(self): time_func = time.perf_counter start_time = time_func() # 1. Get list of current reservations current_reservations = Reservation.current_events() # FIXME DEBUG ONLY log.debug( json.dumps([r.as_dict() for r in current_reservations], indent=4)) for reservation in current_reservations: # 1. Extract reservation info uuid = reservation.resource_id hostname = self.find_hostname(uuid) user = User.get(reservation.user_id) username = user.username if hostname is None or username is None: log.warning( 'Unable to process the reservation ({}@{}), skipping...'. format(username, hostname)) continue # 2. Establish connection to node and find all tty sessions node_connection = self.connection_manager.single_connection( hostname) node_sessions = self.node_tty_sessions(node_connection) node_processes = self.node_gpu_processes(hostname) reserved_gpu_process_owners = self.gpu_users(node_processes, uuid) is_unprivileged = lambda sess: sess[ 'USER'] in reserved_gpu_process_owners intruder_ttys = [ sess for sess in node_sessions if is_unprivileged(sess) ] try: # Priviliged user can be ignored on this list reserved_gpu_process_owners.remove(username) except ValueError: pass finally: unprivileged_gpu_process_owners = reserved_gpu_process_owners # 3. Execute protection handlers for intruder in unprivileged_gpu_process_owners: violation_data = { 'INTRUDER_USERNAME': intruder, 'RESERVATION_OWNER_USERNAME': username, 'RESERVATION_OWNER_EMAIL': user.email, 'RESERVATION_END': utc2local(reservation.end), 'UUID': uuid, 'GPU_NAME': self.gpu_attr(hostname, uuid, attribute='name'), 'GPU_ID': self.gpu_attr(hostname, uuid, attribute='index'), 'HOSTNAME': hostname, 'TTY_SESSIONS': intruder_ttys, 'SSH_CONNECTION': node_connection } for handler in self.violation_handlers: handler.trigger_action(violation_data) end_time = time_func() execution_time = end_time - start_time # Hold on until next interval if execution_time < self.interval: gevent.sleep(self.interval - execution_time) waiting_time = time_func() - end_time total_time = execution_time + waiting_time log.debug( 'ProtectionService loop took: {:.2f}s (waiting {:.2f}) = {:.2f}'. format(execution_time, waiting_time, total_time))
def do_run(self): time_func = time.perf_counter start_time = time_func() current_infrastructure = self.infrastructure_manager.all_nodes_with_gpu_processes( ) for hostname in current_infrastructure: violations = {} # type: Dict[str, Dict] for gpu_id in current_infrastructure[hostname]: processes = current_infrastructure[hostname][gpu_id] if self.strict_reservations or (processes is not None and len(processes)): current_gpu_reservations = Reservation.current_events( gpu_id) reservation = None if len(current_gpu_reservations): reservation = current_gpu_reservations[0] if hostname is None or reservation.user is None: continue for process in processes: if process['owner'] != reservation.user.username: self.store_violation(violations, process, hostname, reservation, gpu_id) elif self.strict_reservations: for process in processes: self.store_violation(violations, process, hostname, reservation, gpu_id) for intruder in violations: violation_data = violations[intruder] reservations = violation_data['RESERVATIONS'] hostnames = set([ reservation_data['HOSTNAME'] for reservation_data in reservations ]) violation_data['SSH_CONNECTIONS'] = { hostname: self.connection_manager.single_connection(hostname) for hostname in hostnames } violation_data['GPUS'] = ',\n'.join([ '{} - GPU{}: {}'.format(data['HOSTNAME'], data['GPU_ID'], data['GPU_NAME']) for data in reservations ]) violation_data['OWNERS'] = ', '.join([ '{} ({})'.format(data['OWNER_USERNAME'], data['OWNER_EMAIL']) for data in reservations ]) for handler in self.violation_handlers: try: handler.trigger_action(violation_data) except Exception as e: log.warning('Error in violation handler: {}'.format(e)) end_time = time_func() execution_time = end_time - start_time # Hold on until next interval if execution_time < self.interval: gevent.sleep(self.interval - execution_time) waiting_time = time_func() - end_time total_time = execution_time + waiting_time log.debug( 'ProtectionService loop took: {:.2f}s (waiting {:.2f}) = {:.2f}'. format(execution_time, waiting_time, total_time))