async def request_status(ws: str, work: str, columns: Set[str], all_cols: bool = False) -> None: """Request the status of an odin job over web-sockets :param ws: The web socket :param work: the job name :param columns: A set of columns to include in the output :param all_cols: Should we just show all columns, If true then columns in ignored """ async with websockets.connect(ws) as websocket: await websocket.send( json.dumps({ APIField.COMMAND: 'STATUS', APIField.REQUEST: work })) results = json.loads(await websocket.recv()) if results[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(results) return if results[APIField.STATUS] == APIStatus.OK: results = results[APIField.RESPONSE] for result in results: rows = [Row(**r) for r in result['task_statuses']] show_status(Pipeline(**result['pipeline_status']), rows, columns, all_cols)
async def request_logs(ws: str, resource: str, namespace: str, container: str, follow: bool, lines: Optional[int] = None) -> None: """Make the websocket request to get the logs. :param ws: The websocket host :param resource: The thing to get logs from :param namespace: The namespace the pod is in :param container: The container to get the logs from, only needed when there are multiple containers in a pod :param follow: Should you get all the logs available right now or stream them as they come in? """ work = {'resource': resource, 'namespace': namespace, 'follow': follow} if container is not None: work['container'] = container if lines is not None: work['lines'] = lines async with websockets.connect(ws) as websocket: await websocket.send( json.dumps({ APIField.COMMAND: 'LOGS', APIField.REQUEST: work })) line = json.loads(await websocket.recv()) while line[APIField.STATUS] != APIStatus.END: if line[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(line) break LOGGER.info(line[APIField.RESPONSE]) line = json.loads(await websocket.recv())
async def request_events(url: str, resource: str, namespace: str = 'default') -> None: """Get k8s events for some resource. :param url: The location of the server :param resource: The name of the resource you are asking about. :param namespace: The namespace of the resource you are asking about. """ async with websockets.connect(url) as websocket: await websocket.send( json.dumps({ APIField.COMMAND: 'EVENTS', APIField.REQUEST: { 'resource': resource, 'namespace': namespace } })) resp = json.loads(await websocket.recv()) if resp[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(resp) return if resp[APIField.STATUS] == APIStatus.OK: rows = [Event(**r) for r in resp[APIField.RESPONSE]] print_table(rows)
def authenticate_user(url: str, token_path: str, username: str, password: str) -> None: """Authenticate a user over HTTP :param url: the base URL :param token_path: The file location of the JWT token :param username: The user ID :param password: The password """ if os.path.exists(token_path): os.remove(token_path) jwt_token = get_jwt_token(url, token_path, username, password) LOGGER.info(jwt_token)
def cleanup( work: str, store: Store, sched: Optional[TaskManager] = None, purge_db: bool = False, purge_fs: bool = False, data_dir: Optional[str] = None, ) -> List[Cleaned]: """Cleanup a job :param work: The name of the job :param store: The job store :param sched: The scheduler used to kill jobs. :param purge_db: Should the pipeline also be removed from the job db. :param purge_fs: Should the pipeline also be removed from the file system. :param data_dir: A directory that combined with work is where all artifacts produced by the pipeline live. :returns: The list jobs and if it was removed from k8s or the job db. """ if not work: return [] sched = sched if sched else KubernetesTaskManager(store) parent_details = store.get(work) children = list( chain(parent_details[Store.EXECUTED], parent_details[Store.EXECUTING])) cleaned = set() purged = set() removed = set() if purge_fs: if data_dir is None: LOGGER.warning( "Requested removal from the file system but no data directory provided." ) else: shutil.rmtree(os.path.join(data_dir, work), ignore_errors=True) removed = set(chain([work], children)) for job in children: try: sched.kill(job) cleaned.add(job) except: # pylint: disable=bare-except pass if purge_db: if store.remove(job): purged.add(job) # Remove the work entry from the db last so if there is an error before hand we can still use the db entry. if purge_db: if store.remove(work): purged.add(work) return [ Cleaned(j, done(j, cleaned), done(j, purged), done(j, removed)) for j in chain([work], children) ]
async def ping(uri: str, message: str) -> None: """Ping odin at uri and send message. :param uri: The location of the server :param message: The message you expect to see back :raises RuntimeError: If the server returns an error """ async with websockets.connect(uri) as websocket: await websocket.send(json.dumps({APIField.COMMAND: 'PING', APIField.REQUEST: message})) resp = json.loads(await websocket.recv()) if resp[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(resp) raise RuntimeError(resp) LOGGER.info(resp[APIField.RESPONSE])
async def request_cleanup(ws: str, work: str, purge_db: bool = False, purge_fs: bool = False): """Request the work is cleaned up by the server.""" async with websockets.connect(ws) as websocket: args = {'work': work, 'purge_db': purge_db, 'purge_fs': purge_fs} await websocket.send(json.dumps({APIField.COMMAND: 'CLEANUP', APIField.REQUEST: args})) results = json.loads(await websocket.recv()) if results[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(results) return if results[APIField.STATUS] == APIStatus.OK: cleaned = results[APIField.RESPONSE] print("Results of this request:") print_table([Cleaned(**c) for c in cleaned])
async def request_generate_config(ws, config): """Use async to open a connection to serve.py and generate a config.""" async with websockets.connect(ws) as websocket: await websocket.send( json.dumps({ APIField.COMMAND: 'GENERATE', APIField.REQUEST: config })) result = json.loads(await websocket.recv()) if result[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(result) return if result[APIField.STATUS] == APIStatus.OK: LOGGER.info('Generated pipeline is called %s', result[APIField.RESPONSE])
def submit(self, task: Task) -> str: """Submit a multi-worker PyTorchJob Task :param task: The task definition :type task: Task :return: A string handle name :rtype: str """ secrets = self._reference_secrets(task) configmaps = self._generate_configmaps(task) task.num_gpus = 1 pod_spec = task_to_pod_spec(task, container_name="pytorch-elasticjob", secrets=secrets, configmaps=configmaps) template_metadata = client.V1ObjectMeta(name=task.name) template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec) worker_replica_spec = {} worker_replica_spec['replicas'] = task.num_workers worker_replica_spec['restartPolicy'] = PyTorchElasticJobHandler.EXIT_CODE worker_replica_spec['template'] = template spec = {} spec['replicaSpecs'] = {} spec['replicaSpecs']['Worker'] = worker_replica_spec spec['minReplicas'] = task.num_workers spec['maxReplicas'] = task.num_workers etcd_svc = getenv('PYTORCH_ELASTIC_ETCD_SVC') if not etcd_svc: LOGGER.warning("No environment variable set for etcd service, looking for first available in elastic-job namespace") api = client.CoreV1Api() etcd_svc = [x for x in api.list_namespaced_service('elastic-job').items if x.metadata.name =='etcd-service'][0].spec.cluster_ip LOGGER.info("Using etcd service on %s:%d", etcd_svc, PyTorchElasticJobHandler.ETCD_PORT) spec['rdzvEndpoint'] = f'{etcd_svc}:{PyTorchElasticJobHandler.ETCD_PORT}' pytorch_job_spec = {} pytorch_job_spec['kind'] = PyTorchElasticJobHandler.NAME pytorch_job_spec['apiVersion'] = f'{PyTorchElasticJobHandler.GROUP}/{PyTorchElasticJobHandler.VERSION}' pytorch_job_spec['metadata'] = client.V1ObjectMeta(generate_name=task.name) pytorch_job_spec['spec'] = spec pytorch_job = self.api.create_namespaced_custom_object( PyTorchElasticJobHandler.GROUP, PyTorchElasticJobHandler.VERSION, self.namespace, PyTorchElasticJobHandler.PLURAL, pytorch_job_spec, ) return pytorch_job['metadata']['name']
def main(): """Use `asyncio` to connect to a websocket and request a pipeline, wait. """ signal.signal(signal.SIGINT, lambda *args, **kwargs: exit(0)) parser = argparse.ArgumentParser( description='HTTP or Websocket-based Pipeline scheduler') parser.add_argument('work', help='Job') parser.add_argument('--host', default=ODIN_URL, type=str) parser.add_argument('--port', default=ODIN_PORT) parser.add_argument('--token', help="File where JWT token can reside", default=os.path.expanduser("~/.odin.token")) parser.add_argument('--username', '-u', help="Username", default=getuser()) parser.add_argument('--password', '-p', help="Password") parser.add_argument( '--scheme', choices={'http', 'wss', 'ws', 'https'}, default=ODIN_SCHEME, help= 'Connection protocol, use `http` for REST, use `wss` for remote connections and `ws` for localhost', ) args, overrides = parser.parse_known_args() context = parse_and_merge_overrides({}, overrides, pre='x') url = f'{args.scheme}://{args.host}:{args.port}' if args.scheme.startswith('ws'): if context: LOGGER.warning("Context is ignored by web-socket tier") asyncio.get_event_loop().run_until_complete( schedule_pipeline(url, args.work)) else: jwt_token = get_jwt_token(url, args.token, args.username, args.password) try: schedule_pipeline_http(url, jwt_token, args.work, context) except ValueError: # Try deleting the token file and start again if os.path.exists(args.token): os.remove(args.token) jwt_token = get_jwt_token(url, args.token, args.username, args.password) schedule_pipeline_http(url, jwt_token, args.work, context)
def expand_dirs(files: List[str]) -> List[str]: """Given a list of files and dirs return a list all files in the dir. :param files: The list of files and dirs. :returns: The list with dirs expanded into the files contained within them. """ new_files = [] for f in files: f = os.path.expanduser(f) if not os.path.exists(f): LOGGER.warning("Requested hash of %s but file not found.", f) continue if os.path.isdir(f): new_files.extend(expand_dir(f)) else: new_files.append(f) return new_files
def _authenticate(url, username, passwd): response = None url = f'{url}/v1/auth' try: response = requests.post(url, data={'username': username, 'password': passwd}) results = response.json() return results['message'] except Exception as ex: try: response = requests.post(url, json={'username': username, 'password': passwd}) results = response.json() return results['message'] except Exception as ex: LOGGER.error(url) if response: LOGGER.error(response.status_code) raise ex
async def request_pipeline_definitions(ws: str, pipeline: str) -> None: """Use async to open a connection to serve.py and get a pipeline definition.""" async with websockets.connect(ws) as websocket: await websocket.send( json.dumps({ APIField.COMMAND: 'SHOW', APIField.REQUEST: pipeline })) result = json.loads(await websocket.recv()) if result[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(result) return if result[APIField.STATUS] == APIStatus.OK: for file_name, file_contents in result[APIField.RESPONSE].items(): LOGGER.info(file_name) LOGGER.info("=" * 100) LOGGER.info(file_contents) LOGGER.info("")
def _reference_secrets(self, task: Task) -> Optional[List[Secret]]: """Generate secrets based on the requirements of the job. Eventually we can support custom secrets by having the job create secrets from the yaml config. Then this function will combine secrets on the job with these injected secrets to yield the final full list. :param task: The job we are running to add secrets to. :type task: Task :returns: A list of Secrets or `None` :rtype: Optional[List[Secret]] """ secrets = task.secrets if task.secrets is not None else [] command = listify(task.command) if command[0].startswith('odin'): try: # Check if the odin-cred secret exists _ = self.core_api.read_namespaced_secret( name=ODIN_CRED, namespace=self.namespace) cred_secret = Secret(os.path.join(SECRET_LOC, ODIN_CRED_FILE), ODIN_CRED, ODIN_CRED_FILE) # Make sure they aren't already requesting this secret if not any(s == cred_secret for s in secrets): secrets.append(cred_secret) except client.rest.ApiException: if '--cred' not in task.args: LOGGER.warning( 'No --cred arg found on job %s and no odin-cred secret found to populate container.', task.name) if command[0].startswith('odin-chores'): try: # Check if the ssh-key secret exists _ = self.core_api.read_namespaced_secret( name=SSH_KEY, namespace=self.namespace) # Make the key permissions -rw------- ssh_secret = Secret(os.path.join(SECRET_LOC, SSH_KEY_FILE), SSH_KEY, SSH_KEY_FILE, SSH_MODE) # Make sure they aren't already requesting this secret if not any(s == ssh_secret for s in secrets): secrets.append(ssh_secret) except client.rest.ApiException: pass return secrets if secrets else None
async def request_data(url: str, resource: str) -> None: """Get k8s data for some resource. :param url: The location of the server :param resource: The name of the resource you are asking about. :param namespace: The namespace of the resource you are asking about. """ async with websockets.connect(url) as websocket: await websocket.send( json.dumps({ APIField.COMMAND: 'DATA', APIField.REQUEST: { 'resource': resource } })) resp = json.loads(await websocket.recv()) if resp[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(resp) return if resp[APIField.STATUS] == APIStatus.OK: print(json.dumps(resp[APIField.RESPONSE]))
async def schedule_pipeline(ws, work) -> None: """Use async to open a connection to serve.py and launch work Blocks until the job completes (and websocket stays open) """ async with websockets.connect(ws) as websocket: await websocket.send( json.dumps({ APIField.COMMAND: 'START', APIField.REQUEST: work })) result = json.loads(await websocket.recv()) while result[APIField.STATUS] != APIStatus.END: if result[APIField.STATUS] == APIStatus.ERROR: LOGGER.error(result) return if result[APIField.RESPONSE].startswith('PIPE_ID'): pipe_id = result.split(' ')[-1] LOGGER.info('Started %s', pipe_id) else: LOGGER.info(result[APIField.RESPONSE]) result = json.loads(await websocket.recv())
def create_user_http(url: str, jwt_token: str, username: str, password: str, firstname: str, lastname: str) -> None: """Create or update a user over HTTP :param url: the base URL :param jwt_token: The JWT token representing this authentication :param username: The user ID :param password: The updated password :param firstname: The firstname :param lastname: The lastname """ user = {"username": username, "password": password} if firstname: user['firstname'] = firstname if lastname: user['lastname'] = lastname headers = {'Authorization': f'Bearer {jwt_token}'} try: response = requests.get(f'{url}/v1/users/{username}') if response.status_code == 401: raise ValueError("Invalid login") if response.status_code != 200: # No such user exists so do a POST response = requests.post(f'{url}/v1/users', headers=headers, json={"user": user}) if response.status_code != 200: raise Exception(f"Failed to create user: {username}") results = response.json() LOGGER.info("Created new user") LOGGER.info(json.dumps(results)) return results = response.json() LOGGER.info("Found existing user") LOGGER.info(json.dumps(results)) except Exception as ex: LOGGER.error(ex) return response = requests.put(f'{url}/v1/users/{username}', json=user, headers=headers) results = response.json() LOGGER.info(json.dumps(results))