Exemplo n.º 1
0
def cleanup(
    work: str,
    store: Store,
    sched: Optional[TaskManager] = None,
    purge_db: bool = False,
    purge_fs: bool = False,
    data_dir: Optional[str] = None,
) -> List[Cleaned]:
    """Cleanup a job

    :param work: The name of the job
    :param store: The job store
    :param sched: The scheduler used to kill jobs.
    :param purge_db: Should the pipeline also be removed from the job db.
    :param purge_fs: Should the pipeline also be removed from the file system.
    :param data_dir: A directory that combined with work is where all artifacts produced by the pipeline live.

    :returns: The list jobs and if it was removed from k8s or the job db.
    """
    if not work:
        return []
    sched = sched if sched else KubernetesTaskManager(store)
    parent_details = store.get(work)
    children = list(
        chain(parent_details[Store.EXECUTED], parent_details[Store.EXECUTING]))
    cleaned = set()
    purged = set()
    removed = set()
    if purge_fs:
        if data_dir is None:
            LOGGER.warning(
                "Requested removal from the file system but no data directory provided."
            )
        else:
            shutil.rmtree(os.path.join(data_dir, work), ignore_errors=True)
            removed = set(chain([work], children))
    for job in children:
        try:
            sched.kill(job)
            cleaned.add(job)
        except:  # pylint: disable=bare-except
            pass
        if purge_db:
            if store.remove(job):
                purged.add(job)
    # Remove the work entry from the db last so if there is an error before hand we can still use the db entry.
    if purge_db:
        if store.remove(work):
            purged.add(work)
    return [
        Cleaned(j, done(j, cleaned), done(j, purged), done(j, removed))
        for j in chain([work], children)
    ]
Exemplo n.º 2
0
    def submit(self, task: Task) -> str:
        """Submit a multi-worker PyTorchJob Task

        :param task: The task definition
        :type task: Task
        :return: A string handle name
        :rtype: str
        """
        secrets = self._reference_secrets(task)
        configmaps = self._generate_configmaps(task)
        task.num_gpus = 1
        pod_spec = task_to_pod_spec(task, container_name="pytorch-elasticjob", secrets=secrets, configmaps=configmaps)
        template_metadata = client.V1ObjectMeta(name=task.name)

        template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec)

        worker_replica_spec = {}
        worker_replica_spec['replicas'] = task.num_workers
        worker_replica_spec['restartPolicy'] = PyTorchElasticJobHandler.EXIT_CODE
        worker_replica_spec['template'] = template

        spec = {}
        spec['replicaSpecs'] = {}
        spec['replicaSpecs']['Worker'] = worker_replica_spec
        spec['minReplicas'] = task.num_workers
        spec['maxReplicas'] = task.num_workers
        etcd_svc = getenv('PYTORCH_ELASTIC_ETCD_SVC')
        if not etcd_svc:
            LOGGER.warning("No environment variable set for etcd service, looking for first available in elastic-job namespace")
            api = client.CoreV1Api()
            etcd_svc = [x for x in api.list_namespaced_service('elastic-job').items if x.metadata.name =='etcd-service'][0].spec.cluster_ip
        LOGGER.info("Using etcd service on %s:%d", etcd_svc, PyTorchElasticJobHandler.ETCD_PORT)
        spec['rdzvEndpoint'] = f'{etcd_svc}:{PyTorchElasticJobHandler.ETCD_PORT}'
        pytorch_job_spec = {}
        pytorch_job_spec['kind'] = PyTorchElasticJobHandler.NAME
        pytorch_job_spec['apiVersion'] = f'{PyTorchElasticJobHandler.GROUP}/{PyTorchElasticJobHandler.VERSION}'
        pytorch_job_spec['metadata'] = client.V1ObjectMeta(generate_name=task.name)
        pytorch_job_spec['spec'] = spec

        pytorch_job = self.api.create_namespaced_custom_object(
            PyTorchElasticJobHandler.GROUP,
            PyTorchElasticJobHandler.VERSION,
            self.namespace,
            PyTorchElasticJobHandler.PLURAL,
            pytorch_job_spec,
        )
        return pytorch_job['metadata']['name']
Exemplo n.º 3
0
def main():
    """Use `asyncio` to connect to a websocket and request a pipeline, wait.
    """
    signal.signal(signal.SIGINT, lambda *args, **kwargs: exit(0))

    parser = argparse.ArgumentParser(
        description='HTTP or Websocket-based Pipeline scheduler')
    parser.add_argument('work', help='Job')
    parser.add_argument('--host', default=ODIN_URL, type=str)
    parser.add_argument('--port', default=ODIN_PORT)
    parser.add_argument('--token',
                        help="File where JWT token can reside",
                        default=os.path.expanduser("~/.odin.token"))
    parser.add_argument('--username', '-u', help="Username", default=getuser())
    parser.add_argument('--password', '-p', help="Password")
    parser.add_argument(
        '--scheme',
        choices={'http', 'wss', 'ws', 'https'},
        default=ODIN_SCHEME,
        help=
        'Connection protocol, use `http` for REST, use `wss` for remote connections and `ws` for localhost',
    )

    args, overrides = parser.parse_known_args()
    context = parse_and_merge_overrides({}, overrides, pre='x')

    url = f'{args.scheme}://{args.host}:{args.port}'

    if args.scheme.startswith('ws'):
        if context:
            LOGGER.warning("Context is ignored by web-socket tier")
        asyncio.get_event_loop().run_until_complete(
            schedule_pipeline(url, args.work))
    else:
        jwt_token = get_jwt_token(url, args.token, args.username,
                                  args.password)
        try:
            schedule_pipeline_http(url, jwt_token, args.work, context)
        except ValueError:
            # Try deleting the token file and start again
            if os.path.exists(args.token):
                os.remove(args.token)
                jwt_token = get_jwt_token(url, args.token, args.username,
                                          args.password)
                schedule_pipeline_http(url, jwt_token, args.work, context)
Exemplo n.º 4
0
def expand_dirs(files: List[str]) -> List[str]:
    """Given a list of files and dirs return a list all files in the dir.

    :param files: The list of files and dirs.

    :returns: The list with dirs expanded into the files contained within them.
    """
    new_files = []
    for f in files:
        f = os.path.expanduser(f)
        if not os.path.exists(f):
            LOGGER.warning("Requested hash of %s but file not found.", f)
            continue
        if os.path.isdir(f):
            new_files.extend(expand_dir(f))
        else:
            new_files.append(f)
    return new_files
Exemplo n.º 5
0
    def _reference_secrets(self, task: Task) -> Optional[List[Secret]]:
        """Generate secrets based on the requirements of the job.

        Eventually we can support custom secrets by having the job create
        secrets from the yaml config. Then this function will combine secrets
        on the job with these injected secrets to yield the final full list.

        :param task: The job we are running to add secrets to.
        :type task: Task
        :returns: A list of Secrets or `None`
        :rtype: Optional[List[Secret]]
        """
        secrets = task.secrets if task.secrets is not None else []
        command = listify(task.command)
        if command[0].startswith('odin'):
            try:
                # Check if the odin-cred secret exists
                _ = self.core_api.read_namespaced_secret(
                    name=ODIN_CRED, namespace=self.namespace)
                cred_secret = Secret(os.path.join(SECRET_LOC, ODIN_CRED_FILE),
                                     ODIN_CRED, ODIN_CRED_FILE)
                # Make sure they aren't already requesting this secret
                if not any(s == cred_secret for s in secrets):
                    secrets.append(cred_secret)
            except client.rest.ApiException:
                if '--cred' not in task.args:
                    LOGGER.warning(
                        'No --cred arg found on job %s and no odin-cred secret found to populate container.',
                        task.name)
        if command[0].startswith('odin-chores'):
            try:
                # Check if the ssh-key secret exists
                _ = self.core_api.read_namespaced_secret(
                    name=SSH_KEY, namespace=self.namespace)
                # Make the key permissions -rw-------
                ssh_secret = Secret(os.path.join(SECRET_LOC, SSH_KEY_FILE),
                                    SSH_KEY, SSH_KEY_FILE, SSH_MODE)
                # Make sure they aren't already requesting this secret
                if not any(s == ssh_secret for s in secrets):
                    secrets.append(ssh_secret)
            except client.rest.ApiException:
                pass
        return secrets if secrets else None