Exemplo n.º 1
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' job-pull')
        parser.add_argument(
            'id',
            help=
            "Short or long job id, like ef8009d83a9892968097cec05b9467c685d45453"
        )
        parser.add_argument(
            '--model',
            help="Model name like peter/mnist. Per default from configuration."
        )
        parser.add_argument(
            '-c',
            '--config',
            help=
            "Default aetros.yml in current working directory or directories above."
        )

        parsed_args = parser.parse_args(args)

        if not parsed_args.id:
            parser.print_help()
            sys.exit(1)

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print(
                "No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'."
            )
            sys.exit(2)

        full_id = git_has_remote_job(home_config, model, parsed_args.id)
        if not full_id:
            print("Error: Job not found on remote.")
            sys.exit(1)

        ref = 'refs/aetros/job/' + full_id
        git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model +
                                   '.git')

        git_remote_url = 'git@%s:%s.git' % (home_config['host'], model)

        if not os.path.isdir(git_dir):
            subprocess.call([
                home_config['git'], '--bare', 'clone', git_remote_url, git_dir
            ])

        print('Pull job %s of %s' % (parsed_args.id, model))
        setup_git_ssh(home_config)
        subprocess.call([
            home_config['git'], '--bare', '--git-dir', git_dir, 'fetch',
            'origin', ref + ':' + ref
        ])
Exemplo n.º 2
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' id')

        parsed_args = parser.parse_args(args)
        config = read_home_config()

        try:
            user = api.user()
        except KeyNotConfiguredException as e:
            self.logger.error(str(e))
            sys.exit(1)

        print("Logged in as %s (%s) on %s" %
              (user['username'], user['name'], config['host']))

        if len(user['accounts']) > 0:
            for orga in six.itervalues(user['accounts']):
                print("  %s of organisation %s (%s)." %
                      ("Owner" if orga['memberType'] == 1 else "Member",
                       orga['username'], orga['name']))
        else:
            print("  Without membership to an organisation.")
Exemplo n.º 3
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' model')

        parsed_args = parser.parse_args(args)

        home_config = read_home_config()
        config_path = find_config_path()

        if not config_path:
            print(
                "No model configuration file (aetros.yml). Switch to a directory first.."
            )
            sys.exit(1)

        config = find_config(error_on_missing=True)
        print("Model %s in %s used in all aetros commands." %
              (config['model'], os.path.dirname(config_path)))

        git_remote_url = 'git@%s:%s.git' % (home_config['host'],
                                            config['model'])
        print("Git url: %s" % (git_remote_url, ))
Exemplo n.º 4
0
def request(path, query=None, body=None, method='get', config=None):
    query = query or {}

    if isinstance(query, dict):
        query = urlencode(query)

    if '?' in path:
        path += '&' + query
    else:
        path += '?' + query

    config = read_home_config() if config is None else config

    if method == 'get' and body is not None:
        method = 'post'

    ssh_stream = create_ssh_stream(config)
    stdin, stdout, stderr = ssh_stream.exec_command('api ' + method + ' ' + simplejson.dumps(path))

    if body is not None:
        input = six.b(simplejson.dumps(body))
        stdin.write(input)
        stdin.flush()
        stdin.channel.shutdown_write()

    stdout = drain_stream(stdout)
    stderr = drain_stream(stderr)

    if len(stderr) > 0:
        if hasattr(stderr, 'decode'):
            stderr = stderr.decode('utf-8')

        raise ApiError('Could not request api: ' + config['host'] + path, stderr)

    return stdout
Exemplo n.º 5
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-diff')
        parser.add_argument('id_from', help="Short or long job id like ed4d6a204.")
        parser.add_argument('id_to', nargs='?', help="Short or long job id like d55df24a7 or file path")
        parser.add_argument('limit', nargs='?', help="Limit files to diff")
        parser.add_argument('--model', help="Model name like peter/mnist. Per default from configuration.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.")

        parsed_args = parser.parse_args(args)

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git')

        id_map = {}

        for job_id in [parsed_args.id_from, parsed_args.id_to]:
            if os.path.exists(job_id):
                continue

            full_id = git_has_local_job(home_config, model, job_id)
            id_map[job_id] = full_id
            if not full_id:
                full_id = git_has_remote_job(home_config, model, job_id)
                id_map[job_id] = full_id
                if full_id:
                    print("Pull job %s to local ... " % (job_id, ))
                    ref = 'refs/aetros/job/' + full_id
                    subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref])
                else:
                    print("Job %s not found." % (job_id, ))
                    sys.exit(2)

        print("Diff jobs %s and %s of %s." %(parsed_args.id_from, parsed_args.id_to, model))

        from_ref = 'refs/aetros/job/' + id_map[parsed_args.id_from]
        args = [home_config['git'], '--bare', '--git-dir', git_dir]

        if os.path.exists(parsed_args.id_to):
            args += ['--work-tree', os.path.abspath(parsed_args.id_to), 'diff', from_ref]
        else:
            to_ref = 'refs/aetros/job/' + id_map[parsed_args.id_to]
            args += ['diff', from_ref+'...'+to_ref]

        if parsed_args.limit:
            args += ['--', parsed_args.limit]

        subprocess.call(args)
Exemplo n.º 6
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-files')
        parser.add_argument('job_id', help="Short or long job id like ed4d6a204")
        parser.add_argument('folder', nargs='?', help="Limit files list to folder. Default root ./")
        parser.add_argument('-r', action='store_true', help="Recursive files tree")
        parser.add_argument('--model', help="Model name like peter/mnist. Per default from found configuration.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.")

        parsed_args = parser.parse_args(args)

        if not parsed_args.job_id:
            parser.print_help()
            sys.exit()

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git')

        id_map = {}

        for job_id in [parsed_args.job_id]:
            full_id = git_has_local_job(home_config, model, job_id)
            id_map[job_id] = full_id
            if not full_id:
                full_id = git_has_remote_job(home_config, model, job_id)
                id_map[job_id] = full_id
                if full_id:
                    print("Pull job %s to local ... " % (job_id, ))
                    ref = 'refs/aetros/job/' + full_id
                    subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref])
                else:
                    print("Job %s not found." % (job_id, ))
                    sys.exit(2)

        ref = 'refs/aetros/job/' + id_map[parsed_args.job_id]

        print("List job files of %s of %s" % (parsed_args.job_id, model))
        args = [home_config['git'], '--bare', '--git-dir', git_dir, 'ls-tree', '--long']
        if parsed_args.r:
            args.append('-r')

        args.append(ref)

        if parsed_args.folder:
            args.append(parsed_args.folder)
        subprocess.call(args)
Exemplo n.º 7
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' jobs')
        parser.add_argument('--all', '-a', action='store_true', help="Show remote jobs as well")
        parser.add_argument('--model', help="Model name like peter/mnist. Per default from configuration.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.")

        parsed_args = parser.parse_args(args)

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        print("Show jobs of model " + model + ' ('+home_config['host']+')')

        setup_git_ssh(home_config)
        local_job_ids = git_local_job_ids(home_config, model)
        remote_job_ids = []

        try:
            remote_job_ids = git_remote_job_ids(home_config, model)
        except:
            pass

        job_map = OrderedDict()
        for job_id in local_job_ids:
            job_map[job_id] = {'local': Color('{autogreen}Yes{/autogreen}'), 'remote': Color('{autored}No{/autored}'),}

        for job_id in remote_job_ids:
            if job_id in job_map:
                job_map[job_id]['remote'] = Color('{autogreen}Yes{/autogreen}')
            elif parsed_args.all:
                job_map[job_id] = {'local': Color('{autored}No{/autored}'), 'remote': Color('{autogreen}Yes{/autogreen}')}

        print("%d jobs found. (%d synced to remote)" % (len(job_map), len(remote_job_ids)))
        if not parsed_args.all:
            print("Use --all to show remote-only jobs as well.")

        table_data = [['Short Job ID', 'Local', 'Remote', 'Long Job ID']]

        for job_id, info in six.iteritems(job_map):
            table_data.append([job_id[0:9], info['local'], info['remote'], job_id])

        table = AsciiTable(table_data)
        print(table.table)
Exemplo n.º 8
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-commits')
        parser.add_argument('job_id', help="Short or long job id like ed4d6a204.")
        parser.add_argument('--model', help="Model name like peter/mnist. Per default from configuration.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.")

        parsed_args = parser.parse_args(args)

        if not parsed_args.job_id:
            parser.print_help()
            sys.exit(1)

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git')

        id_map = {}

        for job_id in [parsed_args.job_id]:
            full_id = git_has_local_job(home_config, model, job_id)
            id_map[job_id] = full_id
            if not full_id:
                full_id = git_has_remote_job(home_config, model, job_id)
                id_map[job_id] = full_id
                if full_id:
                    print("Pull job %s to local ... " % (job_id, ))
                    ref = 'refs/aetros/job/' + full_id
                    subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref])
                else:
                    print("Job %s not found." % (job_id, ))
                    sys.exit(2)

        ref = 'refs/aetros/job/' + id_map[parsed_args.job_id]
        args = [home_config['git'], '--bare', '--git-dir', git_dir]
        args += ['log', '--stat', ref]

        subprocess.call(args)
Exemplo n.º 9
0
def http_request(path, query='', json_body=None, method='get', config=None, handle_common_errors=True):
    config = read_home_config() if config is None else config

    try:
        import urllib3
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    except Exception: pass

    if query is not None:
        if isinstance(query, dict):
            query = urlencode(query)

        if '?' in path:
            path += '&' + query
        else:
            path += '?' + query

    url = config['url'] + '/api/' + path

    auth = None
    if 'auth_user' in config:
        auth = HTTPBasicAuth(config['auth_user'], config['auth_pw'])

    if json_body is not None and method == 'get':
        method = 'post'


    try:
        response = requests.request(
            method, url, data=json_body,
            auth=auth, verify=config['ssl_verify'],
            headers={'Accept': 'application/json'}
        )
    except requests.exceptions.SSLError:
        if not handle_common_errors:
            raise

        print("Error: Could not connect to " + url + ". Make sure to install a valid SSL cert or disable ssl check by"
                                                     "setting aetros home-config ssl_verify false")
        sys.exit(1)

    if response.status_code >= 400:
        raise_response_exception('Failed request ' + url, response)

    return parse_json(response.content.decode('utf-8'))
Exemplo n.º 10
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' model')

        parsed_args = parser.parse_args(args)

        home_config = read_home_config()
        config_path = find_config_path()

        if not config_path:
            print("No model configuration file (aetros.yml). Switch to a directory first..")
            sys.exit(1)

        config = find_config(error_on_missing=True)
        print("Model %s in %s used in all aetros commands." % (config['model'], os.path.dirname(config_path)))

        git_remote_url = 'git@%s:%s.git' % (home_config['host'], config['model'])
        print("Git url: %s" % (git_remote_url,))
Exemplo n.º 11
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-pull')
        parser.add_argument('id', help="Short or long job id, like ef8009d83a9892968097cec05b9467c685d45453")
        parser.add_argument('--model', help="Model name like peter/mnist. Per default from configuration.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory or directories above.")

        parsed_args = parser.parse_args(args)

        if not parsed_args.id:
            parser.print_help()
            sys.exit(1)

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        full_id = git_has_remote_job(home_config, model, parsed_args.id)
        if not full_id:
            print("Error: Job not found on remote.")
            sys.exit(1)

        ref = 'refs/aetros/job/' + full_id
        git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git')

        git_remote_url = 'git@%s:%s.git' % (home_config['host'], model)

        if not os.path.isdir(git_dir):
            subprocess.call([home_config['git'], '--bare', 'clone', git_remote_url, git_dir])

        print('Pull job %s of %s' % (parsed_args.id, model))
        setup_git_ssh(home_config)
        subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref])
Exemplo n.º 12
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' run')
        parser.add_argument('name', nargs='?', help="Model name")
        parser.add_argument(
            '--private',
            action='store_true',
            help=
            "Make the model private. Example: aetros init my-model --private")

        home_config = read_home_config()
        parsed_args = parser.parse_args(args)
        if not parsed_args.name:
            parser.print_help()
            sys.exit(1)

        if os.path.exists('aetros.yml'):
            config = yaml.safe_load(open('aetros.yml', 'r'))
            if isinstance(config, dict) and 'model' in config:
                print(
                    "failed: aetros.yml already exists with a linked model to "
                    + config['model'])
                sys.exit(1)

        name = api.create_model(
            parsed_args.name or (os.path.basename(os.getcwd())),
            parsed_args.private)

        with open('aetros.yml', 'w') as f:
            f.write('model: ' + name)

        print("aetros.yml created linked with model " + name + ' in ' +
              os.getcwd())
        print("Open AETROS Trainer to see the model at https://" +
              home_config['host'] + '/model/' + name)
Exemplo n.º 13
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-push')
        parser.add_argument('id', help="Short or long job id, like ef8009d83a9892968097cec05b9467c685d45453")
        parser.add_argument('--model', help="Model name like peter/mnist. Per default from current directory")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")

        parsed_args = parser.parse_args(args)

        if not parsed_args.id:
            parser.print_help()
            sys.exit(1)

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        full_id = git_has_local_job(home_config, model, parsed_args.id)
        if not full_id:
            print("Error: Job not found on local.")
            sys.exit(1)

        ref = 'refs/aetros/job/' + full_id
        git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git')

        if not os.path.isdir(git_dir):
            self.logger.error("Git repository for model %s in %s not found." % (full_id, git_dir))
            self.logger.error("You seem not to have any job created on this machine for model " + model)
            sys.exit(1)

        print('Push job %s of %s' % (full_id, model))
        setup_git_ssh(home_config)
        subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'push', 'origin', ref])
Exemplo n.º 14
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' id')

        parsed_args = parser.parse_args(args)
        config = read_home_config()


        try:
            user = api.user()
        except KeyNotConfiguredException as e:
            self.logger.error(str(e))
            sys.exit(1)

        print("Logged in as %s (%s) on %s" % (user['username'], user['name'], config['host']))

        if len(user['accounts']) > 0:
            for orga in six.itervalues(user['accounts']):
                print("  %s of organisation %s (%s)." % ("Owner" if orga['memberType'] == 1 else "Member", orga['username'], orga['name']))
        else:
            print("  Without membership to an organisation.")
Exemplo n.º 15
0
def http_request(path, query='', json_body=None, method='get', config=None):
    config = read_home_config() if config is None else config

    try:
        import urllib3
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    except Exception:
        pass

    if query is not None:
        if isinstance(query, dict):
            query = urlencode(query)

        if '?' in path:
            path += '&' + query
        else:
            path += '?' + query

    url = 'https://' + config['host'] + '/api/' + path
    auth = None
    if 'auth_user' in config:
        auth = HTTPBasicAuth(config['auth_user'], config['auth_pw'])

    if json_body is not None and method == 'get':
        method = 'post'

    response = requests.request(method,
                                url,
                                data=json_body,
                                auth=auth,
                                verify=config['ssl_verify'],
                                headers={'Accept': 'application/json'})

    if response.status_code >= 400:
        raise_response_exception('Failed request ' + path, response)

    return parse_json(response.content.decode('utf-8'))
Exemplo n.º 16
0
    def main(self, args):
        from aetros.starter import start
        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' start')
        parser.add_argument(
            'name',
            nargs='?',
            help=
            'the model name, e.g. aetros/mnist-network to start new job, or job id, e.g. user/modelname/0db75a64acb74c27bd72c22e359de7a4c44a20e5 to start a pre-created job.'
        )

        parser.add_argument(
            '-i',
            '--image',
            help=
            "Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host."
        )
        parser.add_argument(
            '-l',
            '--local',
            action='store_true',
            help="Start the job immediately on the current machine.")
        parser.add_argument(
            '-s',
            '--server',
            action='append',
            help=
            "Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed."
        )
        parser.add_argument(
            '-b',
            '--branch',
            help=
            "This overwrites the Git branch used when new job should be started."
        )
        parser.add_argument(
            '--priority',
            help="Increases or decreases priority. Default is 0.")

        parser.add_argument(
            '--cpu',
            help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument(
            '--memory',
            help="How much memory should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu',
            help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu_memory',
            help="Memory requirement for the GPU. Docker only.")

        parser.add_argument(
            '--gpu-device',
            action='append',
            help=
            "Which device id should be mapped into the NVIDIA docker container."
        )

        parser.add_argument(
            '--max-time',
            help=
            "Limit execution time in seconds. Sends SIGINT to the process group when reached."
        )
        parser.add_argument(
            '--max-epochs',
            help=
            "Limit execution epochs. Sends SIGINT to the process group when reached."
        )

        parser.add_argument('--insights',
                            action='store_true',
                            help="activates insights. Only for simple models.")
        parser.add_argument(
            '--dataset',
            help=
            "Dataset id when model has placeholders. Only for simple models with placeholders as input/output."
        )

        parser.add_argument(
            '-p',
            '--param',
            action='append',
            help=
            "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed."
        )

        parsed_args = parser.parse_args(args)

        home_config = read_home_config()

        hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception(
                        '--param ' + param +
                        ' does not contain a `=`. Please use "--param name=value"'
                    )

                name, value = param.split('=')
                hyperparameter[name] = value

        job_config = {'insights': parsed_args.insights}

        if parsed_args.image:
            job_config['image'] = parsed_args.image

        if parsed_args.branch:
            job_config['sourceGitTree'] = parsed_args.branch

        if parsed_args.max_epochs:
            job_config['maxEpochs'] = int(parsed_args.max_epochs)

        if parsed_args.max_time:
            job_config['maxTime'] = float(parsed_args.max_time)

        job_config['priority'] = 0
        if parsed_args.priority:
            job_config['priority'] = float(parsed_args.priority)

        if 'resources' not in job_config:
            job_config['resources'] = {}

        if parsed_args.server:
            job_config['servers'] = []
            for name in parsed_args.server:
                job_config['servers'].append(name)

        if parsed_args.cpu or parsed_args.memory or parsed_args.gpu is not None or parsed_args.gpu_memory:
            if parsed_args.cpu:
                job_config['resources']['cpu'] = float(parsed_args.cpu)
            if parsed_args.memory:
                job_config['resources']['memory'] = float(parsed_args.memory)
            if parsed_args.gpu is not None:
                job_config['resources']['gpu'] = float(parsed_args.gpu)
            if parsed_args.gpu_memory:
                job_config['resources']['gpu_memory'] = float(
                    parsed_args.gpu_memory)

        model_name = parsed_args.name

        if model_name.count('/') == 1:
            try:
                self.logger.debug("Create job ...")
                created = api.create_job(model_name,
                                         parsed_args.local,
                                         hyperparameter,
                                         parsed_args.dataset,
                                         config=job_config)
            except api.ApiError as e:
                if 'Connection refused' in e.reason:
                    self.logger.error("You are offline")

                raise

            print("Job %s/%s created." % (model_name, created['id']))

            if parsed_args.local:
                start(self.logger,
                      model_name + '/' + created['id'],
                      gpu_devices=parsed_args.gpu_device)
            else:
                print("Open http://%s/model/%s/job/%s to monitor it." %
                      (home_config['host'], model_name, created['id']))

        else:
            start(self.logger, model_name, gpu_devices=parsed_args.gpu_device)
Exemplo n.º 17
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' authenticate',
            description=
            'Authenticates the machine with a new pair of SSH keys with a user account.'
        )

        parsed_args = parser.parse_args(args)

        home_config = read_home_config()
        host = home_config['host']

        installed_key = get_ssh_key_for_host(host)
        key_exists_and_valid = False
        if installed_key:
            try:
                create_ssh_stream(home_config, exit_on_failure=False)
                key_exists_and_valid = True
            except Exception:
                pass

        if key_exists_and_valid:
            choice = six.moves.input(
                "You have already configured a valid SSH (ssk_key: " +
                installed_key + ") "
                "for " + host +
                ".\nWant to create a new key? (y/N): ").lower()
            if choice != 'y' and choice != 'yes':
                print("Aborted.")
                sys.exit(1)

        ssh_key = paramiko.RSAKey.generate(4096)
        ssh_key_private = ssh_key.key.private_bytes(
            serialization.Encoding.PEM,
            serialization.PrivateFormat.TraditionalOpenSSL,
            serialization.NoEncryption()).decode()
        ssh_key_public = 'rsa ' + ssh_key.get_base64()

        fingerprint = hashlib.md5(ssh_key.__str__()).hexdigest()
        fingerprint = ':'.join(
            a + b for a, b in zip(fingerprint[::2], fingerprint[1::2]))

        token = api.http_request('machine-token', None, {
            'host': socket.getfqdn(),
            'key': ssh_key_public
        })

        print(
            "Open following link and login to confirm this machine's SSH key in your account."
        )
        print("Public Key Fingerprint: MD5:" + fingerprint)
        print("\n   https://" + host + "/confirm-machine/" + token)
        print("\nWaiting for confirmation ...")

        while True:
            time.sleep(3)
            response = api.http_request('machine-token/authorized?id=' + token,
                                        method='post')
            if response['status'] == 'confirmed':
                print(
                    "\n" + response['username'] +
                    ' confirmed the public key. Test with "aetros id" or "ssh git@'
                    + host + '".')
                private_key_path = os.path.expanduser('~/.ssh/aetros_' +
                                                      response['username'] +
                                                      '_rsa')
                public_key_path = os.path.expanduser('~/.ssh/aetros_' +
                                                     response['username'] +
                                                     '_rsa.pub')

                if not os.path.exists(os.path.dirname(private_key_path)):
                    os.makedirs(os.path.dirname(private_key_path))

                with open(private_key_path, 'w') as f:
                    f.write(ssh_key_private)

                with open(public_key_path, 'w') as f:
                    f.write(ssh_key_public)

                os.chmod(private_key_path, 0o600)
                os.chmod(public_key_path, 0o600)

                ssh_config_path = os.path.expanduser('~/.ssh/config')

                if not os.path.exists(os.path.dirname(ssh_config_path)):
                    os.makedirs(os.path.dirname(ssh_config_path))

                host_section = 'host ' + host + '\n'
                identity_section = '    IdentityFile ~/.ssh/aetros_' + response[
                    'username'] + '_rsa\n'

                if os.path.exists(ssh_config_path):
                    import re
                    regex = re.compile(r"^host\s+" + re.escape(host) + '\s*',
                                       re.IGNORECASE | re.MULTILINE)
                    with open(ssh_config_path, 'r+') as f:
                        config = f.read()

                        if regex.match(config):
                            config = regex.sub(host_section + identity_section,
                                               config, 1)
                        else:
                            config = host_section + identity_section + config

                        f.seek(0)
                        f.write(config)
                else:
                    with open(ssh_config_path, 'w') as f:
                        f.write(host_section + identity_section)

                print("Private key " + private_key_path +
                      " installed in ~/.ssh/config for " + host + ".\n")
                user = api.user()
                print("Key installed of account %s (%s)." %
                      (user['username'], user['name']))
                sys.exit(0)
            if response['status'] == 'expired':
                print("Token expired.")
                sys.exit(1)
Exemplo n.º 18
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' jobs')
        parser.add_argument('--all',
                            '-a',
                            action='store_true',
                            help="Show remote jobs as well")
        parser.add_argument(
            '--model',
            help="Model name like peter/mnist. Per default from configuration."
        )
        parser.add_argument(
            '-c',
            '--config',
            help=
            "Default aetros.yml in current working directory or directories above."
        )

        parsed_args = parser.parse_args(args)

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print(
                "No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'."
            )
            sys.exit(2)

        print("Show jobs of model " + model + ' (' + home_config['host'] + ')')

        setup_git_ssh(home_config)
        local_job_ids = git_local_job_ids(home_config, model)
        remote_job_ids = []

        try:
            remote_job_ids = git_remote_job_ids(home_config, model)
        except:
            pass

        job_map = OrderedDict()
        for job_id in local_job_ids:
            job_map[job_id] = {
                'local': Color('{autogreen}Yes{/autogreen}'),
                'remote': Color('{autored}No{/autored}'),
            }

        for job_id in remote_job_ids:
            if job_id in job_map:
                job_map[job_id]['remote'] = Color('{autogreen}Yes{/autogreen}')
            elif parsed_args.all:
                job_map[job_id] = {
                    'local': Color('{autored}No{/autored}'),
                    'remote': Color('{autogreen}Yes{/autogreen}')
                }

        print("%d jobs found. (%d synced to remote)" %
              (len(job_map), len(remote_job_ids)))
        if not parsed_args.all:
            print("Use --all to show remote-only jobs as well.")

        table_data = [['Short Job ID', 'Local', 'Remote', 'Long Job ID']]

        for job_id, info in six.iteritems(job_map):
            table_data.append(
                [job_id[0:9], info['local'], info['remote'], job_id])

        table = AsciiTable(table_data)
        print(table.table)
Exemplo n.º 19
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' authenticate',
            description='Authenticates the machine with a new pair of SSH keys with a user account.')

        parsed_args = parser.parse_args(args)

        home_config = read_home_config()
        host = home_config['host']

        installed_key = get_ssh_key_for_host(host)
        key_exists_and_valid = False
        if installed_key:
            try:
                create_ssh_stream(home_config, exit_on_failure=False)
                key_exists_and_valid = True
            except Exception: pass

        if key_exists_and_valid:
            choice = six.moves.input("You have already configured a valid SSH (ssk_key: "+installed_key+") "
                                     "for "+host+".\nWant to create a new key? The old won't be removed. (y/N): ").lower()
            if choice != 'y' and choice != 'yes':
                print("Aborted.")
                sys.exit(1)

        ssh_key = paramiko.RSAKey.generate(4096)
        ssh_key_private = ssh_key.key.private_bytes(
            serialization.Encoding.PEM, serialization.PrivateFormat.TraditionalOpenSSL, serialization.NoEncryption()
        ).decode()
        ssh_key_public = 'rsa ' + ssh_key.get_base64()

        string_key = ssh_key.__str__()

        if not isinstance(string_key, six.binary_type):
            string_key = string_key.encode('utf-8')
        md5 = hashlib.md5(string_key)

        fingerprint  = md5.hexdigest()
        fingerprint = ':'.join(a + b for a, b in zip(fingerprint[::2], fingerprint[1::2]))

        try:
            token = api.http_request('machine-token', None, {
                'host': socket.getfqdn(),
                'key': ssh_key_public
            })
        except requests.exceptions.SSLError:
            sys.exit(1)

        print("Open following link and login to confirm this machine's SSH key in your account.")
        print("Public Key Fingerprint: MD5:" + fingerprint)
        print("\n   " + home_config['url'] + "/confirm-machine/" + token)
        print("\nWaiting for confirmation ...")

        key_prefix = home_config['host'] + '_'

        while True:
            time.sleep(3)
            response = api.http_request('machine-token/authorized?id=' + token, method='post')
            if response['status'] == 'confirmed':


                print("\n" + response['username'] + ' confirmed the public key. Test with "aetros id" or "ssh git@'+host+'".')
                private_key_path = os.path.expanduser('~/.ssh/' + key_prefix + response['username']+'_rsa')
                public_key_path = os.path.expanduser('~/.ssh/' + key_prefix + response['username']+'_rsa.pub')

                if not os.path.exists(os.path.dirname(private_key_path)):
                    os.makedirs(os.path.dirname(private_key_path))

                with open(private_key_path, 'w') as f:
                    f.write(ssh_key_private)

                with open(public_key_path, 'w') as f:
                    f.write(ssh_key_public)

                os.chmod(private_key_path, 0o600)
                os.chmod(public_key_path, 0o600)

                ssh_config_path = os.path.expanduser('~/.ssh/config')

                if not os.path.exists(os.path.dirname(ssh_config_path)):
                    os.makedirs(os.path.dirname(ssh_config_path))

                host_section = 'host '+host+'\n'
                identity_section = '    IdentityFile ~/.ssh/' + key_prefix + response['username']+'_rsa\n'

                if os.path.exists(ssh_config_path):
                    import re
                    regex = re.compile(r"^host\s+" + re.escape(host)+'\s*', re.IGNORECASE | re.MULTILINE)
                    with open(ssh_config_path, 'r+') as f:
                        config = f.read()

                        if regex.match(config):
                            config = regex.sub(host_section + identity_section, config, 1)
                        else:
                            config = host_section + identity_section + config

                        f.seek(0)
                        f.write(config)
                else:
                    with open(ssh_config_path, 'w') as f:
                        f.write(host_section + identity_section)

                print("Private key " + private_key_path + " installed in ~/.ssh/config for "+host+".\n")
                user = api.user()
                print("Key installed of account %s (%s)." % (user['username'], user['name']))
                sys.exit(0)
            if response['status'] == 'expired':
                print("Token expired.")
                sys.exit(1)
Exemplo n.º 20
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' init')
        parser.add_argument('name', help="Model name")
        parser.add_argument('directory',
                            nargs='?',
                            help="Directory, default in current.")
        parser.add_argument(
            '--organisation',
            '-o',
            help=
            "Create the model in the organisation instead of the user account."
        )
        parser.add_argument(
            '--space',
            '-s',
            help=
            "Create the model in given space. If space does not exist, create it."
        )
        parser.add_argument(
            '--private',
            action='store_true',
            help=
            "Make the model private. Example: aetros init my-model --private")
        parser.add_argument(
            '--force',
            '-f',
            action='store_true',
            help="Force overwriting of already existing configuration file.")

        home_config = read_home_config()
        parsed_args = parser.parse_args(args)
        if not parsed_args.name:
            parser.print_help()
            sys.exit(1)

        path = os.getcwd()
        if parsed_args.directory:
            path = os.path.abspath(parsed_args.directory)

        if os.path.exists(path) and not os.path.isdir(path):
            sys.stderr.write('Path already exist and is not a directory: ' +
                             path)

        if not os.path.exists(path):
            os.makedirs(path)

        yaml = ruamel.yaml.YAML()
        config = {}

        if os.path.exists(path + '/aetros.yml'):
            with open(path + '/aetros.yml', 'r') as f:
                config = yaml.load(f)

            if isinstance(
                    config,
                    dict) and 'model' in config and not parsed_args.force:
                print(
                    "failed: aetros.yml already exists in with a linked model to "
                    + config['model'] + '. Use -f to force.')
                sys.exit(1)

        if not parsed_args.private:
            print(
                "Warning: creating public model. Use --private to create private models."
            )

        if '/' in parsed_args.name:
            sys.stderr.write(
                'No / allowed in name. Use -o if thie model should be created in an organisation.'
            )
            sys.exit(1)

        response = api.create_model(
            parsed_args.name or (os.path.basename(os.getcwd())),
            parsed_args.organisation, parsed_args.space, parsed_args.private)
        name = response['name']

        if response['already_exists']:
            print("Notice: Model already exists remotely.")

        config['model'] = name

        with open(path + '/aetros.yml', 'w+') as f:
            yaml.dump(config, f)

        print("aetros.yml created and linked with model " + name + ' in ' +
              path)
        print("Open AETROS Trainer to see the model at https://" +
              home_config['host'] + '/model/' + name)

        git_remote_url = 'git@%s:%s.git' % (home_config['host'], name)

        print(
            "Use git to store your source code. Each model has its own Git repository."
        )
        print("  $ cd " + path)
        print("  $ git init")
        print("  $ git remote add origin " + git_remote_url)
        print("  $ git add .")
        print("  $ git commit -m 'first commit'")
        print("  $ git push origin master")
Exemplo n.º 21
0
def start_command(logger,
                  job_backend,
                  env=None,
                  volumes=None,
                  gpu_devices=None):
    work_tree = job_backend.git.work_tree
    home_config = read_home_config()

    if not env:
        env = {}

    if 'PYTHONPATH' not in env:
        env['PYTHONPATH'] = os.getenv('PYTHONPATH', '')

    env['PYTHONPATH'] += ':' + os.getcwd()
    env['AETROS_MODEL_NAME'] = job_backend.model_name
    env['AETROS_JOB_ID'] = str(job_backend.job_id)
    env['DEBUG'] = os.getenv('DEBUG', '')
    env['AETROS_ATTY'] = '1'
    env['AETROS_GIT'] = job_backend.git.get_base_command()

    if os.getenv('AETROS_SSH_KEY_BASE64'):
        env['AETROS_SSH_KEY_BASE64'] = os.getenv('AETROS_SSH_KEY_BASE64')
    elif get_ssh_key_for_host(home_config['host']):
        # we need to read the key into env so the docker container can connect to AETROS
        env['AETROS_SSH_KEY_BASE64'] = open(
            get_ssh_key_for_host(home_config['host']), 'r').read()

    job_config = job_backend.job['config']

    if 'command' not in job_config:
        job_backend.fail(
            'No "command" given. See Configuration section in the documentation.'
        )

    command = job_config['command']
    image = job_config['image']

    if job_backend.is_simple_model():
        if image:
            command = ['python']
        else:
            command = [sys.executable]
        command += [
            '-m', 'aetros', 'start-simple',
            job_backend.model_name + '/' + job_backend.job_id
        ]

    if command is None:
        raise Exception('No command specified.')

    # replace {{batch_size}} parameters
    if isinstance(job_config['parameters'], dict):
        for key, value in six.iteritems(
                flatten_parameters(job_config['parameters'])):
            if isinstance(command, list):
                for pos, v in enumerate(command):
                    if isinstance(command[pos], six.string_types):
                        command[pos] = command[pos].replace(
                            '{{' + key + '}}', json.dumps(value))
            elif isinstance(command, six.string_types):
                command = command.replace('{{' + key + '}}', json.dumps(value))

    logger.info("Switch working directory to " + work_tree)
    os.chdir(job_backend.git.work_tree)

    docker_image_built = False
    if job_config['dockerfile'] or job_config['install']:
        dockerfile = job_config['dockerfile']
        if isinstance(dockerfile,
                      six.string_types) and os.path.exists(dockerfile):
            pass
        else:
            if isinstance(dockerfile, six.string_types):
                dockerfile_content = dockerfile
            elif isinstance(dockerfile, list) and len(dockerfile) > 0:
                dockerfile_content = "\n".join(dockerfile)
            else:
                if image is None:
                    job_backend.fail(
                        "Image name missing, needed by `install` in aetros.yml"
                    )
                dockerfile_content = 'FROM ' + image + '\nRUN '

                if isinstance(job_config['install'], list):
                    dockerfile_content += '\n RUN '.join(job_config['install'])
                else:
                    dockerfile_content += job_config['install']

            dockerfile_content = '# CREATED BY AETROS because of "install" or "dockerfile" config in aetros.yml.\n' \
                                 + dockerfile_content

            with open('Dockerfile.aetros', 'w') as f:
                f.write(dockerfile_content)

            dockerfile = 'Dockerfile.aetros'
            job_backend.commit_file('Dockerfile.aetros')

        job_backend.set_system_info('image/dockerfile', dockerfile)
        docker_build = [
            home_config['docker'],
            'build',
            '-t',
            job_backend.model_name,
            '-f',
            dockerfile,
            '.',
        ]

        logger.info("Prepare docker image: $ " + (' '.join(docker_build)))
        job_backend.set_status('IMAGE BUILD')
        p = execute_command(args=docker_build,
                            bufsize=1,
                            stderr=subprocess.PIPE,
                            stdout=subprocess.PIPE)

        if p.returncode:
            job_backend.fail('Image build error')
            sys.exit(p.returncode)

        docker_image_built = True
        image = job_backend.model_name

    docker_command = None
    if image:
        if not docker_image_built:
            logger.info("Pull docker image: $ " + image)
            job_backend.set_status('IMAGE PULL')
            execute_command(args=[home_config['docker'], 'pull', image],
                            bufsize=1,
                            stderr=subprocess.PIPE,
                            stdout=subprocess.PIPE)

        inspections = execute_command_stdout(
            [home_config['docker'], 'inspect', image])
        inspections = json.loads(inspections.decode('utf-8'))
        if inspections:
            inspection = inspections[0]
            with job_backend.git.batch_commit('Docker image'):
                job_backend.set_system_info('image/id', inspection['Id'])
                job_backend.set_system_info('image/docker_version',
                                            inspection['DockerVersion'])
                job_backend.set_system_info('image/created',
                                            inspection['Created'])
                job_backend.set_system_info('image/container',
                                            inspection['Container'])
                job_backend.set_system_info('image/architecture',
                                            inspection['Architecture'])
                job_backend.set_system_info('image/os', inspection['Os'])
                job_backend.set_system_info('image/size', inspection['Size'])
                job_backend.set_system_info('image/rootfs',
                                            inspection['RootFS'])

        # make sure old container is removed
        subprocess.Popen([home_config['docker'], 'rm', job_backend.job_id],
                         stderr=subprocess.PIPE).wait()

        docker_command = [
            home_config['docker'], 'run', '-t', '--name', job_backend.job_id
        ]
        docker_command += home_config['docker_options']

        env['AETROS_GIT_WORK_DIR'] = '/job'
        docker_command += [
            '--mount', 'type=bind,source=' + job_backend.git.work_tree +
            ',destination=/job'
        ]

        env['AETROS_STORAGE_DIR'] = '/aetros'
        docker_command += [
            '--mount', 'type=bind,source=' + job_backend.git.git_path +
            ',destination=' + '/aetros/' + job_backend.model_name + '.git'
        ]

        home_config_path = os.path.expanduser('~/aetros.yml')
        if os.path.exists(home_config_path):
            env['AETROS_HOME_CONFIG_FILE'] = '/aetros/aetros.yml'
            docker_command += [
                '--mount', 'type=bind,source=' + home_config_path +
                ',destination=' + '/aetros/aetros.yml'
            ]

        docker_command += ['-w', '/job']

        # make sure the docker command receives all environment variables
        for k in six.iterkeys(env):
            docker_command += ['-e', k]

        if volumes:
            for volume in volumes:
                docker_command += ['-v', volume]

        if 'resources' in job_backend.job:
            assigned_resources = job_backend.job['resources']

            cpus = 1
            if 'cpu' in assigned_resources and assigned_resources['cpu']:
                cpus = assigned_resources['cpu']
            docker_command += ['--cpus', str(cpus)]

            memory = 1
            if 'memory' in assigned_resources and assigned_resources['memory']:
                memory = assigned_resources['memory']

            docker_command += ['--memory', str(memory * 1024 * 1024 * 1024)]

        if gpu_devices and (sys.platform == "linux"
                            or sys.platform == "linux2"):
            # only supported on linux
            docker_command += ['--runtime', 'nvidia']
            docker_command += [
                '-e', 'NVIDIA_VISIBLE_DEVICES=' + (','.join(gpu_devices))
            ]
            # support nvidia-docker1 as well
            # docker_command += ['--device', '/dev/nvidia1']

        docker_command.append(image)

        # since linux doesnt handle SIGINT when pid=1 process has no signal listener,
        # we need to make sure, we attached one to the pid=1 process
        trap = 'trapIt () { "$@"& pid="$!"; trap "kill -INT $pid" INT TERM; ' \
               'while kill -0 $pid > /dev/null 2>&1; do wait $pid; ec="$?"; done; exit $ec;};'

        if isinstance(command, list):
            command = ' '.join(command)

        docker_command += ['sh', '-c', trap + 'trapIt ' + command]
        command = docker_command

    job_backend.set_system_info('image/name', str(image))

    if not isinstance(command, list):
        command = ['sh', '-c', command]

    p = None
    exited = False
    wait_stdout = None
    wait_stderr = None
    try:
        job_backend.set_status('STARTED')
        logger.warning("$ %s " % (' '.join([json.dumps(a) for a in command])))

        command_env = os.environ.copy()
        command_env.update(env)

        # make sure maxTime limitation is correctly calculated
        job_backend.monitoring_thread.handle_max_time = True
        job_backend.monitoring_thread.handle_max_time_time = time.time()

        # Since JobBackend sends SIGINT to its current process group, it sends also to its parents when same pg.
        # We need to change the process group of the process, so this won't happen.
        # If we don't this, the master process receives the SIGINT as well.
        kwargs = {}
        if os.name == 'nt':
            kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
        else:
            kwargs['preexec_fn'] = os.setsid

        p = subprocess.Popen(args=command,
                             bufsize=1,
                             stderr=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             env=command_env,
                             **kwargs)
        wait_stdout = sys.stdout.attach(p.stdout)
        wait_stderr = sys.stderr.attach(p.stderr)

        p.wait()
        wait_stdout()
        wait_stderr()

        exited = True

        sys.exit(p.returncode)
    except SystemExit:
        # We can not send a SIGINT to the child process
        # as we don't know whether it received it already (pressing CTRL+C) or not (sending SIGINT to this process only
        # instead of to the group), so we assume it received it. A second signal would force the exit.
        # sys.__stdout__.write("SystemExit with " + str(p.returncode) + ', exited: ' + str(exited) + ", early: "+str(job_backend.in_early_stop)+"\n")

        # make sure the process dies
        if docker_command:
            # docker run does not proxy INT signals to the docker-engine,
            # so we need to do it on our own directly.
            subprocess.Popen([
                home_config['docker'], 'kill', '--signal', 'INT',
                job_backend.job_id
            ],
                             stderr=subprocess.PIPE,
                             stdout=subprocess.PIPE).wait()
            subprocess.Popen(
                [home_config['docker'], 'wait', job_backend.job_id],
                stdout=subprocess.PIPE).wait()
        elif not exited and p and p.poll() is None:
            p.kill()  # sends SIGINT
            p.wait()

        if exited:
            if p.returncode == 0:
                job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_DONE)
            elif p.returncode == 1:
                job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_ABORTED)
            else:
                job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_FAILED)
        else:
            # master received SIGINT before the actual command exited.
            if not job_backend.in_early_stop:
                # master did not receive early_stop signal (maxTime limitation)
                # if not, the master received a stop signal by server or by hand (ctrl+c), so mark as aborted
                job_backend.abort()
            else:
                # let the on_shutdown listener handle the rest
                pass
Exemplo n.º 22
0
def start_command(logger,
                  job_backend,
                  env_overwrite=None,
                  volumes=None,
                  cpus=1,
                  memory=1,
                  gpu_devices=None,
                  offline=False):

    home_config = read_home_config()

    env = {}
    if env_overwrite:
        env.update(env_overwrite)

    start_time = time.time()
    env['AETROS_MODEL_NAME'] = job_backend.model_name
    env['AETROS_JOB_ID'] = str(job_backend.job_id)
    env['AETROS_OFFLINE'] = '1' if offline else ''
    env['AETROS_GIT_INDEX_FILE'] = job_backend.git.index_path
    env['DEBUG'] = os.getenv('DEBUG', '')
    env['PYTHONUNBUFFERED'] = os.getenv('PYTHONUNBUFFERED', '1')
    env['PYTHONIOENCODING'] = os.getenv('PYTHONIOENCODING', 'UTF-8')
    env['AETROS_ATTY'] = '1'
    env['AETROS_GIT'] = job_backend.git.get_base_command()

    env['PATH'] = os.getenv('PATH', '')
    if 'PYTHONPATH' not in env:
        env['PYTHONPATH'] = os.getenv('PYTHONPATH', '')

    if os.getenv('AETROS_SSH_KEY_BASE64'):
        env['AETROS_SSH_KEY_BASE64'] = os.getenv('AETROS_SSH_KEY_BASE64')
    elif get_ssh_key_for_host(home_config['host']):
        # we need to read the key into env so the docker container can connect to AETROS
        env['AETROS_SSH_KEY_BASE64'] = open(
            get_ssh_key_for_host(home_config['host']), 'r').read()

    job_config = job_backend.job['config']
    job = job_backend.get_job_model()

    if 'command' not in job_config:
        job_backend.fail(
            'No "command" given. See Configuration section in the documentation.'
        )

    job_commands = job_config['command']
    docker_image = job_config['image']

    if job_backend.is_simple_model():
        if docker_image:
            simple_command = ['python']
        else:
            simple_command = [sys.executable]

        simple_command += [
            '-m', 'aetros', 'start-simple',
            job_backend.model_name + '/' + job_backend.job_id
        ]
        job_commands = {'run': ' '.join(simple_command)}

    if job_commands is None:
        raise Exception('No command specified.')

    if not isinstance(job_commands, list) and not isinstance(
            job_commands, dict):
        job_commands = [job_commands]

    # replace {{batch_size}} parameters
    if isinstance(job_config['parameters'], dict):
        for key, value in six.iteritems(
                flatten_parameters(job_config['parameters'])):
            if isinstance(job_commands, list):
                for k, v in enumerate(job_commands):
                    if isinstance(job_commands[k], six.string_types):
                        job_commands[k] = job_commands[k].replace(
                            '{{' + key + '}}', simplejson.dumps(value))

            elif isinstance(job_commands, dict):
                for k, v in six.iteritems(job_commands):
                    if isinstance(job_commands[k], six.string_types):
                        job_commands[k] = job_commands[k].replace(
                            '{{' + key + '}}', simplejson.dumps(value))

    job_backend.set_system_info('commands', job_commands)
    os.chdir(job_backend.git.work_tree)

    docker_image_built = False

    if docker_image and (job_config['dockerfile'] or job_config['install']):
        rebuild_image = job_config[
            'rebuild_image'] if 'rebuild_image' in job_config else False
        docker_image = docker_build_image(logger, home_config, job_backend,
                                          rebuild_image)
        docker_image_built = True

    job_backend.collect_device_information(gpu_devices)

    state = {'last_process': None}
    job_backend.set_system_info('processRunning', False, True)

    def pause():
        if not state['last_process'] or state['last_process'].poll(
        ) is not None:
            # no running process
            return

        if docker_image:
            if docker_pause(logger, home_config, job_backend):
                job_backend.set_paused(True)
        else:
            os.killpg(os.getpgid(state['last_process'].pid), signal.SIGSTOP)
            job_backend.set_paused(True)

    def cont():
        if not state['last_process'] or state['last_process'].poll(
        ) is not None:
            # no running process
            return

        job_backend.set_paused(False)
        if docker_image:
            docker_continue(logger, home_config, job_backend)
        else:
            os.killpg(os.getpgid(state['last_process'].pid), signal.SIGCONT)

    job_backend.on_pause = pause
    job_backend.on_continue = cont

    if docker_image:
        env['AETROS_GIT_INDEX_FILE'] = '/aetros/' + job_backend.model_name + '.git/' + os.path.basename(
            env['AETROS_GIT_INDEX_FILE'])

        with job_backend.git.batch_commit('JOB_SYSTEM_INFORMATION'):
            aetros_environment = {
                'aetros_version': __version__,
                'variables': env.copy()
            }
            if 'AETROS_SSH_KEY' in aetros_environment['variables']:
                del aetros_environment['variables']['AETROS_SSH_KEY']
            if 'AETROS_SSH_KEY_BASE64' in aetros_environment['variables']:
                del aetros_environment['variables']['AETROS_SSH_KEY_BASE64']
            job_backend.set_system_info('environment', aetros_environment)

            job_backend.set_system_info('memory_total',
                                        memory * 1024 * 1024 * 1024)

            import cpuinfo
            cpu = cpuinfo.get_cpu_info()
            job_backend.set_system_info('cpu_name', cpu['brand'])
            job_backend.set_system_info('cpu', [cpu['hz_actual_raw'][0], cpus])

        job_backend.start_monitoring(cpu_cores=cpus,
                                     gpu_devices=gpu_devices,
                                     docker_container=job_backend.job_id)

        if not docker_image_built:
            docker_pull_image(logger, home_config, job_backend)

        docker_image_information(logger, home_config, job_backend)

        # make sure old container is removed
        subprocess.Popen([home_config['docker'], 'rm', job_backend.job_id],
                         stderr=subprocess.PIPE).wait()

        command = docker_command_wrapper(logger, home_config, job_backend,
                                         volumes, cpus, memory, gpu_devices,
                                         env)

        # since linux doesnt handle SIGINT when pid=1 process has no signal listener,
        # we need to make sure, we attached one to the pid=1 process
        trap = 'trapIt () { "$@"& pid="$!"; trap "kill -INT $pid" INT TERM; ' \
               'while kill -0 $pid > /dev/null 2>&1; do wait $pid; ec="$?"; done; exit $ec;};'

        command.append(docker_image)
        command += [
            '/bin/sh', '-c', trap + 'trapIt /bin/sh /job/aetros/command.sh'
        ]
    else:
        # non-docker
        # env['PYTHONPATH'] += ':' + os.getcwd()
        job_backend.collect_system_information()
        job_backend.collect_environment(env)
        job_backend.start_monitoring(gpu_devices=gpu_devices)

        command = ['/bin/sh', job_backend.git.work_tree + '/aetros/command.sh']

    logger.debug("$ %s " % (' '.join([simplejson.dumps(a) for a in command])))
    job_backend.set_system_info('image/name', str(docker_image))

    p = None
    exited = False
    last_return_code = None
    state['last_process'] = None
    all_done = False
    command_stats = None

    def clean():
        # clear working tree
        shutil.rmtree(job_backend.git.work_tree)

    def on_force_exit():
        # make sure the process dies
        clean()

        with open(os.devnull, 'r+b', 0) as DEVNULL:
            if docker_image:
                # docker run does not proxy INT signals to the docker-engine,
                # so we need to do it on our own directly.
                subprocess.Popen(
                    args=[home_config['docker'], 'kill', job_backend.job_id],
                    stdout=DEVNULL,
                    stderr=DEVNULL).wait()
            elif not exited and state[
                    'last_process'] and state['last_process'].poll() is None:
                # wait for last command
                os.killpg(os.getpgid(state['last_process'].pid),
                          signal.SIGKILL)

    job_backend.on_force_exit = on_force_exit

    try:
        job_backend.set_status('STARTED', add_section=False)
        # logger.warning("$ %s " % (str(command),))

        # make sure maxTime limitation is correctly calculated
        job_backend.monitoring_thread.handle_max_time = True
        job_backend.monitoring_thread.handle_max_time_time = time.time()

        # Since JobBackend sends SIGINT to its current process group, it sends also to its parents when same pg.
        # We need to change the process group of the process, so this won't happen.
        # If we don't this, the master process (server command e.g.) receives the SIGINT as well.
        kwargs = {}
        if os.name == 'nt':
            kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
        else:
            kwargs['preexec_fn'] = os.setsid

        # only use full env when no image used

        command_env = env
        if not docker_image:
            command_env = os.environ.copy()
            command_env.update(env)
            if os.environ.get('LD_LIBRARY_PATH', None):
                command_env['LD_LIBRARY_PATH_ORI'] = command_env[
                    'LD_LIBRARY_PATH']

        def write_command_sh(job_command):
            f = open(job_backend.git.work_tree + '/aetros/command.sh', 'w+')

            if not docker_image:
                # new shells unset LD_LIBRARY_PATH automatically, so we make sure it will be there again
                f.write('export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_ORI;\n')

            if job.get_working_dir():
                f.write('cd %s;\n' % (job.get_working_dir(), ))

            f.write(job_command)
            f.close()

        def read_line(line):
            handled, filtered_line, failed = extract_api_calls(
                line,
                job_backend.handle_stdout_api,
                print_traceback=True,
                logger=logger)

            if is_debug():
                for call in handled:
                    logger.debug('STDOUT API CALL: ' + str(call))

            for fail in failed:
                logger.warning(
                    "API call failed '%s': %s %s" %
                    (str(fail['line']), str(type(
                        fail['exception']).__name__), str(fail['exception'])))

            return filtered_line

        def exec_command(index, command, job_command):
            write_command_sh(job_command)

            working_dir = '/'
            if job.get_working_dir():
                working_dir = job.get_working_dir() + '/'

            print('%s $ %s' % (working_dir, job_command.strip()))
            args = command
            logger.debug('$ ' + ' '.join([simplejson.dumps(a) for a in args]))

            command_stats[index]['started'] = time.time() - start_time
            job_backend.set_system_info('command_stats', command_stats, True)

            # important to prefix it, otherwise name='master' would reset all stats in controller backend
            command_env['AETROS_JOB_NAME'] = 'command_' + str(index)

            state['last_process'] = subprocess.Popen(args=args,
                                                     bufsize=0,
                                                     stderr=subprocess.PIPE,
                                                     stdout=subprocess.PIPE,
                                                     env=command_env,
                                                     **kwargs)
            job_backend.set_system_info('processRunning', True, True)
            wait_stdout = sys.stdout.attach(state['last_process'].stdout,
                                            read_line=read_line)
            wait_stderr = sys.stderr.attach(state['last_process'].stderr)
            state['last_process'].wait()
            command_stats[index]['rc'] = last_return_code
            command_stats[index]['ended'] = time.time() - start_time
            job_backend.set_system_info('command_stats', command_stats, True)
            job_backend.set_system_info('processRunning', True, False)
            wait_stdout()
            wait_stderr()
            # make sure a new line is printed after a command
            print("")

            return state['last_process']

        done = 0
        total = len(job_commands)
        job_backend.set_system_info('command_stats', command_stats, True)
        if isinstance(job_commands, list):
            command_stats = [{
                'rc': None,
                'started': None,
                'ended': None
            } for x in job_commands]
            for k, job_command in enumerate(job_commands):
                job_backend.set_status('Command ' + str(k + 1))

                p = exec_command(k, command, job_command)
                last_return_code = p.poll()

                if last_return_code == 0:
                    done += 1
                else:
                    # one failed, so exit and don't execute next
                    break

        if isinstance(job_commands, dict):
            command_stats = {}
            for name, job_command in six.iteritems(job_commands):
                command_stats[name] = {
                    'rc': None,
                    'started': None,
                    'ended': None
                }

            for name, job_command in six.iteritems(job_commands):
                job_backend.set_status('Command ' + name)

                p = exec_command(name, command, job_command)
                last_return_code = p.poll()

                if last_return_code == 0:
                    done += 1
                else:
                    # one failed, so exit and don't execute next
                    break

        all_done = done == total
        exited = True

        if state['last_process']:
            sys.exit(state['last_process'].poll())
        else:
            sys.exit(1)

    except SystemExit:
        # since we started the command in a new process group, a SIGINT or CTRL+C on this process won't affect
        # our actual command process. So we need to take care that we stop everything.
        logger.debug(
            "SystemExit, exited=%s, all-done=%s, has-last-process=%s, pid=%s" %
            (str(exited), str(all_done), state['last_process'] is not None,
             state['last_process'].poll() if state['last_process'] else None))

        # make sure the process dies
        if docker_image:
            # docker run does not proxy INT signals to the docker-engine,
            # so we need to do it on our own directly.
            p = subprocess.Popen(
                args=[home_config['docker'], 'inspect', job_backend.job_id],
                stderr=subprocess.PIPE,
                stdout=subprocess.PIPE)
            p.wait()
            if p.poll() == 0:
                subprocess.Popen(
                    args=[home_config['docker'], 'kill', job_backend.job_id
                          ]).wait()
        elif not exited and state[
                'last_process'] and state['last_process'].poll() is None:
            # wait for last command
            os.killpg(os.getpgid(state['last_process'].pid), signal.SIGINT)
            state['last_process'].wait()

        if 'output' in job_config and job_config['output']:
            upload_output_files(job_backend, job_config['output'])

        if exited:
            if all_done:
                job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_DONE)
            else:
                job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_FAILED)
        else:
            # master received SIGINT before the all job commands exited.
            if not job_backend.in_early_stop:
                # in_early_stop indicates whether we want to have a planned stop (maxTime limitation for example),
                # which should mark the job as done, not as abort().
                # if this is not set, we the master received a SIGINT without early_stop, so mark as aborted.
                job_backend.abort()
            else:
                # let the on_shutdown listener handle the rest
                pass

        clean()
Exemplo n.º 23
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' run')
        parser.add_argument('command', nargs='?', help="The command to run. Default read in configuration file")
        parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host.")
        parser.add_argument('--no-image', action='store_true', help="Forces not to use docker, even when image is defined in the configuration file.")

        parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed.")
        parser.add_argument('-m', '--model', help="Under which model this job should be listed. Default read in configuration file")
        parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")
        parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.")

        parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.")
        parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.")

        parser.add_argument('--offline', '-o', action='store_true', help="Whether the execution should happen offline.")

        parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.")
        parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.")

        parser.add_argument('--gpu-device', action='append', help="Which device id should be mapped into the NVIDIA docker container. Only when --local")

        parser.add_argument('--volume', '-v', action='append', help="Volume into docker. Only when --local")
        parser.add_argument('-e', action='append', help="Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env")

        parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.")

        parsed_args = parser.parse_args(args)

        if parsed_args.config and not os.path.exists(parsed_args.config):
            self.logger.error("fatal: file %s does not exist." % (parsed_args.config,))
            sys.exit(2)

        config = find_config(parsed_args.config)
        home_config = read_home_config()

        if config['model'] and not parsed_args.model:
            parsed_args.model = config['model']

        if not parsed_args.model:
            print("fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        if not parsed_args.local and parsed_args.volume:
            print("fatal: can not use volume with jobs on the cluster. Use datasets instead.")
            sys.exit(1)

        if parsed_args.local and parsed_args.priority:
            print("fatal: the priority can only be set for jobs in the cluster.")
            sys.exit(1)

        if config['image']:
            ensure_docker_installed(self.logger)

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if ('command' not in config or not config['command']) and not parsed_args.command:
            self.logger.error('No command given. Define the command in aetros.yml or use command argument.')
            sys.exit(1)

        job_backend = JobBackend(parsed_args.model, self.logger)

        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job_backend.job = {'config': {'ignore': ignore}}

        adding_files = loading_text("- Adding job files to index ... ")
        files_added, size_added = job_backend.add_files(config['root'], report=False)
        adding_files("done with %d file%s added (%s)."
                     % (files_added, 's' if files_added != 1 else '', human_size(size_added, 2)))

        create_info = {
            'type': 'custom',
            'config': config
        }

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"')

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters, incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.rebuild_image:
            create_info['config']['rebuild_image'] = True

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:
            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config['image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.no_image:
            create_info['config']['image'] = None

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        create_info['config']['resources'] = create_info['config'].get('resources', {})
        resources = create_info['config']['resources']

        default_cpu_and_memory = 1 if create_info['config']['image'] else 0
        resources['cpu'] = int(parsed_args.cpu or resources.get('cpu', default_cpu_and_memory))
        resources['memory'] = int(parsed_args.memory or resources.get('memory', default_cpu_and_memory))
        resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0))
        resources['gpu_memory'] = int(parsed_args.gpu_memory or resources.get('gpu_memory', 0))

        if parsed_args.local:
            create_info['server'] = 'local'

            # make sure we do not limit the resources to something that is not available on the local machine
            warning = []
            cpu = cpuinfo.get_cpu_info()
            mem = psutil.virtual_memory().total
            gpu = 0
            try:
                gpu = len(get_ordered_devices())
            except CudaNotImplementedException: pass

            if not create_info['config']['image'] and not all([x == 0 for x in six.itervalues(resources)]):
                self.logger.warning("! No Docker virtualization since no `image` defined, resources limitation ignored.")

            if create_info['config']['image'] and resources['gpu'] > 0:
                if not (sys.platform == "linux" or sys.platform == "linux2"):
                    self.logger.warning("! Your operating system does not support GPU allocation for "
                                        "Docker virtualization. "
                                        "NVIDIA-Docker2 is only supported on Linux.")

            local_max_resources = {'cpu': cpu['count'], 'memory': ceil(mem / 1024 / 1024 / 1024), 'gpu': gpu}

            if create_info['config']['image']:
                # read max hardware within Docker
                out = docker_call(['run', 'alpine', 'sh', '-c', 'nproc && cat /proc/meminfo | grep MemTotal'])
                cpus, memory = out.decode('utf-8').strip().split('\n')
                local_max_resources['cpu'] = int(cpus)

                memory = memory.replace('MemTotal:', '').replace('kB', '').strip()
                local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024)

            if local_max_resources['cpu'] < resources['cpu']:
                warning.append('CPU cores %d -> %d' % (resources['cpu'], local_max_resources['cpu']))
                resources['cpu'] = local_max_resources['cpu']

            if local_max_resources['memory'] < resources['memory']:
                warning.append('memory %dGB -> %dGB' % (resources['memory'], local_max_resources['memory']))
                resources['memory'] = local_max_resources['memory']

            if local_max_resources['gpu'] < resources['gpu']:
                warning.append('GPU cards %d -> %d' % (resources['gpu'], local_max_resources['gpu']))
                resources['gpu'] = local_max_resources['gpu']

            if warning:
                self.logger.warning("! Resources downgrade due to missing hardware: %s." % (', '.join(warning),))

        if parsed_args.config and not create_info['config']['configPath']:
            create_info['config']['configPath'] = parsed_args.config

        create_info['config']['sourcesAttached'] = True

        creating_git_job = loading_text("- Create job in local Git ... ")
        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job_backend.create(create_info=create_info, server=None)
        creating_git_job("created %s in %s." % (job_backend.job_id[0:9], job_backend.model_name))

        summary = "➤ Summary: Job running "
        if parsed_args.local:
            summary += 'locally'
        else:
            summary += 'on the cluster'

        if create_info['config']['image']:
            summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \
                       % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu'])
        else:
            summary += ' on host using all available resources.'

        print(summary)

        # tasks = []
        #
        # if 'tasks' in config:
        #     for name, task_config in six.iteritems(config['tasks']):
        #         replica = 1
        #         if 'replica' in task_config:
        #             replica = int(task_config['replica'])
        #         for index in range(0, replica):
        #             tasks.append(job_backend.create_task(job_id, task_config, name, index))

        if parsed_args.offline:
            if not parsed_args.local:
                self.logger.warning("Can not create a remote job in offline mode.")
                sys.exit(1)

            self.logger.warning("Execution started offline.")
        else:
            adding_files = loading_text("- Connecting to "+home_config['host']+" ... ")
            if job_backend.connect():
                adding_files("connected.")
            else:
                parsed_args.offline = True
                adding_files("failed. Continue in offline mode.")

        if not parsed_args.offline:
            sys.stdout.write("- Uploading job data ... ")
            job_backend.git.push()
            job_backend.client.wait_until_queue_empty(['files'], clear_end=False)

            sys.stdout.write(" done.\n")

            link = "%smodel/%s/job/%s" % (home_config['url'], job_backend.model_name, job_backend.job_id)
            sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link))

        if parsed_args.local:
            job_backend.start(collect_system=False, offline=parsed_args.offline, push=False)

            if not parsed_args.offline:
                job_backend.git.start_push_sync()

            cpus = create_info['config']['resources']['cpu']
            memory = create_info['config']['resources']['memory']

            if not parsed_args.gpu_device and create_info['config']['resources']['gpu'] > 0:
                # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1]
                parsed_args.gpu_device = []
                for i in range(0, create_info['config']['resources']['gpu']):
                    parsed_args.gpu_device.append(i)

            start_command(self.logger, job_backend, env, parsed_args.volume, cpus=cpus, memory=memory, gpu_devices=parsed_args.gpu_device,
                offline=parsed_args.offline)
Exemplo n.º 24
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' job-checkout')
        parser.add_argument('job_id', help="Short or long job id like ed4d6a204")
        parser.add_argument('file', nargs='*', help="Checkout only one file.")
        parser.add_argument('--target', '-t', help="Target directory where job files (or a single file) should be saved. Default current folder")
        parser.add_argument('--overwrite', '-p', help="Overwrite existing files.")
        parser.add_argument('--model', help="Model name like peter/mnist. Per default from current directory")
        parser.add_argument('-c', '--config', help="Default aetros.yml in current working directory.")

        parsed_args = parser.parse_args(args)

        if not parsed_args.job_id:
            parser.print_help()
            sys.exit()

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print("No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'.")
            sys.exit(2)

        target = os.path.normpath(os.path.abspath(parsed_args.target if parsed_args.target else os.getcwd()))
        git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model + '.git')

        if not parsed_args.file and not os.path.exists(target):
            os.makedirs(target)

        if parsed_args.file and not os.path.exists(target):
            os.makedirs(target)

        id_map = {}

        for job_id in [parsed_args.job_id]:
            full_id = git_has_local_job(home_config, model, job_id)
            id_map[job_id] = full_id
            if not full_id:
                full_id = git_has_remote_job(home_config, model, job_id)
                id_map[job_id] = full_id
                if full_id:
                    print("Pull job %s to local ... " % (job_id, ))
                    ref = 'refs/aetros/job/' + full_id
                    subprocess.call([home_config['git'], '--bare', '--git-dir', git_dir, 'fetch', 'origin', ref+':'+ref])
                else:
                    print("Job %s not found." % (job_id, ))
                    sys.exit(2)

        ref = 'refs/aetros/job/' + id_map[parsed_args.job_id]

        if not parsed_args.file:
            print("Checkout all job files %s %s into %s ... " % (model, id_map[parsed_args.job_id], target))
        else:
            print("Checkout job files %s %s into %s ... " % (model, id_map[parsed_args.job_id], target))

        paths = parsed_args.file if parsed_args.file else ['.']

        subprocess.call(
            [home_config['git'], '--bare', '--git-dir', git_dir, '--work-tree', target, 'checkout', ref, '--'] + paths
        )
Exemplo n.º 25
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' server')
        parser.add_argument('name', nargs='?', help="Server name")
        parser.add_argument('--generate-ssh-key', help="Generates automatically a ssh key, register them in AETROS in "
                                                       "your account, and delete them when the server exits. "
                                                       "You should prefer 'aetros register' command as its safer.")

        parser.add_argument('--allow-host-execution', action='store_true', help="Whether a job can run on this server "
            "directly, without a virtual (docker) container.\nSecurity risk and makes resource limitation useless.")

        parser.add_argument('--max-memory',
            help="How many RAM is available. In gigabyte. Per default all available memory.")
        parser.add_argument('--max-cpus',
            help="How many cores are available. Per default all available CPU cores.")
        parser.add_argument('--max-gpus',
            help="How many GPUs are available. Comma separate list of device ids (pciBusId)."
                 "Per default all available GPU cards. Use 'aetros gpu' too see the ids.")

        parser.add_argument('--no-gpus', action='store_true', help="Disable all GPUs")

        parser.add_argument('--max-jobs', help="How many jobs are allowed to run in total until the process exists automatically.")
        parser.add_argument('--host', help="Default trainer.aetros.com. Read from the global configuration ~/aetros.yml.")
        parser.add_argument('--show-stdout', action='store_true', help="Show all stdout of all jobs. Only for debugging necessary.")

        parsed_args = parser.parse_args(args)

        if not parsed_args.name:
            parser.print_help()
            sys.exit()

        self.config = read_home_config()

        if parsed_args.max_jobs:
            self.max_jobs = int(parsed_args.max_jobs)

        if parsed_args.max_memory:
            self.resources_limit['memory'] = int(parsed_args.max_memory)

        if parsed_args.max_cpus:
            self.resources_limit['cpus'] = int(parsed_args.max_cpus)

        self.resources_limit['host_execution'] = parsed_args.allow_host_execution

        gpus = []
        try:
            gpus = aetros.cuda_gpu.get_ordered_devices()
            for i in range(len(gpus)):
                self.enabled_gpus.append(i)
        except Exception: pass

        if parsed_args.max_gpus:
            self.enabled_gpus = []

            for i in parsed_args.max_gpus.split(','):
                i = int(i)
                if i < 0 or i >= len(gpus):
                    raise Exception('--max-gpus ' + str(i) + ' not available on the system. GPUs ' + str([i for i in range(len(gpus))])+ ' detected.')

                self.enabled_gpus.append(i)

        elif parsed_args.no_gpus:
            self.enabled_gpus = []

        if parsed_args.show_stdout:
            self.show_stdout = True

        event_listener = EventListener()

        event_listener.on('registration', self.registration_complete)
        event_listener.on('failed', self.connection_failed)
        event_listener.on('jobs', self.sync_jobs)
        event_listener.on('close', self.on_client_close)

        if hasattr(signal, 'SIGUSR1'):
            signal.signal(signal.SIGUSR1, self.on_signusr1)

        ssh_key_registered = False
        if parsed_args.generate_ssh_key:
            self.logger.info('Generate SSH key')

            ssh_key = paramiko.RSAKey.generate(4096)
            self.ssh_key_private = ssh_key.key.private_bytes(
                serialization.Encoding.PEM, serialization.PrivateFormat.TraditionalOpenSSL, serialization.NoEncryption()
            ).decode()
            self.ssh_key_public = 'rsa ' + ssh_key.get_base64() + ' ' + parsed_args.name

            self.logger.info('Register SSH key at ' + self.config['host'])

            data = {
                'name': parsed_args.name,
                'secure_key': parsed_args.generate_ssh_key,
                'key': self.ssh_key_public,
            }

            response = aetros.api.http_request('server/ssh-key', json_body=data, method='post')

            ssh_key_registered = response == True

        def delete_ssh_key():
            self.logger.info('Delete SSH key at ' + self.config['host'])

            data = {
                'secure_key': parsed_args.generate_ssh_key,
                'key': self.ssh_key_public,
            }
            response = aetros.api.http_request('server/ssh-key/delete', json_body=data)
            if not response:
                self.logger.error('Could not delete SSH key in AETROS Trainer.')

        if parsed_args.generate_ssh_key and ssh_key_registered:
            import atexit
            atexit.register(delete_ssh_key)

        if parsed_args.host:
            self.config['host'] = parsed_args.host

        if self.ssh_key_private:
            self.config['ssh_key_base64'] = self.ssh_key_private

        self.server = ServerClient(self.config, event_listener, self.logger)

        self.general_logger_stdout = GeneralLogger(job_backend=self, redirect_to=sys.__stdout__)
        self.general_logger_stderr = GeneralLogger(job_backend=self, redirect_to=sys.__stderr__)

        sys.stdout = self.general_logger_stdout
        sys.stderr = self.general_logger_stderr

        self.server.configure(parsed_args.name)
        self.logger.debug('Connecting to ' + self.config['host'])
        self.server.start()
        self.write_log("\n")

        try:
            while self.active:
                if self.registered:
                    self.server.send_message({'type': 'utilization', 'values': self.collect_system_utilization()})
                    self.check_finished_jobs()

                time.sleep(1)
        except KeyboardInterrupt:
            self.logger.warning('Aborted')
            self.stop()
Exemplo n.º 26
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' job-files')
        parser.add_argument('job_id',
                            help="Short or long job id like ed4d6a204")
        parser.add_argument('folder',
                            nargs='?',
                            help="Limit files list to folder. Default root ./")
        parser.add_argument('-r',
                            action='store_true',
                            help="Recursive files tree")
        parser.add_argument(
            '--model',
            help=
            "Model name like peter/mnist. Per default from found configuration."
        )
        parser.add_argument(
            '-c',
            '--config',
            help=
            "Default aetros.yml in current working directory or directories above."
        )

        parsed_args = parser.parse_args(args)

        if not parsed_args.job_id:
            parser.print_help()
            sys.exit()

        home_config = read_home_config()
        config = find_config(parsed_args.config)
        model = parsed_args.model if parsed_args.model else config['model']

        if not model:
            print(
                "No model defined. Use --model or switch into a directory where you executed 'aetros init model-name'."
            )
            sys.exit(2)

        git_dir = os.path.normpath(home_config['storage_dir'] + '/' + model +
                                   '.git')

        id_map = {}

        for job_id in [parsed_args.job_id]:
            full_id = git_has_local_job(home_config, model, job_id)
            id_map[job_id] = full_id
            if not full_id:
                full_id = git_has_remote_job(home_config, model, job_id)
                id_map[job_id] = full_id
                if full_id:
                    print("Pull job %s to local ... " % (job_id, ))
                    ref = 'refs/aetros/job/' + full_id
                    subprocess.call([
                        home_config['git'], '--bare', '--git-dir', git_dir,
                        'fetch', 'origin', ref + ':' + ref
                    ])
                else:
                    print("Job %s not found." % (job_id, ))
                    sys.exit(2)

        ref = 'refs/aetros/job/' + id_map[parsed_args.job_id]

        print("List job files of %s of %s" % (parsed_args.job_id, model))
        args = [
            home_config['git'], '--bare', '--git-dir', git_dir, 'ls-tree',
            '--long'
        ]
        if parsed_args.r:
            args.append('-r')

        args.append(ref)

        if parsed_args.folder:
            args.append(parsed_args.folder)
        subprocess.call(args)
Exemplo n.º 27
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' run')
        parser.add_argument(
            'command',
            nargs='?',
            help="The command to run. Default read in configuration file")
        parser.add_argument(
            '-i',
            '--image',
            help=
            "Which Docker image to use for the command. Default read in configuration file. If not specified, command is executed on the host."
        )
        parser.add_argument(
            '--no-image',
            action='store_true',
            help=
            "Forces not to use docker, even when image is defined in the configuration file."
        )

        parser.add_argument(
            '-s',
            '--server',
            action='append',
            help=
            "Limits the server pool to this server. Default not limitation or read in configuration file. Multiple --server allowed."
        )
        parser.add_argument(
            '-m',
            '--model',
            help=
            "Under which model this job should be listed. Default read in configuration file"
        )
        parser.add_argument(
            '-l',
            '--local',
            action='store_true',
            help="Start the job immediately on the current machine.")
        parser.add_argument(
            '-c',
            '--config',
            help="Default aetros.yml in current working directory.")
        parser.add_argument(
            '--priority',
            help="Increases or decreases priority. Default is 0.")

        parser.add_argument(
            '--cpu',
            help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument(
            '--memory',
            help="How much memory should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu',
            help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument(
            '--gpu_memory',
            help="Memory requirement for the GPU. Docker only.")

        parser.add_argument(
            '--offline',
            '-o',
            action='store_true',
            help="Whether the execution should happen offline.")

        parser.add_argument(
            '--rebuild-image',
            action='store_true',
            help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument(
            '--max-time',
            help=
            "Limit execution time in seconds. Sends SIGINT to the process group when reached."
        )
        parser.add_argument(
            '--max-epochs',
            help=
            "Limit execution epochs. Sends SIGINT to the process group when reached."
        )

        parser.add_argument(
            '--gpu-device',
            action='append',
            help=
            "Which device id should be mapped into the NVIDIA docker container. Only when --local"
        )

        parser.add_argument('--volume',
                            '-v',
                            action='append',
                            help="Volume into docker. Only when --local")
        parser.add_argument(
            '-e',
            action='append',
            help=
            "Sets additional environment variables. '-e name=value' to set value, or '-e name' to read from current env"
        )

        parser.add_argument(
            '-p',
            '--param',
            action='append',
            help=
            "Sets a hyperparameter, example '--param name=value'. Multiple --param allowed."
        )

        parsed_args = parser.parse_args(args)

        if parsed_args.config and not os.path.exists(parsed_args.config):
            self.logger.error("fatal: file %s does not exist." %
                              (parsed_args.config, ))
            sys.exit(2)

        config = find_config(parsed_args.config)
        home_config = read_home_config()

        if config['model'] and not parsed_args.model:
            parsed_args.model = config['model']

        if not parsed_args.model:
            print(
                "fatal: no model defined. Use --model or switch into a directory where you executed 'aetros init model-name'."
            )
            sys.exit(2)

        if not parsed_args.local and parsed_args.volume:
            print(
                "fatal: can not use volume with jobs on the cluster. Use datasets instead."
            )
            sys.exit(1)

        if parsed_args.local and parsed_args.priority:
            print(
                "fatal: the priority can only be set for jobs in the cluster.")
            sys.exit(1)

        if config['image']:
            ensure_docker_installed(self.logger)

        env = {}
        if parsed_args.e:
            for item in parsed_args.e:
                if '=' in item:
                    k, v = item.split('=')
                else:
                    k = item
                    v = os.getenv(k)
                env[k] = v

        if ('command' not in config
                or not config['command']) and not parsed_args.command:
            self.logger.error(
                'No command given. Define the command in aetros.yml or use command argument.'
            )
            sys.exit(1)

        job_backend = JobBackend(parsed_args.model, self.logger)

        ignore = []
        if 'ignore' in config:
            ignore = config['ignore']
        job_backend.job = {'config': {'ignore': ignore}}

        adding_files = loading_text("- Adding job files to index ... ")
        files_added, size_added = job_backend.add_files(config['root'],
                                                        report=False)
        adding_files("done with %d file%s added (%s)." %
                     (files_added, 's' if files_added != 1 else '',
                      human_size(size_added, 2)))

        create_info = {'type': 'custom', 'config': config}

        incoming_hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception(
                        '--param ' + param +
                        ' does not contain a `=`. Please use "--param name=value"'
                    )

                name, value = param.split('=')
                incoming_hyperparameter[name] = value

        # first transform simple format in the full definition with parameter types
        # (string, number, group, choice_group, etc)
        full_hyperparameters = lose_parameters_to_full(config['parameters'])

        # now extract hyperparameters from full definition, and overwrite stuff using
        # incoming_hyperparameter if available
        hyperparameter = extract_parameters(full_hyperparameters,
                                            incoming_hyperparameter)

        create_info['config']['parameters'] = hyperparameter

        if parsed_args.rebuild_image:
            create_info['config']['rebuild_image'] = True

        if parsed_args.max_epochs:
            create_info['config']['maxEpochs'] = int(parsed_args.max_epochs)

        create_info['config']['priority'] = 0
        if parsed_args.priority:
            create_info['config']['priority'] = float(parsed_args.priority)

        if parsed_args.max_time:
            create_info['config']['maxTime'] = float(parsed_args.max_time)

        if parsed_args.command:
            create_info['config']['command'] = parsed_args.command

        if parsed_args.image:
            # reset install options, since we can't make sure if the base image still fits
            if 'image' in config and config[
                    'image'] and config['image'] != parsed_args.image:
                create_info['config']['install'] = None

            # reset dockerfile, since we specified manually an image
            create_info['config']['dockerfile'] = None
            create_info['config']['image'] = parsed_args.image

        if parsed_args.no_image:
            create_info['config']['image'] = None

        if parsed_args.server:
            create_info['config']['servers'] = []
            for name in parsed_args.server:
                create_info['config']['servers'].append(name)

        create_info['config']['resources'] = create_info['config'].get(
            'resources', {})
        resources = create_info['config']['resources']

        default_cpu_and_memory = 1 if create_info['config']['image'] else 0
        resources['cpu'] = int(parsed_args.cpu
                               or resources.get('cpu', default_cpu_and_memory))
        resources['memory'] = int(
            parsed_args.memory
            or resources.get('memory', default_cpu_and_memory))
        resources['gpu'] = int(parsed_args.gpu or resources.get('gpu', 0))
        resources['gpu_memory'] = int(parsed_args.gpu_memory
                                      or resources.get('gpu_memory', 0))

        if parsed_args.local:
            create_info['server'] = 'local'

            # make sure we do not limit the resources to something that is not available on the local machine
            warning = []
            cpu = cpuinfo.get_cpu_info()
            mem = psutil.virtual_memory().total
            gpu = 0
            try:
                gpu = len(get_ordered_devices())
            except CudaNotImplementedException:
                pass

            if not create_info['config']['image'] and not all(
                [x == 0 for x in six.itervalues(resources)]):
                self.logger.warning(
                    "! No Docker virtualization since no `image` defined, resources limitation ignored."
                )

            if create_info['config']['image'] and resources['gpu'] > 0:
                if not (sys.platform == "linux" or sys.platform == "linux2"):
                    self.logger.warning(
                        "! Your operating system does not support GPU allocation for "
                        "Docker virtualization. "
                        "NVIDIA-Docker2 is only supported on Linux.")

            local_max_resources = {
                'cpu': cpu['count'],
                'memory': ceil(mem / 1024 / 1024 / 1024),
                'gpu': gpu
            }

            if create_info['config']['image']:
                # read max hardware within Docker
                out = docker_call([
                    'run', 'alpine', 'sh', '-c',
                    'nproc && cat /proc/meminfo | grep MemTotal'
                ])
                cpus, memory = out.decode('utf-8').strip().split('\n')
                local_max_resources['cpu'] = int(cpus)

                memory = memory.replace('MemTotal:', '').replace('kB',
                                                                 '').strip()
                local_max_resources['memory'] = ceil(int(memory) / 1024 / 1024)

            if local_max_resources['cpu'] < resources['cpu']:
                warning.append('CPU cores %d -> %d' %
                               (resources['cpu'], local_max_resources['cpu']))
                resources['cpu'] = local_max_resources['cpu']

            if local_max_resources['memory'] < resources['memory']:
                warning.append(
                    'memory %dGB -> %dGB' %
                    (resources['memory'], local_max_resources['memory']))
                resources['memory'] = local_max_resources['memory']

            if local_max_resources['gpu'] < resources['gpu']:
                warning.append('GPU cards %d -> %d' %
                               (resources['gpu'], local_max_resources['gpu']))
                resources['gpu'] = local_max_resources['gpu']

            if warning:
                self.logger.warning(
                    "! Resources downgrade due to missing hardware: %s." %
                    (', '.join(warning), ))

        if parsed_args.config and not create_info['config']['configPath']:
            create_info['config']['configPath'] = parsed_args.config

        create_info['config']['sourcesAttached'] = True

        creating_git_job = loading_text("- Create job in local Git ... ")
        if aetros.utils.git.get_current_commit_hash():
            create_info['origin_git_source'] = {
                'origin': aetros.utils.git.get_current_remote_url(),
                'author': aetros.utils.git.get_current_commit_author(),
                'message': aetros.utils.git.get_current_commit_message(),
                'branch': aetros.utils.git.get_current_branch(),
                'commit': aetros.utils.git.get_current_commit_hash(),
            }

        job_backend.create(create_info=create_info, server=None)
        creating_git_job("created %s in %s." %
                         (job_backend.job_id[0:9], job_backend.model_name))

        summary = "➤ Summary: Job running "
        if parsed_args.local:
            summary += 'locally'
        else:
            summary += 'on the cluster'

        if create_info['config']['image']:
            summary += ' in Docker using image %s with %d CPU cores, %dGB memory and %d GPUs.' \
                       % (create_info['config']['image'], resources['cpu'], resources['memory'], resources['gpu'])
        else:
            summary += ' on host using all available resources.'

        print(summary)

        # tasks = []
        #
        # if 'tasks' in config:
        #     for name, task_config in six.iteritems(config['tasks']):
        #         replica = 1
        #         if 'replica' in task_config:
        #             replica = int(task_config['replica'])
        #         for index in range(0, replica):
        #             tasks.append(job_backend.create_task(job_id, task_config, name, index))

        if parsed_args.offline:
            if not parsed_args.local:
                self.logger.warning(
                    "Can not create a remote job in offline mode.")
                sys.exit(1)

            self.logger.warning("Execution started offline.")
        else:
            adding_files = loading_text("- Connecting to " +
                                        home_config['host'] + " ... ")
            if job_backend.connect():
                adding_files("connected.")
            else:
                parsed_args.offline = True
                adding_files("failed. Continue in offline mode.")

        if not parsed_args.offline:
            sys.stdout.write("- Uploading job data ... ")
            job_backend.git.push()
            job_backend.client.wait_until_queue_empty(['files'],
                                                      clear_end=False)

            sys.stdout.write(" done.\n")

            link = "%s/model/%s/job/%s" % (
                home_config['url'], job_backend.model_name, job_backend.job_id)
            sys.__stdout__.write(u"➤ Monitor job at %s\n" % (link))

        if parsed_args.local:
            job_backend.start(collect_system=False,
                              offline=parsed_args.offline,
                              push=False)

            if not parsed_args.offline:
                job_backend.git.start_push_sync()

            cpus = create_info['config']['resources']['cpu']
            memory = create_info['config']['resources']['memory']

            if not parsed_args.gpu_device and create_info['config'][
                    'resources']['gpu'] > 0:
                # if requested 2 GPUs and we have 3 GPUs with id [0,1,2], gpus should be [0,1]
                parsed_args.gpu_device = []
                for i in range(0, create_info['config']['resources']['gpu']):
                    parsed_args.gpu_device.append(i)

            start_command(self.logger,
                          job_backend,
                          env,
                          parsed_args.volume,
                          cpus=cpus,
                          memory=memory,
                          gpu_devices=parsed_args.gpu_device,
                          offline=parsed_args.offline)
Exemplo n.º 28
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
            prog=aetros.const.__prog__ + ' init')
        parser.add_argument('name', help="Model name")
        parser.add_argument('directory', nargs='?', help="Directory, default in current.")
        parser.add_argument('--organisation', '-o', help="Create the model in the organisation instead of the user account.")
        parser.add_argument('--space', '-s', help="Create the model in given space. If space does not exist, create it.")
        parser.add_argument('--private', action='store_true', help="Make the model private. Example: aetros init my-model --private")
        parser.add_argument('--force', '-f', action='store_true', help="Force overwriting of already existing configuration file.")

        home_config = read_home_config()
        parsed_args = parser.parse_args(args)
        if not parsed_args.name:
            parser.print_help()
            sys.exit(1)

        path = os.getcwd()
        if parsed_args.directory:
            path = os.path.abspath(parsed_args.directory)

        if os.path.exists(path) and not os.path.isdir(path):
            sys.stderr.write('Path already exist and is not a directory: ' + path)

        if not os.path.exists(path):
            os.makedirs(path)

        yaml = ruamel.yaml.YAML()
        config = {}

        if os.path.exists(path+'/aetros.yml'):
            with open(path+'/aetros.yml', 'r') as f:
                config = yaml.load(f)

            if isinstance(config, dict) and 'model' in config and not parsed_args.force:
                print("failed: aetros.yml already exists in with a linked model to " + config['model']+ '. Use -f to force.')
                sys.exit(1)

        if not parsed_args.private:
            print("Warning: creating public model. Use --private to create private models.")

        if '/' in parsed_args.name:
            sys.stderr.write('No / allowed in name. Use -o if thie model should be created in an organisation.')
            sys.exit(1)

        response = api.create_model(parsed_args.name or (os.path.basename(os.getcwd())), parsed_args.organisation, parsed_args.space, parsed_args.private)
        name = response['name']

        if response['already_exists']:
            print("Notice: Model already exists remotely.")

        config['model'] = name

        with open(path + '/aetros.yml', 'w+') as f:
            yaml.dump(config, f)

        print("aetros.yml created and linked with model " + name + ' in ' + path)
        print("Open AETROS Trainer to see the model at https://" + home_config['host'] + '/model/' + name)

        git_remote_url = 'git@%s:%s.git' % (home_config['host'], name)

        print("Use git to store your source code. Each model has its own Git repository.")
        print("  $ cd " + path)
        print("  $ git init")
        print("  $ git remote add origin " + git_remote_url)
        print("  $ git add .")
        print("  $ git commit -m 'first commit'")
        print("  $ git push origin master")
Exemplo n.º 29
0
def start_command(logger, job_backend, env_overwrite=None, volumes=None, cpus=1, memory=1,
                  gpu_devices=None, offline=False):

    home_config = read_home_config()

    env = {}
    if env_overwrite:
        env.update(env_overwrite)

    start_time = time.time()
    env['AETROS_MODEL_NAME'] = job_backend.model_name
    env['AETROS_JOB_ID'] = str(job_backend.job_id)
    env['AETROS_OFFLINE'] = '1' if offline else ''
    env['AETROS_GIT_INDEX_FILE'] = job_backend.git.index_path
    env['DEBUG'] = os.getenv('DEBUG', '')
    env['PYTHONUNBUFFERED'] = os.getenv('PYTHONUNBUFFERED', '1')
    env['PYTHONIOENCODING'] = os.getenv('PYTHONIOENCODING', 'UTF-8')
    env['AETROS_ATTY'] = '1'
    env['AETROS_GIT'] = job_backend.git.get_base_command()

    env['PATH'] = os.getenv('PATH', '')
    if 'PYTHONPATH' not in env:
        env['PYTHONPATH'] = os.getenv('PYTHONPATH', '')

    if os.getenv('AETROS_SSH_KEY_BASE64'):
        env['AETROS_SSH_KEY_BASE64'] = os.getenv('AETROS_SSH_KEY_BASE64')
    elif get_ssh_key_for_host(home_config['host']):
        # we need to read the key into env so the docker container can connect to AETROS
        env['AETROS_SSH_KEY_BASE64'] = open(get_ssh_key_for_host(home_config['host']), 'r').read()

    job_config = job_backend.job['config']
    job = job_backend.get_job_model()

    if 'command' not in job_config:
        job_backend.fail('No "command" given. See Configuration section in the documentation.')

    job_commands = job_config['command']
    docker_image = job_config['image']

    if job_backend.is_simple_model():
        if docker_image:
            simple_command = ['python']
        else:
            simple_command = [sys.executable]

        simple_command += ['-m', 'aetros', 'start-simple', job_backend.model_name + '/' + job_backend.job_id]
        job_commands = {'run': ' '.join(simple_command)}

    if job_commands is None:
        raise Exception('No command specified.')

    if not isinstance(job_commands, list) and not isinstance(job_commands, dict):
        job_commands = [job_commands]

    # replace {{batch_size}} parameters
    if isinstance(job_config['parameters'], dict):
        for key, value in six.iteritems(flatten_parameters(job_config['parameters'])):
            if isinstance(job_commands, list):
                for k, v in enumerate(job_commands):
                    if isinstance(job_commands[k], six.string_types):
                        job_commands[k] = job_commands[k].replace('{{' + key + '}}', simplejson.dumps(value))

            elif isinstance(job_commands, dict):
                for k, v in six.iteritems(job_commands):
                    if isinstance(job_commands[k], six.string_types):
                        job_commands[k] = job_commands[k].replace('{{' + key + '}}', simplejson.dumps(value))

    job_backend.set_system_info('commands', job_commands)
    os.chdir(job_backend.git.work_tree)

    docker_image_built = False

    if docker_image and (job_config['dockerfile'] or job_config['install']):
        rebuild_image = job_config['rebuild_image'] if 'rebuild_image' in job_config else False
        docker_image = docker_build_image(logger, home_config, job_backend, rebuild_image)
        docker_image_built = True

    job_backend.collect_device_information(gpu_devices)

    state = {'last_process': None}
    job_backend.set_system_info('processRunning', False, True)

    def pause():
        if not state['last_process'] or state['last_process'].poll() is not None:
            # no running process
            return

        if docker_image:
            if docker_pause(logger, home_config, job_backend):
                job_backend.set_paused(True)
        else:
            os.killpg(os.getpgid(state['last_process'].pid), signal.SIGSTOP)
            job_backend.set_paused(True)

    def cont():
        if not state['last_process'] or state['last_process'].poll() is not None:
            # no running process
            return

        job_backend.set_paused(False)
        if docker_image:
            docker_continue(logger, home_config, job_backend)
        else:
            os.killpg(os.getpgid(state['last_process'].pid), signal.SIGCONT)

    job_backend.on_pause = pause
    job_backend.on_continue = cont

    if docker_image:
        env['AETROS_GIT_INDEX_FILE'] = '/aetros/' + job_backend.model_name + '.git/' + os.path.basename(env['AETROS_GIT_INDEX_FILE'])

        with job_backend.git.batch_commit('JOB_SYSTEM_INFORMATION'):
            aetros_environment = {'aetros_version': __version__, 'variables': env.copy()}
            if 'AETROS_SSH_KEY' in aetros_environment['variables']: del aetros_environment['variables']['AETROS_SSH_KEY']
            if 'AETROS_SSH_KEY_BASE64' in aetros_environment['variables']: del aetros_environment['variables']['AETROS_SSH_KEY_BASE64']
            job_backend.set_system_info('environment', aetros_environment)

            job_backend.set_system_info('memory_total', memory * 1024 * 1024 * 1024)

            import cpuinfo
            cpu = cpuinfo.get_cpu_info()
            job_backend.set_system_info('cpu_name', cpu['brand'])
            job_backend.set_system_info('cpu', [cpu['hz_actual_raw'][0], cpus])

        job_backend.start_monitoring(cpu_cores=cpus, gpu_devices=gpu_devices, docker_container=job_backend.job_id)

        if not docker_image_built:
            docker_pull_image(logger, home_config, job_backend)

        docker_image_information(logger, home_config, job_backend)

        # make sure old container is removed
        subprocess.Popen([home_config['docker'], 'rm', job_backend.job_id], stderr=subprocess.PIPE).wait()

        command = docker_command_wrapper(logger, home_config, job_backend, volumes, cpus, memory, gpu_devices, env)

        # since linux doesnt handle SIGINT when pid=1 process has no signal listener,
        # we need to make sure, we attached one to the pid=1 process
        trap = 'trapIt () { "$@"& pid="$!"; trap "kill -INT $pid" INT TERM; ' \
               'while kill -0 $pid > /dev/null 2>&1; do wait $pid; ec="$?"; done; exit $ec;};'

        command.append(docker_image)
        command += ['/bin/sh', '-c', trap + 'trapIt /bin/sh /job/aetros/command.sh']
    else:
        # non-docker
        # env['PYTHONPATH'] += ':' + os.getcwd()
        job_backend.collect_system_information()
        job_backend.collect_environment(env)
        job_backend.start_monitoring(gpu_devices=gpu_devices)

        command = ['/bin/sh', job_backend.git.work_tree + '/aetros/command.sh']

    logger.debug("$ %s " % (' '.join([simplejson.dumps(a) for a in command])))
    job_backend.set_system_info('image/name', str(docker_image))

    p = None
    exited = False
    last_return_code = None
    state['last_process'] = None
    all_done = False
    command_stats = None
    files = job_backend.file_list()

    def clean():
        # clear working tree
        shutil.rmtree(job_backend.git.work_tree)

    def on_force_exit():
        # make sure the process dies
        clean()

        with open(os.devnull, 'r+b', 0) as DEVNULL:
            if docker_image:
                # docker run does not proxy INT signals to the docker-engine,
                # so we need to do it on our own directly.
                subprocess.Popen(args=[home_config['docker'], 'kill', job_backend.job_id], stdout=DEVNULL, stderr=DEVNULL).wait()
            elif not exited and state['last_process'] and state['last_process'].poll() is None:
                # wait for last command
                os.killpg(os.getpgid(state['last_process'].pid), signal.SIGKILL)

    job_backend.on_force_exit = on_force_exit

    try:
        job_backend.set_status('STARTED', add_section=False)
        # logger.warning("$ %s " % (str(command),))

        # make sure maxTime limitation is correctly calculated
        job_backend.monitoring_thread.handle_max_time = True
        job_backend.monitoring_thread.handle_max_time_time = time.time()

        # Since JobBackend sends SIGINT to its current process group, it sends also to its parents when same pg.
        # We need to change the process group of the process, so this won't happen.
        # If we don't this, the master process (server command e.g.) receives the SIGINT as well.
        kwargs = {}
        if os.name == 'nt':
            kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
        else:
            kwargs['preexec_fn'] = os.setsid

        # only use full env when no image used

        command_env = env
        if not docker_image:
            command_env = os.environ.copy()
            command_env.update(env)
            if os.environ.get('LD_LIBRARY_PATH', None):
                command_env['LD_LIBRARY_PATH_ORI'] = command_env['LD_LIBRARY_PATH']

        def write_command_sh(job_command):
            f = open(job_backend.git.work_tree + '/aetros/command.sh', 'w+')

            if not docker_image:
                # new shells unset LD_LIBRARY_PATH automatically, so we make sure it will be there again
                f.write('export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_ORI;\n')

            if job.get_working_dir():
                f.write('cd %s;\n' % (job.get_working_dir(),))

            f.write(job_command)
            f.close()

        def read_line(line):
            handled, filtered_line, failed = extract_api_calls(line, job_backend.handle_stdout_api, logger=logger)

            if is_debug():
                for call in handled:
                    logger.debug('STDOUT API CALL: ' + str(call))

            for fail in failed:
                logger.warning("API call failed '%s': %s %s"
                               % (str(fail['line']), str(type(fail['exception']).__name__), str(fail['exception'])))

            return filtered_line

        def exec_command(index, command, job_command):
            write_command_sh(job_command)
            print('%s $ %s' % ('/' + job.get_working_dir(), job_command.strip()))
            args = command
            logger.debug('$ ' + ' '.join([simplejson.dumps(a) for a in args]))

            command_stats[index]['started'] = time.time() - start_time
            job_backend.set_system_info('command_stats', command_stats, True)

            # important to prefix it, otherwise name='master' would reset all stats in controller backend
            command_env['AETROS_JOB_NAME'] = 'command_' + str(index)

            state['last_process'] = subprocess.Popen(
                args=args, bufsize=0, stderr=subprocess.PIPE, stdout=subprocess.PIPE, env=command_env, **kwargs
            )
            job_backend.set_system_info('processRunning', True, True)
            wait_stdout = sys.stdout.attach(state['last_process'].stdout, read_line=read_line)
            wait_stderr = sys.stderr.attach(state['last_process'].stderr)
            state['last_process'].wait()
            command_stats[index]['rc'] = last_return_code
            command_stats[index]['ended'] = time.time() - start_time
            job_backend.set_system_info('command_stats', command_stats, True)
            job_backend.set_system_info('processRunning', True, False)
            wait_stdout()
            wait_stderr()
            # make sure a new line is printed after a command
            print("")

            return state['last_process']

        done = 0
        total = len(job_commands)
        job_backend.set_system_info('command_stats', command_stats, True)
        if isinstance(job_commands, list):
            command_stats = [{'rc': None, 'started': None, 'ended': None} for x in job_commands]
            for k, job_command in enumerate(job_commands):
                job_backend.set_status('Command ' + str(k+1))

                p = exec_command(k, command, job_command)
                last_return_code = p.poll()

                if last_return_code == 0:
                    done += 1
                else:
                    # one failed, so exit and don't execute next
                    break

        if isinstance(job_commands, dict):
            command_stats = {}
            for name, job_command in six.iteritems(job_commands):
                command_stats[name] = {'rc': None, 'started': None, 'ended': None}

            for name, job_command in six.iteritems(job_commands):
                job_backend.set_status('Command ' + name)

                p = exec_command(name, command, job_command)
                last_return_code = p.poll()

                if last_return_code == 0:
                    done += 1
                else:
                    # one failed, so exit and don't execute next
                    break

        all_done = done == total
        exited = True

        if state['last_process']:
            sys.exit(state['last_process'].poll())
        else:
            sys.exit(1)

    except SystemExit:
        # since we started the command in a new process group, a SIGINT or CTRL+C on this process won't affect
        # our actual command process. So we need to take care that we stop everything.
        logger.debug("SystemExit, exited=%s, all-done=%s, has-last-process=%s, pid=%s" %(
            str(exited),
            str(all_done),
            state['last_process'] is not None,
            state['last_process'].poll() if state['last_process'] else None
        ))

        # make sure the process dies
        if docker_image:
            # docker run does not proxy INT signals to the docker-engine,
            # so we need to do it on our own directly.
            p = subprocess.Popen(args=[home_config['docker'], 'inspect', job_backend.job_id],
                stderr=subprocess.PIPE, stdout=subprocess.PIPE)
            p.wait()
            if p.poll() == 0:
                subprocess.Popen(args=[home_config['docker'], 'kill', job_backend.job_id]).wait()
        elif not exited and state['last_process'] and state['last_process'].poll() is None:
            # wait for last command
            os.killpg(os.getpgid(state['last_process'].pid), signal.SIGINT)
            state['last_process'].wait()

        if 'output' in job_config and job_config['output']:
            upload_output_files(job_backend, job_config['output'])

        if exited:
            if all_done:
                job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_DONE)
            else:
                job_backend.stop(progress=JOB_STATUS.PROGRESS_STATUS_FAILED)
        else:
            # master received SIGINT before the all job commands exited.
            if not job_backend.in_early_stop:
                # in_early_stop indicates whether we want to have a planned stop (maxTime limitation for example),
                # which should mark the job as done, not as abort().
                # if this is not set, we the master received a SIGINT without early_stop, so mark as aborted.
                job_backend.abort()
            else:
                # let the on_shutdown listener handle the rest
                pass

        clean()
Exemplo n.º 30
0
    def diff_objects(self, latest_commit_sha):
        """
                Push all changes to origin, based on objects, not on commits.
                Important: Call this push after every new commit, or we lose commits.
                """
        base = ['git', '--bare', '--git-dir', self.git_path]

        object_shas = []
        summary = {'commits': [], 'trees': [], 'files': []}

        def read_parents_and_tree_from(commit):
            if commit in self.synced_object_shas or commit in object_shas:
                # this commit has already been synced or read
                return None, None

            self.synced_object_shas[commit] = True
            summary['commits'].append(commit)
            object_shas.append(commit)

            object_content = subprocess.check_output(base + ['cat-file', '-p', commit]).decode('utf-8').strip()
            parents = []
            tree = ''
            for line in object_content.splitlines():
                if line.startswith('tree '):
                    tree = line[len('tree '):]
                if line.startswith('parent '):
                    parents.append(line[len('parent '):])

            return parents, tree

        def collect_files_from_tree(tree):
            if tree in self.synced_object_shas or tree in object_shas:
                # we have exactly this tree already synced or read, meaning all its objects as well
                return

            self.synced_object_shas[tree] = True
            summary['trees'].append(tree)
            object_shas.append(tree)

            object_content = subprocess.check_output(base + ['ls-tree', '-r', '-t', tree]).decode('utf-8').strip()

            for line in object_content.splitlines():
                exploded = line.split(' ')

                if len(exploded) < 3:
                    sys.stderr.write("Error: Wrong line format of ls-tree for %s: %s\n" % (tree, line,))
                    sys.exit(1)

                object_to_add = str(exploded[2][:40])
                path = str(exploded[2][41:])

                if object_to_add in self.synced_object_shas or object_to_add in object_shas:
                    # have it already in the list or already synced
                    continue

                object_shas.append(object_to_add)
                self.synced_object_shas[object_to_add] = True
                summary['files'].append([object_to_add, path])

        commits_to_check = [latest_commit_sha]

        while len(commits_to_check):
            sha = commits_to_check.pop(0)
            parents, tree = read_parents_and_tree_from(sha)

            if parents:
                for parent in parents:
                    if parent not in commits_to_check:
                        commits_to_check.append(parent)

            if tree:
                collect_files_from_tree(tree)

        is_debug2() and self.logger.debug("shas_to_check %d: %s " % (len(object_shas), str(object_shas),))

        if not object_shas:
            return [], summary

        try:
            is_debug2() and self.logger.debug("Do git-cat-file-check.sh")

            ssh_stream = create_ssh_stream(read_home_config(), exit_on_failure=False)
            channel = ssh_stream.get_transport().open_session()
            channel.exec_command('git-cat-file-check.sh "%s"' % (self.model_name + '.git',))
            channel.sendall('\n'.join(object_shas))
            channel.shutdown_write()

            def readall(c):
                content = b''
                while True:
                    try:
                        chunk = c.recv(1024)
                        if chunk == b'':
                            break
                        content += chunk
                    except (KeyboardInterrupt, SystemExit):
                        return

                return content

            missing_objects = readall(channel).decode('utf-8').splitlines()
            channel.close()
            ssh_stream.close()

            # make sure we have in summary only SHAs we actually will sync
            for stype in six.iterkeys(summary):
                ids = summary[stype][:]
                for sha in ids:
                    if stype == 'files':
                        if sha[0] not in missing_objects:
                            summary[stype].remove(sha)
                    else:
                        if sha not in missing_objects:
                            summary[stype].remove(sha)

            return missing_objects, summary
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.logger.error("Failed to generate diff_objects: %s" % (str(e),))
            for sha in object_shas:
                if sha in self.synced_object_shas:
                    del self.synced_object_shas[sha]
            return None, None
Exemplo n.º 31
0
def start_custom(logger, job_backend):
    job_model = job_backend.get_job_model()
    config = job_model.config

    custom_git = False

    if 'gitCustom' in config and config['gitCustom']:
        custom_git = config['gitCustom']

    if custom_git and ('sourceGitUrl' not in config
                       or not config['sourceGitUrl']):
        raise Exception('Server git url is not configured. Aborted')

    if 'sourcePythonScript' not in config or not config['sourcePythonScript']:
        raise Exception('Server python script is not configured. Aborted')

    python_script = config['sourcePythonScript']
    git_tree = 'master'

    if custom_git:
        git_url = config['sourceGitUrl']
    else:
        user_config = read_home_config()
        git_url = 'git@' + user_config[
            'host'] + ':' + job_backend.model_name + '.git'

    if 'sourceGitTree' in config and config['sourceGitTree']:
        git_tree = config['sourceGitTree']

    work_tree = job_backend.git.work_tree

    my_env = os.environ.copy()
    if 'PYTHONPATH' not in my_env:
        my_env['PYTHONPATH'] = ''
    my_env['PYTHONPATH'] += ':' + os.getcwd()
    my_env['AETROS_MODEL_NAME'] = job_backend.model_name
    my_env['AETROS_JOB_ID'] = job_backend.job_id
    my_env['AETROS_ATTY'] = '1'

    logger.info("Setting up git repository %s in %s" % (git_url, work_tree))
    logger.info("Using git tree of '%s'" % (git_tree, ))

    try:
        if os.path.exists(work_tree):
            shutil.rmtree(work_tree)

        args = ['git', 'clone', git_url, work_tree]
        code = subprocess.call(args, stderr=sys.stderr, stdout=sys.stdout)
        if code != 0:
            raise Exception('Could not clone repository %s to %s' %
                            (git_url, work_tree))

        # make sure the requested branch is existent in local git. Target FETCH_HEAD to this branch.
        git_execute(logger, work_tree, ['fetch', 'origin', git_tree])
        git_execute(logger, work_tree, ['checkout', git_tree])

    except GitCommandException as e:
        raise Exception(
            'Could not run "%s" for repository %s in %s. Look at previous output.'
            % (e.cmd, git_url, work_tree))

    args = (sys.executable, python_script)
    logger.info("Model source code checked out.")
    logger.info("-----------")
    logger.info("-----------")
    logger.info("Switch working directory to " + work_tree)
    logger.warning("$ %s %s" % args)

    try:
        subprocess.Popen(args, close_fds=True, env=my_env,
                         cwd=work_tree).wait()
    except KeyboardInterrupt:
        logger.warning("Job aborted.")
        sys.exit(1)
Exemplo n.º 32
0
    def main(self, args):
        import aetros.const

        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
                                         prog=aetros.const.__prog__ + ' server')
        parser.add_argument('name', nargs='?', help="Server name")
        parser.add_argument('--generate-ssh-key', help="Generates automatically a ssh key, register them in AETROS in "
                                                       "your account, and delete them when the server exits. "
                                                       "You should prefer 'aetros authenticate' command as its safer.")

        parser.add_argument('--allow-host-execution', action='store_true', help="Whether a job can run on this server "
            "directly, without a virtual (docker) container.\nSecurity risk and makes resource limitation useless.")

        parser.add_argument('--max-memory',
            help="How many RAM is available. In gigabyte. Per default all available memory.")
        parser.add_argument('--max-cpus',
            help="How many cores are available. Per default all available CPU cores.")
        parser.add_argument('--max-gpus',
            help="How many GPUs are available. Comma separate list of device ids."
                 "Per default all available GPU cards. Use 'aetros gpu' too see the ids.")

        parser.add_argument('--no-gpus', action='store_true', help="Disable all GPUs")

        parser.add_argument('--max-jobs', help="How many jobs are allowed to run in total until the process exists automatically.")
        parser.add_argument('--host', help="Default trainer.aetros.com. Read from the global configuration ~/aetros.yml.")
        parser.add_argument('--show-stdout', action='store_true', help="Show all stdout of all jobs. Only for debugging necessary.")

        parsed_args = parser.parse_args(args)

        if not parsed_args.name:
            parser.print_help()
            sys.exit()

        self.config = read_home_config()

        if parsed_args.max_jobs:
            self.max_jobs = int(parsed_args.max_jobs)

        if parsed_args.max_memory:
            self.resources_limit['memory'] = int(parsed_args.max_memory)

        if parsed_args.max_cpus:
            self.resources_limit['cpus'] = int(parsed_args.max_cpus)

        self.resources_limit['host_execution'] = parsed_args.allow_host_execution

        gpus = []
        try:
            gpus = aetros.cuda_gpu.get_ordered_devices()
            for i in range(len(gpus)):
                self.enabled_gpus.append(i)
        except aetros.cuda_gpu.CudaNotImplementedException: pass

        if parsed_args.max_gpus:
            self.enabled_gpus = []

            for i in parsed_args.max_gpus.split(','):
                i = int(i)
                if i < 0 or i >= len(gpus):
                    raise Exception('--max-gpus ' + str(i) + ' not available on the system. GPUs ' + str([i for i in range(len(gpus))])+ ' detected.')

                self.enabled_gpus.append(i)

        elif parsed_args.no_gpus:
            self.enabled_gpus = []

        if parsed_args.show_stdout:
            self.show_stdout = True

        event_listener = EventListener()

        event_listener.on('registration', self.registration_complete)
        event_listener.on('failed', self.connection_failed)
        event_listener.on('jobs', self.sync_jobs)
        event_listener.on('close', self.on_client_close)

        if hasattr(signal, 'SIGUSR1'):
            signal.signal(signal.SIGUSR1, self.on_signusr1)

        ssh_key_registered = False
        if parsed_args.generate_ssh_key:
            self.logger.info('Generate SSH key')

            ssh_key = paramiko.RSAKey.generate(4096)
            self.ssh_key_private = ssh_key.key.private_bytes(
                serialization.Encoding.PEM, serialization.PrivateFormat.TraditionalOpenSSL, serialization.NoEncryption()
            ).decode()
            self.ssh_key_public = 'rsa ' + ssh_key.get_base64() + ' ' + parsed_args.name

            self.logger.info('Register SSH key at ' + self.config['host'])

            data = {
                'name': parsed_args.name,
                'secure_key': parsed_args.generate_ssh_key,
                'key': self.ssh_key_public,
            }

            try:
                response = aetros.api.http_request('server/ssh-key', json_body=data, method='post')
            except aetros.api.ApiError as e:
                if 'access_denied' in e.error:
                    print("error: Could not connect to " + self.config['url'] +
                          ': Access denied. --generate-ssh-key seems to be wrong. Incorrect host? See "aetros id"')
                    sys.exit(1)
                raise

            ssh_key_registered = response == True

        def delete_ssh_key():
            self.logger.info('Delete SSH key at ' + self.config['host'])

            data = {
                'secure_key': parsed_args.generate_ssh_key,
                'key': self.ssh_key_public,
            }
            response = aetros.api.http_request('server/ssh-key/delete', json_body=data)
            if not response:
                self.logger.error('Could not delete SSH key in AETROS Trainer.')

        if parsed_args.generate_ssh_key and ssh_key_registered:
            atexit.register(delete_ssh_key)

        if parsed_args.host:
            self.config['host'] = parsed_args.host

        if self.ssh_key_private:
            self.config['ssh_key_base64'] = self.ssh_key_private

        self.server = ServerClient(self.config, event_listener, self.logger)

        self.general_logger_stdout = GeneralLogger(job_backend=self, redirect_to=sys.__stdout__)
        self.general_logger_stderr = GeneralLogger(job_backend=self, redirect_to=sys.__stderr__)

        sys.stdout = self.general_logger_stdout
        sys.stderr = self.general_logger_stderr

        self.server.configure(parsed_args.name)
        self.logger.debug('Connecting to ' + self.config['host'])
        self.server.start()
        self.write_log("\n")

        try:
            while self.active:
                if self.registered:
                    self.server.send_message({'type': 'utilization', 'values': self.collect_system_utilization()}, '')
                    self.check_finished_jobs()

                time.sleep(1)
        except SystemExit:
            self.logger.warning('Killed')
            self.stop()
        except KeyboardInterrupt:
            self.stop()
Exemplo n.º 33
0
    def main(self, args):
        from aetros.starter import start
        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, prog=aetros.const.__prog__ + ' start')
        parser.add_argument('name', help='the model name, e.g. aetros/mnist-network to start new job, or job id, e.g. user/modelname/0db75a64acb74c27bd72c22e359de7a4c44a20e5 to start a pre-created job.')

        parser.add_argument('-i', '--image', help="Which Docker image to use for the command. Default read in aetros.yml. If not specified, command is executed on the host.")
        parser.add_argument('-l', '--local', action='store_true', help="Start the job immediately on the current machine.")
        parser.add_argument('-s', '--server', action='append', help="Limits the server pool to this server. Default not limitation or read in aetros.yml. Multiple --server allowed.")
        parser.add_argument('-b', '--branch', help="This overwrites the Git branch used when new job should be started.")
        parser.add_argument('-c', '--config', help="Default /aetros.yml in Git root.")
        parser.add_argument('--priority', help="Increases or decreases priority. Default is 0.")

        parser.add_argument('--cpu', help="How many CPU cores should be assigned to job. Docker only.")
        parser.add_argument('--memory', help="How much memory should be assigned to job. Docker only.")
        parser.add_argument('--gpu', help="How many GPU cards should be assigned to job. Docker only.")
        parser.add_argument('--gpu_memory', help="Memory requirement for the GPU. Docker only.")

        parser.add_argument('--rebuild-image', action='store_true', help="Makes sure the Docker image is re-built without cache.")

        parser.add_argument('--gpu-device', action='append', help="Which GPU device id should be mapped into the Docker container. Only with --local.")

        parser.add_argument('--max-time', help="Limit execution time in seconds. Sends SIGINT to the process group when reached.")
        parser.add_argument('--max-epochs', help="Limit execution epochs. Sends SIGINT to the process group when reached.")

        parser.add_argument('--insights', action='store_true', help="activates insights. Only for simple models.")
        parser.add_argument('--dataset', help="Dataset id when model has placeholders. Only for simple models with placeholders as input/output.")

        parser.add_argument('-p', '--param', action='append', help="Sets a hyperparameter, example '--param name=value'. Multiple --param allowed.")

        parsed_args = parser.parse_args(args)

        if not parsed_args.name:
            print("fatal: no model defined. 'aetros start user/model-name'.")
            sys.exit(2)

        if parsed_args.name and parsed_args.name.count('/') > 1:
            # start a concrete job, used by server command
            gpu_devices = []
            if parsed_args.gpu_device:
                gpu_devices = [int(x) for x in parsed_args.gpu_device]

            start(self.logger, parsed_args.name, cpus=int(parsed_args.cpu), memory=int(parsed_args.memory),
                gpu_devices=gpu_devices)
            return

        home_config = read_home_config()
        model_name = parsed_args.name

        # create a new job
        hyperparameter = {}
        if parsed_args.param:
            for param in parsed_args.param:
                if '=' not in param:
                    raise Exception('--param ' + param + ' does not contain a `=`. Please use "--param name=value"')

                name, value = param.split('=')
                hyperparameter[name] = value

        job_config = {'insights': parsed_args.insights}

        if parsed_args.image:
            job_config['image'] = parsed_args.image

        if parsed_args.branch:
            job_config['sourceGitTree'] = parsed_args.branch

        if parsed_args.max_epochs:
            job_config['maxEpochs'] = int(parsed_args.max_epochs)

        if parsed_args.max_time:
            job_config['maxTime'] = float(parsed_args.max_time)

        job_config['priority'] = 0
        if parsed_args.priority:
            job_config['priority'] = float(parsed_args.priority)

        if parsed_args.rebuild_image:
            job_config['config']['rebuild_image'] = True

        if parsed_args.server:
            job_config['servers'] = []
            for name in parsed_args.server:
                job_config['servers'].append(name)

        job_config['resources'] = {}

        if parsed_args.cpu:
            job_config['resources']['cpu'] = int(parsed_args.cpu)

        if parsed_args.memory:
            job_config['resources']['memory'] = int(parsed_args.memory)

        if parsed_args.gpu:
            job_config['resources']['gpu'] = int(parsed_args.gpu)

        if parsed_args.gpu_memory:
            job_config['resources']['gpu_memory'] = int(parsed_args.gpu_memory)

        config_path = parsed_args.config or 'aetros.yml'

        try:
            self.logger.debug("Create job ...")
            created = api.create_job(model_name, config_path, parsed_args.local, hyperparameter, parsed_args.dataset, config=job_config)
        except api.ApiError as e:
            if 'Connection refused' in e.error:
                self.logger.error("You are offline")

            raise

        self.logger.info("Job %s/%s created." % (model_name, created['id']))

        if parsed_args.local:
            start(self.logger, model_name + '/' + created['id'], gpu_devices=parsed_args.gpu_device)
        else:
            print("Open http://%s/model/%s/job/%s to monitor it." % (home_config['host'], model_name, created['id']))