Пример #1
0
def superadmin_client():
    from rafiki.client import Client
    admin_host = os.environ['ADMIN_HOST']
    admin_port = os.environ['ADMIN_PORT']
    client = Client(admin_host=admin_host, admin_port=admin_port)
    client.login(email=SUPERADMIN_EMAIL,
                 password=os.environ['SUPERADMIN_PASSWORD'])
    return client
Пример #2
0
 def _make_client(self):
     admin_host = os.environ['ADMIN_HOST']
     admin_port = os.environ['ADMIN_PORT']
     advisor_host = os.environ['ADVISOR_HOST']
     advisor_port = os.environ['ADVISOR_PORT']
     superadmin_email = SUPERADMIN_EMAIL
     superadmin_password = SUPERADMIN_PASSWORD
     client = Client(admin_host=admin_host,
                     admin_port=admin_port,
                     advisor_host=advisor_host,
                     advisor_port=advisor_port)
     client.login(email=superadmin_email, password=superadmin_password)
     return client
Пример #3
0
    def __init__(self, service_id, worker_id, db=None):
        if db is None:
            db = Database()

        self._service_id = service_id
        self._db = db
        self._worker_id = worker_id
        self._trial_id = None
        self._sub_train_job_id = None
        self._client = Client(admin_host=os.environ['ADMIN_HOST'],
                              admin_port=os.environ['ADMIN_PORT'],
                              advisor_host=os.environ['ADVISOR_HOST'],
                              advisor_port=os.environ['ADVISOR_PORT'])
        self._params_root_dir = os.path.join(os.environ['WORKDIR_PATH'],
                                             os.environ['PARAMS_DIR_PATH'])
Пример #4
0
def make_user(user_type, email=None, password=None):
    email = email or gen_email()
    password = password or gen()
    client = Client()
    client.login(superadmin_email, superadmin_password)
    client.create_user(email, password, user_type)
    client.login(email, password)
    return client
Пример #5
0
def make_dataset(client: Client, task=None):
    name = gen()
    task = task or gen()
    file_path = DATASET_FILE_PATH
    dataset = client.create_dataset(name, task, file_path)
    dataset_id = dataset['id']
    return dataset_id
Пример #6
0
def make_train_job(client: Client, task=None, app=None, model_id=None):
    task = task or gen()
    app = app or gen()
    train_dataset_id = make_dataset(client, task=task)
    val_dataset_id = make_dataset(client, task=task)
    model_id = model_id or make_model(task=task)
    budget = {BudgetOption.MODEL_TRIAL_COUNT: 1, BudgetOption.GPU_COUNT: 0}
    train_job = client.create_train_job(app,
                                        task,
                                        train_dataset_id,
                                        val_dataset_id,
                                        budget,
                                        models=[model_id])
    return train_job['id']
Пример #7
0
def wait_for_inference_job_status(client: Client, app, status):
    length = 0
    timeout = JOB_TIMEOUT_SECS
    tick = 1

    while True:
        inference_job = client.get_running_inference_job(app)
        if inference_job['status'] == status:
            return
        elif inference_job['status'] == InferenceJobStatus.ERRORED:
            raise Exception('Inference job has errored')

        # Still running...
        if length >= timeout:
            raise TimeoutError('Waiting for too long')

        length += tick
        time.sleep(tick)
Пример #8
0
import pprint
import os

from rafiki.client import Client
from rafiki.config import SUPERADMIN_EMAIL, SUPERADMIN_PASSWORD


def seed_users(client):
    users = client.create_users('examples/seeds/users.csv')
    pprint.pprint(users)


if __name__ == '__main__':
    rafiki_host = os.environ.get('RAFIKI_HOST', 'localhost')
    admin_port = int(os.environ.get('ADMIN_EXT_PORT', 3000))
    admin_web_port = int(os.environ.get('ADMIN_WEB_EXT_PORT', 3001))
    user_email = os.environ.get('USER_EMAIL', SUPERADMIN_EMAIL)
    user_password = os.environ.get('USER_PASSWORD', SUPERADMIN_PASSWORD)

    # Initialize client
    client = Client(admin_host=rafiki_host, admin_port=admin_port)
    client.login(email=user_email, password=user_password)

    seed_users(client)
Пример #9
0
        dependencies={ModelDependency.TENSORFLOW: '1.12.0'}
    )
    pprint(model)

    print('Creating train job...')
    budget = { 
        BudgetOption.TIME_HOURS: hours,
        BudgetOption.GPU_COUNT: gpus
    }
    train_job = client.create_train_job(app, task, train_dataset['id'], val_dataset['id'], budget, models=[model['id']])
    pprint(train_job)

    print('Monitor the train job on Rafiki Web Admin')

    # TODO: Evaluate on test dataset?

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--email', type=str, default=SUPERADMIN_EMAIL, help='Email of user')
    parser.add_argument('--password', type=str, default=os.environ.get('SUPERADMIN_PASSWORD'), help='Password of user')
    parser.add_argument('--gpus', type=int, default=0, help='How many GPUs to use')
    parser.add_argument('--hours', type=float, default=24, help='How long the train job should run for (in hours)') 
    out_train_dataset_path = 'data/cifar10_train.zip'
    out_val_dataset_path = 'data/cifar10_val.zip'
    (args, _) = parser.parse_known_args()

    # Initialize client
    client = Client()
    client.login(email=args.email, password=args.password)

    run_enas(client, out_train_dataset_path, out_val_dataset_path, args.gpus, args.hours)
Пример #10
0
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#

import os

from rafiki.client import Client
from rafiki.config import SUPERADMIN_EMAIL

if __name__ == '__main__':
    rafiki_host = os.environ.get('RAFIKI_HOST', 'localhost')
    admin_port = int(os.environ.get('ADMIN_EXT_PORT', 3000))
    user_email = SUPERADMIN_EMAIL
    user_password = os.environ.get('SUPERADMIN_PASSWORD', 'rafiki')

    # Initialize client
    client = Client(admin_host=rafiki_host, admin_port=admin_port)
    client.login(email=user_email, password=user_password)
    print(client.stop_all_jobs())
Пример #11
0
class TrainWorker(object):
    def __init__(self, service_id, worker_id, db=None):
        if db is None:
            db = Database()

        self._service_id = service_id
        self._db = db
        self._worker_id = worker_id
        self._trial_id = None
        self._sub_train_job_id = None
        self._client = Client(admin_host=os.environ['ADMIN_HOST'],
                              admin_port=os.environ['ADMIN_PORT'],
                              advisor_host=os.environ['ADVISOR_HOST'],
                              advisor_port=os.environ['ADVISOR_PORT'])
        self._params_root_dir = os.path.join(os.environ['WORKDIR_PATH'],
                                             os.environ['PARAMS_DIR_PATH'])

    def start(self):
        logger.info('Starting train worker for service of ID "{}"...' \
            .format(self._service_id))

        # TODO: Break up crazily long & unreadable method
        advisor_id = None
        while True:
            with self._db:
                (self._sub_train_job_id, budget, model_id, model_file_bytes, model_class, \
                    train_job_id, train_dataset_uri, test_dataset_uri) = self._read_worker_info()

                self._get_client().send_event(
                    'train_job_worker_started',
                    sub_train_job_id=self._sub_train_job_id)

                if self._if_budget_reached(budget):
                    # If budget reached
                    logger.info('Budget for train job has reached')
                    self._stop_sub_train_job()
                    if advisor_id is not None:
                        self._delete_advisor(advisor_id)
                    break

                # Create a new trial
                logger.info('Creating new trial in DB...')
                trial = self._db.create_trial(
                    sub_train_job_id=self._sub_train_job_id,
                    model_id=model_id,
                    worker_id=self._worker_id)
                self._db.commit()
                self._trial_id = trial.id
                logger.info('Created trial of ID "{}" in DB'.format(
                    self._trial_id))

            # Don't keep DB connection while training model

            # Perform trial & record results
            score = 0
            try:
                logger.info('Starting trial...')

                # Load model class from bytes
                logger.info('Loading model class...')
                clazz = load_model_class(model_file_bytes, model_class)

                # If not created, create a Rafiki advisor for train worker to propose knobs in trials
                if advisor_id is None:
                    logger.info('Creating Rafiki advisor...')
                    advisor_id = self._create_advisor(clazz)
                    logger.info(
                        'Created advisor of ID "{}"'.format(advisor_id))

                # Generate knobs for trial
                logger.info('Requesting for knobs proposal from advisor...')
                knobs = self._get_proposal_from_advisor(advisor_id)
                logger.info('Received proposal of knobs from advisor:')
                logger.info(pprint.pformat(knobs))

                # Mark trial as running in DB
                logger.info('Training & evaluating model...')
                with self._db:
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_running(trial, knobs)

                def handle_log(log_line, log_lvl):
                    with self._db:
                        trial = self._db.get_trial(self._trial_id)
                        self._db.add_trial_log(trial, log_line, log_lvl)

                (score, params_file_path) = self._train_and_evaluate_model(
                    clazz, knobs, train_dataset_uri, test_dataset_uri,
                    handle_log)
                logger.info('Trial score: {}'.format(score))

                with self._db:
                    logger.info('Marking trial as complete in DB...')
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_complete(trial, score,
                                                    params_file_path)

                # Report results of trial to advisor
                try:
                    logger.info(
                        'Sending result of trials\' knobs to advisor...')
                    self._feedback_to_advisor(advisor_id, knobs, score)
                except Exception:
                    logger.error(
                        'Error while sending result of proposal to advisor:')
                    logger.error(traceback.format_exc())

                self._trial_id = None

            except Exception:
                logger.error('Error while running trial:')
                logger.error(traceback.format_exc())
                logger.info('Marking trial as errored in DB...')

                with self._db:
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_errored(trial)

                self._trial_id = None
                break  # Exit worker upon trial error

    def stop(self):
        # If worker is currently running a trial, mark it has terminated
        logger.info('Marking trial as terminated in DB...')
        try:
            if self._trial_id is not None:
                with self._db:
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_terminated(trial)

        except Exception:
            logger.error('Error marking trial as terminated:')
            logger.error(traceback.format_exc())

        if self._sub_train_job_id is not None:
            self._get_client().send_event(
                'train_job_worker_stopped',
                sub_train_job_id=self._sub_train_job_id)

    def _train_and_evaluate_model(self, clazz, knobs, train_dataset_uri, \
                                test_dataset_uri, handle_log):

        # Initialize model
        model_inst = clazz(**knobs)

        # Add logs handlers for trial, including adding handler to root logger
        # to handle logs emitted during model training with level above INFO
        log_handler = ModelLoggerHandler(handle_log)
        root_logger = logging.getLogger()
        root_logger.addHandler(log_handler)
        py_model_logger = logging.getLogger('{}.trial'.format(__name__))
        py_model_logger.setLevel(logging.INFO)
        py_model_logger.propagate = False  # Avoid duplicate logs in root logger
        py_model_logger.addHandler(log_handler)
        model_logger.set_logger(py_model_logger)

        # Train model
        model_inst.train(train_dataset_uri)

        # Evaluate model
        score = model_inst.evaluate(test_dataset_uri)

        # Remove log handlers from loggers for this trial
        root_logger.removeHandler(log_handler)
        py_model_logger.removeHandler(log_handler)

        # Dump and pickle model parameters
        parameters = model_inst.dump_parameters()
        parameters = pickle.dumps(parameters)
        params_file_path = os.path.join(self._params_root_dir,
                                        '{}.model'.format(self._trial_id))
        with open(params_file_path, 'wb') as f:
            f.write(parameters)

        model_inst.destroy()

        return (score, params_file_path)

    # Gets proposal of a set of knob values from advisor
    def _get_proposal_from_advisor(self, advisor_id):
        res = self._get_client()._generate_proposal(advisor_id)
        knobs = res['knobs']
        return knobs

    # Feedback result of knobs to advisor
    def _feedback_to_advisor(self, advisor_id, knobs, score):
        self._get_client()._feedback_to_advisor(advisor_id, knobs, score)

    def _stop_sub_train_job(self):
        logger.warn('Stopping sub train job...')
        try:
            self._get_client().send_event(
                'sub_train_job_budget_reached',
                sub_train_job_id=self._sub_train_job_id)
        except Exception:
            # Throw just a warning - likely that another worker has stopped it
            logger.warn('Error while stopping sub train job:')
            logger.warn(traceback.format_exc())

    def _create_advisor(self, clazz):
        # Retrieve knob config for model of worker
        knob_config = clazz.get_knob_config()
        knob_config_str = serialize_knob_config(knob_config)

        # Create advisor associated with worker
        res = self._get_client()._create_advisor(knob_config_str,
                                                 advisor_id=self._service_id)
        advisor_id = res['id']
        return advisor_id

    # Delete advisor
    def _delete_advisor(self, advisor_id):
        try:
            self._get_client()._delete_advisor(advisor_id)
        except Exception:
            # Throw just a warning - not critical for advisor to be deleted
            logger.warning('Error while deleting advisor:')
            logger.warning(traceback.format_exc())

    # Returns whether the worker reached its budget (only consider COMPLETED or ERRORED trials)
    def _if_budget_reached(self, budget):
        # By default, budget is model trial count of 5
        max_trials = budget.get(BudgetType.MODEL_TRIAL_COUNT, 5)
        trials = self._db.get_trials_of_sub_train_job(self._sub_train_job_id)
        trials = [
            x for x in trials
            if x.status in [TrialStatus.COMPLETED, TrialStatus.ERRORED]
        ]
        return len(trials) >= max_trials

    def _read_worker_info(self):
        worker = self._db.get_train_job_worker(self._service_id)

        if worker is None:
            raise InvalidWorkerException()

        sub_train_job = self._db.get_sub_train_job(worker.sub_train_job_id)
        train_job = self._db.get_train_job(sub_train_job.train_job_id)
        model = self._db.get_model(sub_train_job.model_id)

        if model is None:
            raise InvalidModelException()

        if train_job is None or sub_train_job is None:
            raise InvalidTrainJobException()

        return (sub_train_job.id, train_job.budget, model.id,
                model.model_file_bytes, model.model_class, train_job.id,
                train_job.train_dataset_uri, train_job.test_dataset_uri)

    def _get_client(self):
        self._client.login(email=SUPERADMIN_EMAIL,
                           password=SUPERADMIN_PASSWORD)
        return self._client
Пример #12
0
from examples.scripts.client_quickstart import RAFIKI_HOST, ADMIN_PORT, USER_PASSWORD, MODEL_DEVELOPER_EMAIL, \
    create_model
from rafiki.client import Client
from rafiki.constants import TaskType, ModelDependency

if __name__ == '__main__':
    app = 'home_rentals_regression'
    task = TaskType.TABLE_REGRESSION

    client = Client(admin_host=RAFIKI_HOST, admin_port=ADMIN_PORT)

    print('Logging in as model developer...')
    client.login(email=MODEL_DEVELOPER_EMAIL, password=USER_PASSWORD)

    print('Adding models to Rafiki...')
    create_model(client,
                 'SkLasso',
                 task,
                 'examples/models/table_regression/SkLasso.py',
                 'SkLasso',
                 dependencies={ModelDependency.SCIKIT_LEARN: '0.20.0'})
Пример #13
0
import requests
import json
import sys

sys.path.insert(0, '..')
from rafiki.client import Client
# import pusher

db_filename = "chatbot.db"
app = Flask(__name__)

app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///%s' % db_filename
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['SQLALCHEMY_ECHO'] = True

client = Client(admin_host='localhost', admin_port=3000)

db.init_app(app)
with app.app_context():
    db.create_all()


@app.route('/')
def index():
    return render_template('index.html')


# run Flask app
if __name__ == "__main__":
    load_dotenv()
    app.run()
Пример #14
0
def superadmin():
    client = Client()
    client.login(superadmin_email, superadmin_password)
    return client
Пример #15
0
from examples.scripts.client_quickstart import make_predictions, RAFIKI_HOST, ADMIN_PORT
from rafiki.client import Client

if __name__ == '__main__':
    client = Client(admin_host=RAFIKI_HOST, admin_port=ADMIN_PORT)

    print('Making predictions for queries:')
    print(queries)
    predictions = make_predictions(client, predictor_host, queries)
    print('Predictions are:')
    print(predictions)

    print('Stopping inference job...')
    pprint.pprint(client.stop_inference_job(app))
Пример #16
0
                    0, 0, 0, 0, 0, 0, 0, 0
                ],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0
                ],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0
                ],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0
                ]]]

    client = Client(admin_host=RAFIKI_HOST, admin_port=ADMIN_PORT)
    client.login(email=SUPERADMIN_EMAIL, password=USER_PASSWORD)

    print('Creating model developer in Rafiki...')
    create_user(client, MODEL_DEVELOPER_EMAIL, USER_PASSWORD,
                UserType.MODEL_DEVELOPER)

    print('Creating app developer in Rafiki...')
    create_user(client, APP_DEVELOPER_EMAIL, USER_PASSWORD,
                UserType.APP_DEVELOPER)

    print('Logging in as model developer...')
    client.login(email=MODEL_DEVELOPER_EMAIL, password=USER_PASSWORD)

    print('Adding models to Rafiki...')
    create_model(client, 'TfFeedForward', task, 'examples/models/image_classification/TfFeedForward.py', \
Пример #17
0
def make_predictions(client, predictor_host):
    res = requests.post(url='http://{}/predict'.format(predictor_host),
                        json={'query': QUERY})

    if res.status_code != 200:
        raise Exception(res.text)

    pprint.pprint(res.json())


def stop_inference_job(client):
    pprint.pprint(client.stop_inference_job(app=APP))


if __name__ == '__main__':
    client = Client(admin_host=ADMIN_HOST, admin_port=ADMIN_PORT)
    client.login(email=USER_EMAIL, password=USER_PASSWORD)

    print('Adding models to Rafiki...')
    create_models(client)

    print('Creating train job for app "{}" on Rafiki...'.format(APP))
    create_train_job(client)

    print('Waiting for train job to complete...')
    wait_until_train_job_has_completed(client)
    print('Train job has been completed!')

    print(
        'Listing best trials of latest train job for app "{}"...'.format(APP))
    list_best_trials_of_train_job(client)