Пример #1
0
    def _pull_job_info(self):
        service_id = self._service_id

        logger.info('Reading job info from meta store...')
        with self._meta_store:
            worker = self._meta_store.get_inference_job_worker(service_id)
            if worker is None:
                raise InvalidWorkerError(
                    'No such worker "{}"'.format(service_id))

            inference_job = self._meta_store.get_inference_job(
                worker.inference_job_id)
            if inference_job is None:
                raise InvalidWorkerError(
                    'No such inference job with ID "{}"'.format(
                        worker.inference_job_id))

            trial = self._meta_store.get_trial(worker.trial_id)
            if trial is None or trial.store_params_id is None:  # Must have model saved
                raise InvalidTrialError('No saved trial with ID "{}"'.format(
                    worker.trial_id))
            logger.info(f'Using trial "{trial.id}"...')

            model = self._meta_store.get_model(trial.model_id)
            if model is None:
                raise InvalidTrialError('No such model with ID "{}"'.format(
                    trial.model_id))
            logger.info(f'Using model "{model.name}"...')

            self._inference_job_id = inference_job.id

            self._py_model_class = load_model_class(model.model_file_bytes,
                                                    model.model_class)
            self._proposal = Proposal.from_jsonable(trial.proposal)
            self._store_params_id = trial.store_params_id
Пример #2
0
    def pull_job_info(self):
        service_id = self._service_id

        logger.info('Reading job info from meta store...')
        with self._meta_store:
            worker = self._meta_store.get_train_job_worker(service_id)
            if worker is None:
                raise InvalidWorkerError('No such worker "{}"'.format(service_id))

            sub_train_job = self._meta_store.get_sub_train_job(worker.sub_train_job_id)
            if sub_train_job is None:
                raise InvalidWorkerError('No such sub train job associated with advisor "{}"'.format(service_id))

            train_job = self._meta_store.get_train_job(sub_train_job.train_job_id)
            if train_job is None:
                raise InvalidWorkerError('No such train job with ID "{}"'.format(sub_train_job.train_job_id))

            model = self._meta_store.get_model(sub_train_job.model_id)
            if model is None:
                raise InvalidWorkerError('No such model with ID "{}"'.format(sub_train_job.model_id))
            logger.info(f'Using model "{model.name}"...')

            (self.train_dataset_path, self.val_dataset_path) = self._load_datasets(train_job)
            self.train_args = train_job.train_args
            self.sub_train_job_id = sub_train_job.id
            self.model_class = load_model_class(model.model_file_bytes, model.model_class)
Пример #3
0
    def pull_job_info(self):
        service_id = self._service_id

        logger.info('Reading job info from meta store...')
        with self._meta_store:
            sub_train_job = self._meta_store.get_sub_train_job_by_advisor(service_id)
            if sub_train_job is None:
                raise InvalidSubTrainJobError('No sub train job associated with advisor "{}"'.format(service_id))

            train_job = self._meta_store.get_train_job(sub_train_job.train_job_id)
            if train_job is None:
                raise InvalidSubTrainJobError('No such train job with ID "{}"'.format(sub_train_job.train_job_id))

            model = self._meta_store.get_model(sub_train_job.model_id)
            if model is None:
                raise InvalidSubTrainJobError('No such model with ID "{}"'.format(sub_train_job.model_id))
            logger.info(f'Using model "{model.name}"...')
            logger.info(f'Using budget "{train_job.budget}"...')

            trials = self._meta_store.get_trials_of_sub_train_job(sub_train_job.id)

            self.sub_train_job_id = sub_train_job.id
            self.budget = train_job.budget
            self.model_class = load_model_class(model.model_file_bytes, model.model_class)
            self._num_trials = len(trials)
            self._model_id = model.id
Пример #4
0
    def _load_model(self, trial_id):
        trial = self._db.get_trial(trial_id)
        model = self._db.get_model(trial.model_id)

        # Load model based on trial
        clazz = load_model_class(model.model_file_bytes, model.model_class)
        model_inst = clazz()
        model_inst.init(trial.knobs)

        # Unpickle model parameters and load it
        parameters = pickle.loads(trial.parameters)
        model_inst.load_parameters(parameters)

        return model_inst
Пример #5
0
    def _load_model(self, trial_id):
        trial = self._db.get_trial(trial_id)
        sub_train_job = self._db.get_sub_train_job(trial.sub_train_job_id)
        model = self._db.get_model(sub_train_job.model_id)

        # Load model based on trial
        clazz = load_model_class(model.model_file_bytes, model.model_class)
        model_inst = clazz(**trial.knobs)

        # Unpickle model parameters and load it
        with open(trial.params_file_path, 'rb') as f:
            parameters = f.read()
        parameters = pickle.loads(parameters)
        model_inst.load_parameters(parameters)

        return model_inst
Пример #6
0
    def start(self):
        logger.info('Starting train worker for service of ID "{}"...' \
            .format(self._service_id))

        # TODO: Break up crazily long & unreadable method
        advisor_id = None
        while True:
            with self._db:
                (sub_train_job_id, budget, model_id, model_file_bytes, model_class, \
                    train_job_id, train_dataset_uri, test_dataset_uri) = self._read_worker_info()

                if self._if_budget_reached(budget, sub_train_job_id):
                    # If budget reached
                    logger.info('Budget for train job has reached')
                    self._stop_worker()
                    if advisor_id is not None:
                        self._delete_advisor(advisor_id)
                    break

                # Create a new trial
                logger.info('Creating new trial in DB...')
                trial = self._db.create_trial(
                    sub_train_job_id=sub_train_job_id, model_id=model_id)
                self._db.commit()
                self._trial_id = trial.id
                logger.info('Created trial of ID "{}" in DB'.format(
                    self._trial_id))

            # Don't keep DB connection while training model

            # Perform trial & record results
            score = 0
            try:
                logger.info('Starting trial...')

                # Load model class from bytes
                logger.info('Loading model class...')
                clazz = load_model_class(model_file_bytes, model_class)

                # If not created, create a Rafiki advisor for train worker to propose knobs in trials
                if advisor_id is None:
                    logger.info('Creating Rafiki advisor...')
                    advisor_id = self._create_advisor(clazz)
                    logger.info(
                        'Created advisor of ID "{}"'.format(advisor_id))

                # Generate knobs for trial
                logger.info('Requesting for knobs proposal from advisor...')
                knobs = self._get_proposal_from_advisor(advisor_id)
                logger.info('Received proposal of knobs from advisor:')
                logger.info(pprint.pformat(knobs))

                # Mark trial as running in DB
                logger.info('Training & evaluating model...')
                with self._db:
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_running(trial, knobs)

                def handle_log(log_line, log_lvl):
                    with self._db:
                        trial = self._db.get_trial(self._trial_id)
                        self._db.add_trial_log(trial, log_line, log_lvl)

                (score, parameters) = self._train_and_evaluate_model(
                    clazz, knobs, train_dataset_uri, test_dataset_uri,
                    handle_log)
                logger.info('Trial score: {}'.format(score))

                with self._db:
                    logger.info('Marking trial as complete in DB...')
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_complete(trial, score, parameters)

                self._trial_id = None

                # Report results of trial to advisor
                try:
                    logger.info(
                        'Sending result of trials\' knobs to advisor...')
                    self._feedback_to_advisor(advisor_id, knobs, score)
                except Exception:
                    logger.error(
                        'Error while sending result of proposal to advisor:')
                    logger.error(traceback.format_exc())

            except Exception:
                logger.error('Error while running trial:')
                logger.error(traceback.format_exc())
                logger.info('Marking trial as errored in DB...')

                with self._db:
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_errored(trial)

                self._trial_id = None
                break  # Exit worker upon trial error
Пример #7
0
    def start(self):
        logger.info('Starting train worker for service of ID "{}"...' \
            .format(self._service_id))

        advisor_id = None
        while True:
            self._db.connect()
            (budget, model_id, model_file_bytes, model_class, train_job_id,
             train_dataset_uri, test_dataset_uri) = self._read_worker_info()

            if self._if_budget_reached(budget, train_job_id, model_id):
                # If budget reached
                logger.info('Budget for train job has reached')
                self._stop_worker()
                if advisor_id is not None:
                    self._delete_advisor(advisor_id)

                break

            # Load model class from bytes
            try:
                clazz = load_model_class(model_file_bytes, model_class)
            except Exception as e:
                logger.error('Error while loading model class for worker:')
                logger.error(traceback.format_exc())
                self._stop_worker()
                raise e

            # If not created, create a Rafiki advisor for train worker to propose knobs in trials
            if advisor_id is None:
                logger.info('Creating Rafiki advisor...')
                try:
                    advisor_id = self._create_advisor(clazz)
                    logger.info(
                        'Created advisor of ID "{}"'.format(advisor_id))
                except Exception as e:
                    logger.error('Error while creating advisor for worker:')
                    logger.error(traceback.format_exc())
                    raise e

            # Create a new trial
            logger.info('Starting trial...')
            logger.info('Requesting for knobs proposal from advisor...')
            knobs = self._get_proposal_from_advisor(advisor_id)
            logger.info('Received proposal of knobs from advisor:')
            logger.info(pprint.pformat(knobs))
            logger.info('Creating new trial in DB...')
            trial = self._create_new_trial(model_id, train_job_id, knobs)
            self._trial_id = trial.id
            logger.info('Created trial of ID "{}" in DB'.format(trial.id))

            # Don't keep DB connection while training model
            self._db.disconnect()

            # Perform trial & record results
            score = 0
            try:
                logger.info('Starting trial...')
                logger.info('Training & evaluating model...')
                (score, parameters, logs) = self._train_and_evaluate_model(
                    clazz, knobs, train_dataset_uri, test_dataset_uri)
                logger.info('Trial score: {}'.format(score))

                with self._db:
                    logger.info('Marking trial as complete in DB...')
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_complete(trial, score, parameters,
                                                    logs)

                self._trial_id = None
            except Exception:
                logger.error('Error while running trial:')
                logger.error(traceback.format_exc())
                logger.info('Marking trial as errored in DB...')

                with self._db:
                    trial = self._db.get_trial(self._trial_id)
                    self._db.mark_trial_as_errored(trial)

                self._trial_id = None

            # Report results of trial to advisor
            try:
                logger.info('Sending result of trials\' knobs to advisor...')
                self._feedback_to_advisor(advisor_id, knobs, score)
            except Exception:
                logger.error(
                    'Error while sending result of proposal to advisor:')
                logger.error(traceback.format_exc())