Exemplo n.º 1
0
    def get_training_data(self, sim_config: int = None, baseline: bool = True):

        if sim_config is None:
            sim_config = self._get_sim_base_config()
        else:
            sql = "SELECT id FROM sim_config WHERE id = {}".format(P_MARKER)
            row = select_record(self.db, sql=sql, params=(sim_config, ))
            assert row is not None, "Invalid Sim Config id {}".format(
                sim_config)
            sim_config, = row

        sql = '''SELECT training_session_id as session, training_iteration.id as iteration, reward_mean 
                 FROM training_iteration
                 INNER JOIN training_session ON training_iteration.training_session_id = training_session.id
                 INNER JOIN sim_config ON training_session.sim_config_id = sim_config.id
                 WHERE training_session.sim_config_id = {}'''.format(P_MARKER)
        params = (sim_config, )
        df = pd.read_sql_query(sql, self.db, params=params) \
            .pivot(index='iteration', columns='session', values='reward_mean')

        if baseline:
            sql = "SELECT baseline_avg FROM sim_config WHERE id = {}".format(
                P_MARKER)
            baseline_avg, = select_record(self.db,
                                          sql=sql,
                                          params=(sim_config, ))
            df['baseline'] = [baseline_avg for _ in range(df.shape[0])]

        return df
Exemplo n.º 2
0
def get_simulator(trainer_id: int, policy_id: int):

    # Get Trainer DB
    trainer_name, cloud_provider = _get_trainer_and_cloud(trainer_id=trainer_id)
    trainer_db = db_connect(_TRAINER_PATH(trainer_name, cloud_provider) + "/" + TRAINER_DB_NAME)

    sql = '''SELECT trainer_cluster.name, trainer_cluster.cloud_provider, policy.model_name, policy.sim_config
             FROM policy INNER JOIN trainer_cluster ON policy.cluster_id = trainer_cluster.id
             WHERE policy_id = {}'''.format(P_MARKER, P_MARKER)
    sql = '''SELECT sim_model.name, sim_config.config
             FROM policy INNER JOIN sim_model ON policy.sim_model_id = sim_model.id
             INNER JOIN sim_config ON policy.sim_config_id = sim_config.id
             WHERE policy.id = {}'''.format(P_MARKER)
    row = select_record(trainer_db, sql=sql, params=(policy_id,))
    assert row is not None, "Invalid Trainer ID {} and Policy ID {}".format(trainer_id, policy_id)
    model_name, sim_config = row
    sim_config = json.loads(sim_config)

    sim_path = '{}.models.{}'.format(_TRAINER_PATH(trainer_name, cloud_provider), model_name)
    exec_locals = {}
    try:
        exec("from {} import SimBaseline, N_ACTIONS, OBSERVATION_SPACE, SimModel, BASE_CONFIG".format(
            sim_path), {}, exec_locals)
    except ModuleNotFoundError:
        raise Exception(" Model '{}' not found!!".format(sim_path))
    except Exception as e:
        raise e

    env_config = {"n_actions"        : exec_locals['N_ACTIONS'],
                  "observation_space": exec_locals['OBSERVATION_SPACE'],
                  "sim_model"        : exec_locals['SimModel'],
                  "sim_config"       : sim_config}

    return SimpyEnv(env_config)
Exemplo n.º 3
0
 def _get_sim_base_config(self):
     sql = '''SELECT id FROM sim_config 
              WHERE sim_model_id = {} and name = {}'''.format(
         P_MARKER, P_MARKER)
     params = (self._model_id, self.default_sim_config_name)
     row = select_record(self.db, sql=sql, params=params)
     assert row is not None, "Base Sim Config not found!"
     return row[0]
Exemplo n.º 4
0
def tear_down_trainer(trainer_id: int):
    get_trainer_data(trainer_id=trainer_id)
    sql = "SELECT name, cloud_provider FROM trainer_cluster WHERE id = {}".format(P_MARKER)
    row = select_record(_BACKOFFICE_DB, sql=sql, params=(trainer_id,))
    assert row is not None, "Unknown Trainer ID {}".format(trainer_id)
    trainer_name, cloud_provider = row
    if cloud_provider != '':
        result = subprocess.run(_CMD_PREFIX + "ray down {} -y".format(_TRAINER_YAML(trainer_name, cloud_provider)),
                                shell=True, capture_output=True, text=True, executable=_SHELL)
        assert not result.returncode, "Error on Tear Down {} {}\n{}".format(_TRAINER_YAML(trainer_name, cloud_provider),
                                                                        _TRAINER_PATH(trainer_name, cloud_provider), result.stderr)
Exemplo n.º 5
0
def launch_trainer(cluster_name: str = None, cloud_provider: str = '', cluster_config: dict = None):

    result = subprocess.run(['ls', _TRAINER_PATH(cluster_name, cloud_provider)], capture_output=True, text=True)
    # Create the Trainer Cluster if it does not exist.
    # No distinction exists between cloud providers, therefore training results are shared between runs in different
    # clouds
    if result.returncode != 0:
        # Create trainer folder
        result = subprocess.run(['cp', '-r', 'simpy_template', _TRAINER_PATH(cluster_name, cloud_provider)], capture_output=True,
                                text=True)
        if result.returncode:
            print("Error Creating Trainer Directory {}".format(_TRAINER_PATH(cluster_name, cloud_provider)))
            print(result.stderr)

        cursor = _BACKOFFICE_DB.cursor()
        sql = "INSERT INTO trainer_cluster (name, cloud_provider, start, config) VALUES ({})".format(SQLParamList(4))
        params = (cluster_name, cloud_provider, datetime.now(), json.dumps(cluster_config))
        cursor.execute(sql, params)
        _BACKOFFICE_DB.commit()
        trainer_id = cursor.lastrowid
    else:
        sql = '''SELECT id FROM trainer_cluster 
                 WHERE name = {} and cloud_provider = {} and stop IS NULL'''.format(P_MARKER, P_MARKER)
        trainer_id, = select_record(_BACKOFFICE_DB, sql=sql, params=(cluster_name, cloud_provider))
    # Create trainer yaml config file
    # When a cluster with the same name and provider is relaunched the configuration is overridden
    if cloud_provider != '':
        config_file = open(_TRAINER_YAML(cluster_name, cloud_provider), "wt")
        # ToDo: Test aws
        config_file.write(trainer_cluster_config(cloud_provider, cluster_name, _TRAINER_PATH(cluster_name, cloud_provider),
                                                 config=cluster_config))
        config_file.close()
        # launch the cluster
        result = subprocess.run(_CMD_PREFIX + "ray up {} --no-config-cache -y".format(_TRAINER_YAML(
            cluster_name, cloud_provider)), shell=True, capture_output=True, text=True, executable=_SHELL)
        subprocess.run(_CMD_PREFIX + "ray exec {} 'rm -r /home/ubuntu/trainer/*'".format(
            _TRAINER_YAML(cluster_name, cloud_provider)),
                       shell=True, capture_output=True, text=True, executable=_SHELL)
        subprocess.run(_CMD_PREFIX + "ray rsync_up {} '{}/' '/home/ubuntu/trainer/'".format(
            _TRAINER_YAML(cluster_name, cloud_provider), _TRAINER_PATH(cluster_name, cloud_provider)),
                       shell=True, capture_output=True, text=True, executable=_SHELL)

    _BACKOFFICE_DB.commit()
    return trainer_id, result
Exemplo n.º 6
0
def delete_trainer(trainer_id: int):

    sql = '''SELECT count(*) FROM policy 
             WHERE trainer_id = {} AND backend_name IS NOT NULL'''.format(P_MARKER, P_MARKER)
    count, = select_record(_BACKOFFICE_DB, sql=sql, params=(trainer_id,))
    assert count == 0, "Can not delete trainer with deployed policies"
    tear_down_trainer(trainer_id=trainer_id)

    trainer_name, cloud_provider = _get_trainer_and_cloud(trainer_id=trainer_id)
    result = subprocess.run(['rm', '-r', _TRAINER_PATH(trainer_name, cloud_provider)], capture_output=True, text=True)
    if result.returncode:
        print(result.stderr)
    if cloud_provider != '':
        result = subprocess.run(['rm', _TRAINER_YAML(trainer_name, cloud_provider)], capture_output=True, text=True)
        if result.returncode:
                print(result.stderr)
    cursor = _BACKOFFICE_DB.cursor()
    sql = '''DELETE FROM trainer_cluster WHERE id = {}'''.format(P_MARKER)
    cursor.execute(sql, (trainer_id,))
    _BACKOFFICE_DB.commit()
Exemplo n.º 7
0
    def get_policy_run_data(self,
                            sim_config: int = None,
                            baseline: bool = True):

        if sim_config is None:
            sim_config = self._get_sim_base_config()
        else:
            sql = "SELECT id FROM sim_config WHERE id = {}".format(P_MARKER)
            row = select_record(self.db, sql=sql, params=(sim_config, ))
            assert row is not None, "Invalid Sim Config id {}".format(
                sim_config)
            sim_config, = row

        sql = '''SELECT policy_id, policy_run.id, time_start, results 
                 FROM policy_run
                 INNER JOIN policy ON policy_run.policy_id = policy.id
                 WHERE policy.sim_config_id = {}'''.format(P_MARKER)
        params = (sim_config, )
        policy_run = select_all(self.db, sql=sql, params=params)
        df = pd.DataFrame(
            [["ai_policy{}_run{}".format(policy_id, run_id), time, x]
             for policy_id, run_id, time, l in policy_run
             for x in json.loads(l)],
            columns=['policy', 'time', 'reward'])
        if baseline:
            sql = '''SELECT id, time_start, results 
                     FROM baseline_run
                     WHERE sim_config_id = {}'''.format(P_MARKER)
            params = (sim_config, )
            baseline_run = select_all(self.db, sql=sql, params=params)
            df2 = pd.DataFrame([["baseline_run{}".format(run_id), time, x]
                                for run_id, time, l in baseline_run
                                for x in json.loads(l)],
                               columns=['policy', 'time', 'reward'])
            df = df.append(df2)

        return df
Exemplo n.º 8
0
def _get_trainer_and_cloud(trainer_id: int):
    sql = "SELECT name, cloud_provider FROM trainer_cluster WHERE id = {}".format(P_MARKER)
    row = select_record(_BACKOFFICE_DB, sql=sql, params=(trainer_id,))
    assert row is not None, "Unknown Trainer ID {}".format(trainer_id)
    return row
Exemplo n.º 9
0
def deploy_policy(backend_server: ServeClient, trainer_id: int, policy_id: int, policy_config: dict = None):
    class ServeModel:
        def __init__(self, agent_config: dict, checkpoint_path: str, trainer_path: str, model_name: str):

            sim_path = '{}.models.{}'.format(trainer_path, model_name)
            exec_locals = {}
            try:
                exec("from {} import SimBaseline, N_ACTIONS, OBSERVATION_SPACE, SimModel, BASE_CONFIG".format(
                    sim_path), {}, exec_locals)
            except ModuleNotFoundError:
                raise Exception(" Model '{}' not found!!".format(sim_path))
            except Exception as e:
                raise e

            agent_config["num_workers"] = 0
            agent_config["env"] = SimpyEnv
            agent_config["env_config"] = {"n_actions"        : exec_locals['N_ACTIONS'],
                                          "observation_space": exec_locals['OBSERVATION_SPACE'],
                                          "sim_model"        : exec_locals['SimModel'],
                                          "sim_config"       : exec_locals['BASE_CONFIG']}
            # print(agent_config)
            # assert agent_config is not None and isinstance(agent_config, dict), \
            #    "Invalid Agent Config {} when deploying a policy!".format(agent_config)
            checkpoint_path = trainer_path + checkpoint_path[1:]
            print(checkpoint_path)
            # assert checkpoint_path is not None and isinstance(agent_config, str), \
            #    "Invalid Checkpoint Path {} when deploying a policy!".format(checkpoint_path)
            self.trainer = ppo.PPOTrainer(config=agent_config)
            self.trainer.restore(checkpoint_path)

        async def __call__(self, request: Request):
            json_input = await request.json()
            obs = json_input["observation"]

            action = self.trainer.compute_action(obs)
            return {"action": int(action)}

    # Get Trainer DB
    trainer_name, cloud_provider = _get_trainer_and_cloud(trainer_id=trainer_id)
    trainer_db = db_connect(_TRAINER_PATH(trainer_name, cloud_provider) + "/" + TRAINER_DB_NAME)

    # Get Policy info
    sql = '''SELECT sim_model.name, policy.checkpoint, policy.agent_config
             FROM policy INNER JOIN sim_model ON policy.sim_model_id = sim_model.id
             WHERE policy.id = {}'''.format(P_MARKER)
    row = select_record(trainer_db, sql=sql, params=(policy_id,))
    assert row is not None, "Invalid Trainer ID {} and Policy ID {}".format(trainer_id, policy_id)
    model_name, checkpoint, saved_agent_config = row
    saved_agent_config = json.loads(saved_agent_config)

    if policy_config is None:
        policy_config = {'num_replicas': 1}
    policy_name = "trainer{}_policy{}".format(trainer_id, policy_id)
    trainer_path = _TRAINER_PATH(trainer_name, cloud_provider)
    backend_server.create_backend(policy_name, ServeModel, saved_agent_config, checkpoint, trainer_path, model_name,
                                  config=policy_config,
                                  ray_actor_options=_POLICY_ACTOR_CONFIG,
                                  env=CondaEnv(_CURRENT_ENV) )
    insert_sql = '''INSERT OR IGNORE INTO policy (
                        trainer_id,
                        policy_id,
                        backend_name
                    ) VALUES ({})'''.format(SQLParamList(3))
    cursor = _BACKOFFICE_DB.cursor()
    cursor.execute(insert_sql, (trainer_id, policy_id, policy_name))
    _BACKOFFICE_DB.commit()
    print("# Policy '{}' Deployed".format(policy_name))
    return policy_name
Exemplo n.º 10
0
    def __init__(self,
                 sim_name: str,
                 log_level: str = "ERROR",
                 checkpoint_path=None):
        exec_locals = {}
        try:
            exec(
                "from models.{} import SimBaseline, N_ACTIONS, OBSERVATION_SPACE, SimModel, BASE_CONFIG"
                .format(sim_name), {}, exec_locals)
        except ModuleNotFoundError:
            raise Exception(" Model '{}' not found!!".format(sim_name))
        except Exception as e:
            raise e

        try:
            self.db = db_connect(TRAINER_DB_NAME)
        except Exception as e:
            raise e

        assert isinstance(exec_locals['BASE_CONFIG'],
                          dict), "Simulation Config {} must be a dict!".format(
                              exec_locals['BASE_CONFIG'])

        assert log_level in ["DEBUG", "INFO", "WARN",
                             "ERROR"], "Invalid log_level {}".format(log_level)

        if not ray.is_initialized():
            my_ray_init()

        self._sim_baseline = exec_locals['SimBaseline']

        sql = '''SELECT id FROM sim_model WHERE name = {}'''.format(P_MARKER)
        params = (sim_name, )
        row = select_record(self.db, sql=sql, params=params)
        if row is None:
            cursor = self.db.cursor()
            cursor.execute(
                '''INSERT INTO sim_model (name) VALUES ({})'''.format(
                    P_MARKER), params)
            self._model_id = cursor.lastrowid
            params = (self._model_id, self.default_sim_config_name,
                      self._get_baseline_avg(exec_locals['BASE_CONFIG']),
                      json.dumps(exec_locals['BASE_CONFIG']))
            cursor.execute(
                '''INSERT INTO sim_config (sim_model_id,
                                                      name,
                                                      baseline_avg,
                                                      config) VALUES ({})'''.
                format(SQLParamList(4)), params)
            self.db.commit()
            print("# {} Created!".format(sim_name))
        else:
            self._model_id, = row

        self._config = self.ppo_config.copy()
        self._config["log_level"] = log_level
        self._config["env"] = SimpyEnv
        # ToDo: Change the Observation Space to a fucntion that receive a Sim Config as a parameter.
        #  In this part of the code it received exec_locals['BASE_CONFIG']
        self._config["env_config"] = {
            "n_actions": exec_locals['N_ACTIONS'],
            "observation_space": exec_locals['OBSERVATION_SPACE'],
            "sim_model": exec_locals['SimModel'],
            "sim_config": exec_locals['BASE_CONFIG']
        }
        if checkpoint_path is None:
            self.checkpoint_path = self.default_sim_checkpoint_path