예제 #1
0
    def __init__(self, model_name, env):
        super(DQNAgent, self).__init__(model_name, env)
        self.episode = self.configs.episode
        self.batch_size = self.configs.batch_size
        self.gamma = self.configs.gamma
        self.eps_start = self.configs.eps_start
        self.eps_end = self.configs.eps_end
        self.eps_decay = self.configs.eps_decay
        self.target_update_episode = self.configs.target_update_episode

        self.model_path = self.configs.save_path
        self.save_episode = self.configs.save_episode
        self.plot_episode = self.configs.plot_episode

        self.policy_net = models.DQN(self.configs, env).to(self.device)
        self.target_net = models.DQN(self.configs, env).to(self.device)
        self.load_model(self.model_path)
        self.optimizer = optim.Adam(
            self.policy_net.parameters(),
            lr=self.configs.optimizer_lr,
            betas=(self.configs.optimizer_beta1, self.configs.optimizer_beta2),
            eps=self.configs.optimizer_eps,
            weight_decay=self.configs.optimizer_weight_decay)
        self.memory = utils.ReplayMemory(10000)
        self.num_random_choose = 0

        self.num_choice_per_dim = self.configs.num_choice_per_dim
        self.action_dim = env.action_spec().shape
        self.action_min = env.action_spec().minimum
        self.action_max = env.action_spec().maximum

        self.action_space = utils.enumerate(self.num_choice_per_dim,
                                            self.action_min, self.action_max)
예제 #2
0
    def __init__(self, env):

        self.env = env
        self.input_size = env.observation_space.shape[0]
        self.output_size = env.action_space.n
        self.gamma = 0.9
        self.num_episodes = 5000
        self.replay_buffer = deque()
        self.MEMORY_SIZE = 50000

        self.model = models.DQN(self.input_size, self.output_size, [10])
        self.target_Q = models.DQN(self.input_size, self.output_size, [10])
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
예제 #3
0
    def __init__(self, actions, params={}):
        self.params = params
        self.model = models.ForwardModel(params).cuda()

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=params['lr'])

        if 'horizon' in params.keys():
            self.horizon = params['horizon']

        self.num_actions = actions
        self.traj = []
        self.replay_buffer = {
            'states': [],
            'actions': [],
            'next_states': [],
            'rewards': [],
            'levels': [],
            'next_levels': []
        }
        self.current_level = 0
        self.started_training = False
        self.action_buffer = []
        self.mode = 'explore'
        self.ucb_c = params['ucb_c']

        self.dqn = models.DQN(params).cuda()
        self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters(), lr=0.0001)

        print("[NeuraE3] Initialized")
예제 #4
0
def main():

    config, experiment_name = arguments.get_args()

    # Set seed
    random.seed(config.seed)
    numpy.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    experiment = f'{config.results_dir}/{experiment_name}/'
    print("EXPERIMENT NAME: ", experiment_name)

    # Create the experiment folder and logger
    if not os.path.exists(experiment):
        os.makedirs(experiment)
    logger = utils.SimpleLogger(f'{experiment}/log.txt')

    # Copy source code
    srcpath = experiment + '/src/'
    if not os.path.exists(srcpath):
        os.makedirs(srcpath)
        os.system(f'cp *.py {srcpath}')

    # Define log settings
    log_path = experiment + '/train_baseline.log'

    # Create agent and environment
    env = environment.EnvironmentWrapper(config)
    agent = agents.Agent(config)
    if config.cuda == 1: agent = agent.cuda()
    optimizer = optim.Adam(agent.parameters(), lr=config.learning_rate)
    agent.dqn = models.DQN(config).cuda()
    optimizer_dqn = optim.Adam(agent.dqn.parameters(),
                               lr=config.dqn_learning_rate)
    agent.best_dqn_params = agent.dqn.state_dict()
    keep_training_dqn = True
    print(f'# parameters: {utils.count_parameters(agent)}')

    # Load checkpoint if one exists
    # if os.path.isfile(experiment + '/agent.pth'):
    #     print(f'[loading checkpoint from {experiment}]')
    #     checkpoint = torch.load(experiment + '/agent.pth')
    #     agent.load_state_dict(checkpoint['agent'].state_dict())
    #     optimizer.load_state_dict(checkpoint['optimizer'].state_dict())
    #     agent.replay_memory = checkpoint['agent'].replay_memory
    #     epoch = checkpoint['ep'] + 1
    #     perf = torch.load(experiment + '/perf.pth')
    #     print(f'[resuming at epoch {epoch}]')
    # else:
    #     epoch = 0
    #     perf = {'losses': [], 'metrics': [], 'rewards': []}
    epoch = 0
    perf = {'losses': [], 'metrics': [], 'rewards': []}
    tensorboard = Tensorboard(
        config.results_dir + f'/tensorboard/{experiment_name}',
        log_dir=config.results_dir + '/tensorboard_logs/')

    best_exploit_perf = -math.inf
    dqn_epochs_completed = 0

    # Start algorithm
    phase = 'explore'
    while epoch < 200:

        if epoch < config.n_exploration_epochs:
            #### Explore phase
            phase = 'explore'
            agent.eval()
            exploration_policy = 'random' if epoch == 0 else config.exploration_policy
            ep_reward, ep_length = agent.act(env,
                                             'train',
                                             config,
                                             policy=exploration_policy,
                                             goal='explore')
            logger.log(
                f'EXPLORE PHASE | mean reward: {ep_reward}, mean episode length: {ep_length}'
            )
            if config.test == 1:
                agent.act(env,
                          'test',
                          config,
                          policy=exploration_policy,
                          goal='explore')
            # train the models
            for i in range(config.n_training_epochs):
                if (i < config.n_training_epochs - 1):
                    split = 'train'
                    agent.train()
                else:
                    split = 'test'
                    agent.eval()
                losses, log_string = agent.train_model(
                    split,
                    'explore',
                    optimizer,
                    config,
                    tensorboard,
                    update=(split == 'train'))
                logger.log(f'TRAINING MODEL | ep {epoch}/{i} | {log_string}')
        else:
            #### Exploit phase
            if 'maze' in config.env:
                # just do search
                ep_reward, ep_length = agent.act(
                    env,
                    'train',
                    config,
                    policy='particle2',
                    goal='exploit',
                    n_episodes=config.n_trajectories)
                logger.log(
                    f'EXPLOIT PHASE: epoch {epoch}, mean reward: {ep_reward}, mean episode length: {ep_length}'
                )
            else:
                if phase == 'explore':
                    # this is our first time exploiting - train the DQN for a while
                    phase = 'exploit'
                    agent.train_policy_dqn('train',
                                           'explore',
                                           optimizer_dqn,
                                           config,
                                           n_updates=config.dqn_model_updates,
                                           logger=logger)
                    agent.best_dqn_params = copy.deepcopy(
                        agent.dqn.state_dict())
                    keep_training_dqn = True
                else:
                    # act in the environment
                    if keep_training_dqn:
                        agent.train_policy_dqn('train',
                                               'explore',
                                               optimizer_dqn,
                                               config,
                                               n_updates=25000,
                                               logger=logger)
                    ep_reward, ep_length = agent.act(
                        env,
                        'test',
                        config,
                        policy='dqn',
                        n_episodes=config.dqn_eval_ep)
                    logger.log(
                        f'EXPLOIT PHASE: epoch {epoch}, mean reward: {ep_reward}, mean episode length: {ep_length}, DQN training: {keep_training_dqn}'
                    )
                    if keep_training_dqn:
                        if ep_reward >= best_exploit_perf or config.checkpoint_dqn == 0:
                            best_exploit_perf = ep_reward
                            agent.best_dqn_params = copy.deepcopy(
                                agent.dqn.state_dict())  # TODO clone!!!!
                        else:
                            agent.dqn.load_state_dict(agent.best_dqn_params)
                            keep_training_dqn = False

        perf['epoch'] = epoch
        perf['rewards'].append(ep_reward)
        torch.save(perf, f'{experiment}/perf.pth')
        torch.save({
            'agent': agent,
            'optimizer': optimizer,
            'ep': epoch
        }, f'{experiment}/agent.epoch{epoch}.pth')
        torch.save({
            'agent': agent,
            'optimizer': optimizer,
            'ep': epoch
        }, f'{experiment}/agent.pth')
        torch.save(agent.replay_memory, f'{experiment}/replay_memory.pth')
        epoch += 1
예제 #5
0
        if render:
            env.render()
        state_t = torch.tensor(state)
        if type(env.action_space) == gym.spaces.Discrete:
            action_values = network(state_t.unsqueeze(0))
            action = torch.argmax(action_values)
        else:
            mu,_ = network(state_t)
            action = mu.cpu().numpy()
        state, reward, done, _ = env.step(action)
        total_reward += reward
    env.close()
    return total_reward


if __name__=="__main__":

    net_state_file_dir = os.getcwd()
    env = preproccessing.make_env(env_name)
    print(net_state_file_dir)
    policy = models.DQN(env.observation_space.shape, env.action_space.n)
    print(f"environment name: {env_name} |  Solving reward: { env.spec.reward_threshold}| Episode length: {env.spec.max_episode_steps}")
    while True:
        policy.load_state_dict(torch.load(net_state_file_dir + f'/DQN-{env_name}_network_state.pt'))
        total_return = test_policy(policy, env, render=render, record=record)
        print(f"Total return {total_return}")
        print("Press Enter to play again")
        input('')


예제 #6
0
def prepare():
    if not os.path.exists(CONST.LOG_PATH):
        os.mkdir(CONST.LOG_PATH)
    if not os.path.exists(CONST.LOG_SYSBENCH_PATH):
        os.mkdir(CONST.LOG_SYSBENCH_PATH)

    global opt, task_detail, instance_detail, model_detail

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size',
                        type=int,
                        default=2,
                        help='Training Batch Size')
    parser.add_argument('--memory',
                        type=str,
                        default='',
                        help='add replay memory')
    parser.add_argument('--task_id',
                        type=int,
                        required=True,
                        help='get task info')
    parser.add_argument('--inst_id',
                        type=int,
                        required=True,
                        help='get inst info')
    parser.add_argument('--model_id',
                        type=int,
                        required=True,
                        help='get model info')
    parser.add_argument('--host',
                        type=str,
                        required=True,
                        help='cluster host for set mysql param')

    opt = parser.parse_args()

    task_id = opt.task_id
    inst_id = opt.inst_id
    model_id = opt.model_id

    init_logger(task_id, False, True)
    CONST.TASK_ID = task_id

    logger.info("start cdbtune")
    logger.info(opt)
    # taskdb = database("127.0.0.1",3306,"root","123456","cdbtune")
    taskdb = database("10.249.50.200", 4839, "cdbtune", "123456", "cdbtune")

    rsp_task = taskdb.fetch_all("select * from tb_task where task_id = %d" %
                                task_id)
    rsp_inst = taskdb.fetch_all(
        "select * from tb_mysql_inst where inst_id = %d" % inst_id)
    rsp_model = taskdb.fetch_all(
        "select * from tb_models where model_id = %d" % model_id)

    if len(rsp_task) == 0 or len(rsp_inst) == 0 or len(rsp_model) == 0:
        os_quit(Err.INPUT_ERROR, "task_id or inst_id or model_id doesn`t exit")

    task_detail = rsp_task[0]
    instance_detail = rsp_inst[0]
    model_detail = rsp_model[0]

    method = model_detail["method"]
    model_path = model_detail["position"]
    num_knobs = model_detail["knobs"]
    num_metrics = model_detail["dimension"]

    env = environment.TencentServer(instance=instance_detail,
                                    task_detail=task_detail,
                                    model_detail=model_detail,
                                    host=opt.host)

    # Build models
    if method == 'ddpg':
        ddpg_opt = dict()
        ddpg_opt['tau'] = 0.001
        ddpg_opt['alr'] = 0.00001
        ddpg_opt['clr'] = 0.00001
        ddpg_opt['model'] = model_path

        gamma = 0.99
        memory_size = 100000
        ddpg_opt['gamma'] = gamma
        ddpg_opt['batch_size'] = opt.batch_size
        ddpg_opt['memory_size'] = memory_size

        model = models.DDPG(n_states=num_metrics,
                            n_actions=num_knobs,
                            opt=ddpg_opt,
                            ouprocess=True)
    else:
        model = models.DQN()
        pass

    if len(opt.memory) > 0:
        model.replay_memory.load_memory(opt.memory)
        logger.info("Load Memory: {}".format(len(model.replay_memory)))

    # Load mean value and varianc

    current_knob = environment.get_init_knobs()

    return env, model
예제 #7
0
env.reset()
plt.figure()

BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

IMAGE_SIZE = 3 * 32 * 48

tb = SummaryWriterWithGlobal('cartpole')

policy_net = models.DQN().to(device)
target_net = models.DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

vae = models.ThreeLayerLinearVAE(IMAGE_SIZE, 10).to(device)
vae_optim = optim.Adam(vae.parameters(), lr=1e-3)

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

eye = Eyes(env, vae)
#eye.registerObserver('raw', OpenCV('raw'))
eye.registerObserver('raw', OpenCV('raw'))
eye.registerObserver('recon', OpenCV('recon'))
eye.registerObserver('raw', ImageFileWriter('data/images/fullscreen', 'raw'))
예제 #8
0
        ddpg_opt['model'] = opt.params
        n_states = opt.metric_num
        gamma = 0.99
        memory_size = 100000
        num_actions = opt.default_knobs + opt.other_knob
        ddpg_opt['gamma'] = gamma
        ddpg_opt['batch_size'] = opt.batch_size
        ddpg_opt['memory_size'] = memory_size

        model = models.DDPG(n_states=n_states,
                            n_actions=num_actions,
                            opt=ddpg_opt,
                            ouprocess=not opt.noisy)

    else:
        model = models.DQN()
        pass

    if not os.path.exists('log'):
        os.mkdir('log')

    if not os.path.exists('save_memory'):
        os.mkdir('save_memory')

    if not os.path.exists('save_knobs'):
        os.mkdir('save_knobs')

    if not os.path.exists('save_state_actions'):
        os.mkdir('save_state_actions')

    if not os.path.exists('model_params'):
예제 #9
0
def _create_agents(config_list):
    """
    Create agents with different hyper-parameters.

    Parameters
    ----------
    config_list : list of dict
        List of parameters dict. Each dict has configurations
        such as model name, learning rate, etc..

    Returns
    -------
        Created agents list and core agent object.

    """
    try:
        agents = []
        for config in config_list:
            hyper_parameters = utils.Hyperparameter(
                batch_size=config["batch_size"],
                gamma=config["gamma"],
                eps_start=config["eps_start"],
                eps_end=config["eps_end"],
                eps_decay=config["eps_decay"],
                target_update=config["target_update"],
                default_durability=config["default_durability"],
                learning_rate=config["learning_rate"],
                initial_memory=config["initial_memory"],
                n_episode=config["n_episode"],
                n_actions=config["n_action"],
                default_durability_decreased_level=config[
                    "default_durability_decreased_level"],
                default_durability_increased_level=config[
                    "default_durability_increased_level"],
                default_check_frequency=config["default_check_frequency"],
                default_healing_frequency=config["default_healing_frequency"],
                env_name=config["env_name"],
                exp_name=config["exp_name"],
                render=config["render"],
                run_name=config["run_name"],
                output_directory_path=config["output_directory_path"],
                hyper_dash=config["hyper_dash"],
                model_saving_frequency=config["model_saving_frequency"],
                parameters_name=config["name"],
                roulette_mode=config["roulette_mode"],
                max_reward=config["max_reward"],
                min_reward=config["min_reward"])
            print(config["name"])
            if config["name"] != "core":
                if config["model"] == "DQN":
                    policy_net = models.DQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                    target_net = models.DQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                elif config["model"] == "DDQN":
                    policy_net = models.DDQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                    target_net = models.DDQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                elif config["model"] == "DQNbn":
                    policy_net = models.DQNbn(n_actions=4).to(
                        hyper_parameters.DEVICE)
                    target_net = models.DQNbn(n_actions=4).to(
                        hyper_parameters.DEVICE)
                elif config["model"] == "NonBatchNormalizedDQN":
                    policy_net = models.NonBatchNormalizedDQN(n_actions=4).to()
                    target_net = models.NonBatchNormalizedDQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                # elif args["model"] == "RamDQN":
                #     policy_net = models.RamDQN(n_actions=4).to(hyper_parameters.DEVICE)
                #     target_net = models.RamDQN(n_actions=4).to(hyper_parameters.DEVICE)
                else:
                    policy_net = models.DQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                    target_net = models.DQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                optimizer = optim.Adam(policy_net.parameters(),
                                       lr=hyper_parameters.LEARNING_RATE)
                agents.append(
                    Agent(policy_net, target_net,
                          hyper_parameters.DEFAULT_DURABILITY, optimizer,
                          config["name"], hyper_parameters))
            else:
                # For core agent
                policy_net = models.NonBatchNormalizedDQN(n_actions=4).to(
                    hyper_parameters.DEVICE)
                target_net = models.NonBatchNormalizedDQN(n_actions=4).to(
                    hyper_parameters.DEVICE)
                optimizer = optim.Adam(policy_net.parameters(),
                                       lr=hyper_parameters.LEARNING_RATE)
                core_agent = Agent(policy_net, target_net,
                                   hyper_parameters.DEFAULT_DURABILITY,
                                   optimizer, config["name"], hyper_parameters)
            print("Agent:{} has been done".format(config["name"]))
        try:
            core_agent
        except Exception as e:
            print("P_RuntimeError:0x1000 Core agent has not been defined.")
            tb = sys.exc_info()[2]
            print(e.with_traceback(tb))
            sys.exit(1)
        return agents, core_agent
    except Exception as e:
        print("P_RuntimeError:0x1001 Some arguments is missing.")
        tb = sys.exc_info()[2]
        print(e.with_traceback(tb))
        sys.exit(1)
예제 #10
0
##################################################################
BATCH_SIZE = 32
REPLAY_MEMORY_SIZE = 100000
DOUBLE_DQN = True
EPSILON = 1.0
GAMMA = 0.99
ANNEAL_TO = 0.02
ANNEAL_OVER = 100000   # time steps
ANNEAL_STEP = (EPSILON - ANNEAL_TO) / ANNEAL_OVER
NUM_EPISODES = 10000000
NUM_WARMSTART = 35      # episodes
MAX_NOOP_ITERS = 30
TARGET_UPDATE = 1000     # time steps
PROGRESS_INTERVAL = 10   # episodes

policy_net = models.DQN(FRAME_HISTORY_SIZE, NUM_ACTIONS).to(device)
target_net = models.DQN(FRAME_HISTORY_SIZE, NUM_ACTIONS).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

#optimizer = optim.RMSprop(policy_net.parameters(), lr=0.00025, momentum=0.95, eps=0.01)
#optimizer = optim.RMSprop(policy_net.parameters(), lr=0.00005)
#optimizer = optim.Adam(policy_net.parameters(), lr=0.0001) #7
optimizer = optim.RMSprop(policy_net.parameters(),  lr=0.0025, alpha=0.9, eps=1e-02, momentum=0.0) #8
#optimizer = optim.RMSprop(policy_net.parameters(),  lr=0.00025, alpha=0.95, eps=0.01, momentum=0.0) #9
memory = ReplayMemory(REPLAY_MEMORY_SIZE)

# Select an action randomly without annealing EPSILON
def select_random_action():
    return torch.tensor([[random.randrange(NUM_ACTIONS)]], device=device, dtype=torch.long)
            '-------------------------------DQN----------------------------------'
        )
        print(
            '----------------------------------------------------------------------------------------'
        )
        print(
            f"environment name: {env_name}         | number of iterations: {iters}"
        )
        print(
            f"Hyperparameters: learning rate: {lr} | gamma: {gamma} | Epsilon: {eps}"
        )
        print(
            f"Hyperparameters: Epsilone decay rate: {eps_rate} | Buffer size: {buffer_size} | Batch size: {batch_size}"
        )

        DQN_net = models.DQN(env.observation_space.shape, env.action_space.n)
        target_net = models.DQN(env.observation_space.shape,
                                env.action_space.n)

        buffer = ReplayBuffer(buffer_size)
        epsilone = rlUtils.Epsilon(eps, eps_min, eps_rate)
        experience = collections.namedtuple(
            'Experience', ['obs', 'action', 'reward', 'next_obs', 'done'])

        print(DQN_net)

        print(
            '----------------------------------------------------------------------------------------'
        )

        optimizer = opt.Adam(DQN_net.parameters(), lr=lr)